1 // Copyright Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 //! Functions for converting between different in-RAM representations of text
11 //! and for quickly checking if the Unicode Bidirectional Algorithm can be
12 //! avoided.
13 //!
14 //! By using slices for output, the functions here seek to enable by-register
15 //! (ALU register or SIMD register as available) operations in order to
16 //! outperform iterator-based conversions available in the Rust standard
17 //! library.
18 //!
19 //! _Note:_ "Latin1" in this module refers to the Unicode range from U+0000 to
20 //! U+00FF, inclusive, and does not refer to the windows-1252 range. This
21 //! in-memory encoding is sometimes used as a storage optimization of text
22 //! when UTF-16 indexing and length semantics are exposed.
23 //!
24 //! The FFI binding for this module are in the
25 //! [encoding_c_mem crate](https://github.com/hsivonen/encoding_c_mem).
26 
27 #[cfg(feature = "alloc")]
28 use alloc::borrow::Cow;
29 #[cfg(feature = "alloc")]
30 use alloc::string::String;
31 #[cfg(feature = "alloc")]
32 use alloc::vec::Vec;
33 
34 use super::in_inclusive_range16;
35 use super::in_inclusive_range32;
36 use super::in_inclusive_range8;
37 use super::in_range16;
38 use super::in_range32;
39 use super::DecoderResult;
40 use crate::ascii::*;
41 use crate::utf_8::*;
42 
43 macro_rules! non_fuzz_debug_assert {
44     ($($arg:tt)*) => (if !cfg!(fuzzing) { debug_assert!($($arg)*); })
45 }
46 
47 cfg_if! {
48     if #[cfg(feature = "simd-accel")] {
49         use ::core::intrinsics::likely;
50         use ::core::intrinsics::unlikely;
51     } else {
52         #[inline(always)]
53         // Unsafe to match the intrinsic, which is needlessly unsafe.
54         unsafe fn likely(b: bool) -> bool {
55             b
56         }
57         #[inline(always)]
58         // Unsafe to match the intrinsic, which is needlessly unsafe.
59         unsafe fn unlikely(b: bool) -> bool {
60             b
61         }
62     }
63 }
64 
65 /// Classification of text as Latin1 (all code points are below U+0100),
66 /// left-to-right with some non-Latin1 characters or as containing at least
67 /// some right-to-left characters.
68 #[must_use]
69 #[derive(Debug, PartialEq, Eq)]
70 #[repr(C)]
71 pub enum Latin1Bidi {
72     /// Every character is below U+0100.
73     Latin1 = 0,
74     /// There is at least one character that's U+0100 or higher, but there
75     /// are no right-to-left characters.
76     LeftToRight = 1,
77     /// There is at least one right-to-left character.
78     Bidi = 2,
79 }
80 
81 // `as` truncates, so works on 32-bit, too.
82 #[allow(dead_code)]
83 const LATIN1_MASK: usize = 0xFF00_FF00_FF00_FF00u64 as usize;
84 
85 #[allow(unused_macros)]
86 macro_rules! by_unit_check_alu {
87     ($name:ident, $unit:ty, $bound:expr, $mask:ident) => {
88         #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
89         #[inline(always)]
90         fn $name(buffer: &[$unit]) -> bool {
91             let mut offset = 0usize;
92             let mut accu = 0usize;
93             let unit_size = ::core::mem::size_of::<$unit>();
94             let len = buffer.len();
95             if len >= ALU_ALIGNMENT / unit_size {
96                 // The most common reason to return `false` is for the first code
97                 // unit to fail the test, so check that first.
98                 if buffer[0] >= $bound {
99                     return false;
100                 }
101                 let src = buffer.as_ptr();
102                 let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK))
103                     & ALU_ALIGNMENT_MASK)
104                     / unit_size;
105                 if until_alignment + ALU_ALIGNMENT / unit_size <= len {
106                     if until_alignment != 0 {
107                         accu |= buffer[offset] as usize;
108                         offset += 1;
109                         until_alignment -= 1;
110                         while until_alignment != 0 {
111                             accu |= buffer[offset] as usize;
112                             offset += 1;
113                             until_alignment -= 1;
114                         }
115                         if accu >= $bound {
116                             return false;
117                         }
118                     }
119                     let len_minus_stride = len - ALU_ALIGNMENT / unit_size;
120                     if offset + (4 * (ALU_ALIGNMENT / unit_size)) <= len {
121                         let len_minus_unroll = len - (4 * (ALU_ALIGNMENT / unit_size));
122                         loop {
123                             let unroll_accu = unsafe { *(src.add(offset) as *const usize) }
124                                 | unsafe {
125                                     *(src.add(offset + (ALU_ALIGNMENT / unit_size)) as *const usize)
126                                 }
127                                 | unsafe {
128                                     *(src.add(offset + (2 * (ALU_ALIGNMENT / unit_size)))
129                                         as *const usize)
130                                 }
131                                 | unsafe {
132                                     *(src.add(offset + (3 * (ALU_ALIGNMENT / unit_size)))
133                                         as *const usize)
134                                 };
135                             if unroll_accu & $mask != 0 {
136                                 return false;
137                             }
138                             offset += 4 * (ALU_ALIGNMENT / unit_size);
139                             if offset > len_minus_unroll {
140                                 break;
141                             }
142                         }
143                     }
144                     while offset <= len_minus_stride {
145                         accu |= unsafe { *(src.add(offset) as *const usize) };
146                         offset += ALU_ALIGNMENT / unit_size;
147                     }
148                 }
149             }
150             for &unit in &buffer[offset..] {
151                 accu |= unit as usize;
152             }
153             accu & $mask == 0
154         }
155     };
156 }
157 
158 #[allow(unused_macros)]
159 macro_rules! by_unit_check_simd {
160     ($name:ident, $unit:ty, $splat:expr, $simd_ty:ty, $bound:expr, $func:ident) => {
161         #[inline(always)]
162         fn $name(buffer: &[$unit]) -> bool {
163             let mut offset = 0usize;
164             let mut accu = 0usize;
165             let unit_size = ::core::mem::size_of::<$unit>();
166             let len = buffer.len();
167             if len >= SIMD_STRIDE_SIZE / unit_size {
168                 // The most common reason to return `false` is for the first code
169                 // unit to fail the test, so check that first.
170                 if buffer[0] >= $bound {
171                     return false;
172                 }
173                 let src = buffer.as_ptr();
174                 let mut until_alignment = ((SIMD_ALIGNMENT
175                     - ((src as usize) & SIMD_ALIGNMENT_MASK))
176                     & SIMD_ALIGNMENT_MASK)
177                     / unit_size;
178                 if until_alignment + SIMD_STRIDE_SIZE / unit_size <= len {
179                     if until_alignment != 0 {
180                         accu |= buffer[offset] as usize;
181                         offset += 1;
182                         until_alignment -= 1;
183                         while until_alignment != 0 {
184                             accu |= buffer[offset] as usize;
185                             offset += 1;
186                             until_alignment -= 1;
187                         }
188                         if accu >= $bound {
189                             return false;
190                         }
191                     }
192                     let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size;
193                     if offset + (4 * (SIMD_STRIDE_SIZE / unit_size)) <= len {
194                         let len_minus_unroll = len - (4 * (SIMD_STRIDE_SIZE / unit_size));
195                         loop {
196                             let unroll_accu = unsafe { *(src.add(offset) as *const $simd_ty) }
197                                 | unsafe {
198                                     *(src.add(offset + (SIMD_STRIDE_SIZE / unit_size))
199                                         as *const $simd_ty)
200                                 }
201                                 | unsafe {
202                                     *(src.add(offset + (2 * (SIMD_STRIDE_SIZE / unit_size)))
203                                         as *const $simd_ty)
204                                 }
205                                 | unsafe {
206                                     *(src.add(offset + (3 * (SIMD_STRIDE_SIZE / unit_size)))
207                                         as *const $simd_ty)
208                                 };
209                             if !$func(unroll_accu) {
210                                 return false;
211                             }
212                             offset += 4 * (SIMD_STRIDE_SIZE / unit_size);
213                             if offset > len_minus_unroll {
214                                 break;
215                             }
216                         }
217                     }
218                     let mut simd_accu = $splat;
219                     while offset <= len_minus_stride {
220                         simd_accu = simd_accu | unsafe { *(src.add(offset) as *const $simd_ty) };
221                         offset += SIMD_STRIDE_SIZE / unit_size;
222                     }
223                     if !$func(simd_accu) {
224                         return false;
225                     }
226                 }
227             }
228             for &unit in &buffer[offset..] {
229                 accu |= unit as usize;
230             }
231             accu < $bound
232         }
233     };
234 }
235 
236 cfg_if! {
237     if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
238         use crate::simd_funcs::*;
239         use packed_simd::u8x16;
240         use packed_simd::u16x8;
241 
242         const SIMD_ALIGNMENT: usize = 16;
243 
244         const SIMD_ALIGNMENT_MASK: usize = 15;
245 
246         by_unit_check_simd!(is_ascii_impl, u8, u8x16::splat(0), u8x16, 0x80, simd_is_ascii);
247         by_unit_check_simd!(is_basic_latin_impl, u16, u16x8::splat(0), u16x8, 0x80, simd_is_basic_latin);
248         by_unit_check_simd!(is_utf16_latin1_impl, u16, u16x8::splat(0), u16x8, 0x100, simd_is_latin1);
249 
250         #[inline(always)]
251         fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize {
252             // This function is a mess, because it simultaneously tries to do
253             // only aligned SIMD (perhaps misguidedly) and needs to deal with
254             // the last code unit in a SIMD stride being part of a valid
255             // surrogate pair.
256             let unit_size = ::core::mem::size_of::<u16>();
257             let src = buffer.as_ptr();
258             let len = buffer.len();
259             let mut offset = 0usize;
260             'outer: loop {
261                 let until_alignment = ((SIMD_ALIGNMENT - ((unsafe { src.add(offset) } as usize) & SIMD_ALIGNMENT_MASK)) &
262                                         SIMD_ALIGNMENT_MASK) / unit_size;
263                 if until_alignment == 0 {
264                     if offset + SIMD_STRIDE_SIZE / unit_size > len {
265                         break;
266                     }
267                 } else {
268                     let offset_plus_until_alignment = offset + until_alignment;
269                     let offset_plus_until_alignment_plus_one = offset_plus_until_alignment + 1;
270                     if offset_plus_until_alignment_plus_one + SIMD_STRIDE_SIZE / unit_size > len {
271                         break;
272                     }
273                     let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_until_alignment_plus_one]);
274                     if up_to < until_alignment {
275                         return offset + up_to;
276                     }
277                     if last_valid_low {
278                         offset = offset_plus_until_alignment_plus_one;
279                         continue;
280                     }
281                     offset = offset_plus_until_alignment;
282                 }
283                 let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size;
284                 loop {
285                     let offset_plus_stride = offset + SIMD_STRIDE_SIZE / unit_size;
286                     if contains_surrogates(unsafe { *(src.add(offset) as *const u16x8) }) {
287                         if offset_plus_stride == len {
288                             break 'outer;
289                         }
290                         let offset_plus_stride_plus_one = offset_plus_stride + 1;
291                         let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_stride_plus_one]);
292                         if up_to < SIMD_STRIDE_SIZE / unit_size {
293                             return offset + up_to;
294                         }
295                         if last_valid_low {
296                             offset = offset_plus_stride_plus_one;
297                             continue 'outer;
298                         }
299                     }
300                     offset = offset_plus_stride;
301                     if offset > len_minus_stride {
302                         break 'outer;
303                     }
304                 }
305             }
306             let (up_to, _) = utf16_valid_up_to_alu(&buffer[offset..]);
307             offset + up_to
308         }
309     } else {
310         by_unit_check_alu!(is_ascii_impl, u8, 0x80, ASCII_MASK);
311         by_unit_check_alu!(is_basic_latin_impl, u16, 0x80, BASIC_LATIN_MASK);
312         by_unit_check_alu!(is_utf16_latin1_impl, u16, 0x100, LATIN1_MASK);
313 
314         #[inline(always)]
315         fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize {
316             let (up_to, _) = utf16_valid_up_to_alu(buffer);
317             up_to
318         }
319     }
320 }
321 
322 /// The second return value is true iff the last code unit of the slice was
323 /// reached and turned out to be a low surrogate that is part of a valid pair.
324 #[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))]
325 #[inline(always)]
utf16_valid_up_to_alu(buffer: &[u16]) -> (usize, bool)326 fn utf16_valid_up_to_alu(buffer: &[u16]) -> (usize, bool) {
327     let len = buffer.len();
328     if len == 0 {
329         return (0, false);
330     }
331     let mut offset = 0usize;
332     loop {
333         let unit = buffer[offset];
334         let next = offset + 1;
335         let unit_minus_surrogate_start = unit.wrapping_sub(0xD800);
336         if unit_minus_surrogate_start > (0xDFFF - 0xD800) {
337             // Not a surrogate
338             offset = next;
339             if offset == len {
340                 return (offset, false);
341             }
342             continue;
343         }
344         if unit_minus_surrogate_start <= (0xDBFF - 0xD800) {
345             // high surrogate
346             if next < len {
347                 let second = buffer[next];
348                 let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00);
349                 if second_minus_low_surrogate_start <= (0xDFFF - 0xDC00) {
350                     // The next code unit is a low surrogate. Advance position.
351                     offset = next + 1;
352                     if offset == len {
353                         return (offset, true);
354                     }
355                     continue;
356                 }
357                 // The next code unit is not a low surrogate. Don't advance
358                 // position and treat the high surrogate as unpaired.
359                 // fall through
360             }
361             // Unpaired, fall through
362         }
363         // Unpaired surrogate
364         return (offset, false);
365     }
366 }
367 
368 cfg_if! {
369     if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
370         #[inline(always)]
371         fn is_str_latin1_impl(buffer: &str) -> Option<usize> {
372             let mut offset = 0usize;
373             let bytes = buffer.as_bytes();
374             let len = bytes.len();
375             if len >= SIMD_STRIDE_SIZE {
376                 let src = bytes.as_ptr();
377                 let mut until_alignment = (SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
378                                            SIMD_ALIGNMENT_MASK;
379                 if until_alignment + SIMD_STRIDE_SIZE <= len {
380                     while until_alignment != 0 {
381                         if bytes[offset] > 0xC3 {
382                             return Some(offset);
383                         }
384                         offset += 1;
385                         until_alignment -= 1;
386                     }
387                     let len_minus_stride = len - SIMD_STRIDE_SIZE;
388                     loop {
389                         if !simd_is_str_latin1(unsafe { *(src.add(offset) as *const u8x16) }) {
390                             // TODO: Ensure this compiles away when inlined into `is_str_latin1()`.
391                             while bytes[offset] & 0xC0 == 0x80 {
392                                 offset += 1;
393                             }
394                             return Some(offset);
395                         }
396                         offset += SIMD_STRIDE_SIZE;
397                         if offset > len_minus_stride {
398                             break;
399                         }
400                     }
401                 }
402             }
403             for i in offset..len {
404                 if bytes[i] > 0xC3 {
405                     return Some(i);
406                 }
407             }
408             None
409         }
410     } else {
411         #[inline(always)]
412         fn is_str_latin1_impl(buffer: &str) -> Option<usize> {
413             let mut bytes = buffer.as_bytes();
414             let mut total = 0;
415             loop {
416                 if let Some((byte, offset)) = validate_ascii(bytes) {
417                     total += offset;
418                     if byte > 0xC3 {
419                         return Some(total);
420                     }
421                     bytes = &bytes[offset + 2..];
422                     total += 2;
423                 } else {
424                     return None;
425                 }
426             }
427         }
428     }
429 }
430 
431 #[inline(always)]
is_utf8_latin1_impl(buffer: &[u8]) -> Option<usize>432 fn is_utf8_latin1_impl(buffer: &[u8]) -> Option<usize> {
433     let mut bytes = buffer;
434     let mut total = 0;
435     loop {
436         if let Some((byte, offset)) = validate_ascii(bytes) {
437             total += offset;
438             if in_inclusive_range8(byte, 0xC2, 0xC3) {
439                 let next = offset + 1;
440                 if next == bytes.len() {
441                     return Some(total);
442                 }
443                 if bytes[next] & 0xC0 != 0x80 {
444                     return Some(total);
445                 }
446                 bytes = &bytes[offset + 2..];
447                 total += 2;
448             } else {
449                 return Some(total);
450             }
451         } else {
452             return None;
453         }
454     }
455 }
456 
457 cfg_if! {
458     if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
459         #[inline(always)]
460         fn is_utf16_bidi_impl(buffer: &[u16]) -> bool {
461             let mut offset = 0usize;
462             let len = buffer.len();
463             if len >= SIMD_STRIDE_SIZE / 2 {
464                 let src = buffer.as_ptr();
465                 let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
466                                            SIMD_ALIGNMENT_MASK) / 2;
467                 if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len {
468                     while until_alignment != 0 {
469                         if is_utf16_code_unit_bidi(buffer[offset]) {
470                             return true;
471                         }
472                         offset += 1;
473                         until_alignment -= 1;
474                     }
475                     let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2);
476                     loop {
477                         if is_u16x8_bidi(unsafe { *(src.add(offset) as *const u16x8) }) {
478                             return true;
479                         }
480                         offset += SIMD_STRIDE_SIZE / 2;
481                         if offset > len_minus_stride {
482                             break;
483                         }
484                     }
485                 }
486             }
487             for &u in &buffer[offset..] {
488                 if is_utf16_code_unit_bidi(u) {
489                     return true;
490                 }
491             }
492             false
493         }
494     } else {
495         #[inline(always)]
496         fn is_utf16_bidi_impl(buffer: &[u16]) -> bool {
497             for &u in buffer {
498                 if is_utf16_code_unit_bidi(u) {
499                     return true;
500                 }
501             }
502             false
503         }
504     }
505 }
506 
507 cfg_if! {
508     if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
509         #[inline(always)]
510         fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi {
511             let mut offset = 0usize;
512             let len = buffer.len();
513             if len >= SIMD_STRIDE_SIZE / 2 {
514                 let src = buffer.as_ptr();
515                 let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
516                                            SIMD_ALIGNMENT_MASK) / 2;
517                 if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len {
518                     while until_alignment != 0 {
519                         if buffer[offset] > 0xFF {
520                             // This transition isn't optimal, since the aligment is recomputing
521                             // but not tweaking further today.
522                             if is_utf16_bidi_impl(&buffer[offset..]) {
523                                 return Latin1Bidi::Bidi;
524                             }
525                             return Latin1Bidi::LeftToRight;
526                         }
527                         offset += 1;
528                         until_alignment -= 1;
529                     }
530                     let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2);
531                     loop {
532                         let mut s = unsafe { *(src.add(offset) as *const u16x8) };
533                         if !simd_is_latin1(s) {
534                             loop {
535                                 if is_u16x8_bidi(s) {
536                                     return Latin1Bidi::Bidi;
537                                 }
538                                 offset += SIMD_STRIDE_SIZE / 2;
539                                 if offset > len_minus_stride {
540                                     for &u in &buffer[offset..] {
541                                         if is_utf16_code_unit_bidi(u) {
542                                             return Latin1Bidi::Bidi;
543                                         }
544                                     }
545                                     return Latin1Bidi::LeftToRight;
546                                 }
547                                 s = unsafe { *(src.add(offset) as *const u16x8) };
548                             }
549                         }
550                         offset += SIMD_STRIDE_SIZE / 2;
551                         if offset > len_minus_stride {
552                             break;
553                         }
554                     }
555                 }
556             }
557             let mut iter = (&buffer[offset..]).iter();
558             loop {
559                 if let Some(&u) = iter.next() {
560                     if u > 0xFF {
561                         let mut inner_u = u;
562                         loop {
563                             if is_utf16_code_unit_bidi(inner_u) {
564                                 return Latin1Bidi::Bidi;
565                             }
566                             if let Some(&code_unit) = iter.next() {
567                                 inner_u = code_unit;
568                             } else {
569                                 return Latin1Bidi::LeftToRight;
570                             }
571                         }
572                     }
573                 } else {
574                     return Latin1Bidi::Latin1;
575                 }
576             }
577         }
578     } else {
579         #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
580         #[inline(always)]
581         fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi {
582             let mut offset = 0usize;
583             let len = buffer.len();
584             if len >= ALU_ALIGNMENT / 2 {
585                 let src = buffer.as_ptr();
586                 let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK)) &
587                                            ALU_ALIGNMENT_MASK) / 2;
588                 if until_alignment + ALU_ALIGNMENT / 2 <= len {
589                     while until_alignment != 0 {
590                         if buffer[offset] > 0xFF {
591                             if is_utf16_bidi_impl(&buffer[offset..]) {
592                                 return Latin1Bidi::Bidi;
593                             }
594                             return Latin1Bidi::LeftToRight;
595                         }
596                         offset += 1;
597                         until_alignment -= 1;
598                     }
599                     let len_minus_stride = len - ALU_ALIGNMENT / 2;
600                     loop {
601                         if unsafe { *(src.add(offset) as *const usize) } & LATIN1_MASK != 0 {
602                             if is_utf16_bidi_impl(&buffer[offset..]) {
603                                 return Latin1Bidi::Bidi;
604                             }
605                             return Latin1Bidi::LeftToRight;
606                         }
607                         offset += ALU_ALIGNMENT / 2;
608                         if offset > len_minus_stride {
609                             break;
610                         }
611                     }
612                 }
613             }
614             let mut iter = (&buffer[offset..]).iter();
615             loop {
616                 if let Some(&u) = iter.next() {
617                     if u > 0xFF {
618                         let mut inner_u = u;
619                         loop {
620                             if is_utf16_code_unit_bidi(inner_u) {
621                                 return Latin1Bidi::Bidi;
622                             }
623                             if let Some(&code_unit) = iter.next() {
624                                 inner_u = code_unit;
625                             } else {
626                                 return Latin1Bidi::LeftToRight;
627                             }
628                         }
629                     }
630                 } else {
631                     return Latin1Bidi::Latin1;
632                 }
633             }
634         }
635     }
636 }
637 
638 /// Checks whether the buffer is all-ASCII.
639 ///
640 /// May read the entire buffer even if it isn't all-ASCII. (I.e. the function
641 /// is not guaranteed to fail fast.)
is_ascii(buffer: &[u8]) -> bool642 pub fn is_ascii(buffer: &[u8]) -> bool {
643     is_ascii_impl(buffer)
644 }
645 
646 /// Checks whether the buffer is all-Basic Latin (i.e. UTF-16 representing
647 /// only ASCII characters).
648 ///
649 /// May read the entire buffer even if it isn't all-ASCII. (I.e. the function
650 /// is not guaranteed to fail fast.)
is_basic_latin(buffer: &[u16]) -> bool651 pub fn is_basic_latin(buffer: &[u16]) -> bool {
652     is_basic_latin_impl(buffer)
653 }
654 
655 /// Checks whether the buffer is valid UTF-8 representing only code points
656 /// less than or equal to U+00FF.
657 ///
658 /// Fails fast. (I.e. returns before having read the whole buffer if UTF-8
659 /// invalidity or code points above U+00FF are discovered.
is_utf8_latin1(buffer: &[u8]) -> bool660 pub fn is_utf8_latin1(buffer: &[u8]) -> bool {
661     is_utf8_latin1_impl(buffer).is_none()
662 }
663 
664 /// Checks whether the buffer represents only code points less than or equal
665 /// to U+00FF.
666 ///
667 /// Fails fast. (I.e. returns before having read the whole buffer if code
668 /// points above U+00FF are discovered.
is_str_latin1(buffer: &str) -> bool669 pub fn is_str_latin1(buffer: &str) -> bool {
670     is_str_latin1_impl(buffer).is_none()
671 }
672 
673 /// Checks whether the buffer represents only code point less than or equal
674 /// to U+00FF.
675 ///
676 /// May read the entire buffer even if it isn't all-Latin1. (I.e. the function
677 /// is not guaranteed to fail fast.)
is_utf16_latin1(buffer: &[u16]) -> bool678 pub fn is_utf16_latin1(buffer: &[u16]) -> bool {
679     is_utf16_latin1_impl(buffer)
680 }
681 
682 /// Checks whether a potentially-invalid UTF-8 buffer contains code points
683 /// that trigger right-to-left processing.
684 ///
685 /// The check is done on a Unicode block basis without regard to assigned
686 /// vs. unassigned code points in the block. Hebrew presentation forms in
687 /// the Alphabetic Presentation Forms block are treated as if they formed
688 /// a block on their own (i.e. it treated as right-to-left). Additionally,
689 /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
690 /// for. Control characters that are technically bidi controls but do not
691 /// cause right-to-left behavior without the presence of right-to-left
692 /// characters or right-to-left controls are not checked for. As a special
693 /// case, U+FEFF is excluded from Arabic Presentation Forms-B.
694 ///
695 /// Returns `true` if the input is invalid UTF-8 or the input contains an
696 /// RTL character. Returns `false` if the input is valid UTF-8 and contains
697 /// no RTL characters.
698 #[cfg_attr(feature = "cargo-clippy", allow(collapsible_if, cyclomatic_complexity))]
699 #[inline]
is_utf8_bidi(buffer: &[u8]) -> bool700 pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
701     // As of rustc 1.25.0-nightly (73ac5d6a8 2018-01-11), this is faster
702     // than UTF-8 validation followed by `is_str_bidi()` for German,
703     // Russian and Japanese. However, this is considerably slower for Thai.
704     // Chances are that the compiler makes some branch predictions that are
705     // unfortunate for Thai. Not spending the time to manually optimize
706     // further at this time, since it's unclear if this variant even has
707     // use cases. However, this is worth revisiting once Rust gets the
708     // ability to annotate relative priorities of match arms.
709 
710     // U+058F: D6 8F
711     // U+0590: D6 90
712     // U+08FF: E0 A3 BF
713     // U+0900: E0 A4 80
714     //
715     // U+200F: E2 80 8F
716     // U+202B: E2 80 AB
717     // U+202E: E2 80 AE
718     // U+2067: E2 81 A7
719     //
720     // U+FB1C: EF AC 9C
721     // U+FB1D: EF AC 9D
722     // U+FDFF: EF B7 BF
723     // U+FE00: EF B8 80
724     //
725     // U+FE6F: EF B9 AF
726     // U+FE70: EF B9 B0
727     // U+FEFE: EF BB BE
728     // U+FEFF: EF BB BF
729     //
730     // U+107FF: F0 90 9F BF
731     // U+10800: F0 90 A0 80
732     // U+10FFF: F0 90 BF BF
733     // U+11000: F0 91 80 80
734     //
735     // U+1E7FF: F0 9E 9F BF
736     // U+1E800: F0 9E A0 80
737     // U+1EFFF: F0 9E BF BF
738     // U+1F000: F0 9F 80 80
739     let mut src = buffer;
740     'outer: loop {
741         if let Some((mut byte, mut read)) = validate_ascii(src) {
742             // Check for the longest sequence to avoid checking twice for the
743             // multi-byte sequences.
744             if read + 4 <= src.len() {
745                 'inner: loop {
746                     // At this point, `byte` is not included in `read`.
747                     match byte {
748                         0..=0x7F => {
749                             // ASCII: go back to SIMD.
750                             read += 1;
751                             src = &src[read..];
752                             continue 'outer;
753                         }
754                         0xC2..=0xD5 => {
755                             // Two-byte
756                             let second = unsafe { *(src.get_unchecked(read + 1)) };
757                             if !in_inclusive_range8(second, 0x80, 0xBF) {
758                                 return true;
759                             }
760                             read += 2;
761                         }
762                         0xD6 => {
763                             // Two-byte
764                             let second = unsafe { *(src.get_unchecked(read + 1)) };
765                             if !in_inclusive_range8(second, 0x80, 0xBF) {
766                                 return true;
767                             }
768                             // XXX consider folding the above and below checks
769                             if second > 0x8F {
770                                 return true;
771                             }
772                             read += 2;
773                         }
774                         // two-byte starting with 0xD7 and above is bidi
775                         0xE1 | 0xE3..=0xEC | 0xEE => {
776                             // Three-byte normal
777                             let second = unsafe { *(src.get_unchecked(read + 1)) };
778                             let third = unsafe { *(src.get_unchecked(read + 2)) };
779                             if ((UTF8_DATA.table[usize::from(second)]
780                                 & unsafe {
781                                     *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
782                                 })
783                                 | (third >> 6))
784                                 != 2
785                             {
786                                 return true;
787                             }
788                             read += 3;
789                         }
790                         0xE2 => {
791                             // Three-byte normal, potentially bidi
792                             let second = unsafe { *(src.get_unchecked(read + 1)) };
793                             let third = unsafe { *(src.get_unchecked(read + 2)) };
794                             if ((UTF8_DATA.table[usize::from(second)]
795                                 & unsafe {
796                                     *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
797                                 })
798                                 | (third >> 6))
799                                 != 2
800                             {
801                                 return true;
802                             }
803                             if second == 0x80 {
804                                 if third == 0x8F || third == 0xAB || third == 0xAE {
805                                     return true;
806                                 }
807                             } else if second == 0x81 {
808                                 if third == 0xA7 {
809                                     return true;
810                                 }
811                             }
812                             read += 3;
813                         }
814                         0xEF => {
815                             // Three-byte normal, potentially bidi
816                             let second = unsafe { *(src.get_unchecked(read + 1)) };
817                             let third = unsafe { *(src.get_unchecked(read + 2)) };
818                             if ((UTF8_DATA.table[usize::from(second)]
819                                 & unsafe {
820                                     *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
821                                 })
822                                 | (third >> 6))
823                                 != 2
824                             {
825                                 return true;
826                             }
827                             if in_inclusive_range8(second, 0xAC, 0xB7) {
828                                 if second == 0xAC {
829                                     if third > 0x9C {
830                                         return true;
831                                     }
832                                 } else {
833                                     return true;
834                                 }
835                             } else if in_inclusive_range8(second, 0xB9, 0xBB) {
836                                 if second == 0xB9 {
837                                     if third > 0xAF {
838                                         return true;
839                                     }
840                                 } else if second == 0xBB {
841                                     if third != 0xBF {
842                                         return true;
843                                     }
844                                 } else {
845                                     return true;
846                                 }
847                             }
848                             read += 3;
849                         }
850                         0xE0 => {
851                             // Three-byte special lower bound, potentially bidi
852                             let second = unsafe { *(src.get_unchecked(read + 1)) };
853                             let third = unsafe { *(src.get_unchecked(read + 2)) };
854                             if ((UTF8_DATA.table[usize::from(second)]
855                                 & unsafe {
856                                     *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
857                                 })
858                                 | (third >> 6))
859                                 != 2
860                             {
861                                 return true;
862                             }
863                             // XXX can this be folded into the above validity check
864                             if second < 0xA4 {
865                                 return true;
866                             }
867                             read += 3;
868                         }
869                         0xED => {
870                             // Three-byte special upper bound
871                             let second = unsafe { *(src.get_unchecked(read + 1)) };
872                             let third = unsafe { *(src.get_unchecked(read + 2)) };
873                             if ((UTF8_DATA.table[usize::from(second)]
874                                 & unsafe {
875                                     *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
876                                 })
877                                 | (third >> 6))
878                                 != 2
879                             {
880                                 return true;
881                             }
882                             read += 3;
883                         }
884                         0xF1..=0xF4 => {
885                             // Four-byte normal
886                             let second = unsafe { *(src.get_unchecked(read + 1)) };
887                             let third = unsafe { *(src.get_unchecked(read + 2)) };
888                             let fourth = unsafe { *(src.get_unchecked(read + 3)) };
889                             if (u16::from(
890                                 UTF8_DATA.table[usize::from(second)]
891                                     & unsafe {
892                                         *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
893                                     },
894                             ) | u16::from(third >> 6)
895                                 | (u16::from(fourth & 0xC0) << 2))
896                                 != 0x202
897                             {
898                                 return true;
899                             }
900                             read += 4;
901                         }
902                         0xF0 => {
903                             // Four-byte special lower bound, potentially bidi
904                             let second = unsafe { *(src.get_unchecked(read + 1)) };
905                             let third = unsafe { *(src.get_unchecked(read + 2)) };
906                             let fourth = unsafe { *(src.get_unchecked(read + 3)) };
907                             if (u16::from(
908                                 UTF8_DATA.table[usize::from(second)]
909                                     & unsafe {
910                                         *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
911                                     },
912                             ) | u16::from(third >> 6)
913                                 | (u16::from(fourth & 0xC0) << 2))
914                                 != 0x202
915                             {
916                                 return true;
917                             }
918                             if unsafe { unlikely(second == 0x90 || second == 0x9E) } {
919                                 let third = src[read + 2];
920                                 if third >= 0xA0 {
921                                     return true;
922                                 }
923                             }
924                             read += 4;
925                         }
926                         _ => {
927                             // Invalid lead or bidi-only lead
928                             return true;
929                         }
930                     }
931                     if read + 4 > src.len() {
932                         if read == src.len() {
933                             return false;
934                         }
935                         byte = src[read];
936                         break 'inner;
937                     }
938                     byte = src[read];
939                     continue 'inner;
940                 }
941             }
942             // We can't have a complete 4-byte sequence, but we could still have
943             // a complete shorter sequence.
944 
945             // At this point, `byte` is not included in `read`.
946             match byte {
947                 0..=0x7F => {
948                     // ASCII: go back to SIMD.
949                     read += 1;
950                     src = &src[read..];
951                     continue 'outer;
952                 }
953                 0xC2..=0xD5 => {
954                     // Two-byte
955                     let new_read = read + 2;
956                     if new_read > src.len() {
957                         return true;
958                     }
959                     let second = unsafe { *(src.get_unchecked(read + 1)) };
960                     if !in_inclusive_range8(second, 0x80, 0xBF) {
961                         return true;
962                     }
963                     read = new_read;
964                     // We need to deal with the case where we came here with 3 bytes
965                     // left, so we need to take a look at the last one.
966                     src = &src[read..];
967                     continue 'outer;
968                 }
969                 0xD6 => {
970                     // Two-byte, potentially bidi
971                     let new_read = read + 2;
972                     if new_read > src.len() {
973                         return true;
974                     }
975                     let second = unsafe { *(src.get_unchecked(read + 1)) };
976                     if !in_inclusive_range8(second, 0x80, 0xBF) {
977                         return true;
978                     }
979                     // XXX consider folding the above and below checks
980                     if second > 0x8F {
981                         return true;
982                     }
983                     read = new_read;
984                     // We need to deal with the case where we came here with 3 bytes
985                     // left, so we need to take a look at the last one.
986                     src = &src[read..];
987                     continue 'outer;
988                 }
989                 // two-byte starting with 0xD7 and above is bidi
990                 0xE1 | 0xE3..=0xEC | 0xEE => {
991                     // Three-byte normal
992                     let new_read = read + 3;
993                     if new_read > src.len() {
994                         return true;
995                     }
996                     let second = unsafe { *(src.get_unchecked(read + 1)) };
997                     let third = unsafe { *(src.get_unchecked(read + 2)) };
998                     if ((UTF8_DATA.table[usize::from(second)]
999                         & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1000                         | (third >> 6))
1001                         != 2
1002                     {
1003                         return true;
1004                     }
1005                 }
1006                 0xE2 => {
1007                     // Three-byte normal, potentially bidi
1008                     let new_read = read + 3;
1009                     if new_read > src.len() {
1010                         return true;
1011                     }
1012                     let second = unsafe { *(src.get_unchecked(read + 1)) };
1013                     let third = unsafe { *(src.get_unchecked(read + 2)) };
1014                     if ((UTF8_DATA.table[usize::from(second)]
1015                         & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1016                         | (third >> 6))
1017                         != 2
1018                     {
1019                         return true;
1020                     }
1021                     if second == 0x80 {
1022                         if third == 0x8F || third == 0xAB || third == 0xAE {
1023                             return true;
1024                         }
1025                     } else if second == 0x81 {
1026                         if third == 0xA7 {
1027                             return true;
1028                         }
1029                     }
1030                 }
1031                 0xEF => {
1032                     // Three-byte normal, potentially bidi
1033                     let new_read = read + 3;
1034                     if new_read > src.len() {
1035                         return true;
1036                     }
1037                     let second = unsafe { *(src.get_unchecked(read + 1)) };
1038                     let third = unsafe { *(src.get_unchecked(read + 2)) };
1039                     if ((UTF8_DATA.table[usize::from(second)]
1040                         & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1041                         | (third >> 6))
1042                         != 2
1043                     {
1044                         return true;
1045                     }
1046                     if in_inclusive_range8(second, 0xAC, 0xB7) {
1047                         if second == 0xAC {
1048                             if third > 0x9C {
1049                                 return true;
1050                             }
1051                         } else {
1052                             return true;
1053                         }
1054                     } else if in_inclusive_range8(second, 0xB9, 0xBB) {
1055                         if second == 0xB9 {
1056                             if third > 0xAF {
1057                                 return true;
1058                             }
1059                         } else if second == 0xBB {
1060                             if third != 0xBF {
1061                                 return true;
1062                             }
1063                         } else {
1064                             return true;
1065                         }
1066                     }
1067                 }
1068                 0xE0 => {
1069                     // Three-byte special lower bound, potentially bidi
1070                     let new_read = read + 3;
1071                     if new_read > src.len() {
1072                         return true;
1073                     }
1074                     let second = unsafe { *(src.get_unchecked(read + 1)) };
1075                     let third = unsafe { *(src.get_unchecked(read + 2)) };
1076                     if ((UTF8_DATA.table[usize::from(second)]
1077                         & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1078                         | (third >> 6))
1079                         != 2
1080                     {
1081                         return true;
1082                     }
1083                     // XXX can this be folded into the above validity check
1084                     if second < 0xA4 {
1085                         return true;
1086                     }
1087                 }
1088                 0xED => {
1089                     // Three-byte special upper bound
1090                     let new_read = read + 3;
1091                     if new_read > src.len() {
1092                         return true;
1093                     }
1094                     let second = unsafe { *(src.get_unchecked(read + 1)) };
1095                     let third = unsafe { *(src.get_unchecked(read + 2)) };
1096                     if ((UTF8_DATA.table[usize::from(second)]
1097                         & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1098                         | (third >> 6))
1099                         != 2
1100                     {
1101                         return true;
1102                     }
1103                 }
1104                 _ => {
1105                     // Invalid lead, 4-byte lead or 2-byte bidi-only lead
1106                     return true;
1107                 }
1108             }
1109             return false;
1110         } else {
1111             return false;
1112         }
1113     }
1114 }
1115 
1116 /// Checks whether a valid UTF-8 buffer contains code points that trigger
1117 /// right-to-left processing.
1118 ///
1119 /// The check is done on a Unicode block basis without regard to assigned
1120 /// vs. unassigned code points in the block. Hebrew presentation forms in
1121 /// the Alphabetic Presentation Forms block are treated as if they formed
1122 /// a block on their own (i.e. it treated as right-to-left). Additionally,
1123 /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1124 /// for. Control characters that are technically bidi controls but do not
1125 /// cause right-to-left behavior without the presence of right-to-left
1126 /// characters or right-to-left controls are not checked for. As a special
1127 /// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1128 #[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))]
1129 #[inline]
is_str_bidi(buffer: &str) -> bool1130 pub fn is_str_bidi(buffer: &str) -> bool {
1131     // U+058F: D6 8F
1132     // U+0590: D6 90
1133     // U+08FF: E0 A3 BF
1134     // U+0900: E0 A4 80
1135     //
1136     // U+200F: E2 80 8F
1137     // U+202B: E2 80 AB
1138     // U+202E: E2 80 AE
1139     // U+2067: E2 81 A7
1140     //
1141     // U+FB1C: EF AC 9C
1142     // U+FB1D: EF AC 9D
1143     // U+FDFF: EF B7 BF
1144     // U+FE00: EF B8 80
1145     //
1146     // U+FE6F: EF B9 AF
1147     // U+FE70: EF B9 B0
1148     // U+FEFE: EF BB BE
1149     // U+FEFF: EF BB BF
1150     //
1151     // U+107FF: F0 90 9F BF
1152     // U+10800: F0 90 A0 80
1153     // U+10FFF: F0 90 BF BF
1154     // U+11000: F0 91 80 80
1155     //
1156     // U+1E7FF: F0 9E 9F BF
1157     // U+1E800: F0 9E A0 80
1158     // U+1EFFF: F0 9E BF BF
1159     // U+1F000: F0 9F 80 80
1160     let mut bytes = buffer.as_bytes();
1161     'outer: loop {
1162         // TODO: Instead of just validating ASCII using SIMD, use SIMD
1163         // to check for non-ASCII lead bytes, too, to quickly conclude
1164         // that the vector consist entirely of CJK and below-Hebrew
1165         // code points.
1166         // Unfortunately, scripts above Arabic but below CJK share
1167         // lead bytes with RTL.
1168         if let Some((mut byte, mut read)) = validate_ascii(bytes) {
1169             'inner: loop {
1170                 // At this point, `byte` is not included in `read`.
1171                 if byte < 0xE0 {
1172                     if byte >= 0x80 {
1173                         // Two-byte
1174                         // Adding `unlikely` here improved throughput on
1175                         // Russian plain text by 33%!
1176                         if unsafe { unlikely(byte >= 0xD6) } {
1177                             if byte == 0xD6 {
1178                                 let second = bytes[read + 1];
1179                                 if second > 0x8F {
1180                                     return true;
1181                                 }
1182                             } else {
1183                                 return true;
1184                             }
1185                         }
1186                         read += 2;
1187                     } else {
1188                         // ASCII: write and go back to SIMD.
1189                         read += 1;
1190                         // Intuitively, we should go back to the outer loop only
1191                         // if byte is 0x30 or above, so as to avoid trashing on
1192                         // ASCII space, comma and period in non-Latin context.
1193                         // However, the extra branch seems to cost more than it's
1194                         // worth.
1195                         bytes = &bytes[read..];
1196                         continue 'outer;
1197                     }
1198                 } else if byte < 0xF0 {
1199                     // Three-byte
1200                     if unsafe { unlikely(!in_inclusive_range8(byte, 0xE3, 0xEE) && byte != 0xE1) } {
1201                         let second = bytes[read + 1];
1202                         if byte == 0xE0 {
1203                             if second < 0xA4 {
1204                                 return true;
1205                             }
1206                         } else if byte == 0xE2 {
1207                             let third = bytes[read + 2];
1208                             if second == 0x80 {
1209                                 if third == 0x8F || third == 0xAB || third == 0xAE {
1210                                     return true;
1211                                 }
1212                             } else if second == 0x81 {
1213                                 if third == 0xA7 {
1214                                     return true;
1215                                 }
1216                             }
1217                         } else {
1218                             debug_assert_eq!(byte, 0xEF);
1219                             if in_inclusive_range8(second, 0xAC, 0xB7) {
1220                                 if second == 0xAC {
1221                                     let third = bytes[read + 2];
1222                                     if third > 0x9C {
1223                                         return true;
1224                                     }
1225                                 } else {
1226                                     return true;
1227                                 }
1228                             } else if in_inclusive_range8(second, 0xB9, 0xBB) {
1229                                 if second == 0xB9 {
1230                                     let third = bytes[read + 2];
1231                                     if third > 0xAF {
1232                                         return true;
1233                                     }
1234                                 } else if second == 0xBB {
1235                                     let third = bytes[read + 2];
1236                                     if third != 0xBF {
1237                                         return true;
1238                                     }
1239                                 } else {
1240                                     return true;
1241                                 }
1242                             }
1243                         }
1244                     }
1245                     read += 3;
1246                 } else {
1247                     // Four-byte
1248                     let second = bytes[read + 1];
1249                     if unsafe { unlikely(byte == 0xF0 && (second == 0x90 || second == 0x9E)) } {
1250                         let third = bytes[read + 2];
1251                         if third >= 0xA0 {
1252                             return true;
1253                         }
1254                     }
1255                     read += 4;
1256                 }
1257                 // The comparison is always < or == and never >, but including
1258                 // > here to let the compiler assume that < is true if this
1259                 // comparison is false.
1260                 if read >= bytes.len() {
1261                     return false;
1262                 }
1263                 byte = bytes[read];
1264                 continue 'inner;
1265             }
1266         } else {
1267             return false;
1268         }
1269     }
1270 }
1271 
1272 /// Checks whether a UTF-16 buffer contains code points that trigger
1273 /// right-to-left processing.
1274 ///
1275 /// The check is done on a Unicode block basis without regard to assigned
1276 /// vs. unassigned code points in the block. Hebrew presentation forms in
1277 /// the Alphabetic Presentation Forms block are treated as if they formed
1278 /// a block on their own (i.e. it treated as right-to-left). Additionally,
1279 /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1280 /// for. Control characters that are technically bidi controls but do not
1281 /// cause right-to-left behavior without the presence of right-to-left
1282 /// characters or right-to-left controls are not checked for. As a special
1283 /// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1284 ///
1285 /// Returns `true` if the input contains an RTL character or an unpaired
1286 /// high surrogate that could be the high half of an RTL character.
1287 /// Returns `false` if the input contains neither RTL characters nor
1288 /// unpaired high surrogates that could be higher halves of RTL characters.
is_utf16_bidi(buffer: &[u16]) -> bool1289 pub fn is_utf16_bidi(buffer: &[u16]) -> bool {
1290     is_utf16_bidi_impl(buffer)
1291 }
1292 
1293 /// Checks whether a scalar value triggers right-to-left processing.
1294 ///
1295 /// The check is done on a Unicode block basis without regard to assigned
1296 /// vs. unassigned code points in the block. Hebrew presentation forms in
1297 /// the Alphabetic Presentation Forms block are treated as if they formed
1298 /// a block on their own (i.e. it treated as right-to-left). Additionally,
1299 /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1300 /// for. Control characters that are technically bidi controls but do not
1301 /// cause right-to-left behavior without the presence of right-to-left
1302 /// characters or right-to-left controls are not checked for. As a special
1303 /// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1304 #[inline(always)]
is_char_bidi(c: char) -> bool1305 pub fn is_char_bidi(c: char) -> bool {
1306     // Controls:
1307     // Every control with RIGHT-TO-LEFT in its name in
1308     // https://www.unicode.org/charts/PDF/U2000.pdf
1309     // U+200F RLM
1310     // U+202B RLE
1311     // U+202E RLO
1312     // U+2067 RLI
1313     //
1314     // BMP RTL:
1315     // https://www.unicode.org/roadmaps/bmp/
1316     // U+0590...U+08FF
1317     // U+FB1D...U+FDFF Hebrew presentation forms and
1318     //                 Arabic Presentation Forms A
1319     // U+FE70...U+FEFE Arabic Presentation Forms B (excl. BOM)
1320     //
1321     // Supplementary RTL:
1322     // https://www.unicode.org/roadmaps/smp/
1323     // U+10800...U+10FFF (Lead surrogate U+D802 or U+D803)
1324     // U+1E800...U+1EFFF (Lead surrogate U+D83A or U+D83B)
1325     let code_point = u32::from(c);
1326     if code_point < 0x0590 {
1327         // Below Hebrew
1328         return false;
1329     }
1330     if in_range32(code_point, 0x0900, 0xFB1D) {
1331         // Above Arabic Extended-A and below Hebrew presentation forms
1332         if in_inclusive_range32(code_point, 0x200F, 0x2067) {
1333             // In the range that contains the RTL controls
1334             return code_point == 0x200F
1335                 || code_point == 0x202B
1336                 || code_point == 0x202E
1337                 || code_point == 0x2067;
1338         }
1339         return false;
1340     }
1341     if code_point > 0x1EFFF {
1342         // Above second astral RTL. (Emoji is here.)
1343         return false;
1344     }
1345     if in_range32(code_point, 0x11000, 0x1E800) {
1346         // Between astral RTL blocks
1347         return false;
1348     }
1349     if in_range32(code_point, 0xFEFF, 0x10800) {
1350         // Above Arabic Presentations Forms B (excl. BOM) and below first
1351         // astral RTL
1352         return false;
1353     }
1354     if in_range32(code_point, 0xFE00, 0xFE70) {
1355         // Between Arabic Presentations Forms
1356         return false;
1357     }
1358     true
1359 }
1360 
1361 /// Checks whether a UTF-16 code unit triggers right-to-left processing.
1362 ///
1363 /// The check is done on a Unicode block basis without regard to assigned
1364 /// vs. unassigned code points in the block. Hebrew presentation forms in
1365 /// the Alphabetic Presentation Forms block are treated as if they formed
1366 /// a block on their own (i.e. it treated as right-to-left). Additionally,
1367 /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1368 /// for. Control characters that are technically bidi controls but do not
1369 /// cause right-to-left behavior without the presence of right-to-left
1370 /// characters or right-to-left controls are not checked for. As a special
1371 /// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1372 ///
1373 /// Since supplementary-plane right-to-left blocks are identifiable from the
1374 /// high surrogate without examining the low surrogate, this function returns
1375 /// `true` for such high surrogates making the function suitable for handling
1376 /// supplementary-plane text without decoding surrogate pairs to scalar
1377 /// values. Obviously, such high surrogates are then reported as right-to-left
1378 /// even if actually unpaired.
1379 #[inline(always)]
is_utf16_code_unit_bidi(u: u16) -> bool1380 pub fn is_utf16_code_unit_bidi(u: u16) -> bool {
1381     if u < 0x0590 {
1382         // Below Hebrew
1383         return false;
1384     }
1385     if in_range16(u, 0x0900, 0xD802) {
1386         // Above Arabic Extended-A and below first RTL surrogate
1387         if in_inclusive_range16(u, 0x200F, 0x2067) {
1388             // In the range that contains the RTL controls
1389             return u == 0x200F || u == 0x202B || u == 0x202E || u == 0x2067;
1390         }
1391         return false;
1392     }
1393     if in_range16(u, 0xD83C, 0xFB1D) {
1394         // Between astral RTL high surrogates and Hebrew presentation forms
1395         // (Emoji is here)
1396         return false;
1397     }
1398     if in_range16(u, 0xD804, 0xD83A) {
1399         // Between RTL high surragates
1400         return false;
1401     }
1402     if u > 0xFEFE {
1403         // Above Arabic Presentation Forms (excl. BOM)
1404         return false;
1405     }
1406     if in_range16(u, 0xFE00, 0xFE70) {
1407         // Between Arabic Presentations Forms
1408         return false;
1409     }
1410     true
1411 }
1412 
1413 /// Checks whether a potentially invalid UTF-8 buffer contains code points
1414 /// that trigger right-to-left processing or is all-Latin1.
1415 ///
1416 /// Possibly more efficient than performing the checks separately.
1417 ///
1418 /// Returns `Latin1Bidi::Latin1` if `is_utf8_latin1()` would return `true`.
1419 /// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf8_bidi()` would return
1420 /// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
check_utf8_for_latin1_and_bidi(buffer: &[u8]) -> Latin1Bidi1421 pub fn check_utf8_for_latin1_and_bidi(buffer: &[u8]) -> Latin1Bidi {
1422     if let Some(offset) = is_utf8_latin1_impl(buffer) {
1423         if is_utf8_bidi(&buffer[offset..]) {
1424             Latin1Bidi::Bidi
1425         } else {
1426             Latin1Bidi::LeftToRight
1427         }
1428     } else {
1429         Latin1Bidi::Latin1
1430     }
1431 }
1432 
1433 /// Checks whether a valid UTF-8 buffer contains code points
1434 /// that trigger right-to-left processing or is all-Latin1.
1435 ///
1436 /// Possibly more efficient than performing the checks separately.
1437 ///
1438 /// Returns `Latin1Bidi::Latin1` if `is_str_latin1()` would return `true`.
1439 /// Otherwise, returns `Latin1Bidi::Bidi` if `is_str_bidi()` would return
1440 /// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
check_str_for_latin1_and_bidi(buffer: &str) -> Latin1Bidi1441 pub fn check_str_for_latin1_and_bidi(buffer: &str) -> Latin1Bidi {
1442     // The transition from the latin1 check to the bidi check isn't
1443     // optimal but not tweaking it to perfection today.
1444     if let Some(offset) = is_str_latin1_impl(buffer) {
1445         if is_str_bidi(&buffer[offset..]) {
1446             Latin1Bidi::Bidi
1447         } else {
1448             Latin1Bidi::LeftToRight
1449         }
1450     } else {
1451         Latin1Bidi::Latin1
1452     }
1453 }
1454 
1455 /// Checks whether a potentially invalid UTF-16 buffer contains code points
1456 /// that trigger right-to-left processing or is all-Latin1.
1457 ///
1458 /// Possibly more efficient than performing the checks separately.
1459 ///
1460 /// Returns `Latin1Bidi::Latin1` if `is_utf16_latin1()` would return `true`.
1461 /// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf16_bidi()` would return
1462 /// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
check_utf16_for_latin1_and_bidi(buffer: &[u16]) -> Latin1Bidi1463 pub fn check_utf16_for_latin1_and_bidi(buffer: &[u16]) -> Latin1Bidi {
1464     check_utf16_for_latin1_and_bidi_impl(buffer)
1465 }
1466 
1467 /// Converts potentially-invalid UTF-8 to valid UTF-16 with errors replaced
1468 /// with the REPLACEMENT CHARACTER.
1469 ///
1470 /// The length of the destination buffer must be at least the length of the
1471 /// source buffer _plus one_.
1472 ///
1473 /// Returns the number of `u16`s written.
1474 ///
1475 /// # Panics
1476 ///
1477 /// Panics if the destination buffer is shorter than stated above.
convert_utf8_to_utf16(src: &[u8], dst: &mut [u16]) -> usize1478 pub fn convert_utf8_to_utf16(src: &[u8], dst: &mut [u16]) -> usize {
1479     // TODO: Can the requirement for dst to be at least one unit longer
1480     // be eliminated?
1481     assert!(dst.len() > src.len());
1482     let mut decoder = Utf8Decoder::new_inner();
1483     let mut total_read = 0usize;
1484     let mut total_written = 0usize;
1485     loop {
1486         let (result, read, written) =
1487             decoder.decode_to_utf16_raw(&src[total_read..], &mut dst[total_written..], true);
1488         total_read += read;
1489         total_written += written;
1490         match result {
1491             DecoderResult::InputEmpty => {
1492                 return total_written;
1493             }
1494             DecoderResult::OutputFull => {
1495                 unreachable!("The assert at the top of the function should have caught this.");
1496             }
1497             DecoderResult::Malformed(_, _) => {
1498                 // There should always be space for the U+FFFD, because
1499                 // otherwise we'd have gotten OutputFull already.
1500                 dst[total_written] = 0xFFFD;
1501                 total_written += 1;
1502             }
1503         }
1504     }
1505 }
1506 
1507 /// Converts valid UTF-8 to valid UTF-16.
1508 ///
1509 /// The length of the destination buffer must be at least the length of the
1510 /// source buffer.
1511 ///
1512 /// Returns the number of `u16`s written.
1513 ///
1514 /// # Panics
1515 ///
1516 /// Panics if the destination buffer is shorter than stated above.
convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize1517 pub fn convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize {
1518     assert!(
1519         dst.len() >= src.len(),
1520         "Destination must not be shorter than the source."
1521     );
1522     let bytes = src.as_bytes();
1523     let mut read = 0;
1524     let mut written = 0;
1525     'outer: loop {
1526         let mut byte = {
1527             let src_remaining = &bytes[read..];
1528             let dst_remaining = &mut dst[written..];
1529             let length = src_remaining.len();
1530             match unsafe {
1531                 ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length)
1532             } {
1533                 None => {
1534                     written += length;
1535                     return written;
1536                 }
1537                 Some((non_ascii, consumed)) => {
1538                     read += consumed;
1539                     written += consumed;
1540                     non_ascii
1541                 }
1542             }
1543         };
1544         'inner: loop {
1545             // At this point, `byte` is not included in `read`.
1546             if byte < 0xE0 {
1547                 if byte >= 0x80 {
1548                     // Two-byte
1549                     let second = unsafe { *(bytes.get_unchecked(read + 1)) };
1550                     let point = ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F);
1551                     unsafe { *(dst.get_unchecked_mut(written)) = point };
1552                     read += 2;
1553                     written += 1;
1554                 } else {
1555                     // ASCII: write and go back to SIMD.
1556                     unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) };
1557                     read += 1;
1558                     written += 1;
1559                     // Intuitively, we should go back to the outer loop only
1560                     // if byte is 0x30 or above, so as to avoid trashing on
1561                     // ASCII space, comma and period in non-Latin context.
1562                     // However, the extra branch seems to cost more than it's
1563                     // worth.
1564                     continue 'outer;
1565                 }
1566             } else if byte < 0xF0 {
1567                 // Three-byte
1568                 let second = unsafe { *(bytes.get_unchecked(read + 1)) };
1569                 let third = unsafe { *(bytes.get_unchecked(read + 2)) };
1570                 let point = ((u16::from(byte) & 0xF) << 12)
1571                     | ((u16::from(second) & 0x3F) << 6)
1572                     | (u16::from(third) & 0x3F);
1573                 unsafe { *(dst.get_unchecked_mut(written)) = point };
1574                 read += 3;
1575                 written += 1;
1576             } else {
1577                 // Four-byte
1578                 let second = unsafe { *(bytes.get_unchecked(read + 1)) };
1579                 let third = unsafe { *(bytes.get_unchecked(read + 2)) };
1580                 let fourth = unsafe { *(bytes.get_unchecked(read + 3)) };
1581                 let point = ((u32::from(byte) & 0x7) << 18)
1582                     | ((u32::from(second) & 0x3F) << 12)
1583                     | ((u32::from(third) & 0x3F) << 6)
1584                     | (u32::from(fourth) & 0x3F);
1585                 unsafe { *(dst.get_unchecked_mut(written)) = (0xD7C0 + (point >> 10)) as u16 };
1586                 unsafe {
1587                     *(dst.get_unchecked_mut(written + 1)) = (0xDC00 + (point & 0x3FF)) as u16
1588                 };
1589                 read += 4;
1590                 written += 2;
1591             }
1592             // The comparison is always < or == and never >, but including
1593             // > here to let the compiler assume that < is true if this
1594             // comparison is false.
1595             if read >= src.len() {
1596                 return written;
1597             }
1598             byte = bytes[read];
1599             continue 'inner;
1600         }
1601     }
1602 }
1603 
1604 /// Converts potentially-invalid UTF-8 to valid UTF-16 signaling on error.
1605 ///
1606 /// The length of the destination buffer must be at least the length of the
1607 /// source buffer.
1608 ///
1609 /// Returns the number of `u16`s written or `None` if the input was invalid.
1610 ///
1611 /// When the input was invalid, some output may have been written.
1612 ///
1613 /// # Panics
1614 ///
1615 /// Panics if the destination buffer is shorter than stated above.
convert_utf8_to_utf16_without_replacement(src: &[u8], dst: &mut [u16]) -> Option<usize>1616 pub fn convert_utf8_to_utf16_without_replacement(src: &[u8], dst: &mut [u16]) -> Option<usize> {
1617     assert!(
1618         dst.len() >= src.len(),
1619         "Destination must not be shorter than the source."
1620     );
1621     let (read, written) = convert_utf8_to_utf16_up_to_invalid(src, dst);
1622     if read == src.len() {
1623         return Some(written);
1624     }
1625     None
1626 }
1627 
1628 /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1629 /// with the REPLACEMENT CHARACTER with potentially insufficient output
1630 /// space.
1631 ///
1632 /// Returns the number of code units read and the number of bytes written.
1633 ///
1634 /// Guarantees that the bytes in the destination beyond the number of
1635 /// bytes claimed as written by the second item of the return tuple
1636 /// are left unmodified.
1637 ///
1638 /// Not all code units are read if there isn't enough output space.
1639 ///
1640 /// Note  that this method isn't designed for general streamability but for
1641 /// not allocating memory for the worst case up front. Specifically,
1642 /// if the input starts with or ends with an unpaired surrogate, those are
1643 /// replaced with the REPLACEMENT CHARACTER.
1644 ///
1645 /// Matches the semantics of `TextEncoder.encodeInto()` from the
1646 /// Encoding Standard.
1647 ///
1648 /// # Safety
1649 ///
1650 /// If you want to convert into a `&mut str`, use
1651 /// `convert_utf16_to_str_partial()` instead of using this function
1652 /// together with the `unsafe` method `as_bytes_mut()` on `&mut str`.
1653 #[inline(always)]
convert_utf16_to_utf8_partial(src: &[u16], dst: &mut [u8]) -> (usize, usize)1654 pub fn convert_utf16_to_utf8_partial(src: &[u16], dst: &mut [u8]) -> (usize, usize) {
1655     // The two functions called below are marked `inline(never)` to make
1656     // transitions from the hot part (first function) into the cold part
1657     // (second function) go through a return and another call to discouge
1658     // the CPU from speculating from the hot code into the cold code.
1659     // Letting the transitions be mere intra-function jumps, even to
1660     // basic blocks out-of-lined to the end of the function would wipe
1661     // away a quarter of Arabic encode performance on Haswell!
1662     let (read, written) = convert_utf16_to_utf8_partial_inner(src, dst);
1663     if unsafe { likely(read == src.len()) } {
1664         return (read, written);
1665     }
1666     let (tail_read, tail_written) =
1667         convert_utf16_to_utf8_partial_tail(&src[read..], &mut dst[written..]);
1668     (read + tail_read, written + tail_written)
1669 }
1670 
1671 /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1672 /// with the REPLACEMENT CHARACTER.
1673 ///
1674 /// The length of the destination buffer must be at least the length of the
1675 /// source buffer times three.
1676 ///
1677 /// Returns the number of bytes written.
1678 ///
1679 /// # Panics
1680 ///
1681 /// Panics if the destination buffer is shorter than stated above.
1682 ///
1683 /// # Safety
1684 ///
1685 /// If you want to convert into a `&mut str`, use `convert_utf16_to_str()`
1686 /// instead of using this function together with the `unsafe` method
1687 /// `as_bytes_mut()` on `&mut str`.
1688 #[inline(always)]
convert_utf16_to_utf8(src: &[u16], dst: &mut [u8]) -> usize1689 pub fn convert_utf16_to_utf8(src: &[u16], dst: &mut [u8]) -> usize {
1690     assert!(dst.len() >= src.len() * 3);
1691     let (read, written) = convert_utf16_to_utf8_partial(src, dst);
1692     debug_assert_eq!(read, src.len());
1693     written
1694 }
1695 
1696 /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1697 /// with the REPLACEMENT CHARACTER such that the validity of the output is
1698 /// signaled using the Rust type system with potentially insufficient output
1699 /// space.
1700 ///
1701 /// Returns the number of code units read and the number of bytes written.
1702 ///
1703 /// Not all code units are read if there isn't enough output space.
1704 ///
1705 /// Note  that this method isn't designed for general streamability but for
1706 /// not allocating memory for the worst case up front. Specifically,
1707 /// if the input starts with or ends with an unpaired surrogate, those are
1708 /// replaced with the REPLACEMENT CHARACTER.
convert_utf16_to_str_partial(src: &[u16], dst: &mut str) -> (usize, usize)1709 pub fn convert_utf16_to_str_partial(src: &[u16], dst: &mut str) -> (usize, usize) {
1710     let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
1711     let (read, written) = convert_utf16_to_utf8_partial(src, bytes);
1712     let len = bytes.len();
1713     let mut trail = written;
1714     while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
1715         bytes[trail] = 0;
1716         trail += 1;
1717     }
1718     (read, written)
1719 }
1720 
1721 /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1722 /// with the REPLACEMENT CHARACTER such that the validity of the output is
1723 /// signaled using the Rust type system.
1724 ///
1725 /// The length of the destination buffer must be at least the length of the
1726 /// source buffer times three.
1727 ///
1728 /// Returns the number of bytes written.
1729 ///
1730 /// # Panics
1731 ///
1732 /// Panics if the destination buffer is shorter than stated above.
1733 #[inline(always)]
convert_utf16_to_str(src: &[u16], dst: &mut str) -> usize1734 pub fn convert_utf16_to_str(src: &[u16], dst: &mut str) -> usize {
1735     assert!(dst.len() >= src.len() * 3);
1736     let (read, written) = convert_utf16_to_str_partial(src, dst);
1737     debug_assert_eq!(read, src.len());
1738     written
1739 }
1740 
1741 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1742 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-16.
1743 ///
1744 /// The length of the destination buffer must be at least the length of the
1745 /// source buffer.
1746 ///
1747 /// The number of `u16`s written equals the length of the source buffer.
1748 ///
1749 /// # Panics
1750 ///
1751 /// Panics if the destination buffer is shorter than stated above.
convert_latin1_to_utf16(src: &[u8], dst: &mut [u16])1752 pub fn convert_latin1_to_utf16(src: &[u8], dst: &mut [u16]) {
1753     assert!(
1754         dst.len() >= src.len(),
1755         "Destination must not be shorter than the source."
1756     );
1757     // TODO: On aarch64, the safe version autovectorizes to the same unpacking
1758     // instructions and this code, but, yet, the autovectorized version is
1759     // faster.
1760     unsafe {
1761         unpack_latin1(src.as_ptr(), dst.as_mut_ptr(), src.len());
1762     }
1763 }
1764 
1765 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1766 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient
1767 /// output space.
1768 ///
1769 /// Returns the number of bytes read and the number of bytes written.
1770 ///
1771 /// If the output isn't large enough, not all input is consumed.
1772 ///
1773 /// # Safety
1774 ///
1775 /// If you want to convert into a `&mut str`, use
1776 /// `convert_utf16_to_str_partial()` instead of using this function
1777 /// together with the `unsafe` method `as_bytes_mut()` on `&mut str`.
convert_latin1_to_utf8_partial(src: &[u8], dst: &mut [u8]) -> (usize, usize)1778 pub fn convert_latin1_to_utf8_partial(src: &[u8], dst: &mut [u8]) -> (usize, usize) {
1779     let src_len = src.len();
1780     let src_ptr = src.as_ptr();
1781     let dst_ptr = dst.as_mut_ptr();
1782     let dst_len = dst.len();
1783     let mut total_read = 0usize;
1784     let mut total_written = 0usize;
1785     loop {
1786         // src can't advance more than dst
1787         let src_left = src_len - total_read;
1788         let dst_left = dst_len - total_written;
1789         let min_left = ::core::cmp::min(src_left, dst_left);
1790         if let Some((non_ascii, consumed)) = unsafe {
1791             ascii_to_ascii(
1792                 src_ptr.add(total_read),
1793                 dst_ptr.add(total_written),
1794                 min_left,
1795             )
1796         } {
1797             total_read += consumed;
1798             total_written += consumed;
1799             if total_written.checked_add(2).unwrap() > dst_len {
1800                 return (total_read, total_written);
1801             }
1802 
1803             total_read += 1; // consume `non_ascii`
1804 
1805             dst[total_written] = (non_ascii >> 6) | 0xC0;
1806             total_written += 1;
1807             dst[total_written] = (non_ascii & 0x3F) | 0x80;
1808             total_written += 1;
1809             continue;
1810         }
1811         return (total_read + min_left, total_written + min_left);
1812     }
1813 }
1814 
1815 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1816 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8.
1817 ///
1818 /// The length of the destination buffer must be at least the length of the
1819 /// source buffer times two.
1820 ///
1821 /// Returns the number of bytes written.
1822 ///
1823 /// # Panics
1824 ///
1825 /// Panics if the destination buffer is shorter than stated above.
1826 ///
1827 /// # Safety
1828 ///
1829 /// Note that this function may write garbage beyond the number of bytes
1830 /// indicated by the return value, so using a `&mut str` interpreted as
1831 /// `&mut [u8]` as the destination is not safe. If you want to convert into
1832 /// a `&mut str`, use `convert_utf16_to_str()` instead of this function.
1833 #[inline]
convert_latin1_to_utf8(src: &[u8], dst: &mut [u8]) -> usize1834 pub fn convert_latin1_to_utf8(src: &[u8], dst: &mut [u8]) -> usize {
1835     assert!(
1836         dst.len() >= src.len() * 2,
1837         "Destination must not be shorter than the source times two."
1838     );
1839     let (read, written) = convert_latin1_to_utf8_partial(src, dst);
1840     debug_assert_eq!(read, src.len());
1841     written
1842 }
1843 
1844 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1845 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the
1846 /// output is signaled using the Rust type system with potentially insufficient
1847 /// output space.
1848 ///
1849 /// Returns the number of bytes read and the number of bytes written.
1850 ///
1851 /// If the output isn't large enough, not all input is consumed.
1852 #[inline]
convert_latin1_to_str_partial(src: &[u8], dst: &mut str) -> (usize, usize)1853 pub fn convert_latin1_to_str_partial(src: &[u8], dst: &mut str) -> (usize, usize) {
1854     let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
1855     let (read, written) = convert_latin1_to_utf8_partial(src, bytes);
1856     let len = bytes.len();
1857     let mut trail = written;
1858     let max = ::core::cmp::min(len, trail + MAX_STRIDE_SIZE);
1859     while trail < max {
1860         bytes[trail] = 0;
1861         trail += 1;
1862     }
1863     while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
1864         bytes[trail] = 0;
1865         trail += 1;
1866     }
1867     (read, written)
1868 }
1869 
1870 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1871 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the
1872 /// output is signaled using the Rust type system.
1873 ///
1874 /// The length of the destination buffer must be at least the length of the
1875 /// source buffer times two.
1876 ///
1877 /// Returns the number of bytes written.
1878 ///
1879 /// # Panics
1880 ///
1881 /// Panics if the destination buffer is shorter than stated above.
1882 #[inline]
convert_latin1_to_str(src: &[u8], dst: &mut str) -> usize1883 pub fn convert_latin1_to_str(src: &[u8], dst: &mut str) -> usize {
1884     assert!(
1885         dst.len() >= src.len() * 2,
1886         "Destination must not be shorter than the source times two."
1887     );
1888     let (read, written) = convert_latin1_to_str_partial(src, dst);
1889     debug_assert_eq!(read, src.len());
1890     written
1891 }
1892 
1893 /// If the input is valid UTF-8 representing only Unicode code points from
1894 /// U+0000 to U+00FF, inclusive, converts the input into output that
1895 /// represents the value of each code point as the unsigned byte value of
1896 /// each output byte.
1897 ///
1898 /// If the input does not fulfill the condition stated above, this function
1899 /// panics if debug assertions are enabled (and fuzzing isn't) and otherwise
1900 /// does something that is memory-safe without any promises about any
1901 /// properties of the output. In particular, callers shouldn't assume the
1902 /// output to be the same across crate versions or CPU architectures and
1903 /// should not assume that non-ASCII input can't map to ASCII output.
1904 ///
1905 /// The length of the destination buffer must be at least the length of the
1906 /// source buffer.
1907 ///
1908 /// Returns the number of bytes written.
1909 ///
1910 /// # Panics
1911 ///
1912 /// Panics if the destination buffer is shorter than stated above.
1913 ///
1914 /// If debug assertions are enabled (and not fuzzing) and the input is
1915 /// not in the range U+0000 to U+00FF, inclusive.
convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize1916 pub fn convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize {
1917     assert!(
1918         dst.len() >= src.len(),
1919         "Destination must not be shorter than the source."
1920     );
1921     non_fuzz_debug_assert!(is_utf8_latin1(src));
1922     let src_len = src.len();
1923     let src_ptr = src.as_ptr();
1924     let dst_ptr = dst.as_mut_ptr();
1925     let mut total_read = 0usize;
1926     let mut total_written = 0usize;
1927     loop {
1928         // dst can't advance more than src
1929         let src_left = src_len - total_read;
1930         if let Some((non_ascii, consumed)) = unsafe {
1931             ascii_to_ascii(
1932                 src_ptr.add(total_read),
1933                 dst_ptr.add(total_written),
1934                 src_left,
1935             )
1936         } {
1937             total_read += consumed + 1;
1938             total_written += consumed;
1939 
1940             if total_read == src_len {
1941                 return total_written;
1942             }
1943 
1944             let trail = src[total_read];
1945             total_read += 1;
1946 
1947             dst[total_written] = ((non_ascii & 0x1F) << 6) | (trail & 0x3F);
1948             total_written += 1;
1949             continue;
1950         }
1951         return total_written + src_left;
1952     }
1953 }
1954 
1955 /// If the input is valid UTF-16 representing only Unicode code points from
1956 /// U+0000 to U+00FF, inclusive, converts the input into output that
1957 /// represents the value of each code point as the unsigned byte value of
1958 /// each output byte.
1959 ///
1960 /// If the input does not fulfill the condition stated above, does something
1961 /// that is memory-safe without any promises about any properties of the
1962 /// output and will probably assert in debug builds in future versions.
1963 /// In particular, callers shouldn't assume the output to be the same across
1964 /// crate versions or CPU architectures and should not assume that non-ASCII
1965 /// input can't map to ASCII output.
1966 ///
1967 /// The length of the destination buffer must be at least the length of the
1968 /// source buffer.
1969 ///
1970 /// The number of bytes written equals the length of the source buffer.
1971 ///
1972 /// # Panics
1973 ///
1974 /// Panics if the destination buffer is shorter than stated above.
1975 ///
1976 /// (Probably in future versions if debug assertions are enabled (and not
1977 /// fuzzing) and the input is not in the range U+0000 to U+00FF, inclusive.)
convert_utf16_to_latin1_lossy(src: &[u16], dst: &mut [u8])1978 pub fn convert_utf16_to_latin1_lossy(src: &[u16], dst: &mut [u8]) {
1979     assert!(
1980         dst.len() >= src.len(),
1981         "Destination must not be shorter than the source."
1982     );
1983     // non_fuzz_debug_assert!(is_utf16_latin1(src));
1984     unsafe {
1985         pack_latin1(src.as_ptr(), dst.as_mut_ptr(), src.len());
1986     }
1987 }
1988 
1989 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1990 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8.
1991 ///
1992 /// Borrows if input is ASCII-only. Performs a single heap allocation
1993 /// otherwise.
1994 ///
1995 /// Only available if the `alloc` feature is enabled (enabled by default).
1996 #[cfg(feature = "alloc")]
decode_latin1<'a>(bytes: &'a [u8]) -> Cow<'a, str>1997 pub fn decode_latin1<'a>(bytes: &'a [u8]) -> Cow<'a, str> {
1998     let up_to = ascii_valid_up_to(bytes);
1999     // >= makes later things optimize better than ==
2000     if up_to >= bytes.len() {
2001         debug_assert_eq!(up_to, bytes.len());
2002         let s: &str = unsafe { ::core::str::from_utf8_unchecked(bytes) };
2003         return Cow::Borrowed(s);
2004     }
2005     let (head, tail) = bytes.split_at(up_to);
2006     let capacity = head.len() + tail.len() * 2;
2007     let mut vec = Vec::with_capacity(capacity);
2008     unsafe {
2009         vec.set_len(capacity);
2010     }
2011     (&mut vec[..up_to]).copy_from_slice(head);
2012     let written = convert_latin1_to_utf8(tail, &mut vec[up_to..]);
2013     vec.truncate(up_to + written);
2014     Cow::Owned(unsafe { String::from_utf8_unchecked(vec) })
2015 }
2016 
2017 /// If the input is valid UTF-8 representing only Unicode code points from
2018 /// U+0000 to U+00FF, inclusive, converts the input into output that
2019 /// represents the value of each code point as the unsigned byte value of
2020 /// each output byte.
2021 ///
2022 /// If the input does not fulfill the condition stated above, this function
2023 /// panics if debug assertions are enabled (and fuzzing isn't) and otherwise
2024 /// does something that is memory-safe without any promises about any
2025 /// properties of the output. In particular, callers shouldn't assume the
2026 /// output to be the same across crate versions or CPU architectures and
2027 /// should not assume that non-ASCII input can't map to ASCII output.
2028 ///
2029 /// Borrows if input is ASCII-only. Performs a single heap allocation
2030 /// otherwise.
2031 ///
2032 /// Only available if the `alloc` feature is enabled (enabled by default).
2033 #[cfg(feature = "alloc")]
encode_latin1_lossy<'a>(string: &'a str) -> Cow<'a, [u8]>2034 pub fn encode_latin1_lossy<'a>(string: &'a str) -> Cow<'a, [u8]> {
2035     let bytes = string.as_bytes();
2036     let up_to = ascii_valid_up_to(bytes);
2037     // >= makes later things optimize better than ==
2038     if up_to >= bytes.len() {
2039         debug_assert_eq!(up_to, bytes.len());
2040         return Cow::Borrowed(bytes);
2041     }
2042     let (head, tail) = bytes.split_at(up_to);
2043     let capacity = bytes.len();
2044     let mut vec = Vec::with_capacity(capacity);
2045     unsafe {
2046         vec.set_len(capacity);
2047     }
2048     (&mut vec[..up_to]).copy_from_slice(head);
2049     let written = convert_utf8_to_latin1_lossy(tail, &mut vec[up_to..]);
2050     vec.truncate(up_to + written);
2051     Cow::Owned(vec)
2052 }
2053 
2054 /// Returns the index of the first unpaired surrogate or, if the input is
2055 /// valid UTF-16 in its entirety, the length of the input.
utf16_valid_up_to(buffer: &[u16]) -> usize2056 pub fn utf16_valid_up_to(buffer: &[u16]) -> usize {
2057     utf16_valid_up_to_impl(buffer)
2058 }
2059 
2060 /// Returns the index of first byte that starts an invalid byte
2061 /// sequence or a non-Latin1 byte sequence, or the length of the
2062 /// string if there are neither.
utf8_latin1_up_to(buffer: &[u8]) -> usize2063 pub fn utf8_latin1_up_to(buffer: &[u8]) -> usize {
2064     is_utf8_latin1_impl(buffer).unwrap_or(buffer.len())
2065 }
2066 
2067 /// Returns the index of first byte that starts a non-Latin1 byte
2068 /// sequence, or the length of the string if there are none.
str_latin1_up_to(buffer: &str) -> usize2069 pub fn str_latin1_up_to(buffer: &str) -> usize {
2070     is_str_latin1_impl(buffer).unwrap_or_else(|| buffer.len())
2071 }
2072 
2073 /// Replaces unpaired surrogates in the input with the REPLACEMENT CHARACTER.
2074 #[inline]
ensure_utf16_validity(buffer: &mut [u16])2075 pub fn ensure_utf16_validity(buffer: &mut [u16]) {
2076     let mut offset = 0;
2077     loop {
2078         offset += utf16_valid_up_to(&buffer[offset..]);
2079         if offset == buffer.len() {
2080             return;
2081         }
2082         buffer[offset] = 0xFFFD;
2083         offset += 1;
2084     }
2085 }
2086 
2087 /// Copies ASCII from source to destination up to the first non-ASCII byte
2088 /// (or the end of the input if it is ASCII in its entirety).
2089 ///
2090 /// The length of the destination buffer must be at least the length of the
2091 /// source buffer.
2092 ///
2093 /// Returns the number of bytes written.
2094 ///
2095 /// # Panics
2096 ///
2097 /// Panics if the destination buffer is shorter than stated above.
copy_ascii_to_ascii(src: &[u8], dst: &mut [u8]) -> usize2098 pub fn copy_ascii_to_ascii(src: &[u8], dst: &mut [u8]) -> usize {
2099     assert!(
2100         dst.len() >= src.len(),
2101         "Destination must not be shorter than the source."
2102     );
2103     if let Some((_, consumed)) =
2104         unsafe { ascii_to_ascii(src.as_ptr(), dst.as_mut_ptr(), src.len()) }
2105     {
2106         consumed
2107     } else {
2108         src.len()
2109     }
2110 }
2111 
2112 /// Copies ASCII from source to destination zero-extending it to UTF-16 up to
2113 /// the first non-ASCII byte (or the end of the input if it is ASCII in its
2114 /// entirety).
2115 ///
2116 /// The length of the destination buffer must be at least the length of the
2117 /// source buffer.
2118 ///
2119 /// Returns the number of `u16`s written.
2120 ///
2121 /// # Panics
2122 ///
2123 /// Panics if the destination buffer is shorter than stated above.
copy_ascii_to_basic_latin(src: &[u8], dst: &mut [u16]) -> usize2124 pub fn copy_ascii_to_basic_latin(src: &[u8], dst: &mut [u16]) -> usize {
2125     assert!(
2126         dst.len() >= src.len(),
2127         "Destination must not be shorter than the source."
2128     );
2129     if let Some((_, consumed)) =
2130         unsafe { ascii_to_basic_latin(src.as_ptr(), dst.as_mut_ptr(), src.len()) }
2131     {
2132         consumed
2133     } else {
2134         src.len()
2135     }
2136 }
2137 
2138 /// Copies Basic Latin from source to destination narrowing it to ASCII up to
2139 /// the first non-Basic Latin code unit (or the end of the input if it is
2140 /// Basic Latin in its entirety).
2141 ///
2142 /// The length of the destination buffer must be at least the length of the
2143 /// source buffer.
2144 ///
2145 /// Returns the number of bytes written.
2146 ///
2147 /// # Panics
2148 ///
2149 /// Panics if the destination buffer is shorter than stated above.
copy_basic_latin_to_ascii(src: &[u16], dst: &mut [u8]) -> usize2150 pub fn copy_basic_latin_to_ascii(src: &[u16], dst: &mut [u8]) -> usize {
2151     assert!(
2152         dst.len() >= src.len(),
2153         "Destination must not be shorter than the source."
2154     );
2155     if let Some((_, consumed)) =
2156         unsafe { basic_latin_to_ascii(src.as_ptr(), dst.as_mut_ptr(), src.len()) }
2157     {
2158         consumed
2159     } else {
2160         src.len()
2161     }
2162 }
2163 
2164 // Any copyright to the test code below this comment is dedicated to the
2165 // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
2166 
2167 #[cfg(all(test, feature = "alloc"))]
2168 mod tests {
2169     use super::*;
2170 
2171     #[test]
test_is_ascii_success()2172     fn test_is_ascii_success() {
2173         let mut src: Vec<u8> = Vec::with_capacity(128);
2174         src.resize(128, 0);
2175         for i in 0..src.len() {
2176             src[i] = i as u8;
2177         }
2178         for i in 0..src.len() {
2179             assert!(is_ascii(&src[i..]));
2180         }
2181     }
2182 
2183     #[test]
test_is_ascii_fail()2184     fn test_is_ascii_fail() {
2185         let mut src: Vec<u8> = Vec::with_capacity(128);
2186         src.resize(128, 0);
2187         for i in 0..src.len() {
2188             src[i] = i as u8;
2189         }
2190         for i in 0..src.len() {
2191             let tail = &mut src[i..];
2192             for j in 0..tail.len() {
2193                 tail[j] = 0xA0;
2194                 assert!(!is_ascii(tail));
2195             }
2196         }
2197     }
2198 
2199     #[test]
test_is_basic_latin_success()2200     fn test_is_basic_latin_success() {
2201         let mut src: Vec<u16> = Vec::with_capacity(128);
2202         src.resize(128, 0);
2203         for i in 0..src.len() {
2204             src[i] = i as u16;
2205         }
2206         for i in 0..src.len() {
2207             assert!(is_basic_latin(&src[i..]));
2208         }
2209     }
2210 
2211     #[test]
test_is_basic_latin_fail()2212     fn test_is_basic_latin_fail() {
2213         let mut src: Vec<u16> = Vec::with_capacity(128);
2214         src.resize(128, 0);
2215         for i in 0..src.len() {
2216             src[i] = i as u16;
2217         }
2218         for i in 0..src.len() {
2219             let tail = &mut src[i..];
2220             for j in 0..tail.len() {
2221                 tail[j] = 0xA0;
2222                 assert!(!is_basic_latin(tail));
2223             }
2224         }
2225     }
2226 
2227     #[test]
test_is_utf16_latin1_success()2228     fn test_is_utf16_latin1_success() {
2229         let mut src: Vec<u16> = Vec::with_capacity(256);
2230         src.resize(256, 0);
2231         for i in 0..src.len() {
2232             src[i] = i as u16;
2233         }
2234         for i in 0..src.len() {
2235             assert!(is_utf16_latin1(&src[i..]));
2236             assert_eq!(
2237                 check_utf16_for_latin1_and_bidi(&src[i..]),
2238                 Latin1Bidi::Latin1
2239             );
2240         }
2241     }
2242 
2243     #[test]
test_is_utf16_latin1_fail()2244     fn test_is_utf16_latin1_fail() {
2245         let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow
2246         let mut src: Vec<u16> = Vec::with_capacity(len);
2247         src.resize(len, 0);
2248         for i in 0..src.len() {
2249             src[i] = i as u16;
2250         }
2251         for i in 0..src.len() {
2252             let tail = &mut src[i..];
2253             for j in 0..tail.len() {
2254                 tail[j] = 0x100 + j as u16;
2255                 assert!(!is_utf16_latin1(tail));
2256                 assert_ne!(check_utf16_for_latin1_and_bidi(tail), Latin1Bidi::Latin1);
2257             }
2258         }
2259     }
2260 
2261     #[test]
test_is_str_latin1_success()2262     fn test_is_str_latin1_success() {
2263         let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow
2264         let mut src: Vec<u16> = Vec::with_capacity(len);
2265         src.resize(len, 0);
2266         for i in 0..src.len() {
2267             src[i] = i as u16;
2268         }
2269         for i in 0..src.len() {
2270             let s = String::from_utf16(&src[i..]).unwrap();
2271             assert!(is_str_latin1(&s[..]));
2272             assert_eq!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1);
2273         }
2274     }
2275 
2276     #[test]
test_is_str_latin1_fail()2277     fn test_is_str_latin1_fail() {
2278         let len = if cfg!(miri) { 32 } else { 256 }; // Miri is too slow
2279         let mut src: Vec<u16> = Vec::with_capacity(len);
2280         src.resize(len, 0);
2281         for i in 0..src.len() {
2282             src[i] = i as u16;
2283         }
2284         for i in 0..src.len() {
2285             let tail = &mut src[i..];
2286             for j in 0..tail.len() {
2287                 tail[j] = 0x100 + j as u16;
2288                 let s = String::from_utf16(tail).unwrap();
2289                 assert!(!is_str_latin1(&s[..]));
2290                 assert_ne!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1);
2291             }
2292         }
2293     }
2294 
2295     #[test]
test_is_utf8_latin1_success()2296     fn test_is_utf8_latin1_success() {
2297         let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow
2298         let mut src: Vec<u16> = Vec::with_capacity(len);
2299         src.resize(len, 0);
2300         for i in 0..src.len() {
2301             src[i] = i as u16;
2302         }
2303         for i in 0..src.len() {
2304             let s = String::from_utf16(&src[i..]).unwrap();
2305             assert!(is_utf8_latin1(s.as_bytes()));
2306             assert_eq!(
2307                 check_utf8_for_latin1_and_bidi(s.as_bytes()),
2308                 Latin1Bidi::Latin1
2309             );
2310         }
2311     }
2312 
2313     #[test]
test_is_utf8_latin1_fail()2314     fn test_is_utf8_latin1_fail() {
2315         let len = if cfg!(miri) { 32 } else { 256 }; // Miri is too slow
2316         let mut src: Vec<u16> = Vec::with_capacity(len);
2317         src.resize(len, 0);
2318         for i in 0..src.len() {
2319             src[i] = i as u16;
2320         }
2321         for i in 0..src.len() {
2322             let tail = &mut src[i..];
2323             for j in 0..tail.len() {
2324                 tail[j] = 0x100 + j as u16;
2325                 let s = String::from_utf16(tail).unwrap();
2326                 assert!(!is_utf8_latin1(s.as_bytes()));
2327                 assert_ne!(
2328                     check_utf8_for_latin1_and_bidi(s.as_bytes()),
2329                     Latin1Bidi::Latin1
2330                 );
2331             }
2332         }
2333     }
2334 
2335     #[test]
test_is_utf8_latin1_invalid()2336     fn test_is_utf8_latin1_invalid() {
2337         assert!(!is_utf8_latin1(b"\xC3"));
2338         assert!(!is_utf8_latin1(b"a\xC3"));
2339         assert!(!is_utf8_latin1(b"\xFF"));
2340         assert!(!is_utf8_latin1(b"a\xFF"));
2341         assert!(!is_utf8_latin1(b"\xC3\xFF"));
2342         assert!(!is_utf8_latin1(b"a\xC3\xFF"));
2343     }
2344 
2345     #[test]
test_convert_utf8_to_utf16()2346     fn test_convert_utf8_to_utf16() {
2347         let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2348         let mut dst: Vec<u16> = Vec::with_capacity(src.len() + 1);
2349         dst.resize(src.len() + 1, 0);
2350         let len = convert_utf8_to_utf16(src.as_bytes(), &mut dst[..]);
2351         dst.truncate(len);
2352         let reference: Vec<u16> = src.encode_utf16().collect();
2353         assert_eq!(dst, reference);
2354     }
2355 
2356     #[test]
test_convert_str_to_utf16()2357     fn test_convert_str_to_utf16() {
2358         let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2359         let mut dst: Vec<u16> = Vec::with_capacity(src.len());
2360         dst.resize(src.len(), 0);
2361         let len = convert_str_to_utf16(src, &mut dst[..]);
2362         dst.truncate(len);
2363         let reference: Vec<u16> = src.encode_utf16().collect();
2364         assert_eq!(dst, reference);
2365     }
2366 
2367     #[test]
test_convert_utf16_to_utf8_partial()2368     fn test_convert_utf16_to_utf8_partial() {
2369         let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2370         let src: Vec<u16> = reference.encode_utf16().collect();
2371         let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 3 + 1);
2372         dst.resize(src.len() * 3 + 1, 0);
2373         let (read, written) = convert_utf16_to_utf8_partial(&src[..], &mut dst[..24]);
2374         let len = written + convert_utf16_to_utf8(&src[read..], &mut dst[written..]);
2375         dst.truncate(len);
2376         assert_eq!(dst, reference.as_bytes());
2377     }
2378 
2379     #[test]
test_convert_utf16_to_utf8()2380     fn test_convert_utf16_to_utf8() {
2381         let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2382         let src: Vec<u16> = reference.encode_utf16().collect();
2383         let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 3 + 1);
2384         dst.resize(src.len() * 3 + 1, 0);
2385         let len = convert_utf16_to_utf8(&src[..], &mut dst[..]);
2386         dst.truncate(len);
2387         assert_eq!(dst, reference.as_bytes());
2388     }
2389 
2390     #[test]
test_convert_latin1_to_utf16()2391     fn test_convert_latin1_to_utf16() {
2392         let mut src: Vec<u8> = Vec::with_capacity(256);
2393         src.resize(256, 0);
2394         let mut reference: Vec<u16> = Vec::with_capacity(256);
2395         reference.resize(256, 0);
2396         for i in 0..256 {
2397             src[i] = i as u8;
2398             reference[i] = i as u16;
2399         }
2400         let mut dst: Vec<u16> = Vec::with_capacity(src.len());
2401         dst.resize(src.len(), 0);
2402         convert_latin1_to_utf16(&src[..], &mut dst[..]);
2403         assert_eq!(dst, reference);
2404     }
2405 
2406     #[test]
test_convert_latin1_to_utf8_partial()2407     fn test_convert_latin1_to_utf8_partial() {
2408         let mut dst = [0u8, 2];
2409         let (read, written) = convert_latin1_to_utf8_partial(b"a\xFF", &mut dst[..]);
2410         assert_eq!(read, 1);
2411         assert_eq!(written, 1);
2412     }
2413 
2414     #[test]
test_convert_latin1_to_utf8()2415     fn test_convert_latin1_to_utf8() {
2416         let mut src: Vec<u8> = Vec::with_capacity(256);
2417         src.resize(256, 0);
2418         let mut reference: Vec<u16> = Vec::with_capacity(256);
2419         reference.resize(256, 0);
2420         for i in 0..256 {
2421             src[i] = i as u8;
2422             reference[i] = i as u16;
2423         }
2424         let s = String::from_utf16(&reference[..]).unwrap();
2425         let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 2);
2426         dst.resize(src.len() * 2, 0);
2427         let len = convert_latin1_to_utf8(&src[..], &mut dst[..]);
2428         dst.truncate(len);
2429         assert_eq!(&dst[..], s.as_bytes());
2430     }
2431 
2432     #[test]
test_convert_utf8_to_latin1_lossy()2433     fn test_convert_utf8_to_latin1_lossy() {
2434         let mut reference: Vec<u8> = Vec::with_capacity(256);
2435         reference.resize(256, 0);
2436         let mut src16: Vec<u16> = Vec::with_capacity(256);
2437         src16.resize(256, 0);
2438         for i in 0..256 {
2439             src16[i] = i as u16;
2440             reference[i] = i as u8;
2441         }
2442         let src = String::from_utf16(&src16[..]).unwrap();
2443         let mut dst: Vec<u8> = Vec::with_capacity(src.len());
2444         dst.resize(src.len(), 0);
2445         let len = convert_utf8_to_latin1_lossy(src.as_bytes(), &mut dst[..]);
2446         dst.truncate(len);
2447         assert_eq!(dst, reference);
2448     }
2449 
2450     #[cfg(all(debug_assertions, not(fuzzing)))]
2451     #[test]
2452     #[should_panic]
test_convert_utf8_to_latin1_lossy_panics()2453     fn test_convert_utf8_to_latin1_lossy_panics() {
2454         let mut dst = [0u8; 16];
2455         let _ = convert_utf8_to_latin1_lossy("\u{100}".as_bytes(), &mut dst[..]);
2456     }
2457 
2458     #[test]
test_convert_utf16_to_latin1_lossy()2459     fn test_convert_utf16_to_latin1_lossy() {
2460         let mut src: Vec<u16> = Vec::with_capacity(256);
2461         src.resize(256, 0);
2462         let mut reference: Vec<u8> = Vec::with_capacity(256);
2463         reference.resize(256, 0);
2464         for i in 0..256 {
2465             src[i] = i as u16;
2466             reference[i] = i as u8;
2467         }
2468         let mut dst: Vec<u8> = Vec::with_capacity(src.len());
2469         dst.resize(src.len(), 0);
2470         convert_utf16_to_latin1_lossy(&src[..], &mut dst[..]);
2471         assert_eq!(dst, reference);
2472     }
2473 
2474     #[test]
2475     // #[should_panic]
test_convert_utf16_to_latin1_lossy_panics()2476     fn test_convert_utf16_to_latin1_lossy_panics() {
2477         let mut dst = [0u8; 16];
2478         let _ = convert_utf16_to_latin1_lossy(&[0x0100u16], &mut dst[..]);
2479     }
2480 
2481     #[test]
test_utf16_valid_up_to()2482     fn test_utf16_valid_up_to() {
2483         let valid = vec![
2484             0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0x2603u16,
2485             0xD83Du16, 0xDCA9u16, 0x00B6u16,
2486         ];
2487         assert_eq!(utf16_valid_up_to(&valid[..]), 16);
2488         let lone_high = vec![
2489             0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2490             0x2603u16, 0xD83Du16, 0x00B6u16,
2491         ];
2492         assert_eq!(utf16_valid_up_to(&lone_high[..]), 14);
2493         let lone_low = vec![
2494             0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2495             0x2603u16, 0xDCA9u16, 0x00B6u16,
2496         ];
2497         assert_eq!(utf16_valid_up_to(&lone_low[..]), 14);
2498         let lone_high_at_end = vec![
2499             0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2500             0x2603u16, 0x00B6u16, 0xD83Du16,
2501         ];
2502         assert_eq!(utf16_valid_up_to(&lone_high_at_end[..]), 15);
2503     }
2504 
2505     #[test]
test_ensure_utf16_validity()2506     fn test_ensure_utf16_validity() {
2507         let mut src = vec![
2508             0u16, 0xD83Du16, 0u16, 0u16, 0u16, 0xD83Du16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2509             0u16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2510             0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2511         ];
2512         let reference = vec![
2513             0u16, 0xFFFDu16, 0u16, 0u16, 0u16, 0xD83Du16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2514             0u16, 0xFFFDu16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2515             0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2516         ];
2517         ensure_utf16_validity(&mut src[..]);
2518         assert_eq!(src, reference);
2519     }
2520 
2521     #[test]
test_is_char_bidi()2522     fn test_is_char_bidi() {
2523         assert!(!is_char_bidi('a'));
2524         assert!(!is_char_bidi('\u{03B1}'));
2525         assert!(!is_char_bidi('\u{3041}'));
2526         assert!(!is_char_bidi('\u{1F4A9}'));
2527         assert!(!is_char_bidi('\u{FE00}'));
2528         assert!(!is_char_bidi('\u{202C}'));
2529         assert!(!is_char_bidi('\u{FEFF}'));
2530         assert!(is_char_bidi('\u{0590}'));
2531         assert!(is_char_bidi('\u{08FF}'));
2532         assert!(is_char_bidi('\u{061C}'));
2533         assert!(is_char_bidi('\u{FB50}'));
2534         assert!(is_char_bidi('\u{FDFF}'));
2535         assert!(is_char_bidi('\u{FE70}'));
2536         assert!(is_char_bidi('\u{FEFE}'));
2537         assert!(is_char_bidi('\u{200F}'));
2538         assert!(is_char_bidi('\u{202B}'));
2539         assert!(is_char_bidi('\u{202E}'));
2540         assert!(is_char_bidi('\u{2067}'));
2541         assert!(is_char_bidi('\u{10800}'));
2542         assert!(is_char_bidi('\u{10FFF}'));
2543         assert!(is_char_bidi('\u{1E800}'));
2544         assert!(is_char_bidi('\u{1EFFF}'));
2545     }
2546 
2547     #[test]
test_is_utf16_code_unit_bidi()2548     fn test_is_utf16_code_unit_bidi() {
2549         assert!(!is_utf16_code_unit_bidi(0x0062));
2550         assert!(!is_utf16_code_unit_bidi(0x03B1));
2551         assert!(!is_utf16_code_unit_bidi(0x3041));
2552         assert!(!is_utf16_code_unit_bidi(0xD801));
2553         assert!(!is_utf16_code_unit_bidi(0xFE00));
2554         assert!(!is_utf16_code_unit_bidi(0x202C));
2555         assert!(!is_utf16_code_unit_bidi(0xFEFF));
2556         assert!(is_utf16_code_unit_bidi(0x0590));
2557         assert!(is_utf16_code_unit_bidi(0x08FF));
2558         assert!(is_utf16_code_unit_bidi(0x061C));
2559         assert!(is_utf16_code_unit_bidi(0xFB1D));
2560         assert!(is_utf16_code_unit_bidi(0xFB50));
2561         assert!(is_utf16_code_unit_bidi(0xFDFF));
2562         assert!(is_utf16_code_unit_bidi(0xFE70));
2563         assert!(is_utf16_code_unit_bidi(0xFEFE));
2564         assert!(is_utf16_code_unit_bidi(0x200F));
2565         assert!(is_utf16_code_unit_bidi(0x202B));
2566         assert!(is_utf16_code_unit_bidi(0x202E));
2567         assert!(is_utf16_code_unit_bidi(0x2067));
2568         assert!(is_utf16_code_unit_bidi(0xD802));
2569         assert!(is_utf16_code_unit_bidi(0xD803));
2570         assert!(is_utf16_code_unit_bidi(0xD83A));
2571         assert!(is_utf16_code_unit_bidi(0xD83B));
2572     }
2573 
2574     #[test]
test_is_str_bidi()2575     fn test_is_str_bidi() {
2576         assert!(!is_str_bidi("abcdefghijklmnopaabcdefghijklmnop"));
2577         assert!(!is_str_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop"));
2578         assert!(!is_str_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop"));
2579         assert!(!is_str_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop"));
2580         assert!(!is_str_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop"));
2581         assert!(!is_str_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop"));
2582         assert!(!is_str_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop"));
2583         assert!(is_str_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop"));
2584         assert!(is_str_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop"));
2585         assert!(is_str_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop"));
2586         assert!(is_str_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop"));
2587         assert!(is_str_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop"));
2588         assert!(is_str_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop"));
2589         assert!(is_str_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop"));
2590         assert!(is_str_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop"));
2591         assert!(is_str_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop"));
2592         assert!(is_str_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop"));
2593         assert!(is_str_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop"));
2594         assert!(is_str_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop"));
2595         assert!(is_str_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop"));
2596         assert!(is_str_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop"));
2597         assert!(is_str_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop"));
2598     }
2599 
2600     #[test]
test_is_utf8_bidi()2601     fn test_is_utf8_bidi() {
2602         assert!(!is_utf8_bidi(
2603             "abcdefghijklmnopaabcdefghijklmnop".as_bytes()
2604         ));
2605         assert!(!is_utf8_bidi(
2606             "abcdefghijklmnop\u{03B1}abcdefghijklmnop".as_bytes()
2607         ));
2608         assert!(!is_utf8_bidi(
2609             "abcdefghijklmnop\u{3041}abcdefghijklmnop".as_bytes()
2610         ));
2611         assert!(!is_utf8_bidi(
2612             "abcdefghijklmnop\u{1F4A9}abcdefghijklmnop".as_bytes()
2613         ));
2614         assert!(!is_utf8_bidi(
2615             "abcdefghijklmnop\u{FE00}abcdefghijklmnop".as_bytes()
2616         ));
2617         assert!(!is_utf8_bidi(
2618             "abcdefghijklmnop\u{202C}abcdefghijklmnop".as_bytes()
2619         ));
2620         assert!(!is_utf8_bidi(
2621             "abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes()
2622         ));
2623         assert!(is_utf8_bidi(
2624             "abcdefghijklmnop\u{0590}abcdefghijklmnop".as_bytes()
2625         ));
2626         assert!(is_utf8_bidi(
2627             "abcdefghijklmnop\u{08FF}abcdefghijklmnop".as_bytes()
2628         ));
2629         assert!(is_utf8_bidi(
2630             "abcdefghijklmnop\u{061C}abcdefghijklmnop".as_bytes()
2631         ));
2632         assert!(is_utf8_bidi(
2633             "abcdefghijklmnop\u{FB50}abcdefghijklmnop".as_bytes()
2634         ));
2635         assert!(is_utf8_bidi(
2636             "abcdefghijklmnop\u{FDFF}abcdefghijklmnop".as_bytes()
2637         ));
2638         assert!(is_utf8_bidi(
2639             "abcdefghijklmnop\u{FE70}abcdefghijklmnop".as_bytes()
2640         ));
2641         assert!(is_utf8_bidi(
2642             "abcdefghijklmnop\u{FEFE}abcdefghijklmnop".as_bytes()
2643         ));
2644         assert!(is_utf8_bidi(
2645             "abcdefghijklmnop\u{200F}abcdefghijklmnop".as_bytes()
2646         ));
2647         assert!(is_utf8_bidi(
2648             "abcdefghijklmnop\u{202B}abcdefghijklmnop".as_bytes()
2649         ));
2650         assert!(is_utf8_bidi(
2651             "abcdefghijklmnop\u{202E}abcdefghijklmnop".as_bytes()
2652         ));
2653         assert!(is_utf8_bidi(
2654             "abcdefghijklmnop\u{2067}abcdefghijklmnop".as_bytes()
2655         ));
2656         assert!(is_utf8_bidi(
2657             "abcdefghijklmnop\u{10800}abcdefghijklmnop".as_bytes()
2658         ));
2659         assert!(is_utf8_bidi(
2660             "abcdefghijklmnop\u{10FFF}abcdefghijklmnop".as_bytes()
2661         ));
2662         assert!(is_utf8_bidi(
2663             "abcdefghijklmnop\u{1E800}abcdefghijklmnop".as_bytes()
2664         ));
2665         assert!(is_utf8_bidi(
2666             "abcdefghijklmnop\u{1EFFF}abcdefghijklmnop".as_bytes()
2667         ));
2668     }
2669 
2670     #[test]
test_is_utf16_bidi()2671     fn test_is_utf16_bidi() {
2672         assert!(!is_utf16_bidi(&[
2673             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0062, 0x62, 0x63, 0x64, 0x65, 0x66,
2674             0x67, 0x68, 0x69,
2675         ]));
2676         assert!(!is_utf16_bidi(&[
2677             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x03B1, 0x62, 0x63, 0x64, 0x65, 0x66,
2678             0x67, 0x68, 0x69,
2679         ]));
2680         assert!(!is_utf16_bidi(&[
2681             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x3041, 0x62, 0x63, 0x64, 0x65, 0x66,
2682             0x67, 0x68, 0x69,
2683         ]));
2684         assert!(!is_utf16_bidi(&[
2685             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD801, 0x62, 0x63, 0x64, 0x65, 0x66,
2686             0x67, 0x68, 0x69,
2687         ]));
2688         assert!(!is_utf16_bidi(&[
2689             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE00, 0x62, 0x63, 0x64, 0x65, 0x66,
2690             0x67, 0x68, 0x69,
2691         ]));
2692         assert!(!is_utf16_bidi(&[
2693             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65, 0x66,
2694             0x67, 0x68, 0x69,
2695         ]));
2696         assert!(!is_utf16_bidi(&[
2697             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65, 0x66,
2698             0x67, 0x68, 0x69,
2699         ]));
2700         assert!(is_utf16_bidi(&[
2701             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65, 0x66,
2702             0x67, 0x68, 0x69,
2703         ]));
2704         assert!(is_utf16_bidi(&[
2705             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x08FF, 0x62, 0x63, 0x64, 0x65, 0x66,
2706             0x67, 0x68, 0x69,
2707         ]));
2708         assert!(is_utf16_bidi(&[
2709             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65, 0x66,
2710             0x67, 0x68, 0x69,
2711         ]));
2712         assert!(is_utf16_bidi(&[
2713             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB1D, 0x62, 0x63, 0x64, 0x65, 0x66,
2714             0x67, 0x68, 0x69,
2715         ]));
2716         assert!(is_utf16_bidi(&[
2717             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65, 0x66,
2718             0x67, 0x68, 0x69,
2719         ]));
2720         assert!(is_utf16_bidi(&[
2721             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFDFF, 0x62, 0x63, 0x64, 0x65, 0x66,
2722             0x67, 0x68, 0x69,
2723         ]));
2724         assert!(is_utf16_bidi(&[
2725             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE70, 0x62, 0x63, 0x64, 0x65, 0x66,
2726             0x67, 0x68, 0x69,
2727         ]));
2728         assert!(is_utf16_bidi(&[
2729             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFE, 0x62, 0x63, 0x64, 0x65, 0x66,
2730             0x67, 0x68, 0x69,
2731         ]));
2732         assert!(is_utf16_bidi(&[
2733             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x200F, 0x62, 0x63, 0x64, 0x65, 0x66,
2734             0x67, 0x68, 0x69,
2735         ]));
2736         assert!(is_utf16_bidi(&[
2737             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202B, 0x62, 0x63, 0x64, 0x65, 0x66,
2738             0x67, 0x68, 0x69,
2739         ]));
2740         assert!(is_utf16_bidi(&[
2741             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202E, 0x62, 0x63, 0x64, 0x65, 0x66,
2742             0x67, 0x68, 0x69,
2743         ]));
2744         assert!(is_utf16_bidi(&[
2745             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x2067, 0x62, 0x63, 0x64, 0x65, 0x66,
2746             0x67, 0x68, 0x69,
2747         ]));
2748         assert!(is_utf16_bidi(&[
2749             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD802, 0x62, 0x63, 0x64, 0x65, 0x66,
2750             0x67, 0x68, 0x69,
2751         ]));
2752         assert!(is_utf16_bidi(&[
2753             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD803, 0x62, 0x63, 0x64, 0x65, 0x66,
2754             0x67, 0x68, 0x69,
2755         ]));
2756         assert!(is_utf16_bidi(&[
2757             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83A, 0x62, 0x63, 0x64, 0x65, 0x66,
2758             0x67, 0x68, 0x69,
2759         ]));
2760         assert!(is_utf16_bidi(&[
2761             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83B, 0x62, 0x63, 0x64, 0x65, 0x66,
2762             0x67, 0x68, 0x69,
2763         ]));
2764 
2765         assert!(is_utf16_bidi(&[
2766             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x3041, 0x62, 0x63, 0x64, 0x65,
2767             0x66, 0x67, 0x68, 0x69,
2768         ]));
2769     }
2770 
2771     #[test]
test_check_str_for_latin1_and_bidi()2772     fn test_check_str_for_latin1_and_bidi() {
2773         assert_ne!(
2774             check_str_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop"),
2775             Latin1Bidi::Bidi
2776         );
2777         assert_ne!(
2778             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop"),
2779             Latin1Bidi::Bidi
2780         );
2781         assert_ne!(
2782             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop"),
2783             Latin1Bidi::Bidi
2784         );
2785         assert_ne!(
2786             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop"),
2787             Latin1Bidi::Bidi
2788         );
2789         assert_ne!(
2790             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop"),
2791             Latin1Bidi::Bidi
2792         );
2793         assert_ne!(
2794             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop"),
2795             Latin1Bidi::Bidi
2796         );
2797         assert_ne!(
2798             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop"),
2799             Latin1Bidi::Bidi
2800         );
2801         assert_eq!(
2802             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop"),
2803             Latin1Bidi::Bidi
2804         );
2805         assert_eq!(
2806             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop"),
2807             Latin1Bidi::Bidi
2808         );
2809         assert_eq!(
2810             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop"),
2811             Latin1Bidi::Bidi
2812         );
2813         assert_eq!(
2814             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop"),
2815             Latin1Bidi::Bidi
2816         );
2817         assert_eq!(
2818             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop"),
2819             Latin1Bidi::Bidi
2820         );
2821         assert_eq!(
2822             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop"),
2823             Latin1Bidi::Bidi
2824         );
2825         assert_eq!(
2826             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop"),
2827             Latin1Bidi::Bidi
2828         );
2829         assert_eq!(
2830             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop"),
2831             Latin1Bidi::Bidi
2832         );
2833         assert_eq!(
2834             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop"),
2835             Latin1Bidi::Bidi
2836         );
2837         assert_eq!(
2838             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop"),
2839             Latin1Bidi::Bidi
2840         );
2841         assert_eq!(
2842             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop"),
2843             Latin1Bidi::Bidi
2844         );
2845         assert_eq!(
2846             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop"),
2847             Latin1Bidi::Bidi
2848         );
2849         assert_eq!(
2850             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop"),
2851             Latin1Bidi::Bidi
2852         );
2853         assert_eq!(
2854             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop"),
2855             Latin1Bidi::Bidi
2856         );
2857         assert_eq!(
2858             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop"),
2859             Latin1Bidi::Bidi
2860         );
2861     }
2862 
2863     #[test]
test_check_utf8_for_latin1_and_bidi()2864     fn test_check_utf8_for_latin1_and_bidi() {
2865         assert_ne!(
2866             check_utf8_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop".as_bytes()),
2867             Latin1Bidi::Bidi
2868         );
2869         assert_ne!(
2870             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop".as_bytes()),
2871             Latin1Bidi::Bidi
2872         );
2873         assert_ne!(
2874             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop".as_bytes()),
2875             Latin1Bidi::Bidi
2876         );
2877         assert_ne!(
2878             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop".as_bytes()),
2879             Latin1Bidi::Bidi
2880         );
2881         assert_ne!(
2882             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop".as_bytes()),
2883             Latin1Bidi::Bidi
2884         );
2885         assert_ne!(
2886             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop".as_bytes()),
2887             Latin1Bidi::Bidi
2888         );
2889         assert_ne!(
2890             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes()),
2891             Latin1Bidi::Bidi
2892         );
2893         assert_eq!(
2894             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop".as_bytes()),
2895             Latin1Bidi::Bidi
2896         );
2897         assert_eq!(
2898             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop".as_bytes()),
2899             Latin1Bidi::Bidi
2900         );
2901         assert_eq!(
2902             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop".as_bytes()),
2903             Latin1Bidi::Bidi
2904         );
2905         assert_eq!(
2906             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop".as_bytes()),
2907             Latin1Bidi::Bidi
2908         );
2909         assert_eq!(
2910             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop".as_bytes()),
2911             Latin1Bidi::Bidi
2912         );
2913         assert_eq!(
2914             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop".as_bytes()),
2915             Latin1Bidi::Bidi
2916         );
2917         assert_eq!(
2918             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop".as_bytes()),
2919             Latin1Bidi::Bidi
2920         );
2921         assert_eq!(
2922             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop".as_bytes()),
2923             Latin1Bidi::Bidi
2924         );
2925         assert_eq!(
2926             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop".as_bytes()),
2927             Latin1Bidi::Bidi
2928         );
2929         assert_eq!(
2930             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop".as_bytes()),
2931             Latin1Bidi::Bidi
2932         );
2933         assert_eq!(
2934             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop".as_bytes()),
2935             Latin1Bidi::Bidi
2936         );
2937         assert_eq!(
2938             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop".as_bytes()),
2939             Latin1Bidi::Bidi
2940         );
2941         assert_eq!(
2942             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop".as_bytes()),
2943             Latin1Bidi::Bidi
2944         );
2945         assert_eq!(
2946             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop".as_bytes()),
2947             Latin1Bidi::Bidi
2948         );
2949         assert_eq!(
2950             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop".as_bytes()),
2951             Latin1Bidi::Bidi
2952         );
2953     }
2954 
2955     #[test]
test_check_utf16_for_latin1_and_bidi()2956     fn test_check_utf16_for_latin1_and_bidi() {
2957         assert_ne!(
2958             check_utf16_for_latin1_and_bidi(&[
2959                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0062, 0x62, 0x63, 0x64, 0x65,
2960                 0x66, 0x67, 0x68, 0x69,
2961             ]),
2962             Latin1Bidi::Bidi
2963         );
2964         assert_ne!(
2965             check_utf16_for_latin1_and_bidi(&[
2966                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x03B1, 0x62, 0x63, 0x64, 0x65,
2967                 0x66, 0x67, 0x68, 0x69,
2968             ]),
2969             Latin1Bidi::Bidi
2970         );
2971         assert_ne!(
2972             check_utf16_for_latin1_and_bidi(&[
2973                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x3041, 0x62, 0x63, 0x64, 0x65,
2974                 0x66, 0x67, 0x68, 0x69,
2975             ]),
2976             Latin1Bidi::Bidi
2977         );
2978         assert_ne!(
2979             check_utf16_for_latin1_and_bidi(&[
2980                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD801, 0x62, 0x63, 0x64, 0x65,
2981                 0x66, 0x67, 0x68, 0x69,
2982             ]),
2983             Latin1Bidi::Bidi
2984         );
2985         assert_ne!(
2986             check_utf16_for_latin1_and_bidi(&[
2987                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE00, 0x62, 0x63, 0x64, 0x65,
2988                 0x66, 0x67, 0x68, 0x69,
2989             ]),
2990             Latin1Bidi::Bidi
2991         );
2992         assert_ne!(
2993             check_utf16_for_latin1_and_bidi(&[
2994                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65,
2995                 0x66, 0x67, 0x68, 0x69,
2996             ]),
2997             Latin1Bidi::Bidi
2998         );
2999         assert_ne!(
3000             check_utf16_for_latin1_and_bidi(&[
3001                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65,
3002                 0x66, 0x67, 0x68, 0x69,
3003             ]),
3004             Latin1Bidi::Bidi
3005         );
3006         assert_eq!(
3007             check_utf16_for_latin1_and_bidi(&[
3008                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65,
3009                 0x66, 0x67, 0x68, 0x69,
3010             ]),
3011             Latin1Bidi::Bidi
3012         );
3013         assert_eq!(
3014             check_utf16_for_latin1_and_bidi(&[
3015                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x08FF, 0x62, 0x63, 0x64, 0x65,
3016                 0x66, 0x67, 0x68, 0x69,
3017             ]),
3018             Latin1Bidi::Bidi
3019         );
3020         assert_eq!(
3021             check_utf16_for_latin1_and_bidi(&[
3022                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65,
3023                 0x66, 0x67, 0x68, 0x69,
3024             ]),
3025             Latin1Bidi::Bidi
3026         );
3027         assert_eq!(
3028             check_utf16_for_latin1_and_bidi(&[
3029                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB1D, 0x62, 0x63, 0x64, 0x65,
3030                 0x66, 0x67, 0x68, 0x69,
3031             ]),
3032             Latin1Bidi::Bidi
3033         );
3034         assert_eq!(
3035             check_utf16_for_latin1_and_bidi(&[
3036                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65,
3037                 0x66, 0x67, 0x68, 0x69,
3038             ]),
3039             Latin1Bidi::Bidi
3040         );
3041         assert_eq!(
3042             check_utf16_for_latin1_and_bidi(&[
3043                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFDFF, 0x62, 0x63, 0x64, 0x65,
3044                 0x66, 0x67, 0x68, 0x69,
3045             ]),
3046             Latin1Bidi::Bidi
3047         );
3048         assert_eq!(
3049             check_utf16_for_latin1_and_bidi(&[
3050                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE70, 0x62, 0x63, 0x64, 0x65,
3051                 0x66, 0x67, 0x68, 0x69,
3052             ]),
3053             Latin1Bidi::Bidi
3054         );
3055         assert_eq!(
3056             check_utf16_for_latin1_and_bidi(&[
3057                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFE, 0x62, 0x63, 0x64, 0x65,
3058                 0x66, 0x67, 0x68, 0x69,
3059             ]),
3060             Latin1Bidi::Bidi
3061         );
3062         assert_eq!(
3063             check_utf16_for_latin1_and_bidi(&[
3064                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x200F, 0x62, 0x63, 0x64, 0x65,
3065                 0x66, 0x67, 0x68, 0x69,
3066             ]),
3067             Latin1Bidi::Bidi
3068         );
3069         assert_eq!(
3070             check_utf16_for_latin1_and_bidi(&[
3071                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202B, 0x62, 0x63, 0x64, 0x65,
3072                 0x66, 0x67, 0x68, 0x69,
3073             ]),
3074             Latin1Bidi::Bidi
3075         );
3076         assert_eq!(
3077             check_utf16_for_latin1_and_bidi(&[
3078                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202E, 0x62, 0x63, 0x64, 0x65,
3079                 0x66, 0x67, 0x68, 0x69,
3080             ]),
3081             Latin1Bidi::Bidi
3082         );
3083         assert_eq!(
3084             check_utf16_for_latin1_and_bidi(&[
3085                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x2067, 0x62, 0x63, 0x64, 0x65,
3086                 0x66, 0x67, 0x68, 0x69,
3087             ]),
3088             Latin1Bidi::Bidi
3089         );
3090         assert_eq!(
3091             check_utf16_for_latin1_and_bidi(&[
3092                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD802, 0x62, 0x63, 0x64, 0x65,
3093                 0x66, 0x67, 0x68, 0x69,
3094             ]),
3095             Latin1Bidi::Bidi
3096         );
3097         assert_eq!(
3098             check_utf16_for_latin1_and_bidi(&[
3099                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD803, 0x62, 0x63, 0x64, 0x65,
3100                 0x66, 0x67, 0x68, 0x69,
3101             ]),
3102             Latin1Bidi::Bidi
3103         );
3104         assert_eq!(
3105             check_utf16_for_latin1_and_bidi(&[
3106                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83A, 0x62, 0x63, 0x64, 0x65,
3107                 0x66, 0x67, 0x68, 0x69,
3108             ]),
3109             Latin1Bidi::Bidi
3110         );
3111         assert_eq!(
3112             check_utf16_for_latin1_and_bidi(&[
3113                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83B, 0x62, 0x63, 0x64, 0x65,
3114                 0x66, 0x67, 0x68, 0x69,
3115             ]),
3116             Latin1Bidi::Bidi
3117         );
3118 
3119         assert_eq!(
3120             check_utf16_for_latin1_and_bidi(&[
3121                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x3041, 0x62, 0x63, 0x64,
3122                 0x65, 0x66, 0x67, 0x68, 0x69,
3123             ]),
3124             Latin1Bidi::Bidi
3125         );
3126     }
3127 
3128     #[inline(always)]
reference_is_char_bidi(c: char) -> bool3129     pub fn reference_is_char_bidi(c: char) -> bool {
3130         match c {
3131             '\u{0590}'..='\u{08FF}'
3132             | '\u{FB1D}'..='\u{FDFF}'
3133             | '\u{FE70}'..='\u{FEFE}'
3134             | '\u{10800}'..='\u{10FFF}'
3135             | '\u{1E800}'..='\u{1EFFF}'
3136             | '\u{200F}'
3137             | '\u{202B}'
3138             | '\u{202E}'
3139             | '\u{2067}' => true,
3140             _ => false,
3141         }
3142     }
3143 
3144     #[inline(always)]
reference_is_utf16_code_unit_bidi(u: u16) -> bool3145     pub fn reference_is_utf16_code_unit_bidi(u: u16) -> bool {
3146         match u {
3147             0x0590..=0x08FF
3148             | 0xFB1D..=0xFDFF
3149             | 0xFE70..=0xFEFE
3150             | 0xD802
3151             | 0xD803
3152             | 0xD83A
3153             | 0xD83B
3154             | 0x200F
3155             | 0x202B
3156             | 0x202E
3157             | 0x2067 => true,
3158             _ => false,
3159         }
3160     }
3161 
3162     #[test]
3163     #[cfg_attr(miri, ignore)] // Miri is too slow
test_is_char_bidi_thoroughly()3164     fn test_is_char_bidi_thoroughly() {
3165         for i in 0..0xD800u32 {
3166             let c: char = ::core::char::from_u32(i).unwrap();
3167             assert_eq!(is_char_bidi(c), reference_is_char_bidi(c));
3168         }
3169         for i in 0xE000..0x110000u32 {
3170             let c: char = ::core::char::from_u32(i).unwrap();
3171             assert_eq!(is_char_bidi(c), reference_is_char_bidi(c));
3172         }
3173     }
3174 
3175     #[test]
3176     #[cfg_attr(miri, ignore)] // Miri is too slow
test_is_utf16_code_unit_bidi_thoroughly()3177     fn test_is_utf16_code_unit_bidi_thoroughly() {
3178         for i in 0..0x10000u32 {
3179             let u = i as u16;
3180             assert_eq!(
3181                 is_utf16_code_unit_bidi(u),
3182                 reference_is_utf16_code_unit_bidi(u)
3183             );
3184         }
3185     }
3186 
3187     #[test]
3188     #[cfg_attr(miri, ignore)] // Miri is too slow
test_is_str_bidi_thoroughly()3189     fn test_is_str_bidi_thoroughly() {
3190         let mut buf = [0; 4];
3191         for i in 0..0xD800u32 {
3192             let c: char = ::core::char::from_u32(i).unwrap();
3193             assert_eq!(
3194                 is_str_bidi(c.encode_utf8(&mut buf[..])),
3195                 reference_is_char_bidi(c)
3196             );
3197         }
3198         for i in 0xE000..0x110000u32 {
3199             let c: char = ::core::char::from_u32(i).unwrap();
3200             assert_eq!(
3201                 is_str_bidi(c.encode_utf8(&mut buf[..])),
3202                 reference_is_char_bidi(c)
3203             );
3204         }
3205     }
3206 
3207     #[test]
3208     #[cfg_attr(miri, ignore)] // Miri is too slow
test_is_utf8_bidi_thoroughly()3209     fn test_is_utf8_bidi_thoroughly() {
3210         let mut buf = [0; 8];
3211         for i in 0..0xD800u32 {
3212             let c: char = ::core::char::from_u32(i).unwrap();
3213             let expect = reference_is_char_bidi(c);
3214             {
3215                 let len = {
3216                     let bytes = c.encode_utf8(&mut buf[..]).as_bytes();
3217                     assert_eq!(is_utf8_bidi(bytes), expect);
3218                     bytes.len()
3219                 };
3220                 {
3221                     let tail = &mut buf[len..];
3222                     for b in tail.iter_mut() {
3223                         *b = 0;
3224                     }
3225                 }
3226             }
3227             assert_eq!(is_utf8_bidi(&buf[..]), expect);
3228         }
3229         for i in 0xE000..0x110000u32 {
3230             let c: char = ::core::char::from_u32(i).unwrap();
3231             let expect = reference_is_char_bidi(c);
3232             {
3233                 let len = {
3234                     let bytes = c.encode_utf8(&mut buf[..]).as_bytes();
3235                     assert_eq!(is_utf8_bidi(bytes), expect);
3236                     bytes.len()
3237                 };
3238                 {
3239                     let tail = &mut buf[len..];
3240                     for b in tail.iter_mut() {
3241                         *b = 0;
3242                     }
3243                 }
3244             }
3245             assert_eq!(is_utf8_bidi(&buf[..]), expect);
3246         }
3247     }
3248 
3249     #[test]
3250     #[cfg_attr(miri, ignore)] // Miri is too slow
test_is_utf16_bidi_thoroughly()3251     fn test_is_utf16_bidi_thoroughly() {
3252         let mut buf = [0; 32];
3253         for i in 0..0x10000u32 {
3254             let u = i as u16;
3255             buf[15] = u;
3256             assert_eq!(
3257                 is_utf16_bidi(&buf[..]),
3258                 reference_is_utf16_code_unit_bidi(u)
3259             );
3260         }
3261     }
3262 
3263     #[test]
test_is_utf8_bidi_edge_cases()3264     fn test_is_utf8_bidi_edge_cases() {
3265         assert!(!is_utf8_bidi(b"\xD5\xBF\x61"));
3266         assert!(!is_utf8_bidi(b"\xD6\x80\x61"));
3267         assert!(!is_utf8_bidi(b"abc"));
3268         assert!(is_utf8_bidi(b"\xD5\xBF\xC2"));
3269         assert!(is_utf8_bidi(b"\xD6\x80\xC2"));
3270         assert!(is_utf8_bidi(b"ab\xC2"));
3271     }
3272 
3273     #[test]
test_decode_latin1()3274     fn test_decode_latin1() {
3275         match decode_latin1(b"ab") {
3276             Cow::Borrowed(s) => {
3277                 assert_eq!(s, "ab");
3278             }
3279             Cow::Owned(_) => {
3280                 unreachable!("Should have borrowed");
3281             }
3282         }
3283         assert_eq!(decode_latin1(b"a\xE4"), "a\u{E4}");
3284     }
3285 
3286     #[test]
test_encode_latin1_lossy()3287     fn test_encode_latin1_lossy() {
3288         match encode_latin1_lossy("ab") {
3289             Cow::Borrowed(s) => {
3290                 assert_eq!(s, b"ab");
3291             }
3292             Cow::Owned(_) => {
3293                 unreachable!("Should have borrowed");
3294             }
3295         }
3296         assert_eq!(encode_latin1_lossy("a\u{E4}"), &(b"a\xE4")[..]);
3297     }
3298 
3299     #[test]
test_convert_utf8_to_utf16_without_replacement()3300     fn test_convert_utf8_to_utf16_without_replacement() {
3301         let mut buf = [0u16; 5];
3302         assert_eq!(
3303             convert_utf8_to_utf16_without_replacement(b"ab", &mut buf[..2]),
3304             Some(2)
3305         );
3306         assert_eq!(buf[0], u16::from(b'a'));
3307         assert_eq!(buf[1], u16::from(b'b'));
3308         assert_eq!(buf[2], 0);
3309         assert_eq!(
3310             convert_utf8_to_utf16_without_replacement(b"\xC3\xA4c", &mut buf[..3]),
3311             Some(2)
3312         );
3313         assert_eq!(buf[0], 0xE4);
3314         assert_eq!(buf[1], u16::from(b'c'));
3315         assert_eq!(buf[2], 0);
3316         assert_eq!(
3317             convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83", &mut buf[..3]),
3318             Some(1)
3319         );
3320         assert_eq!(buf[0], 0x2603);
3321         assert_eq!(buf[1], u16::from(b'c'));
3322         assert_eq!(buf[2], 0);
3323         assert_eq!(
3324             convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83d", &mut buf[..4]),
3325             Some(2)
3326         );
3327         assert_eq!(buf[0], 0x2603);
3328         assert_eq!(buf[1], u16::from(b'd'));
3329         assert_eq!(buf[2], 0);
3330         assert_eq!(
3331             convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83\xC3\xA4", &mut buf[..5]),
3332             Some(2)
3333         );
3334         assert_eq!(buf[0], 0x2603);
3335         assert_eq!(buf[1], 0xE4);
3336         assert_eq!(buf[2], 0);
3337         assert_eq!(
3338             convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93\x8E", &mut buf[..4]),
3339             Some(2)
3340         );
3341         assert_eq!(buf[0], 0xD83D);
3342         assert_eq!(buf[1], 0xDCCE);
3343         assert_eq!(buf[2], 0);
3344         assert_eq!(
3345             convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93\x8Ee", &mut buf[..5]),
3346             Some(3)
3347         );
3348         assert_eq!(buf[0], 0xD83D);
3349         assert_eq!(buf[1], 0xDCCE);
3350         assert_eq!(buf[2], u16::from(b'e'));
3351         assert_eq!(
3352             convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93", &mut buf[..5]),
3353             None
3354         );
3355     }
3356 }
3357