1 // Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 //! Functions for converting between different in-RAM representations of text
11 //! and for quickly checking if the Unicode Bidirectional Algorithm can be
12 //! avoided.
13 //!
14 //! By using slices for output, the functions here seek to enable by-register
15 //! (ALU register or SIMD register as available) operations in order to
16 //! outperform iterator-based conversions available in the Rust standard
17 //! library.
18 //!
19 //! _Note:_ "Latin1" in this module refers to the Unicode range from U+0000 to
20 //! U+00FF, inclusive, and does not refer to the windows-1252 range. This
21 //! in-memory encoding is sometimes used as a storage optimization of text
22 //! when UTF-16 indexing and length semantics are exposed.
23 //!
24 //! The FFI binding for this module are in the
25 //! [encoding_c_mem crate](https://github.com/hsivonen/encoding_c_mem).
26 
27 use std::borrow::Cow;
28 
29 use super::in_inclusive_range16;
30 use super::in_inclusive_range32;
31 use super::in_inclusive_range8;
32 use super::in_range16;
33 use super::in_range32;
34 use super::DecoderResult;
35 use ascii::*;
36 use utf_8::*;
37 
38 macro_rules! non_fuzz_debug_assert {
39     ($($arg:tt)*) => (if !cfg!(fuzzing) { debug_assert!($($arg)*); })
40 }
41 
42 cfg_if! {
43     if #[cfg(feature = "simd-accel")] {
44         use ::std::intrinsics::likely;
45         use ::std::intrinsics::unlikely;
46     } else {
47         #[inline(always)]
48         // Unsafe to match the intrinsic, which is needlessly unsafe.
49         unsafe fn likely(b: bool) -> bool {
50             b
51         }
52         #[inline(always)]
53         // Unsafe to match the intrinsic, which is needlessly unsafe.
54         unsafe fn unlikely(b: bool) -> bool {
55             b
56         }
57     }
58 }
59 
60 /// Classification of text as Latin1 (all code points are below U+0100),
61 /// left-to-right with some non-Latin1 characters or as containing at least
62 /// some right-to-left characters.
63 #[must_use]
64 #[derive(Debug, PartialEq, Eq)]
65 #[repr(C)]
66 pub enum Latin1Bidi {
67     /// Every character is below U+0100.
68     Latin1 = 0,
69     /// There is at least one character that's U+0100 or higher, but there
70     /// are no right-to-left characters.
71     LeftToRight = 1,
72     /// There is at least one right-to-left character.
73     Bidi = 2,
74 }
75 
76 // `as` truncates, so works on 32-bit, too.
77 #[allow(dead_code)]
78 const LATIN1_MASK: usize = 0xFF00_FF00_FF00_FF00u64 as usize;
79 
80 #[allow(unused_macros)]
81 macro_rules! by_unit_check_alu {
82     ($name:ident, $unit:ty, $bound:expr, $mask:ident) => {
83         #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
84         #[inline(always)]
85         fn $name(buffer: &[$unit]) -> bool {
86             let mut offset = 0usize;
87             let mut accu = 0usize;
88             let unit_size = ::std::mem::size_of::<$unit>();
89             let len = buffer.len();
90             if len >= ALU_ALIGNMENT / unit_size {
91                 // The most common reason to return `false` is for the first code
92                 // unit to fail the test, so check that first.
93                 if buffer[0] >= $bound {
94                     return false;
95                 }
96                 let src = buffer.as_ptr();
97                 let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK))
98                     & ALU_ALIGNMENT_MASK)
99                     / unit_size;
100                 if until_alignment + ALU_ALIGNMENT / unit_size <= len {
101                     if until_alignment != 0 {
102                         accu |= buffer[offset] as usize;
103                         offset += 1;
104                         until_alignment -= 1;
105                         while until_alignment != 0 {
106                             accu |= buffer[offset] as usize;
107                             offset += 1;
108                             until_alignment -= 1;
109                         }
110                         if accu >= $bound {
111                             return false;
112                         }
113                     }
114                     let len_minus_stride = len - ALU_ALIGNMENT / unit_size;
115                     if offset + (4 * (ALU_ALIGNMENT / unit_size)) <= len {
116                         let len_minus_unroll = len - (4 * (ALU_ALIGNMENT / unit_size));
117                         loop {
118                             let unroll_accu = unsafe { *(src.add(offset) as *const usize) }
119                                 | unsafe {
120                                     *(src.add(offset + (ALU_ALIGNMENT / unit_size)) as *const usize)
121                                 }
122                                 | unsafe {
123                                     *(src.add(offset + (2 * (ALU_ALIGNMENT / unit_size)))
124                                         as *const usize)
125                                 }
126                                 | unsafe {
127                                     *(src.add(offset + (3 * (ALU_ALIGNMENT / unit_size)))
128                                         as *const usize)
129                                 };
130                             if unroll_accu & $mask != 0 {
131                                 return false;
132                             }
133                             offset += 4 * (ALU_ALIGNMENT / unit_size);
134                             if offset > len_minus_unroll {
135                                 break;
136                             }
137                         }
138                     }
139                     while offset <= len_minus_stride {
140                         accu |= unsafe { *(src.add(offset) as *const usize) };
141                         offset += ALU_ALIGNMENT / unit_size;
142                     }
143                 }
144             }
145             for &unit in &buffer[offset..] {
146                 accu |= unit as usize;
147             }
148             accu & $mask == 0
149         }
150     };
151 }
152 
153 #[allow(unused_macros)]
154 macro_rules! by_unit_check_simd {
155     ($name:ident, $unit:ty, $splat:expr, $simd_ty:ty, $bound:expr, $func:ident) => {
156         #[inline(always)]
157         fn $name(buffer: &[$unit]) -> bool {
158             let mut offset = 0usize;
159             let mut accu = 0usize;
160             let unit_size = ::std::mem::size_of::<$unit>();
161             let len = buffer.len();
162             if len >= SIMD_STRIDE_SIZE / unit_size {
163                 // The most common reason to return `false` is for the first code
164                 // unit to fail the test, so check that first.
165                 if buffer[0] >= $bound {
166                     return false;
167                 }
168                 let src = buffer.as_ptr();
169                 let mut until_alignment = ((SIMD_ALIGNMENT
170                     - ((src as usize) & SIMD_ALIGNMENT_MASK))
171                     & SIMD_ALIGNMENT_MASK)
172                     / unit_size;
173                 if until_alignment + SIMD_STRIDE_SIZE / unit_size <= len {
174                     if until_alignment != 0 {
175                         accu |= buffer[offset] as usize;
176                         offset += 1;
177                         until_alignment -= 1;
178                         while until_alignment != 0 {
179                             accu |= buffer[offset] as usize;
180                             offset += 1;
181                             until_alignment -= 1;
182                         }
183                         if accu >= $bound {
184                             return false;
185                         }
186                     }
187                     let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size;
188                     if offset + (4 * (SIMD_STRIDE_SIZE / unit_size)) <= len {
189                         let len_minus_unroll = len - (4 * (SIMD_STRIDE_SIZE / unit_size));
190                         loop {
191                             let unroll_accu = unsafe { *(src.add(offset) as *const $simd_ty) }
192                                 | unsafe {
193                                     *(src.add(offset + (SIMD_STRIDE_SIZE / unit_size))
194                                         as *const $simd_ty)
195                                 }
196                                 | unsafe {
197                                     *(src.add(offset + (2 * (SIMD_STRIDE_SIZE / unit_size)))
198                                         as *const $simd_ty)
199                                 }
200                                 | unsafe {
201                                     *(src.add(offset + (3 * (SIMD_STRIDE_SIZE / unit_size)))
202                                         as *const $simd_ty)
203                                 };
204                             if !$func(unroll_accu) {
205                                 return false;
206                             }
207                             offset += 4 * (SIMD_STRIDE_SIZE / unit_size);
208                             if offset > len_minus_unroll {
209                                 break;
210                             }
211                         }
212                     }
213                     let mut simd_accu = $splat;
214                     while offset <= len_minus_stride {
215                         simd_accu = simd_accu | unsafe { *(src.add(offset) as *const $simd_ty) };
216                         offset += SIMD_STRIDE_SIZE / unit_size;
217                     }
218                     if !$func(simd_accu) {
219                         return false;
220                     }
221                 }
222             }
223             for &unit in &buffer[offset..] {
224                 accu |= unit as usize;
225             }
226             accu < $bound
227         }
228     };
229 }
230 
231 cfg_if! {
232     if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
233         use simd_funcs::*;
234         use packed_simd::u8x16;
235         use packed_simd::u16x8;
236 
237         const SIMD_ALIGNMENT: usize = 16;
238 
239         const SIMD_ALIGNMENT_MASK: usize = 15;
240 
241         by_unit_check_simd!(is_ascii_impl, u8, u8x16::splat(0), u8x16, 0x80, simd_is_ascii);
242         by_unit_check_simd!(is_basic_latin_impl, u16, u16x8::splat(0), u16x8, 0x80, simd_is_basic_latin);
243         by_unit_check_simd!(is_utf16_latin1_impl, u16, u16x8::splat(0), u16x8, 0x100, simd_is_latin1);
244 
245         #[inline(always)]
246         fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize {
247             // This function is a mess, because it simultaneously tries to do
248             // only aligned SIMD (perhaps misguidedly) and needs to deal with
249             // the last code unit in a SIMD stride being part of a valid
250             // surrogate pair.
251             let unit_size = ::std::mem::size_of::<u16>();
252             let src = buffer.as_ptr();
253             let len = buffer.len();
254             let mut offset = 0usize;
255             'outer: loop {
256                 let until_alignment = ((SIMD_ALIGNMENT - ((unsafe { src.add(offset) } as usize) & SIMD_ALIGNMENT_MASK)) &
257                                         SIMD_ALIGNMENT_MASK) / unit_size;
258                 if until_alignment == 0 {
259                     if offset + SIMD_STRIDE_SIZE / unit_size > len {
260                         break;
261                     }
262                 } else {
263                     let offset_plus_until_alignment = offset + until_alignment;
264                     let offset_plus_until_alignment_plus_one = offset_plus_until_alignment + 1;
265                     if offset_plus_until_alignment_plus_one + SIMD_STRIDE_SIZE / unit_size > len {
266                         break;
267                     }
268                     let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_until_alignment_plus_one]);
269                     if up_to < until_alignment {
270                         return offset + up_to;
271                     }
272                     if last_valid_low {
273                         offset = offset_plus_until_alignment_plus_one;
274                         continue;
275                     }
276                     offset = offset_plus_until_alignment;
277                 }
278                 let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size;
279                 'inner: loop {
280                     let offset_plus_stride = offset + SIMD_STRIDE_SIZE / unit_size;
281                     if contains_surrogates(unsafe { *(src.add(offset) as *const u16x8) }) {
282                         if offset_plus_stride == len {
283                             break 'outer;
284                         }
285                         let offset_plus_stride_plus_one = offset_plus_stride + 1;
286                         let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_stride_plus_one]);
287                         if up_to < SIMD_STRIDE_SIZE / unit_size {
288                             return offset + up_to;
289                         }
290                         if last_valid_low {
291                             offset = offset_plus_stride_plus_one;
292                             continue 'outer;
293                         }
294                     }
295                     offset = offset_plus_stride;
296                     if offset > len_minus_stride {
297                         break 'outer;
298                     }
299                 }
300             }
301             let (up_to, _) = utf16_valid_up_to_alu(&buffer[offset..]);
302             offset + up_to
303         }
304     } else {
305         by_unit_check_alu!(is_ascii_impl, u8, 0x80, ASCII_MASK);
306         by_unit_check_alu!(is_basic_latin_impl, u16, 0x80, BASIC_LATIN_MASK);
307         by_unit_check_alu!(is_utf16_latin1_impl, u16, 0x100, LATIN1_MASK);
308 
309         #[inline(always)]
310         fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize {
311             let (up_to, _) = utf16_valid_up_to_alu(buffer);
312             up_to
313         }
314     }
315 }
316 
317 /// The second return value is true iff the last code unit of the slice was
318 /// reached and turned out to be a low surrogate that is part of a valid pair.
319 #[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))]
320 #[inline(always)]
utf16_valid_up_to_alu(buffer: &[u16]) -> (usize, bool)321 fn utf16_valid_up_to_alu(buffer: &[u16]) -> (usize, bool) {
322     let len = buffer.len();
323     if len == 0 {
324         return (0, false);
325     }
326     let mut offset = 0usize;
327     loop {
328         let unit = buffer[offset];
329         let next = offset + 1;
330         let unit_minus_surrogate_start = unit.wrapping_sub(0xD800);
331         if unit_minus_surrogate_start > (0xDFFF - 0xD800) {
332             // Not a surrogate
333             offset = next;
334             if offset == len {
335                 return (offset, false);
336             }
337             continue;
338         }
339         if unit_minus_surrogate_start <= (0xDBFF - 0xD800) {
340             // high surrogate
341             if next < len {
342                 let second = buffer[next];
343                 let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00);
344                 if second_minus_low_surrogate_start <= (0xDFFF - 0xDC00) {
345                     // The next code unit is a low surrogate. Advance position.
346                     offset = next + 1;
347                     if offset == len {
348                         return (offset, true);
349                     }
350                     continue;
351                 }
352                 // The next code unit is not a low surrogate. Don't advance
353                 // position and treat the high surrogate as unpaired.
354                 // fall through
355             }
356             // Unpaired, fall through
357         }
358         // Unpaired surrogate
359         return (offset, false);
360     }
361 }
362 
363 cfg_if! {
364     if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
365         #[inline(always)]
366         fn is_str_latin1_impl(buffer: &str) -> Option<usize> {
367             let mut offset = 0usize;
368             let bytes = buffer.as_bytes();
369             let len = bytes.len();
370             if len >= SIMD_STRIDE_SIZE {
371                 let src = bytes.as_ptr();
372                 let mut until_alignment = (SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
373                                            SIMD_ALIGNMENT_MASK;
374                 if until_alignment + SIMD_STRIDE_SIZE <= len {
375                     while until_alignment != 0 {
376                         if bytes[offset] > 0xC3 {
377                             return Some(offset);
378                         }
379                         offset += 1;
380                         until_alignment -= 1;
381                     }
382                     let len_minus_stride = len - SIMD_STRIDE_SIZE;
383                     loop {
384                         if !simd_is_str_latin1(unsafe { *(src.add(offset) as *const u8x16) }) {
385                             // TODO: Ensure this compiles away when inlined into `is_str_latin1()`.
386                             while bytes[offset] & 0xC0 == 0x80 {
387                                 offset += 1;
388                             }
389                             return Some(offset);
390                         }
391                         offset += SIMD_STRIDE_SIZE;
392                         if offset > len_minus_stride {
393                             break;
394                         }
395                     }
396                 }
397             }
398             for i in offset..len {
399                 if bytes[i] > 0xC3 {
400                     return Some(i);
401                 }
402             }
403             None
404         }
405     } else {
406         #[inline(always)]
407         fn is_str_latin1_impl(buffer: &str) -> Option<usize> {
408             let mut bytes = buffer.as_bytes();
409             let mut total = 0;
410             loop {
411                 if let Some((byte, offset)) = validate_ascii(bytes) {
412                     total += offset;
413                     if byte > 0xC3 {
414                         return Some(total);
415                     }
416                     bytes = &bytes[offset + 2..];
417                     total += 2;
418                 } else {
419                     return None;
420                 }
421             }
422         }
423     }
424 }
425 
426 #[inline(always)]
is_utf8_latin1_impl(buffer: &[u8]) -> Option<usize>427 fn is_utf8_latin1_impl(buffer: &[u8]) -> Option<usize> {
428     let mut bytes = buffer;
429     let mut total = 0;
430     loop {
431         if let Some((byte, offset)) = validate_ascii(bytes) {
432             total += offset;
433             if in_inclusive_range8(byte, 0xC2, 0xC3) {
434                 let next = offset + 1;
435                 if next == bytes.len() {
436                     return Some(total);
437                 }
438                 if bytes[next] & 0xC0 != 0x80 {
439                     return Some(total);
440                 }
441                 bytes = &bytes[offset + 2..];
442                 total += 2;
443             } else {
444                 return Some(total);
445             }
446         } else {
447             return None;
448         }
449     }
450 }
451 
452 cfg_if! {
453     if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
454         #[inline(always)]
455         fn is_utf16_bidi_impl(buffer: &[u16]) -> bool {
456             let mut offset = 0usize;
457             let len = buffer.len();
458             if len >= SIMD_STRIDE_SIZE / 2 {
459                 let src = buffer.as_ptr();
460                 let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
461                                            SIMD_ALIGNMENT_MASK) / 2;
462                 if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len {
463                     while until_alignment != 0 {
464                         if is_utf16_code_unit_bidi(buffer[offset]) {
465                             return true;
466                         }
467                         offset += 1;
468                         until_alignment -= 1;
469                     }
470                     let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2);
471                     loop {
472                         if is_u16x8_bidi(unsafe { *(src.add(offset) as *const u16x8) }) {
473                             return true;
474                         }
475                         offset += SIMD_STRIDE_SIZE / 2;
476                         if offset > len_minus_stride {
477                             break;
478                         }
479                     }
480                 }
481             }
482             for &u in &buffer[offset..] {
483                 if is_utf16_code_unit_bidi(u) {
484                     return true;
485                 }
486             }
487             false
488         }
489     } else {
490         #[inline(always)]
491         fn is_utf16_bidi_impl(buffer: &[u16]) -> bool {
492             for &u in buffer {
493                 if is_utf16_code_unit_bidi(u) {
494                     return true;
495                 }
496             }
497             false
498         }
499     }
500 }
501 
502 cfg_if! {
503     if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
504         #[inline(always)]
505         fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi {
506             let mut offset = 0usize;
507             let len = buffer.len();
508             if len >= SIMD_STRIDE_SIZE / 2 {
509                 let src = buffer.as_ptr();
510                 let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
511                                            SIMD_ALIGNMENT_MASK) / 2;
512                 if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len {
513                     while until_alignment != 0 {
514                         if buffer[offset] > 0xFF {
515                             // This transition isn't optimal, since the aligment is recomputing
516                             // but not tweaking further today.
517                             if is_utf16_bidi_impl(&buffer[offset..]) {
518                                 return Latin1Bidi::Bidi;
519                             }
520                             return Latin1Bidi::LeftToRight;
521                         }
522                         offset += 1;
523                         until_alignment -= 1;
524                     }
525                     let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2);
526                     loop {
527                         let mut s = unsafe { *(src.add(offset) as *const u16x8) };
528                         if !simd_is_latin1(s) {
529                             loop {
530                                 if is_u16x8_bidi(s) {
531                                     return Latin1Bidi::Bidi;
532                                 }
533                                 offset += SIMD_STRIDE_SIZE / 2;
534                                 if offset > len_minus_stride {
535                                     for &u in &buffer[offset..] {
536                                         if is_utf16_code_unit_bidi(u) {
537                                             return Latin1Bidi::Bidi;
538                                         }
539                                     }
540                                     return Latin1Bidi::LeftToRight;
541                                 }
542                                 s = unsafe { *(src.add(offset) as *const u16x8) };
543                             }
544                         }
545                         offset += SIMD_STRIDE_SIZE / 2;
546                         if offset > len_minus_stride {
547                             break;
548                         }
549                     }
550                 }
551             }
552             let mut iter = (&buffer[offset..]).iter();
553             loop {
554                 if let Some(&u) = iter.next() {
555                     if u > 0xFF {
556                         let mut inner_u = u;
557                         loop {
558                             if is_utf16_code_unit_bidi(inner_u) {
559                                 return Latin1Bidi::Bidi;
560                             }
561                             if let Some(&code_unit) = iter.next() {
562                                 inner_u = code_unit;
563                             } else {
564                                 return Latin1Bidi::LeftToRight;
565                             }
566                         }
567                     }
568                 } else {
569                     return Latin1Bidi::Latin1;
570                 }
571             }
572         }
573     } else {
574         #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
575         #[inline(always)]
576         fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi {
577             let mut offset = 0usize;
578             let len = buffer.len();
579             if len >= ALU_ALIGNMENT / 2 {
580                 let src = buffer.as_ptr();
581                 let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK)) &
582                                            ALU_ALIGNMENT_MASK) / 2;
583                 if until_alignment + ALU_ALIGNMENT / 2 <= len {
584                     while until_alignment != 0 {
585                         if buffer[offset] > 0xFF {
586                             if is_utf16_bidi_impl(&buffer[offset..]) {
587                                 return Latin1Bidi::Bidi;
588                             }
589                             return Latin1Bidi::LeftToRight;
590                         }
591                         offset += 1;
592                         until_alignment -= 1;
593                     }
594                     let len_minus_stride = len - ALU_ALIGNMENT / 2;
595                     loop {
596                         if unsafe { *(src.add(offset) as *const usize) } & LATIN1_MASK != 0 {
597                             if is_utf16_bidi_impl(&buffer[offset..]) {
598                                 return Latin1Bidi::Bidi;
599                             }
600                             return Latin1Bidi::LeftToRight;
601                         }
602                         offset += ALU_ALIGNMENT / 2;
603                         if offset > len_minus_stride {
604                             break;
605                         }
606                     }
607                 }
608             }
609             let mut iter = (&buffer[offset..]).iter();
610             loop {
611                 if let Some(&u) = iter.next() {
612                     if u > 0xFF {
613                         let mut inner_u = u;
614                         loop {
615                             if is_utf16_code_unit_bidi(inner_u) {
616                                 return Latin1Bidi::Bidi;
617                             }
618                             if let Some(&code_unit) = iter.next() {
619                                 inner_u = code_unit;
620                             } else {
621                                 return Latin1Bidi::LeftToRight;
622                             }
623                         }
624                     }
625                 } else {
626                     return Latin1Bidi::Latin1;
627                 }
628             }
629         }
630     }
631 }
632 
633 /// Checks whether the buffer is all-ASCII.
634 ///
635 /// May read the entire buffer even if it isn't all-ASCII. (I.e. the function
636 /// is not guaranteed to fail fast.)
is_ascii(buffer: &[u8]) -> bool637 pub fn is_ascii(buffer: &[u8]) -> bool {
638     is_ascii_impl(buffer)
639 }
640 
641 /// Checks whether the buffer is all-Basic Latin (i.e. UTF-16 representing
642 /// only ASCII characters).
643 ///
644 /// May read the entire buffer even if it isn't all-ASCII. (I.e. the function
645 /// is not guaranteed to fail fast.)
is_basic_latin(buffer: &[u16]) -> bool646 pub fn is_basic_latin(buffer: &[u16]) -> bool {
647     is_basic_latin_impl(buffer)
648 }
649 
650 /// Checks whether the buffer is valid UTF-8 representing only code points
651 /// less than or equal to U+00FF.
652 ///
653 /// Fails fast. (I.e. returns before having read the whole buffer if UTF-8
654 /// invalidity or code points above U+00FF are discovered.
is_utf8_latin1(buffer: &[u8]) -> bool655 pub fn is_utf8_latin1(buffer: &[u8]) -> bool {
656     is_utf8_latin1_impl(buffer).is_none()
657 }
658 
659 /// Checks whether the buffer represents only code points less than or equal
660 /// to U+00FF.
661 ///
662 /// Fails fast. (I.e. returns before having read the whole buffer if code
663 /// points above U+00FF are discovered.
is_str_latin1(buffer: &str) -> bool664 pub fn is_str_latin1(buffer: &str) -> bool {
665     is_str_latin1_impl(buffer).is_none()
666 }
667 
668 /// Checks whether the buffer represents only code point less than or equal
669 /// to U+00FF.
670 ///
671 /// May read the entire buffer even if it isn't all-Latin1. (I.e. the function
672 /// is not guaranteed to fail fast.)
is_utf16_latin1(buffer: &[u16]) -> bool673 pub fn is_utf16_latin1(buffer: &[u16]) -> bool {
674     is_utf16_latin1_impl(buffer)
675 }
676 
677 /// Checks whether a potentially-invalid UTF-8 buffer contains code points
678 /// that trigger right-to-left processing.
679 ///
680 /// The check is done on a Unicode block basis without regard to assigned
681 /// vs. unassigned code points in the block. Hebrew presentation forms in
682 /// the Alphabetic Presentation Forms block are treated as if they formed
683 /// a block on their own (i.e. it treated as right-to-left). Additionally,
684 /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
685 /// for. Control characters that are technically bidi controls but do not
686 /// cause right-to-left behavior without the presence of right-to-left
687 /// characters or right-to-left controls are not checked for. As a special
688 /// case, U+FEFF is excluded from Arabic Presentation Forms-B.
689 ///
690 /// Returns `true` if the input is invalid UTF-8 or the input contains an
691 /// RTL character. Returns `false` if the input is valid UTF-8 and contains
692 /// no RTL characters.
693 #[cfg_attr(feature = "cargo-clippy", allow(collapsible_if, cyclomatic_complexity))]
694 #[inline]
is_utf8_bidi(buffer: &[u8]) -> bool695 pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
696     // As of rustc 1.25.0-nightly (73ac5d6a8 2018-01-11), this is faster
697     // than UTF-8 validation followed by `is_str_bidi()` for German,
698     // Russian and Japanese. However, this is considerably slower for Thai.
699     // Chances are that the compiler makes some branch predictions that are
700     // unfortunate for Thai. Not spending the time to manually optimize
701     // further at this time, since it's unclear if this variant even has
702     // use cases. However, this is worth revisiting once Rust gets the
703     // ability to annotate relative priorities of match arms.
704 
705     // U+058F: D6 8F
706     // U+0590: D6 90
707     // U+08FF: E0 A3 BF
708     // U+0900: E0 A4 80
709     //
710     // U+200F: E2 80 8F
711     // U+202B: E2 80 AB
712     // U+202E: E2 80 AE
713     // U+2067: E2 81 A7
714     //
715     // U+FB1C: EF AC 9C
716     // U+FB1D: EF AC 9D
717     // U+FDFF: EF B7 BF
718     // U+FE00: EF B8 80
719     //
720     // U+FE6F: EF B9 AF
721     // U+FE70: EF B9 B0
722     // U+FEFE: EF BB BE
723     // U+FEFF: EF BB BF
724     //
725     // U+107FF: F0 90 9F BF
726     // U+10800: F0 90 A0 80
727     // U+10FFF: F0 90 BF BF
728     // U+11000: F0 91 80 80
729     //
730     // U+1E7FF: F0 9E 9F BF
731     // U+1E800: F0 9E A0 80
732     // U+1EFFF: F0 9E BF BF
733     // U+1F000: F0 9F 80 80
734     let mut src = buffer;
735     'outer: loop {
736         if let Some((mut byte, mut read)) = validate_ascii(src) {
737             // Check for the longest sequence to avoid checking twice for the
738             // multi-byte sequences.
739             if read + 4 <= src.len() {
740                 'inner: loop {
741                     // At this point, `byte` is not included in `read`.
742                     match byte {
743                         0...0x7F => {
744                             // ASCII: go back to SIMD.
745                             read += 1;
746                             src = &src[read..];
747                             continue 'outer;
748                         }
749                         0xC2...0xD5 => {
750                             // Two-byte
751                             let second = unsafe { *(src.get_unchecked(read + 1)) };
752                             if !in_inclusive_range8(second, 0x80, 0xBF) {
753                                 return true;
754                             }
755                             read += 2;
756                         }
757                         0xD6 => {
758                             // Two-byte
759                             let second = unsafe { *(src.get_unchecked(read + 1)) };
760                             if !in_inclusive_range8(second, 0x80, 0xBF) {
761                                 return true;
762                             }
763                             // XXX consider folding the above and below checks
764                             if second > 0x8F {
765                                 return true;
766                             }
767                             read += 2;
768                         }
769                         // two-byte starting with 0xD7 and above is bidi
770                         0xE1 | 0xE3...0xEC | 0xEE => {
771                             // Three-byte normal
772                             let second = unsafe { *(src.get_unchecked(read + 1)) };
773                             let third = unsafe { *(src.get_unchecked(read + 2)) };
774                             if ((UTF8_DATA.table[usize::from(second)]
775                                 & unsafe {
776                                     *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
777                                 })
778                                 | (third >> 6))
779                                 != 2
780                             {
781                                 return true;
782                             }
783                             read += 3;
784                         }
785                         0xE2 => {
786                             // Three-byte normal, potentially bidi
787                             let second = unsafe { *(src.get_unchecked(read + 1)) };
788                             let third = unsafe { *(src.get_unchecked(read + 2)) };
789                             if ((UTF8_DATA.table[usize::from(second)]
790                                 & unsafe {
791                                     *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
792                                 })
793                                 | (third >> 6))
794                                 != 2
795                             {
796                                 return true;
797                             }
798                             if second == 0x80 {
799                                 if third == 0x8F || third == 0xAB || third == 0xAE {
800                                     return true;
801                                 }
802                             } else if second == 0x81 {
803                                 if third == 0xA7 {
804                                     return true;
805                                 }
806                             }
807                             read += 3;
808                         }
809                         0xEF => {
810                             // Three-byte normal, potentially bidi
811                             let second = unsafe { *(src.get_unchecked(read + 1)) };
812                             let third = unsafe { *(src.get_unchecked(read + 2)) };
813                             if ((UTF8_DATA.table[usize::from(second)]
814                                 & unsafe {
815                                     *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
816                                 })
817                                 | (third >> 6))
818                                 != 2
819                             {
820                                 return true;
821                             }
822                             if in_inclusive_range8(second, 0xAC, 0xB7) {
823                                 if second == 0xAC {
824                                     if third > 0x9C {
825                                         return true;
826                                     }
827                                 } else {
828                                     return true;
829                                 }
830                             } else if in_inclusive_range8(second, 0xB9, 0xBB) {
831                                 if second == 0xB9 {
832                                     if third > 0xAF {
833                                         return true;
834                                     }
835                                 } else if second == 0xBB {
836                                     if third != 0xBF {
837                                         return true;
838                                     }
839                                 } else {
840                                     return true;
841                                 }
842                             }
843                             read += 3;
844                         }
845                         0xE0 => {
846                             // Three-byte special lower bound, potentially bidi
847                             let second = unsafe { *(src.get_unchecked(read + 1)) };
848                             let third = unsafe { *(src.get_unchecked(read + 2)) };
849                             if ((UTF8_DATA.table[usize::from(second)]
850                                 & unsafe {
851                                     *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
852                                 })
853                                 | (third >> 6))
854                                 != 2
855                             {
856                                 return true;
857                             }
858                             // XXX can this be folded into the above validity check
859                             if second < 0xA4 {
860                                 return true;
861                             }
862                             read += 3;
863                         }
864                         0xED => {
865                             // Three-byte special upper bound
866                             let second = unsafe { *(src.get_unchecked(read + 1)) };
867                             let third = unsafe { *(src.get_unchecked(read + 2)) };
868                             if ((UTF8_DATA.table[usize::from(second)]
869                                 & unsafe {
870                                     *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
871                                 })
872                                 | (third >> 6))
873                                 != 2
874                             {
875                                 return true;
876                             }
877                             read += 3;
878                         }
879                         0xF1...0xF4 => {
880                             // Four-byte normal
881                             let second = unsafe { *(src.get_unchecked(read + 1)) };
882                             let third = unsafe { *(src.get_unchecked(read + 2)) };
883                             let fourth = unsafe { *(src.get_unchecked(read + 3)) };
884                             if (u16::from(
885                                 UTF8_DATA.table[usize::from(second)]
886                                     & unsafe {
887                                         *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
888                                     },
889                             ) | u16::from(third >> 6)
890                                 | (u16::from(fourth & 0xC0) << 2))
891                                 != 0x202
892                             {
893                                 return true;
894                             }
895                             read += 4;
896                         }
897                         0xF0 => {
898                             // Four-byte special lower bound, potentially bidi
899                             let second = unsafe { *(src.get_unchecked(read + 1)) };
900                             let third = unsafe { *(src.get_unchecked(read + 2)) };
901                             let fourth = unsafe { *(src.get_unchecked(read + 3)) };
902                             if (u16::from(
903                                 UTF8_DATA.table[usize::from(second)]
904                                     & unsafe {
905                                         *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
906                                     },
907                             ) | u16::from(third >> 6)
908                                 | (u16::from(fourth & 0xC0) << 2))
909                                 != 0x202
910                             {
911                                 return true;
912                             }
913                             if unsafe { unlikely(second == 0x90 || second == 0x9E) } {
914                                 let third = src[read + 2];
915                                 if third >= 0xA0 {
916                                     return true;
917                                 }
918                             }
919                             read += 4;
920                         }
921                         _ => {
922                             // Invalid lead or bidi-only lead
923                             return true;
924                         }
925                     }
926                     if read + 4 > src.len() {
927                         if read == src.len() {
928                             return false;
929                         }
930                         byte = src[read];
931                         break 'inner;
932                     }
933                     byte = src[read];
934                     continue 'inner;
935                 }
936             }
937             // We can't have a complete 4-byte sequence, but we could still have
938             // a complete shorter sequence.
939 
940             // At this point, `byte` is not included in `read`.
941             match byte {
942                 0...0x7F => {
943                     // ASCII: go back to SIMD.
944                     read += 1;
945                     src = &src[read..];
946                     continue 'outer;
947                 }
948                 0xC2...0xD5 => {
949                     // Two-byte
950                     let new_read = read + 2;
951                     if new_read > src.len() {
952                         return true;
953                     }
954                     let second = unsafe { *(src.get_unchecked(read + 1)) };
955                     if !in_inclusive_range8(second, 0x80, 0xBF) {
956                         return true;
957                     }
958                     read = new_read;
959                     // We need to deal with the case where we came here with 3 bytes
960                     // left, so we need to take a look at the last one.
961                     src = &src[read..];
962                     continue 'outer;
963                 }
964                 0xD6 => {
965                     // Two-byte, potentially bidi
966                     let new_read = read + 2;
967                     if new_read > src.len() {
968                         return true;
969                     }
970                     let second = unsafe { *(src.get_unchecked(read + 1)) };
971                     if !in_inclusive_range8(second, 0x80, 0xBF) {
972                         return true;
973                     }
974                     // XXX consider folding the above and below checks
975                     if second > 0x8F {
976                         return true;
977                     }
978                     read = new_read;
979                     // We need to deal with the case where we came here with 3 bytes
980                     // left, so we need to take a look at the last one.
981                     src = &src[read..];
982                     continue 'outer;
983                 }
984                 // two-byte starting with 0xD7 and above is bidi
985                 0xE1 | 0xE3...0xEC | 0xEE => {
986                     // Three-byte normal
987                     let new_read = read + 3;
988                     if new_read > src.len() {
989                         return true;
990                     }
991                     let second = unsafe { *(src.get_unchecked(read + 1)) };
992                     let third = unsafe { *(src.get_unchecked(read + 2)) };
993                     if ((UTF8_DATA.table[usize::from(second)]
994                         & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
995                         | (third >> 6))
996                         != 2
997                     {
998                         return true;
999                     }
1000                 }
1001                 0xE2 => {
1002                     // Three-byte normal, potentially bidi
1003                     let new_read = read + 3;
1004                     if new_read > src.len() {
1005                         return true;
1006                     }
1007                     let second = unsafe { *(src.get_unchecked(read + 1)) };
1008                     let third = unsafe { *(src.get_unchecked(read + 2)) };
1009                     if ((UTF8_DATA.table[usize::from(second)]
1010                         & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1011                         | (third >> 6))
1012                         != 2
1013                     {
1014                         return true;
1015                     }
1016                     if second == 0x80 {
1017                         if third == 0x8F || third == 0xAB || third == 0xAE {
1018                             return true;
1019                         }
1020                     } else if second == 0x81 {
1021                         if third == 0xA7 {
1022                             return true;
1023                         }
1024                     }
1025                 }
1026                 0xEF => {
1027                     // Three-byte normal, potentially bidi
1028                     let new_read = read + 3;
1029                     if new_read > src.len() {
1030                         return true;
1031                     }
1032                     let second = unsafe { *(src.get_unchecked(read + 1)) };
1033                     let third = unsafe { *(src.get_unchecked(read + 2)) };
1034                     if ((UTF8_DATA.table[usize::from(second)]
1035                         & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1036                         | (third >> 6))
1037                         != 2
1038                     {
1039                         return true;
1040                     }
1041                     if in_inclusive_range8(second, 0xAC, 0xB7) {
1042                         if second == 0xAC {
1043                             if third > 0x9C {
1044                                 return true;
1045                             }
1046                         } else {
1047                             return true;
1048                         }
1049                     } else if in_inclusive_range8(second, 0xB9, 0xBB) {
1050                         if second == 0xB9 {
1051                             if third > 0xAF {
1052                                 return true;
1053                             }
1054                         } else if second == 0xBB {
1055                             if third != 0xBF {
1056                                 return true;
1057                             }
1058                         } else {
1059                             return true;
1060                         }
1061                     }
1062                 }
1063                 0xE0 => {
1064                     // Three-byte special lower bound, potentially bidi
1065                     let new_read = read + 3;
1066                     if new_read > src.len() {
1067                         return true;
1068                     }
1069                     let second = unsafe { *(src.get_unchecked(read + 1)) };
1070                     let third = unsafe { *(src.get_unchecked(read + 2)) };
1071                     if ((UTF8_DATA.table[usize::from(second)]
1072                         & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1073                         | (third >> 6))
1074                         != 2
1075                     {
1076                         return true;
1077                     }
1078                     // XXX can this be folded into the above validity check
1079                     if second < 0xA4 {
1080                         return true;
1081                     }
1082                 }
1083                 0xED => {
1084                     // Three-byte special upper bound
1085                     let new_read = read + 3;
1086                     if new_read > src.len() {
1087                         return true;
1088                     }
1089                     let second = unsafe { *(src.get_unchecked(read + 1)) };
1090                     let third = unsafe { *(src.get_unchecked(read + 2)) };
1091                     if ((UTF8_DATA.table[usize::from(second)]
1092                         & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1093                         | (third >> 6))
1094                         != 2
1095                     {
1096                         return true;
1097                     }
1098                 }
1099                 _ => {
1100                     // Invalid lead, 4-byte lead or 2-byte bidi-only lead
1101                     return true;
1102                 }
1103             }
1104             return false;
1105         } else {
1106             return false;
1107         }
1108     }
1109 }
1110 
1111 /// Checks whether a valid UTF-8 buffer contains code points that trigger
1112 /// right-to-left processing.
1113 ///
1114 /// The check is done on a Unicode block basis without regard to assigned
1115 /// vs. unassigned code points in the block. Hebrew presentation forms in
1116 /// the Alphabetic Presentation Forms block are treated as if they formed
1117 /// a block on their own (i.e. it treated as right-to-left). Additionally,
1118 /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1119 /// for. Control characters that are technically bidi controls but do not
1120 /// cause right-to-left behavior without the presence of right-to-left
1121 /// characters or right-to-left controls are not checked for. As a special
1122 /// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1123 #[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))]
1124 #[inline]
is_str_bidi(buffer: &str) -> bool1125 pub fn is_str_bidi(buffer: &str) -> bool {
1126     // U+058F: D6 8F
1127     // U+0590: D6 90
1128     // U+08FF: E0 A3 BF
1129     // U+0900: E0 A4 80
1130     //
1131     // U+200F: E2 80 8F
1132     // U+202B: E2 80 AB
1133     // U+202E: E2 80 AE
1134     // U+2067: E2 81 A7
1135     //
1136     // U+FB1C: EF AC 9C
1137     // U+FB1D: EF AC 9D
1138     // U+FDFF: EF B7 BF
1139     // U+FE00: EF B8 80
1140     //
1141     // U+FE6F: EF B9 AF
1142     // U+FE70: EF B9 B0
1143     // U+FEFE: EF BB BE
1144     // U+FEFF: EF BB BF
1145     //
1146     // U+107FF: F0 90 9F BF
1147     // U+10800: F0 90 A0 80
1148     // U+10FFF: F0 90 BF BF
1149     // U+11000: F0 91 80 80
1150     //
1151     // U+1E7FF: F0 9E 9F BF
1152     // U+1E800: F0 9E A0 80
1153     // U+1EFFF: F0 9E BF BF
1154     // U+1F000: F0 9F 80 80
1155     let mut bytes = buffer.as_bytes();
1156     'outer: loop {
1157         // TODO: Instead of just validating ASCII using SIMD, use SIMD
1158         // to check for non-ASCII lead bytes, too, to quickly conclude
1159         // that the vector consist entirely of CJK and below-Hebrew
1160         // code points.
1161         // Unfortunately, scripts above Arabic but below CJK share
1162         // lead bytes with RTL.
1163         if let Some((mut byte, mut read)) = validate_ascii(bytes) {
1164             'inner: loop {
1165                 // At this point, `byte` is not included in `read`.
1166                 if byte < 0xE0 {
1167                     if byte >= 0x80 {
1168                         // Two-byte
1169                         // Adding `unlikely` here improved throughput on
1170                         // Russian plain text by 33%!
1171                         if unsafe { unlikely(byte >= 0xD6) } {
1172                             if byte == 0xD6 {
1173                                 let second = bytes[read + 1];
1174                                 if second > 0x8F {
1175                                     return true;
1176                                 }
1177                             } else {
1178                                 return true;
1179                             }
1180                         }
1181                         read += 2;
1182                     } else {
1183                         // ASCII: write and go back to SIMD.
1184                         read += 1;
1185                         // Intuitively, we should go back to the outer loop only
1186                         // if byte is 0x30 or above, so as to avoid trashing on
1187                         // ASCII space, comma and period in non-Latin context.
1188                         // However, the extra branch seems to cost more than it's
1189                         // worth.
1190                         bytes = &bytes[read..];
1191                         continue 'outer;
1192                     }
1193                 } else if byte < 0xF0 {
1194                     // Three-byte
1195                     if unsafe { unlikely(!in_inclusive_range8(byte, 0xE3, 0xEE) && byte != 0xE1) } {
1196                         let second = bytes[read + 1];
1197                         if byte == 0xE0 {
1198                             if second < 0xA4 {
1199                                 return true;
1200                             }
1201                         } else if byte == 0xE2 {
1202                             let third = bytes[read + 2];
1203                             if second == 0x80 {
1204                                 if third == 0x8F || third == 0xAB || third == 0xAE {
1205                                     return true;
1206                                 }
1207                             } else if second == 0x81 {
1208                                 if third == 0xA7 {
1209                                     return true;
1210                                 }
1211                             }
1212                         } else {
1213                             debug_assert_eq!(byte, 0xEF);
1214                             if in_inclusive_range8(second, 0xAC, 0xB7) {
1215                                 if second == 0xAC {
1216                                     let third = bytes[read + 2];
1217                                     if third > 0x9C {
1218                                         return true;
1219                                     }
1220                                 } else {
1221                                     return true;
1222                                 }
1223                             } else if in_inclusive_range8(second, 0xB9, 0xBB) {
1224                                 if second == 0xB9 {
1225                                     let third = bytes[read + 2];
1226                                     if third > 0xAF {
1227                                         return true;
1228                                     }
1229                                 } else if second == 0xBB {
1230                                     let third = bytes[read + 2];
1231                                     if third != 0xBF {
1232                                         return true;
1233                                     }
1234                                 } else {
1235                                     return true;
1236                                 }
1237                             }
1238                         }
1239                     }
1240                     read += 3;
1241                 } else {
1242                     // Four-byte
1243                     let second = bytes[read + 1];
1244                     if unsafe { unlikely(byte == 0xF0 && (second == 0x90 || second == 0x9E)) } {
1245                         let third = bytes[read + 2];
1246                         if third >= 0xA0 {
1247                             return true;
1248                         }
1249                     }
1250                     read += 4;
1251                 }
1252                 // The comparison is always < or == and never >, but including
1253                 // > here to let the compiler assume that < is true if this
1254                 // comparison is false.
1255                 if read >= bytes.len() {
1256                     return false;
1257                 }
1258                 byte = bytes[read];
1259                 continue 'inner;
1260             }
1261         } else {
1262             return false;
1263         }
1264     }
1265 }
1266 
1267 /// Checks whether a UTF-16 buffer contains code points that trigger
1268 /// right-to-left processing.
1269 ///
1270 /// The check is done on a Unicode block basis without regard to assigned
1271 /// vs. unassigned code points in the block. Hebrew presentation forms in
1272 /// the Alphabetic Presentation Forms block are treated as if they formed
1273 /// a block on their own (i.e. it treated as right-to-left). Additionally,
1274 /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1275 /// for. Control characters that are technically bidi controls but do not
1276 /// cause right-to-left behavior without the presence of right-to-left
1277 /// characters or right-to-left controls are not checked for. As a special
1278 /// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1279 ///
1280 /// Returns `true` if the input contains an RTL character or an unpaired
1281 /// high surrogate that could be the high half of an RTL character.
1282 /// Returns `false` if the input contains neither RTL characters nor
1283 /// unpaired high surrogates that could be higher halves of RTL characters.
is_utf16_bidi(buffer: &[u16]) -> bool1284 pub fn is_utf16_bidi(buffer: &[u16]) -> bool {
1285     is_utf16_bidi_impl(buffer)
1286 }
1287 
1288 /// Checks whether a scalar value triggers right-to-left processing.
1289 ///
1290 /// The check is done on a Unicode block basis without regard to assigned
1291 /// vs. unassigned code points in the block. Hebrew presentation forms in
1292 /// the Alphabetic Presentation Forms block are treated as if they formed
1293 /// a block on their own (i.e. it treated as right-to-left). Additionally,
1294 /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1295 /// for. Control characters that are technically bidi controls but do not
1296 /// cause right-to-left behavior without the presence of right-to-left
1297 /// characters or right-to-left controls are not checked for. As a special
1298 /// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1299 #[inline(always)]
is_char_bidi(c: char) -> bool1300 pub fn is_char_bidi(c: char) -> bool {
1301     // Controls:
1302     // Every control with RIGHT-TO-LEFT in its name in
1303     // https://www.unicode.org/charts/PDF/U2000.pdf
1304     // U+200F RLM
1305     // U+202B RLE
1306     // U+202E RLO
1307     // U+2067 RLI
1308     //
1309     // BMP RTL:
1310     // https://www.unicode.org/roadmaps/bmp/
1311     // U+0590...U+08FF
1312     // U+FB1D...U+FDFF Hebrew presentation forms and
1313     //                 Arabic Presentation Forms A
1314     // U+FE70...U+FEFE Arabic Presentation Forms B (excl. BOM)
1315     //
1316     // Supplementary RTL:
1317     // https://www.unicode.org/roadmaps/smp/
1318     // U+10800...U+10FFF (Lead surrogate U+D802 or U+D803)
1319     // U+1E800...U+1EFFF (Lead surrogate U+D83A or U+D83B)
1320     let code_point = u32::from(c);
1321     if code_point < 0x0590 {
1322         // Below Hebrew
1323         return false;
1324     }
1325     if in_range32(code_point, 0x0900, 0xFB1D) {
1326         // Above Arabic Extended-A and below Hebrew presentation forms
1327         if in_inclusive_range32(code_point, 0x200F, 0x2067) {
1328             // In the range that contains the RTL controls
1329             return code_point == 0x200F
1330                 || code_point == 0x202B
1331                 || code_point == 0x202E
1332                 || code_point == 0x2067;
1333         }
1334         return false;
1335     }
1336     if code_point > 0x1EFFF {
1337         // Above second astral RTL. (Emoji is here.)
1338         return false;
1339     }
1340     if in_range32(code_point, 0x11000, 0x1E800) {
1341         // Between astral RTL blocks
1342         return false;
1343     }
1344     if in_range32(code_point, 0xFEFF, 0x10800) {
1345         // Above Arabic Presentations Forms B (excl. BOM) and below first
1346         // astral RTL
1347         return false;
1348     }
1349     if in_range32(code_point, 0xFE00, 0xFE70) {
1350         // Between Arabic Presentations Forms
1351         return false;
1352     }
1353     true
1354 }
1355 
1356 /// Checks whether a UTF-16 code unit triggers right-to-left processing.
1357 ///
1358 /// The check is done on a Unicode block basis without regard to assigned
1359 /// vs. unassigned code points in the block. Hebrew presentation forms in
1360 /// the Alphabetic Presentation Forms block are treated as if they formed
1361 /// a block on their own (i.e. it treated as right-to-left). Additionally,
1362 /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1363 /// for. Control characters that are technically bidi controls but do not
1364 /// cause right-to-left behavior without the presence of right-to-left
1365 /// characters or right-to-left controls are not checked for. As a special
1366 /// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1367 ///
1368 /// Since supplementary-plane right-to-left blocks are identifiable from the
1369 /// high surrogate without examining the low surrogate, this function returns
1370 /// `true` for such high surrogates making the function suitable for handling
1371 /// supplementary-plane text without decoding surrogate pairs to scalar
1372 /// values. Obviously, such high surrogates are then reported as right-to-left
1373 /// even if actually unpaired.
1374 #[inline(always)]
is_utf16_code_unit_bidi(u: u16) -> bool1375 pub fn is_utf16_code_unit_bidi(u: u16) -> bool {
1376     if u < 0x0590 {
1377         // Below Hebrew
1378         return false;
1379     }
1380     if in_range16(u, 0x0900, 0xD802) {
1381         // Above Arabic Extended-A and below first RTL surrogate
1382         if in_inclusive_range16(u, 0x200F, 0x2067) {
1383             // In the range that contains the RTL controls
1384             return u == 0x200F || u == 0x202B || u == 0x202E || u == 0x2067;
1385         }
1386         return false;
1387     }
1388     if in_range16(u, 0xD83C, 0xFB1D) {
1389         // Between astral RTL high surrogates and Hebrew presentation forms
1390         // (Emoji is here)
1391         return false;
1392     }
1393     if in_range16(u, 0xD804, 0xD83A) {
1394         // Between RTL high surragates
1395         return false;
1396     }
1397     if u > 0xFEFE {
1398         // Above Arabic Presentation Forms (excl. BOM)
1399         return false;
1400     }
1401     if in_range16(u, 0xFE00, 0xFE70) {
1402         // Between Arabic Presentations Forms
1403         return false;
1404     }
1405     true
1406 }
1407 
1408 /// Checks whether a potentially invalid UTF-8 buffer contains code points
1409 /// that trigger right-to-left processing or is all-Latin1.
1410 ///
1411 /// Possibly more efficient than performing the checks separately.
1412 ///
1413 /// Returns `Latin1Bidi::Latin1` if `is_utf8_latin1()` would return `true`.
1414 /// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf8_bidi()` would return
1415 /// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
check_utf8_for_latin1_and_bidi(buffer: &[u8]) -> Latin1Bidi1416 pub fn check_utf8_for_latin1_and_bidi(buffer: &[u8]) -> Latin1Bidi {
1417     if let Some(offset) = is_utf8_latin1_impl(buffer) {
1418         if is_utf8_bidi(&buffer[offset..]) {
1419             Latin1Bidi::Bidi
1420         } else {
1421             Latin1Bidi::LeftToRight
1422         }
1423     } else {
1424         Latin1Bidi::Latin1
1425     }
1426 }
1427 
1428 /// Checks whether a valid UTF-8 buffer contains code points
1429 /// that trigger right-to-left processing or is all-Latin1.
1430 ///
1431 /// Possibly more efficient than performing the checks separately.
1432 ///
1433 /// Returns `Latin1Bidi::Latin1` if `is_str_latin1()` would return `true`.
1434 /// Otherwise, returns `Latin1Bidi::Bidi` if `is_str_bidi()` would return
1435 /// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
check_str_for_latin1_and_bidi(buffer: &str) -> Latin1Bidi1436 pub fn check_str_for_latin1_and_bidi(buffer: &str) -> Latin1Bidi {
1437     // The transition from the latin1 check to the bidi check isn't
1438     // optimal but not tweaking it to perfection today.
1439     if let Some(offset) = is_str_latin1_impl(buffer) {
1440         if is_str_bidi(&buffer[offset..]) {
1441             Latin1Bidi::Bidi
1442         } else {
1443             Latin1Bidi::LeftToRight
1444         }
1445     } else {
1446         Latin1Bidi::Latin1
1447     }
1448 }
1449 
1450 /// Checks whether a potentially invalid UTF-16 buffer contains code points
1451 /// that trigger right-to-left processing or is all-Latin1.
1452 ///
1453 /// Possibly more efficient than performing the checks separately.
1454 ///
1455 /// Returns `Latin1Bidi::Latin1` if `is_utf16_latin1()` would return `true`.
1456 /// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf16_bidi()` would return
1457 /// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
check_utf16_for_latin1_and_bidi(buffer: &[u16]) -> Latin1Bidi1458 pub fn check_utf16_for_latin1_and_bidi(buffer: &[u16]) -> Latin1Bidi {
1459     check_utf16_for_latin1_and_bidi_impl(buffer)
1460 }
1461 
1462 /// Converts potentially-invalid UTF-8 to valid UTF-16 with errors replaced
1463 /// with the REPLACEMENT CHARACTER.
1464 ///
1465 /// The length of the destination buffer must be at least the length of the
1466 /// source buffer _plus one_.
1467 ///
1468 /// Returns the number of `u16`s written.
1469 ///
1470 /// # Panics
1471 ///
1472 /// Panics if the destination buffer is shorter than stated above.
convert_utf8_to_utf16(src: &[u8], dst: &mut [u16]) -> usize1473 pub fn convert_utf8_to_utf16(src: &[u8], dst: &mut [u16]) -> usize {
1474     // TODO: Can the requirement for dst to be at least one unit longer
1475     // be eliminated?
1476     assert!(dst.len() > src.len());
1477     let mut decoder = Utf8Decoder::new_inner();
1478     let mut total_read = 0usize;
1479     let mut total_written = 0usize;
1480     loop {
1481         let (result, read, written) =
1482             decoder.decode_to_utf16_raw(&src[total_read..], &mut dst[total_written..], true);
1483         total_read += read;
1484         total_written += written;
1485         match result {
1486             DecoderResult::InputEmpty => {
1487                 return total_written;
1488             }
1489             DecoderResult::OutputFull => {
1490                 unreachable!("The assert at the top of the function should have caught this.");
1491             }
1492             DecoderResult::Malformed(_, _) => {
1493                 // There should always be space for the U+FFFD, because
1494                 // otherwise we'd have gotten OutputFull already.
1495                 dst[total_written] = 0xFFFD;
1496                 total_written += 1;
1497             }
1498         }
1499     }
1500 }
1501 
1502 /// Converts valid UTF-8 to valid UTF-16.
1503 ///
1504 /// The length of the destination buffer must be at least the length of the
1505 /// source buffer.
1506 ///
1507 /// Returns the number of `u16`s written.
1508 ///
1509 /// # Panics
1510 ///
1511 /// Panics if the destination buffer is shorter than stated above.
convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize1512 pub fn convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize {
1513     assert!(
1514         dst.len() >= src.len(),
1515         "Destination must not be shorter than the source."
1516     );
1517     let bytes = src.as_bytes();
1518     let mut read = 0;
1519     let mut written = 0;
1520     'outer: loop {
1521         let mut byte = {
1522             let src_remaining = &bytes[read..];
1523             let dst_remaining = &mut dst[written..];
1524             let length = src_remaining.len();
1525             match unsafe {
1526                 ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length)
1527             } {
1528                 None => {
1529                     written += length;
1530                     return written;
1531                 }
1532                 Some((non_ascii, consumed)) => {
1533                     read += consumed;
1534                     written += consumed;
1535                     non_ascii
1536                 }
1537             }
1538         };
1539         'inner: loop {
1540             // At this point, `byte` is not included in `read`.
1541             if byte < 0xE0 {
1542                 if byte >= 0x80 {
1543                     // Two-byte
1544                     let second = unsafe { *(bytes.get_unchecked(read + 1)) };
1545                     let point = ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F);
1546                     unsafe { *(dst.get_unchecked_mut(written)) = point };
1547                     read += 2;
1548                     written += 1;
1549                 } else {
1550                     // ASCII: write and go back to SIMD.
1551                     unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) };
1552                     read += 1;
1553                     written += 1;
1554                     // Intuitively, we should go back to the outer loop only
1555                     // if byte is 0x30 or above, so as to avoid trashing on
1556                     // ASCII space, comma and period in non-Latin context.
1557                     // However, the extra branch seems to cost more than it's
1558                     // worth.
1559                     continue 'outer;
1560                 }
1561             } else if byte < 0xF0 {
1562                 // Three-byte
1563                 let second = unsafe { *(bytes.get_unchecked(read + 1)) };
1564                 let third = unsafe { *(bytes.get_unchecked(read + 2)) };
1565                 let point = ((u16::from(byte) & 0xF) << 12)
1566                     | ((u16::from(second) & 0x3F) << 6)
1567                     | (u16::from(third) & 0x3F);
1568                 unsafe { *(dst.get_unchecked_mut(written)) = point };
1569                 read += 3;
1570                 written += 1;
1571             } else {
1572                 // Four-byte
1573                 let second = unsafe { *(bytes.get_unchecked(read + 1)) };
1574                 let third = unsafe { *(bytes.get_unchecked(read + 2)) };
1575                 let fourth = unsafe { *(bytes.get_unchecked(read + 3)) };
1576                 let point = ((u32::from(byte) & 0x7) << 18)
1577                     | ((u32::from(second) & 0x3F) << 12)
1578                     | ((u32::from(third) & 0x3F) << 6)
1579                     | (u32::from(fourth) & 0x3F);
1580                 unsafe { *(dst.get_unchecked_mut(written)) = (0xD7C0 + (point >> 10)) as u16 };
1581                 unsafe {
1582                     *(dst.get_unchecked_mut(written + 1)) = (0xDC00 + (point & 0x3FF)) as u16
1583                 };
1584                 read += 4;
1585                 written += 2;
1586             }
1587             // The comparison is always < or == and never >, but including
1588             // > here to let the compiler assume that < is true if this
1589             // comparison is false.
1590             if read >= src.len() {
1591                 return written;
1592             }
1593             byte = bytes[read];
1594             continue 'inner;
1595         }
1596     }
1597 }
1598 
1599 /// Converts potentially-invalid UTF-8 to valid UTF-16 signaling on error.
1600 ///
1601 /// The length of the destination buffer must be at least the length of the
1602 /// source buffer.
1603 ///
1604 /// Returns the number of `u16`s written or `None` if the input was invalid.
1605 ///
1606 /// When the input was invalid, some output may have been written.
1607 ///
1608 /// # Panics
1609 ///
1610 /// Panics if the destination buffer is shorter than stated above.
convert_utf8_to_utf16_without_replacement(src: &[u8], dst: &mut [u16]) -> Option<usize>1611 pub fn convert_utf8_to_utf16_without_replacement(src: &[u8], dst: &mut [u16]) -> Option<usize> {
1612     assert!(
1613         dst.len() >= src.len(),
1614         "Destination must not be shorter than the source."
1615     );
1616     let (read, written) = convert_utf8_to_utf16_up_to_invalid(src, dst);
1617     if read == src.len() {
1618         return Some(written);
1619     }
1620     None
1621 }
1622 
1623 /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1624 /// with the REPLACEMENT CHARACTER with potentially insufficient output
1625 /// space.
1626 ///
1627 /// Returns the number of code units read and the number of bytes written.
1628 ///
1629 /// Guarantees that the bytes in the destination beyond the number of
1630 /// bytes claimed as written by the second item of the return tuple
1631 /// are left unmodified.
1632 ///
1633 /// Not all code units are read if there isn't enough output space.
1634 ///
1635 /// Note  that this method isn't designed for general streamability but for
1636 /// not allocating memory for the worst case up front. Specifically,
1637 /// if the input starts with or ends with an unpaired surrogate, those are
1638 /// replaced with the REPLACEMENT CHARACTER.
1639 ///
1640 /// Matches the semantics of `TextEncoder.encodeInto()` from the
1641 /// Encoding Standard.
1642 ///
1643 /// # Safety
1644 ///
1645 /// If you want to convert into a `&mut str`, use
1646 /// `convert_utf16_to_str_partial()` instead of using this function
1647 /// together with the `unsafe` method `as_bytes_mut()` on `&mut str`.
1648 #[inline(always)]
convert_utf16_to_utf8_partial(src: &[u16], dst: &mut [u8]) -> (usize, usize)1649 pub fn convert_utf16_to_utf8_partial(src: &[u16], dst: &mut [u8]) -> (usize, usize) {
1650     // The two functions called below are marked `inline(never)` to make
1651     // transitions from the hot part (first function) into the cold part
1652     // (second function) go through a return and another call to discouge
1653     // the CPU from speculating from the hot code into the cold code.
1654     // Letting the transitions be mere intra-function jumps, even to
1655     // basic blocks out-of-lined to the end of the function would wipe
1656     // away a quarter of Arabic encode performance on Haswell!
1657     let (read, written) = convert_utf16_to_utf8_partial_inner(src, dst);
1658     if unsafe { likely(read == src.len()) } {
1659         return (read, written);
1660     }
1661     let (tail_read, tail_written) =
1662         convert_utf16_to_utf8_partial_tail(&src[read..], &mut dst[written..]);
1663     (read + tail_read, written + tail_written)
1664 }
1665 
1666 /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1667 /// with the REPLACEMENT CHARACTER.
1668 ///
1669 /// The length of the destination buffer must be at least the length of the
1670 /// source buffer times three.
1671 ///
1672 /// Returns the number of bytes written.
1673 ///
1674 /// # Panics
1675 ///
1676 /// Panics if the destination buffer is shorter than stated above.
1677 ///
1678 /// # Safety
1679 ///
1680 /// If you want to convert into a `&mut str`, use `convert_utf16_to_str()`
1681 /// instead of using this function together with the `unsafe` method
1682 /// `as_bytes_mut()` on `&mut str`.
1683 #[inline(always)]
convert_utf16_to_utf8(src: &[u16], dst: &mut [u8]) -> usize1684 pub fn convert_utf16_to_utf8(src: &[u16], dst: &mut [u8]) -> usize {
1685     assert!(dst.len() >= src.len() * 3);
1686     let (read, written) = convert_utf16_to_utf8_partial(src, dst);
1687     debug_assert_eq!(read, src.len());
1688     written
1689 }
1690 
1691 /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1692 /// with the REPLACEMENT CHARACTER such that the validity of the output is
1693 /// signaled using the Rust type system with potentially insufficient output
1694 /// space.
1695 ///
1696 /// Returns the number of code units read and the number of bytes written.
1697 ///
1698 /// Not all code units are read if there isn't enough output space.
1699 ///
1700 /// Note  that this method isn't designed for general streamability but for
1701 /// not allocating memory for the worst case up front. Specifically,
1702 /// if the input starts with or ends with an unpaired surrogate, those are
1703 /// replaced with the REPLACEMENT CHARACTER.
convert_utf16_to_str_partial(src: &[u16], dst: &mut str) -> (usize, usize)1704 pub fn convert_utf16_to_str_partial(src: &[u16], dst: &mut str) -> (usize, usize) {
1705     let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
1706     let (read, written) = convert_utf16_to_utf8_partial(src, bytes);
1707     let len = bytes.len();
1708     let mut trail = written;
1709     while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
1710         bytes[trail] = 0;
1711         trail += 1;
1712     }
1713     (read, written)
1714 }
1715 
1716 /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1717 /// with the REPLACEMENT CHARACTER such that the validity of the output is
1718 /// signaled using the Rust type system.
1719 ///
1720 /// The length of the destination buffer must be at least the length of the
1721 /// source buffer times three.
1722 ///
1723 /// Returns the number of bytes written.
1724 ///
1725 /// # Panics
1726 ///
1727 /// Panics if the destination buffer is shorter than stated above.
1728 #[inline(always)]
convert_utf16_to_str(src: &[u16], dst: &mut str) -> usize1729 pub fn convert_utf16_to_str(src: &[u16], dst: &mut str) -> usize {
1730     assert!(dst.len() >= src.len() * 3);
1731     let (read, written) = convert_utf16_to_str_partial(src, dst);
1732     debug_assert_eq!(read, src.len());
1733     written
1734 }
1735 
1736 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1737 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-16.
1738 ///
1739 /// The length of the destination buffer must be at least the length of the
1740 /// source buffer.
1741 ///
1742 /// The number of `u16`s written equals the length of the source buffer.
1743 ///
1744 /// # Panics
1745 ///
1746 /// Panics if the destination buffer is shorter than stated above.
convert_latin1_to_utf16(src: &[u8], dst: &mut [u16])1747 pub fn convert_latin1_to_utf16(src: &[u8], dst: &mut [u16]) {
1748     assert!(
1749         dst.len() >= src.len(),
1750         "Destination must not be shorter than the source."
1751     );
1752     // TODO: On aarch64, the safe version autovectorizes to the same unpacking
1753     // instructions and this code, but, yet, the autovectorized version is
1754     // faster.
1755     unsafe {
1756         unpack_latin1(src.as_ptr(), dst.as_mut_ptr(), src.len());
1757     }
1758 }
1759 
1760 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1761 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient
1762 /// output space.
1763 ///
1764 /// Returns the number of bytes read and the number of bytes written.
1765 ///
1766 /// If the output isn't large enough, not all input is consumed.
1767 ///
1768 /// # Safety
1769 ///
1770 /// If you want to convert into a `&mut str`, use
1771 /// `convert_utf16_to_str_partial()` instead of using this function
1772 /// together with the `unsafe` method `as_bytes_mut()` on `&mut str`.
convert_latin1_to_utf8_partial(src: &[u8], dst: &mut [u8]) -> (usize, usize)1773 pub fn convert_latin1_to_utf8_partial(src: &[u8], dst: &mut [u8]) -> (usize, usize) {
1774     let src_len = src.len();
1775     let src_ptr = src.as_ptr();
1776     let dst_ptr = dst.as_mut_ptr();
1777     let dst_len = dst.len();
1778     let mut total_read = 0usize;
1779     let mut total_written = 0usize;
1780     loop {
1781         // src can't advance more than dst
1782         let src_left = src_len - total_read;
1783         let dst_left = dst_len - total_written;
1784         let min_left = ::std::cmp::min(src_left, dst_left);
1785         if let Some((non_ascii, consumed)) = unsafe {
1786             ascii_to_ascii(
1787                 src_ptr.add(total_read),
1788                 dst_ptr.add(total_written),
1789                 min_left,
1790             )
1791         } {
1792             total_read += consumed;
1793             total_written += consumed;
1794             if total_written.checked_add(2).unwrap() > dst_len {
1795                 return (total_read, total_written);
1796             }
1797 
1798             total_read += 1; // consume `non_ascii`
1799 
1800             dst[total_written] = (non_ascii >> 6) | 0xC0;
1801             total_written += 1;
1802             dst[total_written] = (non_ascii & 0x3F) | 0x80;
1803             total_written += 1;
1804             continue;
1805         }
1806         return (total_read + min_left, total_written + min_left);
1807     }
1808 }
1809 
1810 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1811 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8.
1812 ///
1813 /// The length of the destination buffer must be at least the length of the
1814 /// source buffer times two.
1815 ///
1816 /// Returns the number of bytes written.
1817 ///
1818 /// # Panics
1819 ///
1820 /// Panics if the destination buffer is shorter than stated above.
1821 ///
1822 /// # Safety
1823 ///
1824 /// Note that this function may write garbage beyond the number of bytes
1825 /// indicated by the return value, so using a `&mut str` interpreted as
1826 /// `&mut [u8]` as the destination is not safe. If you want to convert into
1827 /// a `&mut str`, use `convert_utf16_to_str()` instead of this function.
1828 #[inline]
convert_latin1_to_utf8(src: &[u8], dst: &mut [u8]) -> usize1829 pub fn convert_latin1_to_utf8(src: &[u8], dst: &mut [u8]) -> usize {
1830     assert!(
1831         dst.len() >= src.len() * 2,
1832         "Destination must not be shorter than the source times two."
1833     );
1834     let (read, written) = convert_latin1_to_utf8_partial(src, dst);
1835     debug_assert_eq!(read, src.len());
1836     written
1837 }
1838 
1839 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1840 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the
1841 /// output is signaled using the Rust type system with potentially insufficient
1842 /// output space.
1843 ///
1844 /// Returns the number of bytes read and the number of bytes written.
1845 ///
1846 /// If the output isn't large enough, not all input is consumed.
1847 #[inline]
convert_latin1_to_str_partial(src: &[u8], dst: &mut str) -> (usize, usize)1848 pub fn convert_latin1_to_str_partial(src: &[u8], dst: &mut str) -> (usize, usize) {
1849     let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
1850     let (read, written) = convert_latin1_to_utf8_partial(src, bytes);
1851     let len = bytes.len();
1852     let mut trail = written;
1853     let max = ::std::cmp::min(len, trail + MAX_STRIDE_SIZE);
1854     while trail < max {
1855         bytes[trail] = 0;
1856         trail += 1;
1857     }
1858     while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
1859         bytes[trail] = 0;
1860         trail += 1;
1861     }
1862     (read, written)
1863 }
1864 
1865 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1866 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the
1867 /// output is signaled using the Rust type system.
1868 ///
1869 /// The length of the destination buffer must be at least the length of the
1870 /// source buffer times two.
1871 ///
1872 /// Returns the number of bytes written.
1873 ///
1874 /// # Panics
1875 ///
1876 /// Panics if the destination buffer is shorter than stated above.
1877 #[inline]
convert_latin1_to_str(src: &[u8], dst: &mut str) -> usize1878 pub fn convert_latin1_to_str(src: &[u8], dst: &mut str) -> usize {
1879     assert!(
1880         dst.len() >= src.len() * 2,
1881         "Destination must not be shorter than the source times two."
1882     );
1883     let (read, written) = convert_latin1_to_str_partial(src, dst);
1884     debug_assert_eq!(read, src.len());
1885     written
1886 }
1887 
1888 /// If the input is valid UTF-8 representing only Unicode code points from
1889 /// U+0000 to U+00FF, inclusive, converts the input into output that
1890 /// represents the value of each code point as the unsigned byte value of
1891 /// each output byte.
1892 ///
1893 /// If the input does not fulfill the condition stated above, this function
1894 /// panics if debug assertions are enabled (and fuzzing isn't) and otherwise
1895 /// does something that is memory-safe without any promises about any
1896 /// properties of the output. In particular, callers shouldn't assume the
1897 /// output to be the same across crate versions or CPU architectures and
1898 /// should not assume that non-ASCII input can't map to ASCII output.
1899 ///
1900 /// The length of the destination buffer must be at least the length of the
1901 /// source buffer.
1902 ///
1903 /// Returns the number of bytes written.
1904 ///
1905 /// # Panics
1906 ///
1907 /// Panics if the destination buffer is shorter than stated above.
1908 ///
1909 /// If debug assertions are enabled (and not fuzzing) and the input is
1910 /// not in the range U+0000 to U+00FF, inclusive.
convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize1911 pub fn convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize {
1912     assert!(
1913         dst.len() >= src.len(),
1914         "Destination must not be shorter than the source."
1915     );
1916     non_fuzz_debug_assert!(is_utf8_latin1(src));
1917     let src_len = src.len();
1918     let src_ptr = src.as_ptr();
1919     let dst_ptr = dst.as_mut_ptr();
1920     let mut total_read = 0usize;
1921     let mut total_written = 0usize;
1922     loop {
1923         // dst can't advance more than src
1924         let src_left = src_len - total_read;
1925         if let Some((non_ascii, consumed)) = unsafe {
1926             ascii_to_ascii(
1927                 src_ptr.add(total_read),
1928                 dst_ptr.add(total_written),
1929                 src_left,
1930             )
1931         } {
1932             total_read += consumed + 1;
1933             total_written += consumed;
1934 
1935             if total_read == src_len {
1936                 return total_written;
1937             }
1938 
1939             let trail = src[total_read];
1940             total_read += 1;
1941 
1942             dst[total_written] = ((non_ascii & 0x1F) << 6) | (trail & 0x3F);
1943             total_written += 1;
1944             continue;
1945         }
1946         return total_written + src_left;
1947     }
1948 }
1949 
1950 /// If the input is valid UTF-16 representing only Unicode code points from
1951 /// U+0000 to U+00FF, inclusive, converts the input into output that
1952 /// represents the value of each code point as the unsigned byte value of
1953 /// each output byte.
1954 ///
1955 /// If the input does not fulfill the condition stated above, does something
1956 /// that is memory-safe without any promises about any properties of the
1957 /// output and will probably assert in debug builds in future versions.
1958 /// In particular, callers shouldn't assume the output to be the same across
1959 /// crate versions or CPU architectures and should not assume that non-ASCII
1960 /// input can't map to ASCII output.
1961 ///
1962 /// The length of the destination buffer must be at least the length of the
1963 /// source buffer.
1964 ///
1965 /// The number of bytes written equals the length of the source buffer.
1966 ///
1967 /// # Panics
1968 ///
1969 /// Panics if the destination buffer is shorter than stated above.
1970 ///
1971 /// (Probably in future versions if debug assertions are enabled (and not
1972 /// fuzzing) and the input is not in the range U+0000 to U+00FF, inclusive.)
convert_utf16_to_latin1_lossy(src: &[u16], dst: &mut [u8])1973 pub fn convert_utf16_to_latin1_lossy(src: &[u16], dst: &mut [u8]) {
1974     assert!(
1975         dst.len() >= src.len(),
1976         "Destination must not be shorter than the source."
1977     );
1978     // non_fuzz_debug_assert!(is_utf16_latin1(src));
1979     unsafe {
1980         pack_latin1(src.as_ptr(), dst.as_mut_ptr(), src.len());
1981     }
1982 }
1983 
1984 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1985 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8.
1986 ///
1987 /// Borrows if input is ASCII-only. Performs a single heap allocation
1988 /// otherwise.
decode_latin1<'a>(bytes: &'a [u8]) -> Cow<'a, str>1989 pub fn decode_latin1<'a>(bytes: &'a [u8]) -> Cow<'a, str> {
1990     let up_to = ascii_valid_up_to(bytes);
1991     // >= makes later things optimize better than ==
1992     if up_to >= bytes.len() {
1993         debug_assert_eq!(up_to, bytes.len());
1994         let s: &str = unsafe { ::std::str::from_utf8_unchecked(bytes) };
1995         return Cow::Borrowed(s);
1996     }
1997     let (head, tail) = bytes.split_at(up_to);
1998     let capacity = head.len() + tail.len() * 2;
1999     let mut vec = Vec::with_capacity(capacity);
2000     unsafe {
2001         vec.set_len(capacity);
2002     }
2003     (&mut vec[..up_to]).copy_from_slice(head);
2004     let written = convert_latin1_to_utf8(tail, &mut vec[up_to..]);
2005     vec.truncate(up_to + written);
2006     Cow::Owned(unsafe { String::from_utf8_unchecked(vec) })
2007 }
2008 
2009 /// If the input is valid UTF-8 representing only Unicode code points from
2010 /// U+0000 to U+00FF, inclusive, converts the input into output that
2011 /// represents the value of each code point as the unsigned byte value of
2012 /// each output byte.
2013 ///
2014 /// If the input does not fulfill the condition stated above, this function
2015 /// panics if debug assertions are enabled (and fuzzing isn't) and otherwise
2016 /// does something that is memory-safe without any promises about any
2017 /// properties of the output. In particular, callers shouldn't assume the
2018 /// output to be the same across crate versions or CPU architectures and
2019 /// should not assume that non-ASCII input can't map to ASCII output.
2020 ///
2021 /// Borrows if input is ASCII-only. Performs a single heap allocation
2022 /// otherwise.
encode_latin1_lossy<'a>(string: &'a str) -> Cow<'a, [u8]>2023 pub fn encode_latin1_lossy<'a>(string: &'a str) -> Cow<'a, [u8]> {
2024     let bytes = string.as_bytes();
2025     let up_to = ascii_valid_up_to(bytes);
2026     // >= makes later things optimize better than ==
2027     if up_to >= bytes.len() {
2028         debug_assert_eq!(up_to, bytes.len());
2029         return Cow::Borrowed(bytes);
2030     }
2031     let (head, tail) = bytes.split_at(up_to);
2032     let capacity = bytes.len();
2033     let mut vec = Vec::with_capacity(capacity);
2034     unsafe {
2035         vec.set_len(capacity);
2036     }
2037     (&mut vec[..up_to]).copy_from_slice(head);
2038     let written = convert_utf8_to_latin1_lossy(tail, &mut vec[up_to..]);
2039     vec.truncate(up_to + written);
2040     Cow::Owned(vec)
2041 }
2042 
2043 /// Returns the index of the first unpaired surrogate or, if the input is
2044 /// valid UTF-16 in its entirety, the length of the input.
utf16_valid_up_to(buffer: &[u16]) -> usize2045 pub fn utf16_valid_up_to(buffer: &[u16]) -> usize {
2046     utf16_valid_up_to_impl(buffer)
2047 }
2048 
2049 /// Returns the index of first byte that starts an invalid byte
2050 /// sequence or a non-Latin1 byte sequence, or the length of the
2051 /// string if there are neither.
utf8_latin1_up_to(buffer: &[u8]) -> usize2052 pub fn utf8_latin1_up_to(buffer: &[u8]) -> usize {
2053     is_utf8_latin1_impl(buffer).unwrap_or(buffer.len())
2054 }
2055 
2056 /// Returns the index of first byte that starts a non-Latin1 byte
2057 /// sequence, or the length of the string if there are none.
str_latin1_up_to(buffer: &str) -> usize2058 pub fn str_latin1_up_to(buffer: &str) -> usize {
2059     is_str_latin1_impl(buffer).unwrap_or(buffer.len())
2060 }
2061 
2062 /// Replaces unpaired surrogates in the input with the REPLACEMENT CHARACTER.
2063 #[inline]
ensure_utf16_validity(buffer: &mut [u16])2064 pub fn ensure_utf16_validity(buffer: &mut [u16]) {
2065     let mut offset = 0;
2066     loop {
2067         offset += utf16_valid_up_to(&buffer[offset..]);
2068         if offset == buffer.len() {
2069             return;
2070         }
2071         buffer[offset] = 0xFFFD;
2072         offset += 1;
2073     }
2074 }
2075 
2076 /// Copies ASCII from source to destination up to the first non-ASCII byte
2077 /// (or the end of the input if it is ASCII in its entirety).
2078 ///
2079 /// The length of the destination buffer must be at least the length of the
2080 /// source buffer.
2081 ///
2082 /// Returns the number of bytes written.
2083 ///
2084 /// # Panics
2085 ///
2086 /// Panics if the destination buffer is shorter than stated above.
copy_ascii_to_ascii(src: &[u8], dst: &mut [u8]) -> usize2087 pub fn copy_ascii_to_ascii(src: &[u8], dst: &mut [u8]) -> usize {
2088     assert!(
2089         dst.len() >= src.len(),
2090         "Destination must not be shorter than the source."
2091     );
2092     if let Some((_, consumed)) =
2093         unsafe { ascii_to_ascii(src.as_ptr(), dst.as_mut_ptr(), src.len()) }
2094     {
2095         consumed
2096     } else {
2097         src.len()
2098     }
2099 }
2100 
2101 /// Copies ASCII from source to destination zero-extending it to UTF-16 up to
2102 /// the first non-ASCII byte (or the end of the input if it is ASCII in its
2103 /// entirety).
2104 ///
2105 /// The length of the destination buffer must be at least the length of the
2106 /// source buffer.
2107 ///
2108 /// Returns the number of `u16`s written.
2109 ///
2110 /// # Panics
2111 ///
2112 /// Panics if the destination buffer is shorter than stated above.
copy_ascii_to_basic_latin(src: &[u8], dst: &mut [u16]) -> usize2113 pub fn copy_ascii_to_basic_latin(src: &[u8], dst: &mut [u16]) -> usize {
2114     assert!(
2115         dst.len() >= src.len(),
2116         "Destination must not be shorter than the source."
2117     );
2118     if let Some((_, consumed)) =
2119         unsafe { ascii_to_basic_latin(src.as_ptr(), dst.as_mut_ptr(), src.len()) }
2120     {
2121         consumed
2122     } else {
2123         src.len()
2124     }
2125 }
2126 
2127 /// Copies Basic Latin from source to destination narrowing it to ASCII up to
2128 /// the first non-Basic Latin code unit (or the end of the input if it is
2129 /// Basic Latin in its entirety).
2130 ///
2131 /// The length of the destination buffer must be at least the length of the
2132 /// source buffer.
2133 ///
2134 /// Returns the number of bytes written.
2135 ///
2136 /// # Panics
2137 ///
2138 /// Panics if the destination buffer is shorter than stated above.
copy_basic_latin_to_ascii(src: &[u16], dst: &mut [u8]) -> usize2139 pub fn copy_basic_latin_to_ascii(src: &[u16], dst: &mut [u8]) -> usize {
2140     assert!(
2141         dst.len() >= src.len(),
2142         "Destination must not be shorter than the source."
2143     );
2144     if let Some((_, consumed)) =
2145         unsafe { basic_latin_to_ascii(src.as_ptr(), dst.as_mut_ptr(), src.len()) }
2146     {
2147         consumed
2148     } else {
2149         src.len()
2150     }
2151 }
2152 
2153 // Any copyright to the test code below this comment is dedicated to the
2154 // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
2155 
2156 #[cfg(test)]
2157 mod tests {
2158     use super::*;
2159 
2160     #[test]
test_is_ascii_success()2161     fn test_is_ascii_success() {
2162         let mut src: Vec<u8> = Vec::with_capacity(128);
2163         src.resize(128, 0);
2164         for i in 0..src.len() {
2165             src[i] = i as u8;
2166         }
2167         for i in 0..src.len() {
2168             assert!(is_ascii(&src[i..]));
2169         }
2170     }
2171 
2172     #[test]
test_is_ascii_fail()2173     fn test_is_ascii_fail() {
2174         let mut src: Vec<u8> = Vec::with_capacity(128);
2175         src.resize(128, 0);
2176         for i in 0..src.len() {
2177             src[i] = i as u8;
2178         }
2179         for i in 0..src.len() {
2180             let tail = &mut src[i..];
2181             for j in 0..tail.len() {
2182                 tail[j] = 0xA0;
2183                 assert!(!is_ascii(tail));
2184             }
2185         }
2186     }
2187 
2188     #[test]
test_is_basic_latin_success()2189     fn test_is_basic_latin_success() {
2190         let mut src: Vec<u16> = Vec::with_capacity(128);
2191         src.resize(128, 0);
2192         for i in 0..src.len() {
2193             src[i] = i as u16;
2194         }
2195         for i in 0..src.len() {
2196             assert!(is_basic_latin(&src[i..]));
2197         }
2198     }
2199 
2200     #[test]
test_is_basic_latin_fail()2201     fn test_is_basic_latin_fail() {
2202         let mut src: Vec<u16> = Vec::with_capacity(128);
2203         src.resize(128, 0);
2204         for i in 0..src.len() {
2205             src[i] = i as u16;
2206         }
2207         for i in 0..src.len() {
2208             let tail = &mut src[i..];
2209             for j in 0..tail.len() {
2210                 tail[j] = 0xA0;
2211                 assert!(!is_basic_latin(tail));
2212             }
2213         }
2214     }
2215 
2216     #[test]
test_is_utf16_latin1_success()2217     fn test_is_utf16_latin1_success() {
2218         let mut src: Vec<u16> = Vec::with_capacity(256);
2219         src.resize(256, 0);
2220         for i in 0..src.len() {
2221             src[i] = i as u16;
2222         }
2223         for i in 0..src.len() {
2224             assert!(is_utf16_latin1(&src[i..]));
2225             assert_eq!(
2226                 check_utf16_for_latin1_and_bidi(&src[i..]),
2227                 Latin1Bidi::Latin1
2228             );
2229         }
2230     }
2231 
2232     #[test]
test_is_utf16_latin1_fail()2233     fn test_is_utf16_latin1_fail() {
2234         let mut src: Vec<u16> = Vec::with_capacity(256);
2235         src.resize(256, 0);
2236         for i in 0..src.len() {
2237             src[i] = i as u16;
2238         }
2239         for i in 0..src.len() {
2240             let tail = &mut src[i..];
2241             for j in 0..tail.len() {
2242                 tail[j] = 0x100 + j as u16;
2243                 assert!(!is_utf16_latin1(tail));
2244                 assert_ne!(check_utf16_for_latin1_and_bidi(tail), Latin1Bidi::Latin1);
2245             }
2246         }
2247     }
2248 
2249     #[test]
test_is_str_latin1_success()2250     fn test_is_str_latin1_success() {
2251         let mut src: Vec<u16> = Vec::with_capacity(256);
2252         src.resize(256, 0);
2253         for i in 0..src.len() {
2254             src[i] = i as u16;
2255         }
2256         for i in 0..src.len() {
2257             let s = String::from_utf16(&src[i..]).unwrap();
2258             assert!(is_str_latin1(&s[..]));
2259             assert_eq!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1);
2260         }
2261     }
2262 
2263     #[test]
test_is_str_latin1_fail()2264     fn test_is_str_latin1_fail() {
2265         let mut src: Vec<u16> = Vec::with_capacity(256);
2266         src.resize(256, 0);
2267         for i in 0..src.len() {
2268             src[i] = i as u16;
2269         }
2270         for i in 0..src.len() {
2271             let tail = &mut src[i..];
2272             for j in 0..tail.len() {
2273                 tail[j] = 0x100 + j as u16;
2274                 let s = String::from_utf16(tail).unwrap();
2275                 assert!(!is_str_latin1(&s[..]));
2276                 assert_ne!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1);
2277             }
2278         }
2279     }
2280 
2281     #[test]
test_is_utf8_latin1_success()2282     fn test_is_utf8_latin1_success() {
2283         let mut src: Vec<u16> = Vec::with_capacity(256);
2284         src.resize(256, 0);
2285         for i in 0..src.len() {
2286             src[i] = i as u16;
2287         }
2288         for i in 0..src.len() {
2289             let s = String::from_utf16(&src[i..]).unwrap();
2290             assert!(is_utf8_latin1(s.as_bytes()));
2291             assert_eq!(
2292                 check_utf8_for_latin1_and_bidi(s.as_bytes()),
2293                 Latin1Bidi::Latin1
2294             );
2295         }
2296     }
2297 
2298     #[test]
test_is_utf8_latin1_fail()2299     fn test_is_utf8_latin1_fail() {
2300         let mut src: Vec<u16> = Vec::with_capacity(256);
2301         src.resize(256, 0);
2302         for i in 0..src.len() {
2303             src[i] = i as u16;
2304         }
2305         for i in 0..src.len() {
2306             let tail = &mut src[i..];
2307             for j in 0..tail.len() {
2308                 tail[j] = 0x100 + j as u16;
2309                 let s = String::from_utf16(tail).unwrap();
2310                 assert!(!is_utf8_latin1(s.as_bytes()));
2311                 assert_ne!(
2312                     check_utf8_for_latin1_and_bidi(s.as_bytes()),
2313                     Latin1Bidi::Latin1
2314                 );
2315             }
2316         }
2317     }
2318 
2319     #[test]
test_is_utf8_latin1_invalid()2320     fn test_is_utf8_latin1_invalid() {
2321         assert!(!is_utf8_latin1(b"\xC3"));
2322         assert!(!is_utf8_latin1(b"a\xC3"));
2323         assert!(!is_utf8_latin1(b"\xFF"));
2324         assert!(!is_utf8_latin1(b"a\xFF"));
2325         assert!(!is_utf8_latin1(b"\xC3\xFF"));
2326         assert!(!is_utf8_latin1(b"a\xC3\xFF"));
2327     }
2328 
2329     #[test]
test_convert_utf8_to_utf16()2330     fn test_convert_utf8_to_utf16() {
2331         let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2332         let mut dst: Vec<u16> = Vec::with_capacity(src.len() + 1);
2333         dst.resize(src.len() + 1, 0);
2334         let len = convert_utf8_to_utf16(src.as_bytes(), &mut dst[..]);
2335         dst.truncate(len);
2336         let reference: Vec<u16> = src.encode_utf16().collect();
2337         assert_eq!(dst, reference);
2338     }
2339 
2340     #[test]
test_convert_str_to_utf16()2341     fn test_convert_str_to_utf16() {
2342         let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2343         let mut dst: Vec<u16> = Vec::with_capacity(src.len());
2344         dst.resize(src.len(), 0);
2345         let len = convert_str_to_utf16(src, &mut dst[..]);
2346         dst.truncate(len);
2347         let reference: Vec<u16> = src.encode_utf16().collect();
2348         assert_eq!(dst, reference);
2349     }
2350 
2351     #[test]
test_convert_utf16_to_utf8_partial()2352     fn test_convert_utf16_to_utf8_partial() {
2353         let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2354         let src: Vec<u16> = reference.encode_utf16().collect();
2355         let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 3 + 1);
2356         dst.resize(src.len() * 3 + 1, 0);
2357         let (read, written) = convert_utf16_to_utf8_partial(&src[..], &mut dst[..24]);
2358         let len = written + convert_utf16_to_utf8(&src[read..], &mut dst[written..]);
2359         dst.truncate(len);
2360         assert_eq!(dst, reference.as_bytes());
2361     }
2362 
2363     #[test]
test_convert_utf16_to_utf8()2364     fn test_convert_utf16_to_utf8() {
2365         let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2366         let src: Vec<u16> = reference.encode_utf16().collect();
2367         let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 3 + 1);
2368         dst.resize(src.len() * 3 + 1, 0);
2369         let len = convert_utf16_to_utf8(&src[..], &mut dst[..]);
2370         dst.truncate(len);
2371         assert_eq!(dst, reference.as_bytes());
2372     }
2373 
2374     #[test]
test_convert_latin1_to_utf16()2375     fn test_convert_latin1_to_utf16() {
2376         let mut src: Vec<u8> = Vec::with_capacity(256);
2377         src.resize(256, 0);
2378         let mut reference: Vec<u16> = Vec::with_capacity(256);
2379         reference.resize(256, 0);
2380         for i in 0..256 {
2381             src[i] = i as u8;
2382             reference[i] = i as u16;
2383         }
2384         let mut dst: Vec<u16> = Vec::with_capacity(src.len());
2385         dst.resize(src.len(), 0);
2386         convert_latin1_to_utf16(&src[..], &mut dst[..]);
2387         assert_eq!(dst, reference);
2388     }
2389 
2390     #[test]
test_convert_latin1_to_utf8_partial()2391     fn test_convert_latin1_to_utf8_partial() {
2392         let mut dst = [0u8, 2];
2393         let (read, written) = convert_latin1_to_utf8_partial(b"a\xFF", &mut dst[..]);
2394         assert_eq!(read, 1);
2395         assert_eq!(written, 1);
2396     }
2397 
2398     #[test]
test_convert_latin1_to_utf8()2399     fn test_convert_latin1_to_utf8() {
2400         let mut src: Vec<u8> = Vec::with_capacity(256);
2401         src.resize(256, 0);
2402         let mut reference: Vec<u16> = Vec::with_capacity(256);
2403         reference.resize(256, 0);
2404         for i in 0..256 {
2405             src[i] = i as u8;
2406             reference[i] = i as u16;
2407         }
2408         let s = String::from_utf16(&reference[..]).unwrap();
2409         let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 2);
2410         dst.resize(src.len() * 2, 0);
2411         let len = convert_latin1_to_utf8(&src[..], &mut dst[..]);
2412         dst.truncate(len);
2413         assert_eq!(&dst[..], s.as_bytes());
2414     }
2415 
2416     #[test]
test_convert_utf8_to_latin1_lossy()2417     fn test_convert_utf8_to_latin1_lossy() {
2418         let mut reference: Vec<u8> = Vec::with_capacity(256);
2419         reference.resize(256, 0);
2420         let mut src16: Vec<u16> = Vec::with_capacity(256);
2421         src16.resize(256, 0);
2422         for i in 0..256 {
2423             src16[i] = i as u16;
2424             reference[i] = i as u8;
2425         }
2426         let src = String::from_utf16(&src16[..]).unwrap();
2427         let mut dst: Vec<u8> = Vec::with_capacity(src.len());
2428         dst.resize(src.len(), 0);
2429         let len = convert_utf8_to_latin1_lossy(src.as_bytes(), &mut dst[..]);
2430         dst.truncate(len);
2431         assert_eq!(dst, reference);
2432     }
2433 
2434     #[cfg(all(debug_assertions, not(fuzzing)))]
2435     #[test]
2436     #[should_panic]
test_convert_utf8_to_latin1_lossy_panics()2437     fn test_convert_utf8_to_latin1_lossy_panics() {
2438         let mut dst = [0u8; 16];
2439         let _ = convert_utf8_to_latin1_lossy("\u{100}".as_bytes(), &mut dst[..]);
2440     }
2441 
2442     #[test]
test_convert_utf16_to_latin1_lossy()2443     fn test_convert_utf16_to_latin1_lossy() {
2444         let mut src: Vec<u16> = Vec::with_capacity(256);
2445         src.resize(256, 0);
2446         let mut reference: Vec<u8> = Vec::with_capacity(256);
2447         reference.resize(256, 0);
2448         for i in 0..256 {
2449             src[i] = i as u16;
2450             reference[i] = i as u8;
2451         }
2452         let mut dst: Vec<u8> = Vec::with_capacity(src.len());
2453         dst.resize(src.len(), 0);
2454         convert_utf16_to_latin1_lossy(&src[..], &mut dst[..]);
2455         assert_eq!(dst, reference);
2456     }
2457 
2458     #[test]
2459     // #[should_panic]
test_convert_utf16_to_latin1_lossy_panics()2460     fn test_convert_utf16_to_latin1_lossy_panics() {
2461         let mut dst = [0u8; 16];
2462         let _ = convert_utf16_to_latin1_lossy(&[0x0100u16], &mut dst[..]);
2463     }
2464 
2465     #[test]
test_utf16_valid_up_to()2466     fn test_utf16_valid_up_to() {
2467         let valid = vec![
2468             0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0x2603u16,
2469             0xD83Du16, 0xDCA9u16, 0x00B6u16,
2470         ];
2471         assert_eq!(utf16_valid_up_to(&valid[..]), 16);
2472         let lone_high = vec![
2473             0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2474             0x2603u16, 0xD83Du16, 0x00B6u16,
2475         ];
2476         assert_eq!(utf16_valid_up_to(&lone_high[..]), 14);
2477         let lone_low = vec![
2478             0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2479             0x2603u16, 0xDCA9u16, 0x00B6u16,
2480         ];
2481         assert_eq!(utf16_valid_up_to(&lone_low[..]), 14);
2482         let lone_high_at_end = vec![
2483             0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2484             0x2603u16, 0x00B6u16, 0xD83Du16,
2485         ];
2486         assert_eq!(utf16_valid_up_to(&lone_high_at_end[..]), 15);
2487     }
2488 
2489     #[test]
test_ensure_utf16_validity()2490     fn test_ensure_utf16_validity() {
2491         let mut src = vec![
2492             0u16, 0xD83Du16, 0u16, 0u16, 0u16, 0xD83Du16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2493             0u16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2494             0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2495         ];
2496         let reference = vec![
2497             0u16, 0xFFFDu16, 0u16, 0u16, 0u16, 0xD83Du16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2498             0u16, 0xFFFDu16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2499             0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2500         ];
2501         ensure_utf16_validity(&mut src[..]);
2502         assert_eq!(src, reference);
2503     }
2504 
2505     #[test]
test_is_char_bidi()2506     fn test_is_char_bidi() {
2507         assert!(!is_char_bidi('a'));
2508         assert!(!is_char_bidi('\u{03B1}'));
2509         assert!(!is_char_bidi('\u{3041}'));
2510         assert!(!is_char_bidi('\u{1F4A9}'));
2511         assert!(!is_char_bidi('\u{FE00}'));
2512         assert!(!is_char_bidi('\u{202C}'));
2513         assert!(!is_char_bidi('\u{FEFF}'));
2514         assert!(is_char_bidi('\u{0590}'));
2515         assert!(is_char_bidi('\u{08FF}'));
2516         assert!(is_char_bidi('\u{061C}'));
2517         assert!(is_char_bidi('\u{FB50}'));
2518         assert!(is_char_bidi('\u{FDFF}'));
2519         assert!(is_char_bidi('\u{FE70}'));
2520         assert!(is_char_bidi('\u{FEFE}'));
2521         assert!(is_char_bidi('\u{200F}'));
2522         assert!(is_char_bidi('\u{202B}'));
2523         assert!(is_char_bidi('\u{202E}'));
2524         assert!(is_char_bidi('\u{2067}'));
2525         assert!(is_char_bidi('\u{10800}'));
2526         assert!(is_char_bidi('\u{10FFF}'));
2527         assert!(is_char_bidi('\u{1E800}'));
2528         assert!(is_char_bidi('\u{1EFFF}'));
2529     }
2530 
2531     #[test]
test_is_utf16_code_unit_bidi()2532     fn test_is_utf16_code_unit_bidi() {
2533         assert!(!is_utf16_code_unit_bidi(0x0062));
2534         assert!(!is_utf16_code_unit_bidi(0x03B1));
2535         assert!(!is_utf16_code_unit_bidi(0x3041));
2536         assert!(!is_utf16_code_unit_bidi(0xD801));
2537         assert!(!is_utf16_code_unit_bidi(0xFE00));
2538         assert!(!is_utf16_code_unit_bidi(0x202C));
2539         assert!(!is_utf16_code_unit_bidi(0xFEFF));
2540         assert!(is_utf16_code_unit_bidi(0x0590));
2541         assert!(is_utf16_code_unit_bidi(0x08FF));
2542         assert!(is_utf16_code_unit_bidi(0x061C));
2543         assert!(is_utf16_code_unit_bidi(0xFB1D));
2544         assert!(is_utf16_code_unit_bidi(0xFB50));
2545         assert!(is_utf16_code_unit_bidi(0xFDFF));
2546         assert!(is_utf16_code_unit_bidi(0xFE70));
2547         assert!(is_utf16_code_unit_bidi(0xFEFE));
2548         assert!(is_utf16_code_unit_bidi(0x200F));
2549         assert!(is_utf16_code_unit_bidi(0x202B));
2550         assert!(is_utf16_code_unit_bidi(0x202E));
2551         assert!(is_utf16_code_unit_bidi(0x2067));
2552         assert!(is_utf16_code_unit_bidi(0xD802));
2553         assert!(is_utf16_code_unit_bidi(0xD803));
2554         assert!(is_utf16_code_unit_bidi(0xD83A));
2555         assert!(is_utf16_code_unit_bidi(0xD83B));
2556     }
2557 
2558     #[test]
test_is_str_bidi()2559     fn test_is_str_bidi() {
2560         assert!(!is_str_bidi("abcdefghijklmnopaabcdefghijklmnop"));
2561         assert!(!is_str_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop"));
2562         assert!(!is_str_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop"));
2563         assert!(!is_str_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop"));
2564         assert!(!is_str_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop"));
2565         assert!(!is_str_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop"));
2566         assert!(!is_str_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop"));
2567         assert!(is_str_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop"));
2568         assert!(is_str_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop"));
2569         assert!(is_str_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop"));
2570         assert!(is_str_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop"));
2571         assert!(is_str_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop"));
2572         assert!(is_str_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop"));
2573         assert!(is_str_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop"));
2574         assert!(is_str_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop"));
2575         assert!(is_str_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop"));
2576         assert!(is_str_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop"));
2577         assert!(is_str_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop"));
2578         assert!(is_str_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop"));
2579         assert!(is_str_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop"));
2580         assert!(is_str_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop"));
2581         assert!(is_str_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop"));
2582     }
2583 
2584     #[test]
test_is_utf8_bidi()2585     fn test_is_utf8_bidi() {
2586         assert!(!is_utf8_bidi(
2587             "abcdefghijklmnopaabcdefghijklmnop".as_bytes()
2588         ));
2589         assert!(!is_utf8_bidi(
2590             "abcdefghijklmnop\u{03B1}abcdefghijklmnop".as_bytes()
2591         ));
2592         assert!(!is_utf8_bidi(
2593             "abcdefghijklmnop\u{3041}abcdefghijklmnop".as_bytes()
2594         ));
2595         assert!(!is_utf8_bidi(
2596             "abcdefghijklmnop\u{1F4A9}abcdefghijklmnop".as_bytes()
2597         ));
2598         assert!(!is_utf8_bidi(
2599             "abcdefghijklmnop\u{FE00}abcdefghijklmnop".as_bytes()
2600         ));
2601         assert!(!is_utf8_bidi(
2602             "abcdefghijklmnop\u{202C}abcdefghijklmnop".as_bytes()
2603         ));
2604         assert!(!is_utf8_bidi(
2605             "abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes()
2606         ));
2607         assert!(is_utf8_bidi(
2608             "abcdefghijklmnop\u{0590}abcdefghijklmnop".as_bytes()
2609         ));
2610         assert!(is_utf8_bidi(
2611             "abcdefghijklmnop\u{08FF}abcdefghijklmnop".as_bytes()
2612         ));
2613         assert!(is_utf8_bidi(
2614             "abcdefghijklmnop\u{061C}abcdefghijklmnop".as_bytes()
2615         ));
2616         assert!(is_utf8_bidi(
2617             "abcdefghijklmnop\u{FB50}abcdefghijklmnop".as_bytes()
2618         ));
2619         assert!(is_utf8_bidi(
2620             "abcdefghijklmnop\u{FDFF}abcdefghijklmnop".as_bytes()
2621         ));
2622         assert!(is_utf8_bidi(
2623             "abcdefghijklmnop\u{FE70}abcdefghijklmnop".as_bytes()
2624         ));
2625         assert!(is_utf8_bidi(
2626             "abcdefghijklmnop\u{FEFE}abcdefghijklmnop".as_bytes()
2627         ));
2628         assert!(is_utf8_bidi(
2629             "abcdefghijklmnop\u{200F}abcdefghijklmnop".as_bytes()
2630         ));
2631         assert!(is_utf8_bidi(
2632             "abcdefghijklmnop\u{202B}abcdefghijklmnop".as_bytes()
2633         ));
2634         assert!(is_utf8_bidi(
2635             "abcdefghijklmnop\u{202E}abcdefghijklmnop".as_bytes()
2636         ));
2637         assert!(is_utf8_bidi(
2638             "abcdefghijklmnop\u{2067}abcdefghijklmnop".as_bytes()
2639         ));
2640         assert!(is_utf8_bidi(
2641             "abcdefghijklmnop\u{10800}abcdefghijklmnop".as_bytes()
2642         ));
2643         assert!(is_utf8_bidi(
2644             "abcdefghijklmnop\u{10FFF}abcdefghijklmnop".as_bytes()
2645         ));
2646         assert!(is_utf8_bidi(
2647             "abcdefghijklmnop\u{1E800}abcdefghijklmnop".as_bytes()
2648         ));
2649         assert!(is_utf8_bidi(
2650             "abcdefghijklmnop\u{1EFFF}abcdefghijklmnop".as_bytes()
2651         ));
2652     }
2653 
2654     #[test]
test_is_utf16_bidi()2655     fn test_is_utf16_bidi() {
2656         assert!(!is_utf16_bidi(&[
2657             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0062, 0x62, 0x63, 0x64, 0x65, 0x66,
2658             0x67, 0x68, 0x69,
2659         ]));
2660         assert!(!is_utf16_bidi(&[
2661             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x03B1, 0x62, 0x63, 0x64, 0x65, 0x66,
2662             0x67, 0x68, 0x69,
2663         ]));
2664         assert!(!is_utf16_bidi(&[
2665             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x3041, 0x62, 0x63, 0x64, 0x65, 0x66,
2666             0x67, 0x68, 0x69,
2667         ]));
2668         assert!(!is_utf16_bidi(&[
2669             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD801, 0x62, 0x63, 0x64, 0x65, 0x66,
2670             0x67, 0x68, 0x69,
2671         ]));
2672         assert!(!is_utf16_bidi(&[
2673             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE00, 0x62, 0x63, 0x64, 0x65, 0x66,
2674             0x67, 0x68, 0x69,
2675         ]));
2676         assert!(!is_utf16_bidi(&[
2677             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65, 0x66,
2678             0x67, 0x68, 0x69,
2679         ]));
2680         assert!(!is_utf16_bidi(&[
2681             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65, 0x66,
2682             0x67, 0x68, 0x69,
2683         ]));
2684         assert!(is_utf16_bidi(&[
2685             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65, 0x66,
2686             0x67, 0x68, 0x69,
2687         ]));
2688         assert!(is_utf16_bidi(&[
2689             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x08FF, 0x62, 0x63, 0x64, 0x65, 0x66,
2690             0x67, 0x68, 0x69,
2691         ]));
2692         assert!(is_utf16_bidi(&[
2693             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65, 0x66,
2694             0x67, 0x68, 0x69,
2695         ]));
2696         assert!(is_utf16_bidi(&[
2697             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB1D, 0x62, 0x63, 0x64, 0x65, 0x66,
2698             0x67, 0x68, 0x69,
2699         ]));
2700         assert!(is_utf16_bidi(&[
2701             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65, 0x66,
2702             0x67, 0x68, 0x69,
2703         ]));
2704         assert!(is_utf16_bidi(&[
2705             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFDFF, 0x62, 0x63, 0x64, 0x65, 0x66,
2706             0x67, 0x68, 0x69,
2707         ]));
2708         assert!(is_utf16_bidi(&[
2709             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE70, 0x62, 0x63, 0x64, 0x65, 0x66,
2710             0x67, 0x68, 0x69,
2711         ]));
2712         assert!(is_utf16_bidi(&[
2713             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFE, 0x62, 0x63, 0x64, 0x65, 0x66,
2714             0x67, 0x68, 0x69,
2715         ]));
2716         assert!(is_utf16_bidi(&[
2717             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x200F, 0x62, 0x63, 0x64, 0x65, 0x66,
2718             0x67, 0x68, 0x69,
2719         ]));
2720         assert!(is_utf16_bidi(&[
2721             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202B, 0x62, 0x63, 0x64, 0x65, 0x66,
2722             0x67, 0x68, 0x69,
2723         ]));
2724         assert!(is_utf16_bidi(&[
2725             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202E, 0x62, 0x63, 0x64, 0x65, 0x66,
2726             0x67, 0x68, 0x69,
2727         ]));
2728         assert!(is_utf16_bidi(&[
2729             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x2067, 0x62, 0x63, 0x64, 0x65, 0x66,
2730             0x67, 0x68, 0x69,
2731         ]));
2732         assert!(is_utf16_bidi(&[
2733             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD802, 0x62, 0x63, 0x64, 0x65, 0x66,
2734             0x67, 0x68, 0x69,
2735         ]));
2736         assert!(is_utf16_bidi(&[
2737             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD803, 0x62, 0x63, 0x64, 0x65, 0x66,
2738             0x67, 0x68, 0x69,
2739         ]));
2740         assert!(is_utf16_bidi(&[
2741             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83A, 0x62, 0x63, 0x64, 0x65, 0x66,
2742             0x67, 0x68, 0x69,
2743         ]));
2744         assert!(is_utf16_bidi(&[
2745             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83B, 0x62, 0x63, 0x64, 0x65, 0x66,
2746             0x67, 0x68, 0x69,
2747         ]));
2748 
2749         assert!(is_utf16_bidi(&[
2750             0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x3041, 0x62, 0x63, 0x64, 0x65,
2751             0x66, 0x67, 0x68, 0x69,
2752         ]));
2753     }
2754 
2755     #[test]
test_check_str_for_latin1_and_bidi()2756     fn test_check_str_for_latin1_and_bidi() {
2757         assert_ne!(
2758             check_str_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop"),
2759             Latin1Bidi::Bidi
2760         );
2761         assert_ne!(
2762             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop"),
2763             Latin1Bidi::Bidi
2764         );
2765         assert_ne!(
2766             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop"),
2767             Latin1Bidi::Bidi
2768         );
2769         assert_ne!(
2770             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop"),
2771             Latin1Bidi::Bidi
2772         );
2773         assert_ne!(
2774             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop"),
2775             Latin1Bidi::Bidi
2776         );
2777         assert_ne!(
2778             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop"),
2779             Latin1Bidi::Bidi
2780         );
2781         assert_ne!(
2782             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop"),
2783             Latin1Bidi::Bidi
2784         );
2785         assert_eq!(
2786             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop"),
2787             Latin1Bidi::Bidi
2788         );
2789         assert_eq!(
2790             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop"),
2791             Latin1Bidi::Bidi
2792         );
2793         assert_eq!(
2794             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop"),
2795             Latin1Bidi::Bidi
2796         );
2797         assert_eq!(
2798             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop"),
2799             Latin1Bidi::Bidi
2800         );
2801         assert_eq!(
2802             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop"),
2803             Latin1Bidi::Bidi
2804         );
2805         assert_eq!(
2806             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop"),
2807             Latin1Bidi::Bidi
2808         );
2809         assert_eq!(
2810             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop"),
2811             Latin1Bidi::Bidi
2812         );
2813         assert_eq!(
2814             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop"),
2815             Latin1Bidi::Bidi
2816         );
2817         assert_eq!(
2818             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop"),
2819             Latin1Bidi::Bidi
2820         );
2821         assert_eq!(
2822             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop"),
2823             Latin1Bidi::Bidi
2824         );
2825         assert_eq!(
2826             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop"),
2827             Latin1Bidi::Bidi
2828         );
2829         assert_eq!(
2830             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop"),
2831             Latin1Bidi::Bidi
2832         );
2833         assert_eq!(
2834             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop"),
2835             Latin1Bidi::Bidi
2836         );
2837         assert_eq!(
2838             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop"),
2839             Latin1Bidi::Bidi
2840         );
2841         assert_eq!(
2842             check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop"),
2843             Latin1Bidi::Bidi
2844         );
2845     }
2846 
2847     #[test]
test_check_utf8_for_latin1_and_bidi()2848     fn test_check_utf8_for_latin1_and_bidi() {
2849         assert_ne!(
2850             check_utf8_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop".as_bytes()),
2851             Latin1Bidi::Bidi
2852         );
2853         assert_ne!(
2854             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop".as_bytes()),
2855             Latin1Bidi::Bidi
2856         );
2857         assert_ne!(
2858             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop".as_bytes()),
2859             Latin1Bidi::Bidi
2860         );
2861         assert_ne!(
2862             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop".as_bytes()),
2863             Latin1Bidi::Bidi
2864         );
2865         assert_ne!(
2866             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop".as_bytes()),
2867             Latin1Bidi::Bidi
2868         );
2869         assert_ne!(
2870             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop".as_bytes()),
2871             Latin1Bidi::Bidi
2872         );
2873         assert_ne!(
2874             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes()),
2875             Latin1Bidi::Bidi
2876         );
2877         assert_eq!(
2878             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop".as_bytes()),
2879             Latin1Bidi::Bidi
2880         );
2881         assert_eq!(
2882             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop".as_bytes()),
2883             Latin1Bidi::Bidi
2884         );
2885         assert_eq!(
2886             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop".as_bytes()),
2887             Latin1Bidi::Bidi
2888         );
2889         assert_eq!(
2890             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop".as_bytes()),
2891             Latin1Bidi::Bidi
2892         );
2893         assert_eq!(
2894             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop".as_bytes()),
2895             Latin1Bidi::Bidi
2896         );
2897         assert_eq!(
2898             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop".as_bytes()),
2899             Latin1Bidi::Bidi
2900         );
2901         assert_eq!(
2902             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop".as_bytes()),
2903             Latin1Bidi::Bidi
2904         );
2905         assert_eq!(
2906             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop".as_bytes()),
2907             Latin1Bidi::Bidi
2908         );
2909         assert_eq!(
2910             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop".as_bytes()),
2911             Latin1Bidi::Bidi
2912         );
2913         assert_eq!(
2914             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop".as_bytes()),
2915             Latin1Bidi::Bidi
2916         );
2917         assert_eq!(
2918             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop".as_bytes()),
2919             Latin1Bidi::Bidi
2920         );
2921         assert_eq!(
2922             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop".as_bytes()),
2923             Latin1Bidi::Bidi
2924         );
2925         assert_eq!(
2926             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop".as_bytes()),
2927             Latin1Bidi::Bidi
2928         );
2929         assert_eq!(
2930             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop".as_bytes()),
2931             Latin1Bidi::Bidi
2932         );
2933         assert_eq!(
2934             check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop".as_bytes()),
2935             Latin1Bidi::Bidi
2936         );
2937     }
2938 
2939     #[test]
test_check_utf16_for_latin1_and_bidi()2940     fn test_check_utf16_for_latin1_and_bidi() {
2941         assert_ne!(
2942             check_utf16_for_latin1_and_bidi(&[
2943                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0062, 0x62, 0x63, 0x64, 0x65,
2944                 0x66, 0x67, 0x68, 0x69,
2945             ]),
2946             Latin1Bidi::Bidi
2947         );
2948         assert_ne!(
2949             check_utf16_for_latin1_and_bidi(&[
2950                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x03B1, 0x62, 0x63, 0x64, 0x65,
2951                 0x66, 0x67, 0x68, 0x69,
2952             ]),
2953             Latin1Bidi::Bidi
2954         );
2955         assert_ne!(
2956             check_utf16_for_latin1_and_bidi(&[
2957                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x3041, 0x62, 0x63, 0x64, 0x65,
2958                 0x66, 0x67, 0x68, 0x69,
2959             ]),
2960             Latin1Bidi::Bidi
2961         );
2962         assert_ne!(
2963             check_utf16_for_latin1_and_bidi(&[
2964                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD801, 0x62, 0x63, 0x64, 0x65,
2965                 0x66, 0x67, 0x68, 0x69,
2966             ]),
2967             Latin1Bidi::Bidi
2968         );
2969         assert_ne!(
2970             check_utf16_for_latin1_and_bidi(&[
2971                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE00, 0x62, 0x63, 0x64, 0x65,
2972                 0x66, 0x67, 0x68, 0x69,
2973             ]),
2974             Latin1Bidi::Bidi
2975         );
2976         assert_ne!(
2977             check_utf16_for_latin1_and_bidi(&[
2978                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65,
2979                 0x66, 0x67, 0x68, 0x69,
2980             ]),
2981             Latin1Bidi::Bidi
2982         );
2983         assert_ne!(
2984             check_utf16_for_latin1_and_bidi(&[
2985                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65,
2986                 0x66, 0x67, 0x68, 0x69,
2987             ]),
2988             Latin1Bidi::Bidi
2989         );
2990         assert_eq!(
2991             check_utf16_for_latin1_and_bidi(&[
2992                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65,
2993                 0x66, 0x67, 0x68, 0x69,
2994             ]),
2995             Latin1Bidi::Bidi
2996         );
2997         assert_eq!(
2998             check_utf16_for_latin1_and_bidi(&[
2999                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x08FF, 0x62, 0x63, 0x64, 0x65,
3000                 0x66, 0x67, 0x68, 0x69,
3001             ]),
3002             Latin1Bidi::Bidi
3003         );
3004         assert_eq!(
3005             check_utf16_for_latin1_and_bidi(&[
3006                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65,
3007                 0x66, 0x67, 0x68, 0x69,
3008             ]),
3009             Latin1Bidi::Bidi
3010         );
3011         assert_eq!(
3012             check_utf16_for_latin1_and_bidi(&[
3013                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB1D, 0x62, 0x63, 0x64, 0x65,
3014                 0x66, 0x67, 0x68, 0x69,
3015             ]),
3016             Latin1Bidi::Bidi
3017         );
3018         assert_eq!(
3019             check_utf16_for_latin1_and_bidi(&[
3020                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65,
3021                 0x66, 0x67, 0x68, 0x69,
3022             ]),
3023             Latin1Bidi::Bidi
3024         );
3025         assert_eq!(
3026             check_utf16_for_latin1_and_bidi(&[
3027                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFDFF, 0x62, 0x63, 0x64, 0x65,
3028                 0x66, 0x67, 0x68, 0x69,
3029             ]),
3030             Latin1Bidi::Bidi
3031         );
3032         assert_eq!(
3033             check_utf16_for_latin1_and_bidi(&[
3034                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE70, 0x62, 0x63, 0x64, 0x65,
3035                 0x66, 0x67, 0x68, 0x69,
3036             ]),
3037             Latin1Bidi::Bidi
3038         );
3039         assert_eq!(
3040             check_utf16_for_latin1_and_bidi(&[
3041                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFE, 0x62, 0x63, 0x64, 0x65,
3042                 0x66, 0x67, 0x68, 0x69,
3043             ]),
3044             Latin1Bidi::Bidi
3045         );
3046         assert_eq!(
3047             check_utf16_for_latin1_and_bidi(&[
3048                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x200F, 0x62, 0x63, 0x64, 0x65,
3049                 0x66, 0x67, 0x68, 0x69,
3050             ]),
3051             Latin1Bidi::Bidi
3052         );
3053         assert_eq!(
3054             check_utf16_for_latin1_and_bidi(&[
3055                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202B, 0x62, 0x63, 0x64, 0x65,
3056                 0x66, 0x67, 0x68, 0x69,
3057             ]),
3058             Latin1Bidi::Bidi
3059         );
3060         assert_eq!(
3061             check_utf16_for_latin1_and_bidi(&[
3062                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202E, 0x62, 0x63, 0x64, 0x65,
3063                 0x66, 0x67, 0x68, 0x69,
3064             ]),
3065             Latin1Bidi::Bidi
3066         );
3067         assert_eq!(
3068             check_utf16_for_latin1_and_bidi(&[
3069                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x2067, 0x62, 0x63, 0x64, 0x65,
3070                 0x66, 0x67, 0x68, 0x69,
3071             ]),
3072             Latin1Bidi::Bidi
3073         );
3074         assert_eq!(
3075             check_utf16_for_latin1_and_bidi(&[
3076                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD802, 0x62, 0x63, 0x64, 0x65,
3077                 0x66, 0x67, 0x68, 0x69,
3078             ]),
3079             Latin1Bidi::Bidi
3080         );
3081         assert_eq!(
3082             check_utf16_for_latin1_and_bidi(&[
3083                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD803, 0x62, 0x63, 0x64, 0x65,
3084                 0x66, 0x67, 0x68, 0x69,
3085             ]),
3086             Latin1Bidi::Bidi
3087         );
3088         assert_eq!(
3089             check_utf16_for_latin1_and_bidi(&[
3090                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83A, 0x62, 0x63, 0x64, 0x65,
3091                 0x66, 0x67, 0x68, 0x69,
3092             ]),
3093             Latin1Bidi::Bidi
3094         );
3095         assert_eq!(
3096             check_utf16_for_latin1_and_bidi(&[
3097                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83B, 0x62, 0x63, 0x64, 0x65,
3098                 0x66, 0x67, 0x68, 0x69,
3099             ]),
3100             Latin1Bidi::Bidi
3101         );
3102 
3103         assert_eq!(
3104             check_utf16_for_latin1_and_bidi(&[
3105                 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x3041, 0x62, 0x63, 0x64,
3106                 0x65, 0x66, 0x67, 0x68, 0x69,
3107             ]),
3108             Latin1Bidi::Bidi
3109         );
3110     }
3111 
3112     #[inline(always)]
reference_is_char_bidi(c: char) -> bool3113     pub fn reference_is_char_bidi(c: char) -> bool {
3114         match c {
3115             '\u{0590}'...'\u{08FF}'
3116             | '\u{FB1D}'...'\u{FDFF}'
3117             | '\u{FE70}'...'\u{FEFE}'
3118             | '\u{10800}'...'\u{10FFF}'
3119             | '\u{1E800}'...'\u{1EFFF}'
3120             | '\u{200F}'
3121             | '\u{202B}'
3122             | '\u{202E}'
3123             | '\u{2067}' => true,
3124             _ => false,
3125         }
3126     }
3127 
3128     #[inline(always)]
reference_is_utf16_code_unit_bidi(u: u16) -> bool3129     pub fn reference_is_utf16_code_unit_bidi(u: u16) -> bool {
3130         match u {
3131             0x0590...0x08FF
3132             | 0xFB1D...0xFDFF
3133             | 0xFE70...0xFEFE
3134             | 0xD802
3135             | 0xD803
3136             | 0xD83A
3137             | 0xD83B
3138             | 0x200F
3139             | 0x202B
3140             | 0x202E
3141             | 0x2067 => true,
3142             _ => false,
3143         }
3144     }
3145 
3146     #[test]
test_is_char_bidi_thoroughly()3147     fn test_is_char_bidi_thoroughly() {
3148         for i in 0..0xD800u32 {
3149             let c: char = ::std::char::from_u32(i).unwrap();
3150             assert_eq!(is_char_bidi(c), reference_is_char_bidi(c));
3151         }
3152         for i in 0xE000..0x110000u32 {
3153             let c: char = ::std::char::from_u32(i).unwrap();
3154             assert_eq!(is_char_bidi(c), reference_is_char_bidi(c));
3155         }
3156     }
3157 
3158     #[test]
test_is_utf16_code_unit_bidi_thoroughly()3159     fn test_is_utf16_code_unit_bidi_thoroughly() {
3160         for i in 0..0x10000u32 {
3161             let u = i as u16;
3162             assert_eq!(
3163                 is_utf16_code_unit_bidi(u),
3164                 reference_is_utf16_code_unit_bidi(u)
3165             );
3166         }
3167     }
3168 
3169     #[test]
test_is_str_bidi_thoroughly()3170     fn test_is_str_bidi_thoroughly() {
3171         let mut buf = [0; 4];
3172         for i in 0..0xD800u32 {
3173             let c: char = ::std::char::from_u32(i).unwrap();
3174             assert_eq!(
3175                 is_str_bidi(c.encode_utf8(&mut buf[..])),
3176                 reference_is_char_bidi(c)
3177             );
3178         }
3179         for i in 0xE000..0x110000u32 {
3180             let c: char = ::std::char::from_u32(i).unwrap();
3181             assert_eq!(
3182                 is_str_bidi(c.encode_utf8(&mut buf[..])),
3183                 reference_is_char_bidi(c)
3184             );
3185         }
3186     }
3187 
3188     #[test]
test_is_utf8_bidi_thoroughly()3189     fn test_is_utf8_bidi_thoroughly() {
3190         let mut buf = [0; 8];
3191         for i in 0..0xD800u32 {
3192             let c: char = ::std::char::from_u32(i).unwrap();
3193             let expect = reference_is_char_bidi(c);
3194             {
3195                 let len = {
3196                     let bytes = c.encode_utf8(&mut buf[..]).as_bytes();
3197                     assert_eq!(is_utf8_bidi(bytes), expect);
3198                     bytes.len()
3199                 };
3200                 {
3201                     let tail = &mut buf[len..];
3202                     for b in tail.iter_mut() {
3203                         *b = 0;
3204                     }
3205                 }
3206             }
3207             assert_eq!(is_utf8_bidi(&buf[..]), expect);
3208         }
3209         for i in 0xE000..0x110000u32 {
3210             let c: char = ::std::char::from_u32(i).unwrap();
3211             let expect = reference_is_char_bidi(c);
3212             {
3213                 let len = {
3214                     let bytes = c.encode_utf8(&mut buf[..]).as_bytes();
3215                     assert_eq!(is_utf8_bidi(bytes), expect);
3216                     bytes.len()
3217                 };
3218                 {
3219                     let tail = &mut buf[len..];
3220                     for b in tail.iter_mut() {
3221                         *b = 0;
3222                     }
3223                 }
3224             }
3225             assert_eq!(is_utf8_bidi(&buf[..]), expect);
3226         }
3227     }
3228 
3229     #[test]
test_is_utf16_bidi_thoroughly()3230     fn test_is_utf16_bidi_thoroughly() {
3231         let mut buf = [0; 32];
3232         for i in 0..0x10000u32 {
3233             let u = i as u16;
3234             buf[15] = u;
3235             assert_eq!(
3236                 is_utf16_bidi(&buf[..]),
3237                 reference_is_utf16_code_unit_bidi(u)
3238             );
3239         }
3240     }
3241 
3242     #[test]
test_is_utf8_bidi_edge_cases()3243     fn test_is_utf8_bidi_edge_cases() {
3244         assert!(!is_utf8_bidi(b"\xD5\xBF\x61"));
3245         assert!(!is_utf8_bidi(b"\xD6\x80\x61"));
3246         assert!(!is_utf8_bidi(b"abc"));
3247         assert!(is_utf8_bidi(b"\xD5\xBF\xC2"));
3248         assert!(is_utf8_bidi(b"\xD6\x80\xC2"));
3249         assert!(is_utf8_bidi(b"ab\xC2"));
3250     }
3251 
3252     #[test]
test_decode_latin1()3253     fn test_decode_latin1() {
3254         match decode_latin1(b"ab") {
3255             Cow::Borrowed(s) => {
3256                 assert_eq!(s, "ab");
3257             }
3258             Cow::Owned(_) => {
3259                 unreachable!("Should have borrowed");
3260             }
3261         }
3262         assert_eq!(decode_latin1(b"a\xE4"), "a\u{E4}");
3263     }
3264 
3265     #[test]
test_encode_latin1_lossy()3266     fn test_encode_latin1_lossy() {
3267         match encode_latin1_lossy("ab") {
3268             Cow::Borrowed(s) => {
3269                 assert_eq!(s, b"ab");
3270             }
3271             Cow::Owned(_) => {
3272                 unreachable!("Should have borrowed");
3273             }
3274         }
3275         assert_eq!(encode_latin1_lossy("a\u{E4}"), &(b"a\xE4")[..]);
3276     }
3277 
3278     #[test]
test_convert_utf8_to_utf16_without_replacement()3279     fn test_convert_utf8_to_utf16_without_replacement() {
3280         let mut buf = [0u16; 5];
3281         assert_eq!(
3282             convert_utf8_to_utf16_without_replacement(b"ab", &mut buf[..2]),
3283             Some(2)
3284         );
3285         assert_eq!(buf[0], u16::from(b'a'));
3286         assert_eq!(buf[1], u16::from(b'b'));
3287         assert_eq!(buf[2], 0);
3288         assert_eq!(
3289             convert_utf8_to_utf16_without_replacement(b"\xC3\xA4c", &mut buf[..3]),
3290             Some(2)
3291         );
3292         assert_eq!(buf[0], 0xE4);
3293         assert_eq!(buf[1], u16::from(b'c'));
3294         assert_eq!(buf[2], 0);
3295         assert_eq!(
3296             convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83", &mut buf[..3]),
3297             Some(1)
3298         );
3299         assert_eq!(buf[0], 0x2603);
3300         assert_eq!(buf[1], u16::from(b'c'));
3301         assert_eq!(buf[2], 0);
3302         assert_eq!(
3303             convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83d", &mut buf[..4]),
3304             Some(2)
3305         );
3306         assert_eq!(buf[0], 0x2603);
3307         assert_eq!(buf[1], u16::from(b'd'));
3308         assert_eq!(buf[2], 0);
3309         assert_eq!(
3310             convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83\xC3\xA4", &mut buf[..5]),
3311             Some(2)
3312         );
3313         assert_eq!(buf[0], 0x2603);
3314         assert_eq!(buf[1], 0xE4);
3315         assert_eq!(buf[2], 0);
3316         assert_eq!(
3317             convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93\x8E", &mut buf[..4]),
3318             Some(2)
3319         );
3320         assert_eq!(buf[0], 0xD83D);
3321         assert_eq!(buf[1], 0xDCCE);
3322         assert_eq!(buf[2], 0);
3323         assert_eq!(
3324             convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93\x8Ee", &mut buf[..5]),
3325             Some(3)
3326         );
3327         assert_eq!(buf[0], 0xD83D);
3328         assert_eq!(buf[1], 0xDCCE);
3329         assert_eq!(buf[2], u16::from(b'e'));
3330         assert_eq!(
3331             convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93", &mut buf[..5]),
3332             None
3333         );
3334     }
3335 }
3336