1 // Copyright Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9
10 //! Functions for converting between different in-RAM representations of text
11 //! and for quickly checking if the Unicode Bidirectional Algorithm can be
12 //! avoided.
13 //!
14 //! By using slices for output, the functions here seek to enable by-register
15 //! (ALU register or SIMD register as available) operations in order to
16 //! outperform iterator-based conversions available in the Rust standard
17 //! library.
18 //!
19 //! _Note:_ "Latin1" in this module refers to the Unicode range from U+0000 to
20 //! U+00FF, inclusive, and does not refer to the windows-1252 range. This
21 //! in-memory encoding is sometimes used as a storage optimization of text
22 //! when UTF-16 indexing and length semantics are exposed.
23 //!
24 //! The FFI binding for this module are in the
25 //! [encoding_c_mem crate](https://github.com/hsivonen/encoding_c_mem).
26
27 #[cfg(feature = "alloc")]
28 use alloc::borrow::Cow;
29 #[cfg(feature = "alloc")]
30 use alloc::string::String;
31 #[cfg(feature = "alloc")]
32 use alloc::vec::Vec;
33
34 use super::in_inclusive_range16;
35 use super::in_inclusive_range32;
36 use super::in_inclusive_range8;
37 use super::in_range16;
38 use super::in_range32;
39 use super::DecoderResult;
40 use crate::ascii::*;
41 use crate::utf_8::*;
42
43 macro_rules! non_fuzz_debug_assert {
44 ($($arg:tt)*) => (if !cfg!(fuzzing) { debug_assert!($($arg)*); })
45 }
46
47 cfg_if! {
48 if #[cfg(feature = "simd-accel")] {
49 use ::core::intrinsics::likely;
50 use ::core::intrinsics::unlikely;
51 } else {
52 #[inline(always)]
53 // Unsafe to match the intrinsic, which is needlessly unsafe.
54 unsafe fn likely(b: bool) -> bool {
55 b
56 }
57 #[inline(always)]
58 // Unsafe to match the intrinsic, which is needlessly unsafe.
59 unsafe fn unlikely(b: bool) -> bool {
60 b
61 }
62 }
63 }
64
65 /// Classification of text as Latin1 (all code points are below U+0100),
66 /// left-to-right with some non-Latin1 characters or as containing at least
67 /// some right-to-left characters.
68 #[must_use]
69 #[derive(Debug, PartialEq, Eq)]
70 #[repr(C)]
71 pub enum Latin1Bidi {
72 /// Every character is below U+0100.
73 Latin1 = 0,
74 /// There is at least one character that's U+0100 or higher, but there
75 /// are no right-to-left characters.
76 LeftToRight = 1,
77 /// There is at least one right-to-left character.
78 Bidi = 2,
79 }
80
81 // `as` truncates, so works on 32-bit, too.
82 #[allow(dead_code)]
83 const LATIN1_MASK: usize = 0xFF00_FF00_FF00_FF00u64 as usize;
84
85 #[allow(unused_macros)]
86 macro_rules! by_unit_check_alu {
87 ($name:ident, $unit:ty, $bound:expr, $mask:ident) => {
88 #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
89 #[inline(always)]
90 fn $name(buffer: &[$unit]) -> bool {
91 let mut offset = 0usize;
92 let mut accu = 0usize;
93 let unit_size = ::core::mem::size_of::<$unit>();
94 let len = buffer.len();
95 if len >= ALU_ALIGNMENT / unit_size {
96 // The most common reason to return `false` is for the first code
97 // unit to fail the test, so check that first.
98 if buffer[0] >= $bound {
99 return false;
100 }
101 let src = buffer.as_ptr();
102 let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK))
103 & ALU_ALIGNMENT_MASK)
104 / unit_size;
105 if until_alignment + ALU_ALIGNMENT / unit_size <= len {
106 if until_alignment != 0 {
107 accu |= buffer[offset] as usize;
108 offset += 1;
109 until_alignment -= 1;
110 while until_alignment != 0 {
111 accu |= buffer[offset] as usize;
112 offset += 1;
113 until_alignment -= 1;
114 }
115 if accu >= $bound {
116 return false;
117 }
118 }
119 let len_minus_stride = len - ALU_ALIGNMENT / unit_size;
120 if offset + (4 * (ALU_ALIGNMENT / unit_size)) <= len {
121 let len_minus_unroll = len - (4 * (ALU_ALIGNMENT / unit_size));
122 loop {
123 let unroll_accu = unsafe { *(src.add(offset) as *const usize) }
124 | unsafe {
125 *(src.add(offset + (ALU_ALIGNMENT / unit_size)) as *const usize)
126 }
127 | unsafe {
128 *(src.add(offset + (2 * (ALU_ALIGNMENT / unit_size)))
129 as *const usize)
130 }
131 | unsafe {
132 *(src.add(offset + (3 * (ALU_ALIGNMENT / unit_size)))
133 as *const usize)
134 };
135 if unroll_accu & $mask != 0 {
136 return false;
137 }
138 offset += 4 * (ALU_ALIGNMENT / unit_size);
139 if offset > len_minus_unroll {
140 break;
141 }
142 }
143 }
144 while offset <= len_minus_stride {
145 accu |= unsafe { *(src.add(offset) as *const usize) };
146 offset += ALU_ALIGNMENT / unit_size;
147 }
148 }
149 }
150 for &unit in &buffer[offset..] {
151 accu |= unit as usize;
152 }
153 accu & $mask == 0
154 }
155 };
156 }
157
158 #[allow(unused_macros)]
159 macro_rules! by_unit_check_simd {
160 ($name:ident, $unit:ty, $splat:expr, $simd_ty:ty, $bound:expr, $func:ident) => {
161 #[inline(always)]
162 fn $name(buffer: &[$unit]) -> bool {
163 let mut offset = 0usize;
164 let mut accu = 0usize;
165 let unit_size = ::core::mem::size_of::<$unit>();
166 let len = buffer.len();
167 if len >= SIMD_STRIDE_SIZE / unit_size {
168 // The most common reason to return `false` is for the first code
169 // unit to fail the test, so check that first.
170 if buffer[0] >= $bound {
171 return false;
172 }
173 let src = buffer.as_ptr();
174 let mut until_alignment = ((SIMD_ALIGNMENT
175 - ((src as usize) & SIMD_ALIGNMENT_MASK))
176 & SIMD_ALIGNMENT_MASK)
177 / unit_size;
178 if until_alignment + SIMD_STRIDE_SIZE / unit_size <= len {
179 if until_alignment != 0 {
180 accu |= buffer[offset] as usize;
181 offset += 1;
182 until_alignment -= 1;
183 while until_alignment != 0 {
184 accu |= buffer[offset] as usize;
185 offset += 1;
186 until_alignment -= 1;
187 }
188 if accu >= $bound {
189 return false;
190 }
191 }
192 let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size;
193 if offset + (4 * (SIMD_STRIDE_SIZE / unit_size)) <= len {
194 let len_minus_unroll = len - (4 * (SIMD_STRIDE_SIZE / unit_size));
195 loop {
196 let unroll_accu = unsafe { *(src.add(offset) as *const $simd_ty) }
197 | unsafe {
198 *(src.add(offset + (SIMD_STRIDE_SIZE / unit_size))
199 as *const $simd_ty)
200 }
201 | unsafe {
202 *(src.add(offset + (2 * (SIMD_STRIDE_SIZE / unit_size)))
203 as *const $simd_ty)
204 }
205 | unsafe {
206 *(src.add(offset + (3 * (SIMD_STRIDE_SIZE / unit_size)))
207 as *const $simd_ty)
208 };
209 if !$func(unroll_accu) {
210 return false;
211 }
212 offset += 4 * (SIMD_STRIDE_SIZE / unit_size);
213 if offset > len_minus_unroll {
214 break;
215 }
216 }
217 }
218 let mut simd_accu = $splat;
219 while offset <= len_minus_stride {
220 simd_accu = simd_accu | unsafe { *(src.add(offset) as *const $simd_ty) };
221 offset += SIMD_STRIDE_SIZE / unit_size;
222 }
223 if !$func(simd_accu) {
224 return false;
225 }
226 }
227 }
228 for &unit in &buffer[offset..] {
229 accu |= unit as usize;
230 }
231 accu < $bound
232 }
233 };
234 }
235
236 cfg_if! {
237 if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
238 use crate::simd_funcs::*;
239 use packed_simd::u8x16;
240 use packed_simd::u16x8;
241
242 const SIMD_ALIGNMENT: usize = 16;
243
244 const SIMD_ALIGNMENT_MASK: usize = 15;
245
246 by_unit_check_simd!(is_ascii_impl, u8, u8x16::splat(0), u8x16, 0x80, simd_is_ascii);
247 by_unit_check_simd!(is_basic_latin_impl, u16, u16x8::splat(0), u16x8, 0x80, simd_is_basic_latin);
248 by_unit_check_simd!(is_utf16_latin1_impl, u16, u16x8::splat(0), u16x8, 0x100, simd_is_latin1);
249
250 #[inline(always)]
251 fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize {
252 // This function is a mess, because it simultaneously tries to do
253 // only aligned SIMD (perhaps misguidedly) and needs to deal with
254 // the last code unit in a SIMD stride being part of a valid
255 // surrogate pair.
256 let unit_size = ::core::mem::size_of::<u16>();
257 let src = buffer.as_ptr();
258 let len = buffer.len();
259 let mut offset = 0usize;
260 'outer: loop {
261 let until_alignment = ((SIMD_ALIGNMENT - ((unsafe { src.add(offset) } as usize) & SIMD_ALIGNMENT_MASK)) &
262 SIMD_ALIGNMENT_MASK) / unit_size;
263 if until_alignment == 0 {
264 if offset + SIMD_STRIDE_SIZE / unit_size > len {
265 break;
266 }
267 } else {
268 let offset_plus_until_alignment = offset + until_alignment;
269 let offset_plus_until_alignment_plus_one = offset_plus_until_alignment + 1;
270 if offset_plus_until_alignment_plus_one + SIMD_STRIDE_SIZE / unit_size > len {
271 break;
272 }
273 let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_until_alignment_plus_one]);
274 if up_to < until_alignment {
275 return offset + up_to;
276 }
277 if last_valid_low {
278 offset = offset_plus_until_alignment_plus_one;
279 continue;
280 }
281 offset = offset_plus_until_alignment;
282 }
283 let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size;
284 loop {
285 let offset_plus_stride = offset + SIMD_STRIDE_SIZE / unit_size;
286 if contains_surrogates(unsafe { *(src.add(offset) as *const u16x8) }) {
287 if offset_plus_stride == len {
288 break 'outer;
289 }
290 let offset_plus_stride_plus_one = offset_plus_stride + 1;
291 let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_stride_plus_one]);
292 if up_to < SIMD_STRIDE_SIZE / unit_size {
293 return offset + up_to;
294 }
295 if last_valid_low {
296 offset = offset_plus_stride_plus_one;
297 continue 'outer;
298 }
299 }
300 offset = offset_plus_stride;
301 if offset > len_minus_stride {
302 break 'outer;
303 }
304 }
305 }
306 let (up_to, _) = utf16_valid_up_to_alu(&buffer[offset..]);
307 offset + up_to
308 }
309 } else {
310 by_unit_check_alu!(is_ascii_impl, u8, 0x80, ASCII_MASK);
311 by_unit_check_alu!(is_basic_latin_impl, u16, 0x80, BASIC_LATIN_MASK);
312 by_unit_check_alu!(is_utf16_latin1_impl, u16, 0x100, LATIN1_MASK);
313
314 #[inline(always)]
315 fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize {
316 let (up_to, _) = utf16_valid_up_to_alu(buffer);
317 up_to
318 }
319 }
320 }
321
322 /// The second return value is true iff the last code unit of the slice was
323 /// reached and turned out to be a low surrogate that is part of a valid pair.
324 #[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))]
325 #[inline(always)]
utf16_valid_up_to_alu(buffer: &[u16]) -> (usize, bool)326 fn utf16_valid_up_to_alu(buffer: &[u16]) -> (usize, bool) {
327 let len = buffer.len();
328 if len == 0 {
329 return (0, false);
330 }
331 let mut offset = 0usize;
332 loop {
333 let unit = buffer[offset];
334 let next = offset + 1;
335 let unit_minus_surrogate_start = unit.wrapping_sub(0xD800);
336 if unit_minus_surrogate_start > (0xDFFF - 0xD800) {
337 // Not a surrogate
338 offset = next;
339 if offset == len {
340 return (offset, false);
341 }
342 continue;
343 }
344 if unit_minus_surrogate_start <= (0xDBFF - 0xD800) {
345 // high surrogate
346 if next < len {
347 let second = buffer[next];
348 let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00);
349 if second_minus_low_surrogate_start <= (0xDFFF - 0xDC00) {
350 // The next code unit is a low surrogate. Advance position.
351 offset = next + 1;
352 if offset == len {
353 return (offset, true);
354 }
355 continue;
356 }
357 // The next code unit is not a low surrogate. Don't advance
358 // position and treat the high surrogate as unpaired.
359 // fall through
360 }
361 // Unpaired, fall through
362 }
363 // Unpaired surrogate
364 return (offset, false);
365 }
366 }
367
368 cfg_if! {
369 if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
370 #[inline(always)]
371 fn is_str_latin1_impl(buffer: &str) -> Option<usize> {
372 let mut offset = 0usize;
373 let bytes = buffer.as_bytes();
374 let len = bytes.len();
375 if len >= SIMD_STRIDE_SIZE {
376 let src = bytes.as_ptr();
377 let mut until_alignment = (SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
378 SIMD_ALIGNMENT_MASK;
379 if until_alignment + SIMD_STRIDE_SIZE <= len {
380 while until_alignment != 0 {
381 if bytes[offset] > 0xC3 {
382 return Some(offset);
383 }
384 offset += 1;
385 until_alignment -= 1;
386 }
387 let len_minus_stride = len - SIMD_STRIDE_SIZE;
388 loop {
389 if !simd_is_str_latin1(unsafe { *(src.add(offset) as *const u8x16) }) {
390 // TODO: Ensure this compiles away when inlined into `is_str_latin1()`.
391 while bytes[offset] & 0xC0 == 0x80 {
392 offset += 1;
393 }
394 return Some(offset);
395 }
396 offset += SIMD_STRIDE_SIZE;
397 if offset > len_minus_stride {
398 break;
399 }
400 }
401 }
402 }
403 for i in offset..len {
404 if bytes[i] > 0xC3 {
405 return Some(i);
406 }
407 }
408 None
409 }
410 } else {
411 #[inline(always)]
412 fn is_str_latin1_impl(buffer: &str) -> Option<usize> {
413 let mut bytes = buffer.as_bytes();
414 let mut total = 0;
415 loop {
416 if let Some((byte, offset)) = validate_ascii(bytes) {
417 total += offset;
418 if byte > 0xC3 {
419 return Some(total);
420 }
421 bytes = &bytes[offset + 2..];
422 total += 2;
423 } else {
424 return None;
425 }
426 }
427 }
428 }
429 }
430
431 #[inline(always)]
is_utf8_latin1_impl(buffer: &[u8]) -> Option<usize>432 fn is_utf8_latin1_impl(buffer: &[u8]) -> Option<usize> {
433 let mut bytes = buffer;
434 let mut total = 0;
435 loop {
436 if let Some((byte, offset)) = validate_ascii(bytes) {
437 total += offset;
438 if in_inclusive_range8(byte, 0xC2, 0xC3) {
439 let next = offset + 1;
440 if next == bytes.len() {
441 return Some(total);
442 }
443 if bytes[next] & 0xC0 != 0x80 {
444 return Some(total);
445 }
446 bytes = &bytes[offset + 2..];
447 total += 2;
448 } else {
449 return Some(total);
450 }
451 } else {
452 return None;
453 }
454 }
455 }
456
457 cfg_if! {
458 if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
459 #[inline(always)]
460 fn is_utf16_bidi_impl(buffer: &[u16]) -> bool {
461 let mut offset = 0usize;
462 let len = buffer.len();
463 if len >= SIMD_STRIDE_SIZE / 2 {
464 let src = buffer.as_ptr();
465 let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
466 SIMD_ALIGNMENT_MASK) / 2;
467 if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len {
468 while until_alignment != 0 {
469 if is_utf16_code_unit_bidi(buffer[offset]) {
470 return true;
471 }
472 offset += 1;
473 until_alignment -= 1;
474 }
475 let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2);
476 loop {
477 if is_u16x8_bidi(unsafe { *(src.add(offset) as *const u16x8) }) {
478 return true;
479 }
480 offset += SIMD_STRIDE_SIZE / 2;
481 if offset > len_minus_stride {
482 break;
483 }
484 }
485 }
486 }
487 for &u in &buffer[offset..] {
488 if is_utf16_code_unit_bidi(u) {
489 return true;
490 }
491 }
492 false
493 }
494 } else {
495 #[inline(always)]
496 fn is_utf16_bidi_impl(buffer: &[u16]) -> bool {
497 for &u in buffer {
498 if is_utf16_code_unit_bidi(u) {
499 return true;
500 }
501 }
502 false
503 }
504 }
505 }
506
507 cfg_if! {
508 if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
509 #[inline(always)]
510 fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi {
511 let mut offset = 0usize;
512 let len = buffer.len();
513 if len >= SIMD_STRIDE_SIZE / 2 {
514 let src = buffer.as_ptr();
515 let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
516 SIMD_ALIGNMENT_MASK) / 2;
517 if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len {
518 while until_alignment != 0 {
519 if buffer[offset] > 0xFF {
520 // This transition isn't optimal, since the aligment is recomputing
521 // but not tweaking further today.
522 if is_utf16_bidi_impl(&buffer[offset..]) {
523 return Latin1Bidi::Bidi;
524 }
525 return Latin1Bidi::LeftToRight;
526 }
527 offset += 1;
528 until_alignment -= 1;
529 }
530 let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2);
531 loop {
532 let mut s = unsafe { *(src.add(offset) as *const u16x8) };
533 if !simd_is_latin1(s) {
534 loop {
535 if is_u16x8_bidi(s) {
536 return Latin1Bidi::Bidi;
537 }
538 offset += SIMD_STRIDE_SIZE / 2;
539 if offset > len_minus_stride {
540 for &u in &buffer[offset..] {
541 if is_utf16_code_unit_bidi(u) {
542 return Latin1Bidi::Bidi;
543 }
544 }
545 return Latin1Bidi::LeftToRight;
546 }
547 s = unsafe { *(src.add(offset) as *const u16x8) };
548 }
549 }
550 offset += SIMD_STRIDE_SIZE / 2;
551 if offset > len_minus_stride {
552 break;
553 }
554 }
555 }
556 }
557 let mut iter = (&buffer[offset..]).iter();
558 loop {
559 if let Some(&u) = iter.next() {
560 if u > 0xFF {
561 let mut inner_u = u;
562 loop {
563 if is_utf16_code_unit_bidi(inner_u) {
564 return Latin1Bidi::Bidi;
565 }
566 if let Some(&code_unit) = iter.next() {
567 inner_u = code_unit;
568 } else {
569 return Latin1Bidi::LeftToRight;
570 }
571 }
572 }
573 } else {
574 return Latin1Bidi::Latin1;
575 }
576 }
577 }
578 } else {
579 #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
580 #[inline(always)]
581 fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi {
582 let mut offset = 0usize;
583 let len = buffer.len();
584 if len >= ALU_ALIGNMENT / 2 {
585 let src = buffer.as_ptr();
586 let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK)) &
587 ALU_ALIGNMENT_MASK) / 2;
588 if until_alignment + ALU_ALIGNMENT / 2 <= len {
589 while until_alignment != 0 {
590 if buffer[offset] > 0xFF {
591 if is_utf16_bidi_impl(&buffer[offset..]) {
592 return Latin1Bidi::Bidi;
593 }
594 return Latin1Bidi::LeftToRight;
595 }
596 offset += 1;
597 until_alignment -= 1;
598 }
599 let len_minus_stride = len - ALU_ALIGNMENT / 2;
600 loop {
601 if unsafe { *(src.add(offset) as *const usize) } & LATIN1_MASK != 0 {
602 if is_utf16_bidi_impl(&buffer[offset..]) {
603 return Latin1Bidi::Bidi;
604 }
605 return Latin1Bidi::LeftToRight;
606 }
607 offset += ALU_ALIGNMENT / 2;
608 if offset > len_minus_stride {
609 break;
610 }
611 }
612 }
613 }
614 let mut iter = (&buffer[offset..]).iter();
615 loop {
616 if let Some(&u) = iter.next() {
617 if u > 0xFF {
618 let mut inner_u = u;
619 loop {
620 if is_utf16_code_unit_bidi(inner_u) {
621 return Latin1Bidi::Bidi;
622 }
623 if let Some(&code_unit) = iter.next() {
624 inner_u = code_unit;
625 } else {
626 return Latin1Bidi::LeftToRight;
627 }
628 }
629 }
630 } else {
631 return Latin1Bidi::Latin1;
632 }
633 }
634 }
635 }
636 }
637
638 /// Checks whether the buffer is all-ASCII.
639 ///
640 /// May read the entire buffer even if it isn't all-ASCII. (I.e. the function
641 /// is not guaranteed to fail fast.)
is_ascii(buffer: &[u8]) -> bool642 pub fn is_ascii(buffer: &[u8]) -> bool {
643 is_ascii_impl(buffer)
644 }
645
646 /// Checks whether the buffer is all-Basic Latin (i.e. UTF-16 representing
647 /// only ASCII characters).
648 ///
649 /// May read the entire buffer even if it isn't all-ASCII. (I.e. the function
650 /// is not guaranteed to fail fast.)
is_basic_latin(buffer: &[u16]) -> bool651 pub fn is_basic_latin(buffer: &[u16]) -> bool {
652 is_basic_latin_impl(buffer)
653 }
654
655 /// Checks whether the buffer is valid UTF-8 representing only code points
656 /// less than or equal to U+00FF.
657 ///
658 /// Fails fast. (I.e. returns before having read the whole buffer if UTF-8
659 /// invalidity or code points above U+00FF are discovered.
is_utf8_latin1(buffer: &[u8]) -> bool660 pub fn is_utf8_latin1(buffer: &[u8]) -> bool {
661 is_utf8_latin1_impl(buffer).is_none()
662 }
663
664 /// Checks whether the buffer represents only code points less than or equal
665 /// to U+00FF.
666 ///
667 /// Fails fast. (I.e. returns before having read the whole buffer if code
668 /// points above U+00FF are discovered.
is_str_latin1(buffer: &str) -> bool669 pub fn is_str_latin1(buffer: &str) -> bool {
670 is_str_latin1_impl(buffer).is_none()
671 }
672
673 /// Checks whether the buffer represents only code point less than or equal
674 /// to U+00FF.
675 ///
676 /// May read the entire buffer even if it isn't all-Latin1. (I.e. the function
677 /// is not guaranteed to fail fast.)
is_utf16_latin1(buffer: &[u16]) -> bool678 pub fn is_utf16_latin1(buffer: &[u16]) -> bool {
679 is_utf16_latin1_impl(buffer)
680 }
681
682 /// Checks whether a potentially-invalid UTF-8 buffer contains code points
683 /// that trigger right-to-left processing.
684 ///
685 /// The check is done on a Unicode block basis without regard to assigned
686 /// vs. unassigned code points in the block. Hebrew presentation forms in
687 /// the Alphabetic Presentation Forms block are treated as if they formed
688 /// a block on their own (i.e. it treated as right-to-left). Additionally,
689 /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
690 /// for. Control characters that are technically bidi controls but do not
691 /// cause right-to-left behavior without the presence of right-to-left
692 /// characters or right-to-left controls are not checked for. As a special
693 /// case, U+FEFF is excluded from Arabic Presentation Forms-B.
694 ///
695 /// Returns `true` if the input is invalid UTF-8 or the input contains an
696 /// RTL character. Returns `false` if the input is valid UTF-8 and contains
697 /// no RTL characters.
698 #[cfg_attr(feature = "cargo-clippy", allow(collapsible_if, cyclomatic_complexity))]
699 #[inline]
is_utf8_bidi(buffer: &[u8]) -> bool700 pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
701 // As of rustc 1.25.0-nightly (73ac5d6a8 2018-01-11), this is faster
702 // than UTF-8 validation followed by `is_str_bidi()` for German,
703 // Russian and Japanese. However, this is considerably slower for Thai.
704 // Chances are that the compiler makes some branch predictions that are
705 // unfortunate for Thai. Not spending the time to manually optimize
706 // further at this time, since it's unclear if this variant even has
707 // use cases. However, this is worth revisiting once Rust gets the
708 // ability to annotate relative priorities of match arms.
709
710 // U+058F: D6 8F
711 // U+0590: D6 90
712 // U+08FF: E0 A3 BF
713 // U+0900: E0 A4 80
714 //
715 // U+200F: E2 80 8F
716 // U+202B: E2 80 AB
717 // U+202E: E2 80 AE
718 // U+2067: E2 81 A7
719 //
720 // U+FB1C: EF AC 9C
721 // U+FB1D: EF AC 9D
722 // U+FDFF: EF B7 BF
723 // U+FE00: EF B8 80
724 //
725 // U+FE6F: EF B9 AF
726 // U+FE70: EF B9 B0
727 // U+FEFE: EF BB BE
728 // U+FEFF: EF BB BF
729 //
730 // U+107FF: F0 90 9F BF
731 // U+10800: F0 90 A0 80
732 // U+10FFF: F0 90 BF BF
733 // U+11000: F0 91 80 80
734 //
735 // U+1E7FF: F0 9E 9F BF
736 // U+1E800: F0 9E A0 80
737 // U+1EFFF: F0 9E BF BF
738 // U+1F000: F0 9F 80 80
739 let mut src = buffer;
740 'outer: loop {
741 if let Some((mut byte, mut read)) = validate_ascii(src) {
742 // Check for the longest sequence to avoid checking twice for the
743 // multi-byte sequences.
744 if read + 4 <= src.len() {
745 'inner: loop {
746 // At this point, `byte` is not included in `read`.
747 match byte {
748 0..=0x7F => {
749 // ASCII: go back to SIMD.
750 read += 1;
751 src = &src[read..];
752 continue 'outer;
753 }
754 0xC2..=0xD5 => {
755 // Two-byte
756 let second = unsafe { *(src.get_unchecked(read + 1)) };
757 if !in_inclusive_range8(second, 0x80, 0xBF) {
758 return true;
759 }
760 read += 2;
761 }
762 0xD6 => {
763 // Two-byte
764 let second = unsafe { *(src.get_unchecked(read + 1)) };
765 if !in_inclusive_range8(second, 0x80, 0xBF) {
766 return true;
767 }
768 // XXX consider folding the above and below checks
769 if second > 0x8F {
770 return true;
771 }
772 read += 2;
773 }
774 // two-byte starting with 0xD7 and above is bidi
775 0xE1 | 0xE3..=0xEC | 0xEE => {
776 // Three-byte normal
777 let second = unsafe { *(src.get_unchecked(read + 1)) };
778 let third = unsafe { *(src.get_unchecked(read + 2)) };
779 if ((UTF8_DATA.table[usize::from(second)]
780 & unsafe {
781 *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
782 })
783 | (third >> 6))
784 != 2
785 {
786 return true;
787 }
788 read += 3;
789 }
790 0xE2 => {
791 // Three-byte normal, potentially bidi
792 let second = unsafe { *(src.get_unchecked(read + 1)) };
793 let third = unsafe { *(src.get_unchecked(read + 2)) };
794 if ((UTF8_DATA.table[usize::from(second)]
795 & unsafe {
796 *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
797 })
798 | (third >> 6))
799 != 2
800 {
801 return true;
802 }
803 if second == 0x80 {
804 if third == 0x8F || third == 0xAB || third == 0xAE {
805 return true;
806 }
807 } else if second == 0x81 {
808 if third == 0xA7 {
809 return true;
810 }
811 }
812 read += 3;
813 }
814 0xEF => {
815 // Three-byte normal, potentially bidi
816 let second = unsafe { *(src.get_unchecked(read + 1)) };
817 let third = unsafe { *(src.get_unchecked(read + 2)) };
818 if ((UTF8_DATA.table[usize::from(second)]
819 & unsafe {
820 *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
821 })
822 | (third >> 6))
823 != 2
824 {
825 return true;
826 }
827 if in_inclusive_range8(second, 0xAC, 0xB7) {
828 if second == 0xAC {
829 if third > 0x9C {
830 return true;
831 }
832 } else {
833 return true;
834 }
835 } else if in_inclusive_range8(second, 0xB9, 0xBB) {
836 if second == 0xB9 {
837 if third > 0xAF {
838 return true;
839 }
840 } else if second == 0xBB {
841 if third != 0xBF {
842 return true;
843 }
844 } else {
845 return true;
846 }
847 }
848 read += 3;
849 }
850 0xE0 => {
851 // Three-byte special lower bound, potentially bidi
852 let second = unsafe { *(src.get_unchecked(read + 1)) };
853 let third = unsafe { *(src.get_unchecked(read + 2)) };
854 if ((UTF8_DATA.table[usize::from(second)]
855 & unsafe {
856 *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
857 })
858 | (third >> 6))
859 != 2
860 {
861 return true;
862 }
863 // XXX can this be folded into the above validity check
864 if second < 0xA4 {
865 return true;
866 }
867 read += 3;
868 }
869 0xED => {
870 // Three-byte special upper bound
871 let second = unsafe { *(src.get_unchecked(read + 1)) };
872 let third = unsafe { *(src.get_unchecked(read + 2)) };
873 if ((UTF8_DATA.table[usize::from(second)]
874 & unsafe {
875 *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
876 })
877 | (third >> 6))
878 != 2
879 {
880 return true;
881 }
882 read += 3;
883 }
884 0xF1..=0xF4 => {
885 // Four-byte normal
886 let second = unsafe { *(src.get_unchecked(read + 1)) };
887 let third = unsafe { *(src.get_unchecked(read + 2)) };
888 let fourth = unsafe { *(src.get_unchecked(read + 3)) };
889 if (u16::from(
890 UTF8_DATA.table[usize::from(second)]
891 & unsafe {
892 *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
893 },
894 ) | u16::from(third >> 6)
895 | (u16::from(fourth & 0xC0) << 2))
896 != 0x202
897 {
898 return true;
899 }
900 read += 4;
901 }
902 0xF0 => {
903 // Four-byte special lower bound, potentially bidi
904 let second = unsafe { *(src.get_unchecked(read + 1)) };
905 let third = unsafe { *(src.get_unchecked(read + 2)) };
906 let fourth = unsafe { *(src.get_unchecked(read + 3)) };
907 if (u16::from(
908 UTF8_DATA.table[usize::from(second)]
909 & unsafe {
910 *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
911 },
912 ) | u16::from(third >> 6)
913 | (u16::from(fourth & 0xC0) << 2))
914 != 0x202
915 {
916 return true;
917 }
918 if unsafe { unlikely(second == 0x90 || second == 0x9E) } {
919 let third = src[read + 2];
920 if third >= 0xA0 {
921 return true;
922 }
923 }
924 read += 4;
925 }
926 _ => {
927 // Invalid lead or bidi-only lead
928 return true;
929 }
930 }
931 if read + 4 > src.len() {
932 if read == src.len() {
933 return false;
934 }
935 byte = src[read];
936 break 'inner;
937 }
938 byte = src[read];
939 continue 'inner;
940 }
941 }
942 // We can't have a complete 4-byte sequence, but we could still have
943 // a complete shorter sequence.
944
945 // At this point, `byte` is not included in `read`.
946 match byte {
947 0..=0x7F => {
948 // ASCII: go back to SIMD.
949 read += 1;
950 src = &src[read..];
951 continue 'outer;
952 }
953 0xC2..=0xD5 => {
954 // Two-byte
955 let new_read = read + 2;
956 if new_read > src.len() {
957 return true;
958 }
959 let second = unsafe { *(src.get_unchecked(read + 1)) };
960 if !in_inclusive_range8(second, 0x80, 0xBF) {
961 return true;
962 }
963 read = new_read;
964 // We need to deal with the case where we came here with 3 bytes
965 // left, so we need to take a look at the last one.
966 src = &src[read..];
967 continue 'outer;
968 }
969 0xD6 => {
970 // Two-byte, potentially bidi
971 let new_read = read + 2;
972 if new_read > src.len() {
973 return true;
974 }
975 let second = unsafe { *(src.get_unchecked(read + 1)) };
976 if !in_inclusive_range8(second, 0x80, 0xBF) {
977 return true;
978 }
979 // XXX consider folding the above and below checks
980 if second > 0x8F {
981 return true;
982 }
983 read = new_read;
984 // We need to deal with the case where we came here with 3 bytes
985 // left, so we need to take a look at the last one.
986 src = &src[read..];
987 continue 'outer;
988 }
989 // two-byte starting with 0xD7 and above is bidi
990 0xE1 | 0xE3..=0xEC | 0xEE => {
991 // Three-byte normal
992 let new_read = read + 3;
993 if new_read > src.len() {
994 return true;
995 }
996 let second = unsafe { *(src.get_unchecked(read + 1)) };
997 let third = unsafe { *(src.get_unchecked(read + 2)) };
998 if ((UTF8_DATA.table[usize::from(second)]
999 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1000 | (third >> 6))
1001 != 2
1002 {
1003 return true;
1004 }
1005 }
1006 0xE2 => {
1007 // Three-byte normal, potentially bidi
1008 let new_read = read + 3;
1009 if new_read > src.len() {
1010 return true;
1011 }
1012 let second = unsafe { *(src.get_unchecked(read + 1)) };
1013 let third = unsafe { *(src.get_unchecked(read + 2)) };
1014 if ((UTF8_DATA.table[usize::from(second)]
1015 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1016 | (third >> 6))
1017 != 2
1018 {
1019 return true;
1020 }
1021 if second == 0x80 {
1022 if third == 0x8F || third == 0xAB || third == 0xAE {
1023 return true;
1024 }
1025 } else if second == 0x81 {
1026 if third == 0xA7 {
1027 return true;
1028 }
1029 }
1030 }
1031 0xEF => {
1032 // Three-byte normal, potentially bidi
1033 let new_read = read + 3;
1034 if new_read > src.len() {
1035 return true;
1036 }
1037 let second = unsafe { *(src.get_unchecked(read + 1)) };
1038 let third = unsafe { *(src.get_unchecked(read + 2)) };
1039 if ((UTF8_DATA.table[usize::from(second)]
1040 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1041 | (third >> 6))
1042 != 2
1043 {
1044 return true;
1045 }
1046 if in_inclusive_range8(second, 0xAC, 0xB7) {
1047 if second == 0xAC {
1048 if third > 0x9C {
1049 return true;
1050 }
1051 } else {
1052 return true;
1053 }
1054 } else if in_inclusive_range8(second, 0xB9, 0xBB) {
1055 if second == 0xB9 {
1056 if third > 0xAF {
1057 return true;
1058 }
1059 } else if second == 0xBB {
1060 if third != 0xBF {
1061 return true;
1062 }
1063 } else {
1064 return true;
1065 }
1066 }
1067 }
1068 0xE0 => {
1069 // Three-byte special lower bound, potentially bidi
1070 let new_read = read + 3;
1071 if new_read > src.len() {
1072 return true;
1073 }
1074 let second = unsafe { *(src.get_unchecked(read + 1)) };
1075 let third = unsafe { *(src.get_unchecked(read + 2)) };
1076 if ((UTF8_DATA.table[usize::from(second)]
1077 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1078 | (third >> 6))
1079 != 2
1080 {
1081 return true;
1082 }
1083 // XXX can this be folded into the above validity check
1084 if second < 0xA4 {
1085 return true;
1086 }
1087 }
1088 0xED => {
1089 // Three-byte special upper bound
1090 let new_read = read + 3;
1091 if new_read > src.len() {
1092 return true;
1093 }
1094 let second = unsafe { *(src.get_unchecked(read + 1)) };
1095 let third = unsafe { *(src.get_unchecked(read + 2)) };
1096 if ((UTF8_DATA.table[usize::from(second)]
1097 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1098 | (third >> 6))
1099 != 2
1100 {
1101 return true;
1102 }
1103 }
1104 _ => {
1105 // Invalid lead, 4-byte lead or 2-byte bidi-only lead
1106 return true;
1107 }
1108 }
1109 return false;
1110 } else {
1111 return false;
1112 }
1113 }
1114 }
1115
1116 /// Checks whether a valid UTF-8 buffer contains code points that trigger
1117 /// right-to-left processing.
1118 ///
1119 /// The check is done on a Unicode block basis without regard to assigned
1120 /// vs. unassigned code points in the block. Hebrew presentation forms in
1121 /// the Alphabetic Presentation Forms block are treated as if they formed
1122 /// a block on their own (i.e. it treated as right-to-left). Additionally,
1123 /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1124 /// for. Control characters that are technically bidi controls but do not
1125 /// cause right-to-left behavior without the presence of right-to-left
1126 /// characters or right-to-left controls are not checked for. As a special
1127 /// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1128 #[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))]
1129 #[inline]
is_str_bidi(buffer: &str) -> bool1130 pub fn is_str_bidi(buffer: &str) -> bool {
1131 // U+058F: D6 8F
1132 // U+0590: D6 90
1133 // U+08FF: E0 A3 BF
1134 // U+0900: E0 A4 80
1135 //
1136 // U+200F: E2 80 8F
1137 // U+202B: E2 80 AB
1138 // U+202E: E2 80 AE
1139 // U+2067: E2 81 A7
1140 //
1141 // U+FB1C: EF AC 9C
1142 // U+FB1D: EF AC 9D
1143 // U+FDFF: EF B7 BF
1144 // U+FE00: EF B8 80
1145 //
1146 // U+FE6F: EF B9 AF
1147 // U+FE70: EF B9 B0
1148 // U+FEFE: EF BB BE
1149 // U+FEFF: EF BB BF
1150 //
1151 // U+107FF: F0 90 9F BF
1152 // U+10800: F0 90 A0 80
1153 // U+10FFF: F0 90 BF BF
1154 // U+11000: F0 91 80 80
1155 //
1156 // U+1E7FF: F0 9E 9F BF
1157 // U+1E800: F0 9E A0 80
1158 // U+1EFFF: F0 9E BF BF
1159 // U+1F000: F0 9F 80 80
1160 let mut bytes = buffer.as_bytes();
1161 'outer: loop {
1162 // TODO: Instead of just validating ASCII using SIMD, use SIMD
1163 // to check for non-ASCII lead bytes, too, to quickly conclude
1164 // that the vector consist entirely of CJK and below-Hebrew
1165 // code points.
1166 // Unfortunately, scripts above Arabic but below CJK share
1167 // lead bytes with RTL.
1168 if let Some((mut byte, mut read)) = validate_ascii(bytes) {
1169 'inner: loop {
1170 // At this point, `byte` is not included in `read`.
1171 if byte < 0xE0 {
1172 if byte >= 0x80 {
1173 // Two-byte
1174 // Adding `unlikely` here improved throughput on
1175 // Russian plain text by 33%!
1176 if unsafe { unlikely(byte >= 0xD6) } {
1177 if byte == 0xD6 {
1178 let second = bytes[read + 1];
1179 if second > 0x8F {
1180 return true;
1181 }
1182 } else {
1183 return true;
1184 }
1185 }
1186 read += 2;
1187 } else {
1188 // ASCII: write and go back to SIMD.
1189 read += 1;
1190 // Intuitively, we should go back to the outer loop only
1191 // if byte is 0x30 or above, so as to avoid trashing on
1192 // ASCII space, comma and period in non-Latin context.
1193 // However, the extra branch seems to cost more than it's
1194 // worth.
1195 bytes = &bytes[read..];
1196 continue 'outer;
1197 }
1198 } else if byte < 0xF0 {
1199 // Three-byte
1200 if unsafe { unlikely(!in_inclusive_range8(byte, 0xE3, 0xEE) && byte != 0xE1) } {
1201 let second = bytes[read + 1];
1202 if byte == 0xE0 {
1203 if second < 0xA4 {
1204 return true;
1205 }
1206 } else if byte == 0xE2 {
1207 let third = bytes[read + 2];
1208 if second == 0x80 {
1209 if third == 0x8F || third == 0xAB || third == 0xAE {
1210 return true;
1211 }
1212 } else if second == 0x81 {
1213 if third == 0xA7 {
1214 return true;
1215 }
1216 }
1217 } else {
1218 debug_assert_eq!(byte, 0xEF);
1219 if in_inclusive_range8(second, 0xAC, 0xB7) {
1220 if second == 0xAC {
1221 let third = bytes[read + 2];
1222 if third > 0x9C {
1223 return true;
1224 }
1225 } else {
1226 return true;
1227 }
1228 } else if in_inclusive_range8(second, 0xB9, 0xBB) {
1229 if second == 0xB9 {
1230 let third = bytes[read + 2];
1231 if third > 0xAF {
1232 return true;
1233 }
1234 } else if second == 0xBB {
1235 let third = bytes[read + 2];
1236 if third != 0xBF {
1237 return true;
1238 }
1239 } else {
1240 return true;
1241 }
1242 }
1243 }
1244 }
1245 read += 3;
1246 } else {
1247 // Four-byte
1248 let second = bytes[read + 1];
1249 if unsafe { unlikely(byte == 0xF0 && (second == 0x90 || second == 0x9E)) } {
1250 let third = bytes[read + 2];
1251 if third >= 0xA0 {
1252 return true;
1253 }
1254 }
1255 read += 4;
1256 }
1257 // The comparison is always < or == and never >, but including
1258 // > here to let the compiler assume that < is true if this
1259 // comparison is false.
1260 if read >= bytes.len() {
1261 return false;
1262 }
1263 byte = bytes[read];
1264 continue 'inner;
1265 }
1266 } else {
1267 return false;
1268 }
1269 }
1270 }
1271
1272 /// Checks whether a UTF-16 buffer contains code points that trigger
1273 /// right-to-left processing.
1274 ///
1275 /// The check is done on a Unicode block basis without regard to assigned
1276 /// vs. unassigned code points in the block. Hebrew presentation forms in
1277 /// the Alphabetic Presentation Forms block are treated as if they formed
1278 /// a block on their own (i.e. it treated as right-to-left). Additionally,
1279 /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1280 /// for. Control characters that are technically bidi controls but do not
1281 /// cause right-to-left behavior without the presence of right-to-left
1282 /// characters or right-to-left controls are not checked for. As a special
1283 /// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1284 ///
1285 /// Returns `true` if the input contains an RTL character or an unpaired
1286 /// high surrogate that could be the high half of an RTL character.
1287 /// Returns `false` if the input contains neither RTL characters nor
1288 /// unpaired high surrogates that could be higher halves of RTL characters.
is_utf16_bidi(buffer: &[u16]) -> bool1289 pub fn is_utf16_bidi(buffer: &[u16]) -> bool {
1290 is_utf16_bidi_impl(buffer)
1291 }
1292
1293 /// Checks whether a scalar value triggers right-to-left processing.
1294 ///
1295 /// The check is done on a Unicode block basis without regard to assigned
1296 /// vs. unassigned code points in the block. Hebrew presentation forms in
1297 /// the Alphabetic Presentation Forms block are treated as if they formed
1298 /// a block on their own (i.e. it treated as right-to-left). Additionally,
1299 /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1300 /// for. Control characters that are technically bidi controls but do not
1301 /// cause right-to-left behavior without the presence of right-to-left
1302 /// characters or right-to-left controls are not checked for. As a special
1303 /// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1304 #[inline(always)]
is_char_bidi(c: char) -> bool1305 pub fn is_char_bidi(c: char) -> bool {
1306 // Controls:
1307 // Every control with RIGHT-TO-LEFT in its name in
1308 // https://www.unicode.org/charts/PDF/U2000.pdf
1309 // U+200F RLM
1310 // U+202B RLE
1311 // U+202E RLO
1312 // U+2067 RLI
1313 //
1314 // BMP RTL:
1315 // https://www.unicode.org/roadmaps/bmp/
1316 // U+0590...U+08FF
1317 // U+FB1D...U+FDFF Hebrew presentation forms and
1318 // Arabic Presentation Forms A
1319 // U+FE70...U+FEFE Arabic Presentation Forms B (excl. BOM)
1320 //
1321 // Supplementary RTL:
1322 // https://www.unicode.org/roadmaps/smp/
1323 // U+10800...U+10FFF (Lead surrogate U+D802 or U+D803)
1324 // U+1E800...U+1EFFF (Lead surrogate U+D83A or U+D83B)
1325 let code_point = u32::from(c);
1326 if code_point < 0x0590 {
1327 // Below Hebrew
1328 return false;
1329 }
1330 if in_range32(code_point, 0x0900, 0xFB1D) {
1331 // Above Arabic Extended-A and below Hebrew presentation forms
1332 if in_inclusive_range32(code_point, 0x200F, 0x2067) {
1333 // In the range that contains the RTL controls
1334 return code_point == 0x200F
1335 || code_point == 0x202B
1336 || code_point == 0x202E
1337 || code_point == 0x2067;
1338 }
1339 return false;
1340 }
1341 if code_point > 0x1EFFF {
1342 // Above second astral RTL. (Emoji is here.)
1343 return false;
1344 }
1345 if in_range32(code_point, 0x11000, 0x1E800) {
1346 // Between astral RTL blocks
1347 return false;
1348 }
1349 if in_range32(code_point, 0xFEFF, 0x10800) {
1350 // Above Arabic Presentations Forms B (excl. BOM) and below first
1351 // astral RTL
1352 return false;
1353 }
1354 if in_range32(code_point, 0xFE00, 0xFE70) {
1355 // Between Arabic Presentations Forms
1356 return false;
1357 }
1358 true
1359 }
1360
1361 /// Checks whether a UTF-16 code unit triggers right-to-left processing.
1362 ///
1363 /// The check is done on a Unicode block basis without regard to assigned
1364 /// vs. unassigned code points in the block. Hebrew presentation forms in
1365 /// the Alphabetic Presentation Forms block are treated as if they formed
1366 /// a block on their own (i.e. it treated as right-to-left). Additionally,
1367 /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1368 /// for. Control characters that are technically bidi controls but do not
1369 /// cause right-to-left behavior without the presence of right-to-left
1370 /// characters or right-to-left controls are not checked for. As a special
1371 /// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1372 ///
1373 /// Since supplementary-plane right-to-left blocks are identifiable from the
1374 /// high surrogate without examining the low surrogate, this function returns
1375 /// `true` for such high surrogates making the function suitable for handling
1376 /// supplementary-plane text without decoding surrogate pairs to scalar
1377 /// values. Obviously, such high surrogates are then reported as right-to-left
1378 /// even if actually unpaired.
1379 #[inline(always)]
is_utf16_code_unit_bidi(u: u16) -> bool1380 pub fn is_utf16_code_unit_bidi(u: u16) -> bool {
1381 if u < 0x0590 {
1382 // Below Hebrew
1383 return false;
1384 }
1385 if in_range16(u, 0x0900, 0xD802) {
1386 // Above Arabic Extended-A and below first RTL surrogate
1387 if in_inclusive_range16(u, 0x200F, 0x2067) {
1388 // In the range that contains the RTL controls
1389 return u == 0x200F || u == 0x202B || u == 0x202E || u == 0x2067;
1390 }
1391 return false;
1392 }
1393 if in_range16(u, 0xD83C, 0xFB1D) {
1394 // Between astral RTL high surrogates and Hebrew presentation forms
1395 // (Emoji is here)
1396 return false;
1397 }
1398 if in_range16(u, 0xD804, 0xD83A) {
1399 // Between RTL high surragates
1400 return false;
1401 }
1402 if u > 0xFEFE {
1403 // Above Arabic Presentation Forms (excl. BOM)
1404 return false;
1405 }
1406 if in_range16(u, 0xFE00, 0xFE70) {
1407 // Between Arabic Presentations Forms
1408 return false;
1409 }
1410 true
1411 }
1412
1413 /// Checks whether a potentially invalid UTF-8 buffer contains code points
1414 /// that trigger right-to-left processing or is all-Latin1.
1415 ///
1416 /// Possibly more efficient than performing the checks separately.
1417 ///
1418 /// Returns `Latin1Bidi::Latin1` if `is_utf8_latin1()` would return `true`.
1419 /// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf8_bidi()` would return
1420 /// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
check_utf8_for_latin1_and_bidi(buffer: &[u8]) -> Latin1Bidi1421 pub fn check_utf8_for_latin1_and_bidi(buffer: &[u8]) -> Latin1Bidi {
1422 if let Some(offset) = is_utf8_latin1_impl(buffer) {
1423 if is_utf8_bidi(&buffer[offset..]) {
1424 Latin1Bidi::Bidi
1425 } else {
1426 Latin1Bidi::LeftToRight
1427 }
1428 } else {
1429 Latin1Bidi::Latin1
1430 }
1431 }
1432
1433 /// Checks whether a valid UTF-8 buffer contains code points
1434 /// that trigger right-to-left processing or is all-Latin1.
1435 ///
1436 /// Possibly more efficient than performing the checks separately.
1437 ///
1438 /// Returns `Latin1Bidi::Latin1` if `is_str_latin1()` would return `true`.
1439 /// Otherwise, returns `Latin1Bidi::Bidi` if `is_str_bidi()` would return
1440 /// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
check_str_for_latin1_and_bidi(buffer: &str) -> Latin1Bidi1441 pub fn check_str_for_latin1_and_bidi(buffer: &str) -> Latin1Bidi {
1442 // The transition from the latin1 check to the bidi check isn't
1443 // optimal but not tweaking it to perfection today.
1444 if let Some(offset) = is_str_latin1_impl(buffer) {
1445 if is_str_bidi(&buffer[offset..]) {
1446 Latin1Bidi::Bidi
1447 } else {
1448 Latin1Bidi::LeftToRight
1449 }
1450 } else {
1451 Latin1Bidi::Latin1
1452 }
1453 }
1454
1455 /// Checks whether a potentially invalid UTF-16 buffer contains code points
1456 /// that trigger right-to-left processing or is all-Latin1.
1457 ///
1458 /// Possibly more efficient than performing the checks separately.
1459 ///
1460 /// Returns `Latin1Bidi::Latin1` if `is_utf16_latin1()` would return `true`.
1461 /// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf16_bidi()` would return
1462 /// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
check_utf16_for_latin1_and_bidi(buffer: &[u16]) -> Latin1Bidi1463 pub fn check_utf16_for_latin1_and_bidi(buffer: &[u16]) -> Latin1Bidi {
1464 check_utf16_for_latin1_and_bidi_impl(buffer)
1465 }
1466
1467 /// Converts potentially-invalid UTF-8 to valid UTF-16 with errors replaced
1468 /// with the REPLACEMENT CHARACTER.
1469 ///
1470 /// The length of the destination buffer must be at least the length of the
1471 /// source buffer _plus one_.
1472 ///
1473 /// Returns the number of `u16`s written.
1474 ///
1475 /// # Panics
1476 ///
1477 /// Panics if the destination buffer is shorter than stated above.
convert_utf8_to_utf16(src: &[u8], dst: &mut [u16]) -> usize1478 pub fn convert_utf8_to_utf16(src: &[u8], dst: &mut [u16]) -> usize {
1479 // TODO: Can the requirement for dst to be at least one unit longer
1480 // be eliminated?
1481 assert!(dst.len() > src.len());
1482 let mut decoder = Utf8Decoder::new_inner();
1483 let mut total_read = 0usize;
1484 let mut total_written = 0usize;
1485 loop {
1486 let (result, read, written) =
1487 decoder.decode_to_utf16_raw(&src[total_read..], &mut dst[total_written..], true);
1488 total_read += read;
1489 total_written += written;
1490 match result {
1491 DecoderResult::InputEmpty => {
1492 return total_written;
1493 }
1494 DecoderResult::OutputFull => {
1495 unreachable!("The assert at the top of the function should have caught this.");
1496 }
1497 DecoderResult::Malformed(_, _) => {
1498 // There should always be space for the U+FFFD, because
1499 // otherwise we'd have gotten OutputFull already.
1500 dst[total_written] = 0xFFFD;
1501 total_written += 1;
1502 }
1503 }
1504 }
1505 }
1506
1507 /// Converts valid UTF-8 to valid UTF-16.
1508 ///
1509 /// The length of the destination buffer must be at least the length of the
1510 /// source buffer.
1511 ///
1512 /// Returns the number of `u16`s written.
1513 ///
1514 /// # Panics
1515 ///
1516 /// Panics if the destination buffer is shorter than stated above.
convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize1517 pub fn convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize {
1518 assert!(
1519 dst.len() >= src.len(),
1520 "Destination must not be shorter than the source."
1521 );
1522 let bytes = src.as_bytes();
1523 let mut read = 0;
1524 let mut written = 0;
1525 'outer: loop {
1526 let mut byte = {
1527 let src_remaining = &bytes[read..];
1528 let dst_remaining = &mut dst[written..];
1529 let length = src_remaining.len();
1530 match unsafe {
1531 ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length)
1532 } {
1533 None => {
1534 written += length;
1535 return written;
1536 }
1537 Some((non_ascii, consumed)) => {
1538 read += consumed;
1539 written += consumed;
1540 non_ascii
1541 }
1542 }
1543 };
1544 'inner: loop {
1545 // At this point, `byte` is not included in `read`.
1546 if byte < 0xE0 {
1547 if byte >= 0x80 {
1548 // Two-byte
1549 let second = unsafe { *(bytes.get_unchecked(read + 1)) };
1550 let point = ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F);
1551 unsafe { *(dst.get_unchecked_mut(written)) = point };
1552 read += 2;
1553 written += 1;
1554 } else {
1555 // ASCII: write and go back to SIMD.
1556 unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) };
1557 read += 1;
1558 written += 1;
1559 // Intuitively, we should go back to the outer loop only
1560 // if byte is 0x30 or above, so as to avoid trashing on
1561 // ASCII space, comma and period in non-Latin context.
1562 // However, the extra branch seems to cost more than it's
1563 // worth.
1564 continue 'outer;
1565 }
1566 } else if byte < 0xF0 {
1567 // Three-byte
1568 let second = unsafe { *(bytes.get_unchecked(read + 1)) };
1569 let third = unsafe { *(bytes.get_unchecked(read + 2)) };
1570 let point = ((u16::from(byte) & 0xF) << 12)
1571 | ((u16::from(second) & 0x3F) << 6)
1572 | (u16::from(third) & 0x3F);
1573 unsafe { *(dst.get_unchecked_mut(written)) = point };
1574 read += 3;
1575 written += 1;
1576 } else {
1577 // Four-byte
1578 let second = unsafe { *(bytes.get_unchecked(read + 1)) };
1579 let third = unsafe { *(bytes.get_unchecked(read + 2)) };
1580 let fourth = unsafe { *(bytes.get_unchecked(read + 3)) };
1581 let point = ((u32::from(byte) & 0x7) << 18)
1582 | ((u32::from(second) & 0x3F) << 12)
1583 | ((u32::from(third) & 0x3F) << 6)
1584 | (u32::from(fourth) & 0x3F);
1585 unsafe { *(dst.get_unchecked_mut(written)) = (0xD7C0 + (point >> 10)) as u16 };
1586 unsafe {
1587 *(dst.get_unchecked_mut(written + 1)) = (0xDC00 + (point & 0x3FF)) as u16
1588 };
1589 read += 4;
1590 written += 2;
1591 }
1592 // The comparison is always < or == and never >, but including
1593 // > here to let the compiler assume that < is true if this
1594 // comparison is false.
1595 if read >= src.len() {
1596 return written;
1597 }
1598 byte = bytes[read];
1599 continue 'inner;
1600 }
1601 }
1602 }
1603
1604 /// Converts potentially-invalid UTF-8 to valid UTF-16 signaling on error.
1605 ///
1606 /// The length of the destination buffer must be at least the length of the
1607 /// source buffer.
1608 ///
1609 /// Returns the number of `u16`s written or `None` if the input was invalid.
1610 ///
1611 /// When the input was invalid, some output may have been written.
1612 ///
1613 /// # Panics
1614 ///
1615 /// Panics if the destination buffer is shorter than stated above.
convert_utf8_to_utf16_without_replacement(src: &[u8], dst: &mut [u16]) -> Option<usize>1616 pub fn convert_utf8_to_utf16_without_replacement(src: &[u8], dst: &mut [u16]) -> Option<usize> {
1617 assert!(
1618 dst.len() >= src.len(),
1619 "Destination must not be shorter than the source."
1620 );
1621 let (read, written) = convert_utf8_to_utf16_up_to_invalid(src, dst);
1622 if read == src.len() {
1623 return Some(written);
1624 }
1625 None
1626 }
1627
1628 /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1629 /// with the REPLACEMENT CHARACTER with potentially insufficient output
1630 /// space.
1631 ///
1632 /// Returns the number of code units read and the number of bytes written.
1633 ///
1634 /// Guarantees that the bytes in the destination beyond the number of
1635 /// bytes claimed as written by the second item of the return tuple
1636 /// are left unmodified.
1637 ///
1638 /// Not all code units are read if there isn't enough output space.
1639 ///
1640 /// Note that this method isn't designed for general streamability but for
1641 /// not allocating memory for the worst case up front. Specifically,
1642 /// if the input starts with or ends with an unpaired surrogate, those are
1643 /// replaced with the REPLACEMENT CHARACTER.
1644 ///
1645 /// Matches the semantics of `TextEncoder.encodeInto()` from the
1646 /// Encoding Standard.
1647 ///
1648 /// # Safety
1649 ///
1650 /// If you want to convert into a `&mut str`, use
1651 /// `convert_utf16_to_str_partial()` instead of using this function
1652 /// together with the `unsafe` method `as_bytes_mut()` on `&mut str`.
1653 #[inline(always)]
convert_utf16_to_utf8_partial(src: &[u16], dst: &mut [u8]) -> (usize, usize)1654 pub fn convert_utf16_to_utf8_partial(src: &[u16], dst: &mut [u8]) -> (usize, usize) {
1655 // The two functions called below are marked `inline(never)` to make
1656 // transitions from the hot part (first function) into the cold part
1657 // (second function) go through a return and another call to discouge
1658 // the CPU from speculating from the hot code into the cold code.
1659 // Letting the transitions be mere intra-function jumps, even to
1660 // basic blocks out-of-lined to the end of the function would wipe
1661 // away a quarter of Arabic encode performance on Haswell!
1662 let (read, written) = convert_utf16_to_utf8_partial_inner(src, dst);
1663 if unsafe { likely(read == src.len()) } {
1664 return (read, written);
1665 }
1666 let (tail_read, tail_written) =
1667 convert_utf16_to_utf8_partial_tail(&src[read..], &mut dst[written..]);
1668 (read + tail_read, written + tail_written)
1669 }
1670
1671 /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1672 /// with the REPLACEMENT CHARACTER.
1673 ///
1674 /// The length of the destination buffer must be at least the length of the
1675 /// source buffer times three.
1676 ///
1677 /// Returns the number of bytes written.
1678 ///
1679 /// # Panics
1680 ///
1681 /// Panics if the destination buffer is shorter than stated above.
1682 ///
1683 /// # Safety
1684 ///
1685 /// If you want to convert into a `&mut str`, use `convert_utf16_to_str()`
1686 /// instead of using this function together with the `unsafe` method
1687 /// `as_bytes_mut()` on `&mut str`.
1688 #[inline(always)]
convert_utf16_to_utf8(src: &[u16], dst: &mut [u8]) -> usize1689 pub fn convert_utf16_to_utf8(src: &[u16], dst: &mut [u8]) -> usize {
1690 assert!(dst.len() >= src.len() * 3);
1691 let (read, written) = convert_utf16_to_utf8_partial(src, dst);
1692 debug_assert_eq!(read, src.len());
1693 written
1694 }
1695
1696 /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1697 /// with the REPLACEMENT CHARACTER such that the validity of the output is
1698 /// signaled using the Rust type system with potentially insufficient output
1699 /// space.
1700 ///
1701 /// Returns the number of code units read and the number of bytes written.
1702 ///
1703 /// Not all code units are read if there isn't enough output space.
1704 ///
1705 /// Note that this method isn't designed for general streamability but for
1706 /// not allocating memory for the worst case up front. Specifically,
1707 /// if the input starts with or ends with an unpaired surrogate, those are
1708 /// replaced with the REPLACEMENT CHARACTER.
convert_utf16_to_str_partial(src: &[u16], dst: &mut str) -> (usize, usize)1709 pub fn convert_utf16_to_str_partial(src: &[u16], dst: &mut str) -> (usize, usize) {
1710 let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
1711 let (read, written) = convert_utf16_to_utf8_partial(src, bytes);
1712 let len = bytes.len();
1713 let mut trail = written;
1714 while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
1715 bytes[trail] = 0;
1716 trail += 1;
1717 }
1718 (read, written)
1719 }
1720
1721 /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1722 /// with the REPLACEMENT CHARACTER such that the validity of the output is
1723 /// signaled using the Rust type system.
1724 ///
1725 /// The length of the destination buffer must be at least the length of the
1726 /// source buffer times three.
1727 ///
1728 /// Returns the number of bytes written.
1729 ///
1730 /// # Panics
1731 ///
1732 /// Panics if the destination buffer is shorter than stated above.
1733 #[inline(always)]
convert_utf16_to_str(src: &[u16], dst: &mut str) -> usize1734 pub fn convert_utf16_to_str(src: &[u16], dst: &mut str) -> usize {
1735 assert!(dst.len() >= src.len() * 3);
1736 let (read, written) = convert_utf16_to_str_partial(src, dst);
1737 debug_assert_eq!(read, src.len());
1738 written
1739 }
1740
1741 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1742 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-16.
1743 ///
1744 /// The length of the destination buffer must be at least the length of the
1745 /// source buffer.
1746 ///
1747 /// The number of `u16`s written equals the length of the source buffer.
1748 ///
1749 /// # Panics
1750 ///
1751 /// Panics if the destination buffer is shorter than stated above.
convert_latin1_to_utf16(src: &[u8], dst: &mut [u16])1752 pub fn convert_latin1_to_utf16(src: &[u8], dst: &mut [u16]) {
1753 assert!(
1754 dst.len() >= src.len(),
1755 "Destination must not be shorter than the source."
1756 );
1757 // TODO: On aarch64, the safe version autovectorizes to the same unpacking
1758 // instructions and this code, but, yet, the autovectorized version is
1759 // faster.
1760 unsafe {
1761 unpack_latin1(src.as_ptr(), dst.as_mut_ptr(), src.len());
1762 }
1763 }
1764
1765 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1766 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient
1767 /// output space.
1768 ///
1769 /// Returns the number of bytes read and the number of bytes written.
1770 ///
1771 /// If the output isn't large enough, not all input is consumed.
1772 ///
1773 /// # Safety
1774 ///
1775 /// If you want to convert into a `&mut str`, use
1776 /// `convert_utf16_to_str_partial()` instead of using this function
1777 /// together with the `unsafe` method `as_bytes_mut()` on `&mut str`.
convert_latin1_to_utf8_partial(src: &[u8], dst: &mut [u8]) -> (usize, usize)1778 pub fn convert_latin1_to_utf8_partial(src: &[u8], dst: &mut [u8]) -> (usize, usize) {
1779 let src_len = src.len();
1780 let src_ptr = src.as_ptr();
1781 let dst_ptr = dst.as_mut_ptr();
1782 let dst_len = dst.len();
1783 let mut total_read = 0usize;
1784 let mut total_written = 0usize;
1785 loop {
1786 // src can't advance more than dst
1787 let src_left = src_len - total_read;
1788 let dst_left = dst_len - total_written;
1789 let min_left = ::core::cmp::min(src_left, dst_left);
1790 if let Some((non_ascii, consumed)) = unsafe {
1791 ascii_to_ascii(
1792 src_ptr.add(total_read),
1793 dst_ptr.add(total_written),
1794 min_left,
1795 )
1796 } {
1797 total_read += consumed;
1798 total_written += consumed;
1799 if total_written.checked_add(2).unwrap() > dst_len {
1800 return (total_read, total_written);
1801 }
1802
1803 total_read += 1; // consume `non_ascii`
1804
1805 dst[total_written] = (non_ascii >> 6) | 0xC0;
1806 total_written += 1;
1807 dst[total_written] = (non_ascii & 0x3F) | 0x80;
1808 total_written += 1;
1809 continue;
1810 }
1811 return (total_read + min_left, total_written + min_left);
1812 }
1813 }
1814
1815 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1816 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8.
1817 ///
1818 /// The length of the destination buffer must be at least the length of the
1819 /// source buffer times two.
1820 ///
1821 /// Returns the number of bytes written.
1822 ///
1823 /// # Panics
1824 ///
1825 /// Panics if the destination buffer is shorter than stated above.
1826 ///
1827 /// # Safety
1828 ///
1829 /// Note that this function may write garbage beyond the number of bytes
1830 /// indicated by the return value, so using a `&mut str` interpreted as
1831 /// `&mut [u8]` as the destination is not safe. If you want to convert into
1832 /// a `&mut str`, use `convert_utf16_to_str()` instead of this function.
1833 #[inline]
convert_latin1_to_utf8(src: &[u8], dst: &mut [u8]) -> usize1834 pub fn convert_latin1_to_utf8(src: &[u8], dst: &mut [u8]) -> usize {
1835 assert!(
1836 dst.len() >= src.len() * 2,
1837 "Destination must not be shorter than the source times two."
1838 );
1839 let (read, written) = convert_latin1_to_utf8_partial(src, dst);
1840 debug_assert_eq!(read, src.len());
1841 written
1842 }
1843
1844 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1845 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the
1846 /// output is signaled using the Rust type system with potentially insufficient
1847 /// output space.
1848 ///
1849 /// Returns the number of bytes read and the number of bytes written.
1850 ///
1851 /// If the output isn't large enough, not all input is consumed.
1852 #[inline]
convert_latin1_to_str_partial(src: &[u8], dst: &mut str) -> (usize, usize)1853 pub fn convert_latin1_to_str_partial(src: &[u8], dst: &mut str) -> (usize, usize) {
1854 let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
1855 let (read, written) = convert_latin1_to_utf8_partial(src, bytes);
1856 let len = bytes.len();
1857 let mut trail = written;
1858 let max = ::core::cmp::min(len, trail + MAX_STRIDE_SIZE);
1859 while trail < max {
1860 bytes[trail] = 0;
1861 trail += 1;
1862 }
1863 while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
1864 bytes[trail] = 0;
1865 trail += 1;
1866 }
1867 (read, written)
1868 }
1869
1870 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1871 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the
1872 /// output is signaled using the Rust type system.
1873 ///
1874 /// The length of the destination buffer must be at least the length of the
1875 /// source buffer times two.
1876 ///
1877 /// Returns the number of bytes written.
1878 ///
1879 /// # Panics
1880 ///
1881 /// Panics if the destination buffer is shorter than stated above.
1882 #[inline]
convert_latin1_to_str(src: &[u8], dst: &mut str) -> usize1883 pub fn convert_latin1_to_str(src: &[u8], dst: &mut str) -> usize {
1884 assert!(
1885 dst.len() >= src.len() * 2,
1886 "Destination must not be shorter than the source times two."
1887 );
1888 let (read, written) = convert_latin1_to_str_partial(src, dst);
1889 debug_assert_eq!(read, src.len());
1890 written
1891 }
1892
1893 /// If the input is valid UTF-8 representing only Unicode code points from
1894 /// U+0000 to U+00FF, inclusive, converts the input into output that
1895 /// represents the value of each code point as the unsigned byte value of
1896 /// each output byte.
1897 ///
1898 /// If the input does not fulfill the condition stated above, this function
1899 /// panics if debug assertions are enabled (and fuzzing isn't) and otherwise
1900 /// does something that is memory-safe without any promises about any
1901 /// properties of the output. In particular, callers shouldn't assume the
1902 /// output to be the same across crate versions or CPU architectures and
1903 /// should not assume that non-ASCII input can't map to ASCII output.
1904 ///
1905 /// The length of the destination buffer must be at least the length of the
1906 /// source buffer.
1907 ///
1908 /// Returns the number of bytes written.
1909 ///
1910 /// # Panics
1911 ///
1912 /// Panics if the destination buffer is shorter than stated above.
1913 ///
1914 /// If debug assertions are enabled (and not fuzzing) and the input is
1915 /// not in the range U+0000 to U+00FF, inclusive.
convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize1916 pub fn convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize {
1917 assert!(
1918 dst.len() >= src.len(),
1919 "Destination must not be shorter than the source."
1920 );
1921 non_fuzz_debug_assert!(is_utf8_latin1(src));
1922 let src_len = src.len();
1923 let src_ptr = src.as_ptr();
1924 let dst_ptr = dst.as_mut_ptr();
1925 let mut total_read = 0usize;
1926 let mut total_written = 0usize;
1927 loop {
1928 // dst can't advance more than src
1929 let src_left = src_len - total_read;
1930 if let Some((non_ascii, consumed)) = unsafe {
1931 ascii_to_ascii(
1932 src_ptr.add(total_read),
1933 dst_ptr.add(total_written),
1934 src_left,
1935 )
1936 } {
1937 total_read += consumed + 1;
1938 total_written += consumed;
1939
1940 if total_read == src_len {
1941 return total_written;
1942 }
1943
1944 let trail = src[total_read];
1945 total_read += 1;
1946
1947 dst[total_written] = ((non_ascii & 0x1F) << 6) | (trail & 0x3F);
1948 total_written += 1;
1949 continue;
1950 }
1951 return total_written + src_left;
1952 }
1953 }
1954
1955 /// If the input is valid UTF-16 representing only Unicode code points from
1956 /// U+0000 to U+00FF, inclusive, converts the input into output that
1957 /// represents the value of each code point as the unsigned byte value of
1958 /// each output byte.
1959 ///
1960 /// If the input does not fulfill the condition stated above, does something
1961 /// that is memory-safe without any promises about any properties of the
1962 /// output and will probably assert in debug builds in future versions.
1963 /// In particular, callers shouldn't assume the output to be the same across
1964 /// crate versions or CPU architectures and should not assume that non-ASCII
1965 /// input can't map to ASCII output.
1966 ///
1967 /// The length of the destination buffer must be at least the length of the
1968 /// source buffer.
1969 ///
1970 /// The number of bytes written equals the length of the source buffer.
1971 ///
1972 /// # Panics
1973 ///
1974 /// Panics if the destination buffer is shorter than stated above.
1975 ///
1976 /// (Probably in future versions if debug assertions are enabled (and not
1977 /// fuzzing) and the input is not in the range U+0000 to U+00FF, inclusive.)
convert_utf16_to_latin1_lossy(src: &[u16], dst: &mut [u8])1978 pub fn convert_utf16_to_latin1_lossy(src: &[u16], dst: &mut [u8]) {
1979 assert!(
1980 dst.len() >= src.len(),
1981 "Destination must not be shorter than the source."
1982 );
1983 // non_fuzz_debug_assert!(is_utf16_latin1(src));
1984 unsafe {
1985 pack_latin1(src.as_ptr(), dst.as_mut_ptr(), src.len());
1986 }
1987 }
1988
1989 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1990 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8.
1991 ///
1992 /// Borrows if input is ASCII-only. Performs a single heap allocation
1993 /// otherwise.
1994 ///
1995 /// Only available if the `alloc` feature is enabled (enabled by default).
1996 #[cfg(feature = "alloc")]
decode_latin1<'a>(bytes: &'a [u8]) -> Cow<'a, str>1997 pub fn decode_latin1<'a>(bytes: &'a [u8]) -> Cow<'a, str> {
1998 let up_to = ascii_valid_up_to(bytes);
1999 // >= makes later things optimize better than ==
2000 if up_to >= bytes.len() {
2001 debug_assert_eq!(up_to, bytes.len());
2002 let s: &str = unsafe { ::core::str::from_utf8_unchecked(bytes) };
2003 return Cow::Borrowed(s);
2004 }
2005 let (head, tail) = bytes.split_at(up_to);
2006 let capacity = head.len() + tail.len() * 2;
2007 let mut vec = Vec::with_capacity(capacity);
2008 unsafe {
2009 vec.set_len(capacity);
2010 }
2011 (&mut vec[..up_to]).copy_from_slice(head);
2012 let written = convert_latin1_to_utf8(tail, &mut vec[up_to..]);
2013 vec.truncate(up_to + written);
2014 Cow::Owned(unsafe { String::from_utf8_unchecked(vec) })
2015 }
2016
2017 /// If the input is valid UTF-8 representing only Unicode code points from
2018 /// U+0000 to U+00FF, inclusive, converts the input into output that
2019 /// represents the value of each code point as the unsigned byte value of
2020 /// each output byte.
2021 ///
2022 /// If the input does not fulfill the condition stated above, this function
2023 /// panics if debug assertions are enabled (and fuzzing isn't) and otherwise
2024 /// does something that is memory-safe without any promises about any
2025 /// properties of the output. In particular, callers shouldn't assume the
2026 /// output to be the same across crate versions or CPU architectures and
2027 /// should not assume that non-ASCII input can't map to ASCII output.
2028 ///
2029 /// Borrows if input is ASCII-only. Performs a single heap allocation
2030 /// otherwise.
2031 ///
2032 /// Only available if the `alloc` feature is enabled (enabled by default).
2033 #[cfg(feature = "alloc")]
encode_latin1_lossy<'a>(string: &'a str) -> Cow<'a, [u8]>2034 pub fn encode_latin1_lossy<'a>(string: &'a str) -> Cow<'a, [u8]> {
2035 let bytes = string.as_bytes();
2036 let up_to = ascii_valid_up_to(bytes);
2037 // >= makes later things optimize better than ==
2038 if up_to >= bytes.len() {
2039 debug_assert_eq!(up_to, bytes.len());
2040 return Cow::Borrowed(bytes);
2041 }
2042 let (head, tail) = bytes.split_at(up_to);
2043 let capacity = bytes.len();
2044 let mut vec = Vec::with_capacity(capacity);
2045 unsafe {
2046 vec.set_len(capacity);
2047 }
2048 (&mut vec[..up_to]).copy_from_slice(head);
2049 let written = convert_utf8_to_latin1_lossy(tail, &mut vec[up_to..]);
2050 vec.truncate(up_to + written);
2051 Cow::Owned(vec)
2052 }
2053
2054 /// Returns the index of the first unpaired surrogate or, if the input is
2055 /// valid UTF-16 in its entirety, the length of the input.
utf16_valid_up_to(buffer: &[u16]) -> usize2056 pub fn utf16_valid_up_to(buffer: &[u16]) -> usize {
2057 utf16_valid_up_to_impl(buffer)
2058 }
2059
2060 /// Returns the index of first byte that starts an invalid byte
2061 /// sequence or a non-Latin1 byte sequence, or the length of the
2062 /// string if there are neither.
utf8_latin1_up_to(buffer: &[u8]) -> usize2063 pub fn utf8_latin1_up_to(buffer: &[u8]) -> usize {
2064 is_utf8_latin1_impl(buffer).unwrap_or(buffer.len())
2065 }
2066
2067 /// Returns the index of first byte that starts a non-Latin1 byte
2068 /// sequence, or the length of the string if there are none.
str_latin1_up_to(buffer: &str) -> usize2069 pub fn str_latin1_up_to(buffer: &str) -> usize {
2070 is_str_latin1_impl(buffer).unwrap_or_else(|| buffer.len())
2071 }
2072
2073 /// Replaces unpaired surrogates in the input with the REPLACEMENT CHARACTER.
2074 #[inline]
ensure_utf16_validity(buffer: &mut [u16])2075 pub fn ensure_utf16_validity(buffer: &mut [u16]) {
2076 let mut offset = 0;
2077 loop {
2078 offset += utf16_valid_up_to(&buffer[offset..]);
2079 if offset == buffer.len() {
2080 return;
2081 }
2082 buffer[offset] = 0xFFFD;
2083 offset += 1;
2084 }
2085 }
2086
2087 /// Copies ASCII from source to destination up to the first non-ASCII byte
2088 /// (or the end of the input if it is ASCII in its entirety).
2089 ///
2090 /// The length of the destination buffer must be at least the length of the
2091 /// source buffer.
2092 ///
2093 /// Returns the number of bytes written.
2094 ///
2095 /// # Panics
2096 ///
2097 /// Panics if the destination buffer is shorter than stated above.
copy_ascii_to_ascii(src: &[u8], dst: &mut [u8]) -> usize2098 pub fn copy_ascii_to_ascii(src: &[u8], dst: &mut [u8]) -> usize {
2099 assert!(
2100 dst.len() >= src.len(),
2101 "Destination must not be shorter than the source."
2102 );
2103 if let Some((_, consumed)) =
2104 unsafe { ascii_to_ascii(src.as_ptr(), dst.as_mut_ptr(), src.len()) }
2105 {
2106 consumed
2107 } else {
2108 src.len()
2109 }
2110 }
2111
2112 /// Copies ASCII from source to destination zero-extending it to UTF-16 up to
2113 /// the first non-ASCII byte (or the end of the input if it is ASCII in its
2114 /// entirety).
2115 ///
2116 /// The length of the destination buffer must be at least the length of the
2117 /// source buffer.
2118 ///
2119 /// Returns the number of `u16`s written.
2120 ///
2121 /// # Panics
2122 ///
2123 /// Panics if the destination buffer is shorter than stated above.
copy_ascii_to_basic_latin(src: &[u8], dst: &mut [u16]) -> usize2124 pub fn copy_ascii_to_basic_latin(src: &[u8], dst: &mut [u16]) -> usize {
2125 assert!(
2126 dst.len() >= src.len(),
2127 "Destination must not be shorter than the source."
2128 );
2129 if let Some((_, consumed)) =
2130 unsafe { ascii_to_basic_latin(src.as_ptr(), dst.as_mut_ptr(), src.len()) }
2131 {
2132 consumed
2133 } else {
2134 src.len()
2135 }
2136 }
2137
2138 /// Copies Basic Latin from source to destination narrowing it to ASCII up to
2139 /// the first non-Basic Latin code unit (or the end of the input if it is
2140 /// Basic Latin in its entirety).
2141 ///
2142 /// The length of the destination buffer must be at least the length of the
2143 /// source buffer.
2144 ///
2145 /// Returns the number of bytes written.
2146 ///
2147 /// # Panics
2148 ///
2149 /// Panics if the destination buffer is shorter than stated above.
copy_basic_latin_to_ascii(src: &[u16], dst: &mut [u8]) -> usize2150 pub fn copy_basic_latin_to_ascii(src: &[u16], dst: &mut [u8]) -> usize {
2151 assert!(
2152 dst.len() >= src.len(),
2153 "Destination must not be shorter than the source."
2154 );
2155 if let Some((_, consumed)) =
2156 unsafe { basic_latin_to_ascii(src.as_ptr(), dst.as_mut_ptr(), src.len()) }
2157 {
2158 consumed
2159 } else {
2160 src.len()
2161 }
2162 }
2163
2164 // Any copyright to the test code below this comment is dedicated to the
2165 // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
2166
2167 #[cfg(all(test, feature = "alloc"))]
2168 mod tests {
2169 use super::*;
2170
2171 #[test]
test_is_ascii_success()2172 fn test_is_ascii_success() {
2173 let mut src: Vec<u8> = Vec::with_capacity(128);
2174 src.resize(128, 0);
2175 for i in 0..src.len() {
2176 src[i] = i as u8;
2177 }
2178 for i in 0..src.len() {
2179 assert!(is_ascii(&src[i..]));
2180 }
2181 }
2182
2183 #[test]
test_is_ascii_fail()2184 fn test_is_ascii_fail() {
2185 let mut src: Vec<u8> = Vec::with_capacity(128);
2186 src.resize(128, 0);
2187 for i in 0..src.len() {
2188 src[i] = i as u8;
2189 }
2190 for i in 0..src.len() {
2191 let tail = &mut src[i..];
2192 for j in 0..tail.len() {
2193 tail[j] = 0xA0;
2194 assert!(!is_ascii(tail));
2195 }
2196 }
2197 }
2198
2199 #[test]
test_is_basic_latin_success()2200 fn test_is_basic_latin_success() {
2201 let mut src: Vec<u16> = Vec::with_capacity(128);
2202 src.resize(128, 0);
2203 for i in 0..src.len() {
2204 src[i] = i as u16;
2205 }
2206 for i in 0..src.len() {
2207 assert!(is_basic_latin(&src[i..]));
2208 }
2209 }
2210
2211 #[test]
test_is_basic_latin_fail()2212 fn test_is_basic_latin_fail() {
2213 let mut src: Vec<u16> = Vec::with_capacity(128);
2214 src.resize(128, 0);
2215 for i in 0..src.len() {
2216 src[i] = i as u16;
2217 }
2218 for i in 0..src.len() {
2219 let tail = &mut src[i..];
2220 for j in 0..tail.len() {
2221 tail[j] = 0xA0;
2222 assert!(!is_basic_latin(tail));
2223 }
2224 }
2225 }
2226
2227 #[test]
test_is_utf16_latin1_success()2228 fn test_is_utf16_latin1_success() {
2229 let mut src: Vec<u16> = Vec::with_capacity(256);
2230 src.resize(256, 0);
2231 for i in 0..src.len() {
2232 src[i] = i as u16;
2233 }
2234 for i in 0..src.len() {
2235 assert!(is_utf16_latin1(&src[i..]));
2236 assert_eq!(
2237 check_utf16_for_latin1_and_bidi(&src[i..]),
2238 Latin1Bidi::Latin1
2239 );
2240 }
2241 }
2242
2243 #[test]
test_is_utf16_latin1_fail()2244 fn test_is_utf16_latin1_fail() {
2245 let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow
2246 let mut src: Vec<u16> = Vec::with_capacity(len);
2247 src.resize(len, 0);
2248 for i in 0..src.len() {
2249 src[i] = i as u16;
2250 }
2251 for i in 0..src.len() {
2252 let tail = &mut src[i..];
2253 for j in 0..tail.len() {
2254 tail[j] = 0x100 + j as u16;
2255 assert!(!is_utf16_latin1(tail));
2256 assert_ne!(check_utf16_for_latin1_and_bidi(tail), Latin1Bidi::Latin1);
2257 }
2258 }
2259 }
2260
2261 #[test]
test_is_str_latin1_success()2262 fn test_is_str_latin1_success() {
2263 let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow
2264 let mut src: Vec<u16> = Vec::with_capacity(len);
2265 src.resize(len, 0);
2266 for i in 0..src.len() {
2267 src[i] = i as u16;
2268 }
2269 for i in 0..src.len() {
2270 let s = String::from_utf16(&src[i..]).unwrap();
2271 assert!(is_str_latin1(&s[..]));
2272 assert_eq!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1);
2273 }
2274 }
2275
2276 #[test]
test_is_str_latin1_fail()2277 fn test_is_str_latin1_fail() {
2278 let len = if cfg!(miri) { 32 } else { 256 }; // Miri is too slow
2279 let mut src: Vec<u16> = Vec::with_capacity(len);
2280 src.resize(len, 0);
2281 for i in 0..src.len() {
2282 src[i] = i as u16;
2283 }
2284 for i in 0..src.len() {
2285 let tail = &mut src[i..];
2286 for j in 0..tail.len() {
2287 tail[j] = 0x100 + j as u16;
2288 let s = String::from_utf16(tail).unwrap();
2289 assert!(!is_str_latin1(&s[..]));
2290 assert_ne!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1);
2291 }
2292 }
2293 }
2294
2295 #[test]
test_is_utf8_latin1_success()2296 fn test_is_utf8_latin1_success() {
2297 let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow
2298 let mut src: Vec<u16> = Vec::with_capacity(len);
2299 src.resize(len, 0);
2300 for i in 0..src.len() {
2301 src[i] = i as u16;
2302 }
2303 for i in 0..src.len() {
2304 let s = String::from_utf16(&src[i..]).unwrap();
2305 assert!(is_utf8_latin1(s.as_bytes()));
2306 assert_eq!(
2307 check_utf8_for_latin1_and_bidi(s.as_bytes()),
2308 Latin1Bidi::Latin1
2309 );
2310 }
2311 }
2312
2313 #[test]
test_is_utf8_latin1_fail()2314 fn test_is_utf8_latin1_fail() {
2315 let len = if cfg!(miri) { 32 } else { 256 }; // Miri is too slow
2316 let mut src: Vec<u16> = Vec::with_capacity(len);
2317 src.resize(len, 0);
2318 for i in 0..src.len() {
2319 src[i] = i as u16;
2320 }
2321 for i in 0..src.len() {
2322 let tail = &mut src[i..];
2323 for j in 0..tail.len() {
2324 tail[j] = 0x100 + j as u16;
2325 let s = String::from_utf16(tail).unwrap();
2326 assert!(!is_utf8_latin1(s.as_bytes()));
2327 assert_ne!(
2328 check_utf8_for_latin1_and_bidi(s.as_bytes()),
2329 Latin1Bidi::Latin1
2330 );
2331 }
2332 }
2333 }
2334
2335 #[test]
test_is_utf8_latin1_invalid()2336 fn test_is_utf8_latin1_invalid() {
2337 assert!(!is_utf8_latin1(b"\xC3"));
2338 assert!(!is_utf8_latin1(b"a\xC3"));
2339 assert!(!is_utf8_latin1(b"\xFF"));
2340 assert!(!is_utf8_latin1(b"a\xFF"));
2341 assert!(!is_utf8_latin1(b"\xC3\xFF"));
2342 assert!(!is_utf8_latin1(b"a\xC3\xFF"));
2343 }
2344
2345 #[test]
test_convert_utf8_to_utf16()2346 fn test_convert_utf8_to_utf16() {
2347 let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2348 let mut dst: Vec<u16> = Vec::with_capacity(src.len() + 1);
2349 dst.resize(src.len() + 1, 0);
2350 let len = convert_utf8_to_utf16(src.as_bytes(), &mut dst[..]);
2351 dst.truncate(len);
2352 let reference: Vec<u16> = src.encode_utf16().collect();
2353 assert_eq!(dst, reference);
2354 }
2355
2356 #[test]
test_convert_str_to_utf16()2357 fn test_convert_str_to_utf16() {
2358 let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2359 let mut dst: Vec<u16> = Vec::with_capacity(src.len());
2360 dst.resize(src.len(), 0);
2361 let len = convert_str_to_utf16(src, &mut dst[..]);
2362 dst.truncate(len);
2363 let reference: Vec<u16> = src.encode_utf16().collect();
2364 assert_eq!(dst, reference);
2365 }
2366
2367 #[test]
test_convert_utf16_to_utf8_partial()2368 fn test_convert_utf16_to_utf8_partial() {
2369 let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2370 let src: Vec<u16> = reference.encode_utf16().collect();
2371 let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 3 + 1);
2372 dst.resize(src.len() * 3 + 1, 0);
2373 let (read, written) = convert_utf16_to_utf8_partial(&src[..], &mut dst[..24]);
2374 let len = written + convert_utf16_to_utf8(&src[read..], &mut dst[written..]);
2375 dst.truncate(len);
2376 assert_eq!(dst, reference.as_bytes());
2377 }
2378
2379 #[test]
test_convert_utf16_to_utf8()2380 fn test_convert_utf16_to_utf8() {
2381 let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2382 let src: Vec<u16> = reference.encode_utf16().collect();
2383 let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 3 + 1);
2384 dst.resize(src.len() * 3 + 1, 0);
2385 let len = convert_utf16_to_utf8(&src[..], &mut dst[..]);
2386 dst.truncate(len);
2387 assert_eq!(dst, reference.as_bytes());
2388 }
2389
2390 #[test]
test_convert_latin1_to_utf16()2391 fn test_convert_latin1_to_utf16() {
2392 let mut src: Vec<u8> = Vec::with_capacity(256);
2393 src.resize(256, 0);
2394 let mut reference: Vec<u16> = Vec::with_capacity(256);
2395 reference.resize(256, 0);
2396 for i in 0..256 {
2397 src[i] = i as u8;
2398 reference[i] = i as u16;
2399 }
2400 let mut dst: Vec<u16> = Vec::with_capacity(src.len());
2401 dst.resize(src.len(), 0);
2402 convert_latin1_to_utf16(&src[..], &mut dst[..]);
2403 assert_eq!(dst, reference);
2404 }
2405
2406 #[test]
test_convert_latin1_to_utf8_partial()2407 fn test_convert_latin1_to_utf8_partial() {
2408 let mut dst = [0u8, 2];
2409 let (read, written) = convert_latin1_to_utf8_partial(b"a\xFF", &mut dst[..]);
2410 assert_eq!(read, 1);
2411 assert_eq!(written, 1);
2412 }
2413
2414 #[test]
test_convert_latin1_to_utf8()2415 fn test_convert_latin1_to_utf8() {
2416 let mut src: Vec<u8> = Vec::with_capacity(256);
2417 src.resize(256, 0);
2418 let mut reference: Vec<u16> = Vec::with_capacity(256);
2419 reference.resize(256, 0);
2420 for i in 0..256 {
2421 src[i] = i as u8;
2422 reference[i] = i as u16;
2423 }
2424 let s = String::from_utf16(&reference[..]).unwrap();
2425 let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 2);
2426 dst.resize(src.len() * 2, 0);
2427 let len = convert_latin1_to_utf8(&src[..], &mut dst[..]);
2428 dst.truncate(len);
2429 assert_eq!(&dst[..], s.as_bytes());
2430 }
2431
2432 #[test]
test_convert_utf8_to_latin1_lossy()2433 fn test_convert_utf8_to_latin1_lossy() {
2434 let mut reference: Vec<u8> = Vec::with_capacity(256);
2435 reference.resize(256, 0);
2436 let mut src16: Vec<u16> = Vec::with_capacity(256);
2437 src16.resize(256, 0);
2438 for i in 0..256 {
2439 src16[i] = i as u16;
2440 reference[i] = i as u8;
2441 }
2442 let src = String::from_utf16(&src16[..]).unwrap();
2443 let mut dst: Vec<u8> = Vec::with_capacity(src.len());
2444 dst.resize(src.len(), 0);
2445 let len = convert_utf8_to_latin1_lossy(src.as_bytes(), &mut dst[..]);
2446 dst.truncate(len);
2447 assert_eq!(dst, reference);
2448 }
2449
2450 #[cfg(all(debug_assertions, not(fuzzing)))]
2451 #[test]
2452 #[should_panic]
test_convert_utf8_to_latin1_lossy_panics()2453 fn test_convert_utf8_to_latin1_lossy_panics() {
2454 let mut dst = [0u8; 16];
2455 let _ = convert_utf8_to_latin1_lossy("\u{100}".as_bytes(), &mut dst[..]);
2456 }
2457
2458 #[test]
test_convert_utf16_to_latin1_lossy()2459 fn test_convert_utf16_to_latin1_lossy() {
2460 let mut src: Vec<u16> = Vec::with_capacity(256);
2461 src.resize(256, 0);
2462 let mut reference: Vec<u8> = Vec::with_capacity(256);
2463 reference.resize(256, 0);
2464 for i in 0..256 {
2465 src[i] = i as u16;
2466 reference[i] = i as u8;
2467 }
2468 let mut dst: Vec<u8> = Vec::with_capacity(src.len());
2469 dst.resize(src.len(), 0);
2470 convert_utf16_to_latin1_lossy(&src[..], &mut dst[..]);
2471 assert_eq!(dst, reference);
2472 }
2473
2474 #[test]
2475 // #[should_panic]
test_convert_utf16_to_latin1_lossy_panics()2476 fn test_convert_utf16_to_latin1_lossy_panics() {
2477 let mut dst = [0u8; 16];
2478 let _ = convert_utf16_to_latin1_lossy(&[0x0100u16], &mut dst[..]);
2479 }
2480
2481 #[test]
test_utf16_valid_up_to()2482 fn test_utf16_valid_up_to() {
2483 let valid = vec![
2484 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0x2603u16,
2485 0xD83Du16, 0xDCA9u16, 0x00B6u16,
2486 ];
2487 assert_eq!(utf16_valid_up_to(&valid[..]), 16);
2488 let lone_high = vec![
2489 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2490 0x2603u16, 0xD83Du16, 0x00B6u16,
2491 ];
2492 assert_eq!(utf16_valid_up_to(&lone_high[..]), 14);
2493 let lone_low = vec![
2494 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2495 0x2603u16, 0xDCA9u16, 0x00B6u16,
2496 ];
2497 assert_eq!(utf16_valid_up_to(&lone_low[..]), 14);
2498 let lone_high_at_end = vec![
2499 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2500 0x2603u16, 0x00B6u16, 0xD83Du16,
2501 ];
2502 assert_eq!(utf16_valid_up_to(&lone_high_at_end[..]), 15);
2503 }
2504
2505 #[test]
test_ensure_utf16_validity()2506 fn test_ensure_utf16_validity() {
2507 let mut src = vec![
2508 0u16, 0xD83Du16, 0u16, 0u16, 0u16, 0xD83Du16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2509 0u16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2510 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2511 ];
2512 let reference = vec![
2513 0u16, 0xFFFDu16, 0u16, 0u16, 0u16, 0xD83Du16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2514 0u16, 0xFFFDu16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2515 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2516 ];
2517 ensure_utf16_validity(&mut src[..]);
2518 assert_eq!(src, reference);
2519 }
2520
2521 #[test]
test_is_char_bidi()2522 fn test_is_char_bidi() {
2523 assert!(!is_char_bidi('a'));
2524 assert!(!is_char_bidi('\u{03B1}'));
2525 assert!(!is_char_bidi('\u{3041}'));
2526 assert!(!is_char_bidi('\u{1F4A9}'));
2527 assert!(!is_char_bidi('\u{FE00}'));
2528 assert!(!is_char_bidi('\u{202C}'));
2529 assert!(!is_char_bidi('\u{FEFF}'));
2530 assert!(is_char_bidi('\u{0590}'));
2531 assert!(is_char_bidi('\u{08FF}'));
2532 assert!(is_char_bidi('\u{061C}'));
2533 assert!(is_char_bidi('\u{FB50}'));
2534 assert!(is_char_bidi('\u{FDFF}'));
2535 assert!(is_char_bidi('\u{FE70}'));
2536 assert!(is_char_bidi('\u{FEFE}'));
2537 assert!(is_char_bidi('\u{200F}'));
2538 assert!(is_char_bidi('\u{202B}'));
2539 assert!(is_char_bidi('\u{202E}'));
2540 assert!(is_char_bidi('\u{2067}'));
2541 assert!(is_char_bidi('\u{10800}'));
2542 assert!(is_char_bidi('\u{10FFF}'));
2543 assert!(is_char_bidi('\u{1E800}'));
2544 assert!(is_char_bidi('\u{1EFFF}'));
2545 }
2546
2547 #[test]
test_is_utf16_code_unit_bidi()2548 fn test_is_utf16_code_unit_bidi() {
2549 assert!(!is_utf16_code_unit_bidi(0x0062));
2550 assert!(!is_utf16_code_unit_bidi(0x03B1));
2551 assert!(!is_utf16_code_unit_bidi(0x3041));
2552 assert!(!is_utf16_code_unit_bidi(0xD801));
2553 assert!(!is_utf16_code_unit_bidi(0xFE00));
2554 assert!(!is_utf16_code_unit_bidi(0x202C));
2555 assert!(!is_utf16_code_unit_bidi(0xFEFF));
2556 assert!(is_utf16_code_unit_bidi(0x0590));
2557 assert!(is_utf16_code_unit_bidi(0x08FF));
2558 assert!(is_utf16_code_unit_bidi(0x061C));
2559 assert!(is_utf16_code_unit_bidi(0xFB1D));
2560 assert!(is_utf16_code_unit_bidi(0xFB50));
2561 assert!(is_utf16_code_unit_bidi(0xFDFF));
2562 assert!(is_utf16_code_unit_bidi(0xFE70));
2563 assert!(is_utf16_code_unit_bidi(0xFEFE));
2564 assert!(is_utf16_code_unit_bidi(0x200F));
2565 assert!(is_utf16_code_unit_bidi(0x202B));
2566 assert!(is_utf16_code_unit_bidi(0x202E));
2567 assert!(is_utf16_code_unit_bidi(0x2067));
2568 assert!(is_utf16_code_unit_bidi(0xD802));
2569 assert!(is_utf16_code_unit_bidi(0xD803));
2570 assert!(is_utf16_code_unit_bidi(0xD83A));
2571 assert!(is_utf16_code_unit_bidi(0xD83B));
2572 }
2573
2574 #[test]
test_is_str_bidi()2575 fn test_is_str_bidi() {
2576 assert!(!is_str_bidi("abcdefghijklmnopaabcdefghijklmnop"));
2577 assert!(!is_str_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop"));
2578 assert!(!is_str_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop"));
2579 assert!(!is_str_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop"));
2580 assert!(!is_str_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop"));
2581 assert!(!is_str_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop"));
2582 assert!(!is_str_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop"));
2583 assert!(is_str_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop"));
2584 assert!(is_str_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop"));
2585 assert!(is_str_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop"));
2586 assert!(is_str_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop"));
2587 assert!(is_str_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop"));
2588 assert!(is_str_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop"));
2589 assert!(is_str_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop"));
2590 assert!(is_str_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop"));
2591 assert!(is_str_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop"));
2592 assert!(is_str_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop"));
2593 assert!(is_str_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop"));
2594 assert!(is_str_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop"));
2595 assert!(is_str_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop"));
2596 assert!(is_str_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop"));
2597 assert!(is_str_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop"));
2598 }
2599
2600 #[test]
test_is_utf8_bidi()2601 fn test_is_utf8_bidi() {
2602 assert!(!is_utf8_bidi(
2603 "abcdefghijklmnopaabcdefghijklmnop".as_bytes()
2604 ));
2605 assert!(!is_utf8_bidi(
2606 "abcdefghijklmnop\u{03B1}abcdefghijklmnop".as_bytes()
2607 ));
2608 assert!(!is_utf8_bidi(
2609 "abcdefghijklmnop\u{3041}abcdefghijklmnop".as_bytes()
2610 ));
2611 assert!(!is_utf8_bidi(
2612 "abcdefghijklmnop\u{1F4A9}abcdefghijklmnop".as_bytes()
2613 ));
2614 assert!(!is_utf8_bidi(
2615 "abcdefghijklmnop\u{FE00}abcdefghijklmnop".as_bytes()
2616 ));
2617 assert!(!is_utf8_bidi(
2618 "abcdefghijklmnop\u{202C}abcdefghijklmnop".as_bytes()
2619 ));
2620 assert!(!is_utf8_bidi(
2621 "abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes()
2622 ));
2623 assert!(is_utf8_bidi(
2624 "abcdefghijklmnop\u{0590}abcdefghijklmnop".as_bytes()
2625 ));
2626 assert!(is_utf8_bidi(
2627 "abcdefghijklmnop\u{08FF}abcdefghijklmnop".as_bytes()
2628 ));
2629 assert!(is_utf8_bidi(
2630 "abcdefghijklmnop\u{061C}abcdefghijklmnop".as_bytes()
2631 ));
2632 assert!(is_utf8_bidi(
2633 "abcdefghijklmnop\u{FB50}abcdefghijklmnop".as_bytes()
2634 ));
2635 assert!(is_utf8_bidi(
2636 "abcdefghijklmnop\u{FDFF}abcdefghijklmnop".as_bytes()
2637 ));
2638 assert!(is_utf8_bidi(
2639 "abcdefghijklmnop\u{FE70}abcdefghijklmnop".as_bytes()
2640 ));
2641 assert!(is_utf8_bidi(
2642 "abcdefghijklmnop\u{FEFE}abcdefghijklmnop".as_bytes()
2643 ));
2644 assert!(is_utf8_bidi(
2645 "abcdefghijklmnop\u{200F}abcdefghijklmnop".as_bytes()
2646 ));
2647 assert!(is_utf8_bidi(
2648 "abcdefghijklmnop\u{202B}abcdefghijklmnop".as_bytes()
2649 ));
2650 assert!(is_utf8_bidi(
2651 "abcdefghijklmnop\u{202E}abcdefghijklmnop".as_bytes()
2652 ));
2653 assert!(is_utf8_bidi(
2654 "abcdefghijklmnop\u{2067}abcdefghijklmnop".as_bytes()
2655 ));
2656 assert!(is_utf8_bidi(
2657 "abcdefghijklmnop\u{10800}abcdefghijklmnop".as_bytes()
2658 ));
2659 assert!(is_utf8_bidi(
2660 "abcdefghijklmnop\u{10FFF}abcdefghijklmnop".as_bytes()
2661 ));
2662 assert!(is_utf8_bidi(
2663 "abcdefghijklmnop\u{1E800}abcdefghijklmnop".as_bytes()
2664 ));
2665 assert!(is_utf8_bidi(
2666 "abcdefghijklmnop\u{1EFFF}abcdefghijklmnop".as_bytes()
2667 ));
2668 }
2669
2670 #[test]
test_is_utf16_bidi()2671 fn test_is_utf16_bidi() {
2672 assert!(!is_utf16_bidi(&[
2673 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0062, 0x62, 0x63, 0x64, 0x65, 0x66,
2674 0x67, 0x68, 0x69,
2675 ]));
2676 assert!(!is_utf16_bidi(&[
2677 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x03B1, 0x62, 0x63, 0x64, 0x65, 0x66,
2678 0x67, 0x68, 0x69,
2679 ]));
2680 assert!(!is_utf16_bidi(&[
2681 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x3041, 0x62, 0x63, 0x64, 0x65, 0x66,
2682 0x67, 0x68, 0x69,
2683 ]));
2684 assert!(!is_utf16_bidi(&[
2685 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD801, 0x62, 0x63, 0x64, 0x65, 0x66,
2686 0x67, 0x68, 0x69,
2687 ]));
2688 assert!(!is_utf16_bidi(&[
2689 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE00, 0x62, 0x63, 0x64, 0x65, 0x66,
2690 0x67, 0x68, 0x69,
2691 ]));
2692 assert!(!is_utf16_bidi(&[
2693 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65, 0x66,
2694 0x67, 0x68, 0x69,
2695 ]));
2696 assert!(!is_utf16_bidi(&[
2697 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65, 0x66,
2698 0x67, 0x68, 0x69,
2699 ]));
2700 assert!(is_utf16_bidi(&[
2701 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65, 0x66,
2702 0x67, 0x68, 0x69,
2703 ]));
2704 assert!(is_utf16_bidi(&[
2705 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x08FF, 0x62, 0x63, 0x64, 0x65, 0x66,
2706 0x67, 0x68, 0x69,
2707 ]));
2708 assert!(is_utf16_bidi(&[
2709 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65, 0x66,
2710 0x67, 0x68, 0x69,
2711 ]));
2712 assert!(is_utf16_bidi(&[
2713 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB1D, 0x62, 0x63, 0x64, 0x65, 0x66,
2714 0x67, 0x68, 0x69,
2715 ]));
2716 assert!(is_utf16_bidi(&[
2717 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65, 0x66,
2718 0x67, 0x68, 0x69,
2719 ]));
2720 assert!(is_utf16_bidi(&[
2721 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFDFF, 0x62, 0x63, 0x64, 0x65, 0x66,
2722 0x67, 0x68, 0x69,
2723 ]));
2724 assert!(is_utf16_bidi(&[
2725 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE70, 0x62, 0x63, 0x64, 0x65, 0x66,
2726 0x67, 0x68, 0x69,
2727 ]));
2728 assert!(is_utf16_bidi(&[
2729 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFE, 0x62, 0x63, 0x64, 0x65, 0x66,
2730 0x67, 0x68, 0x69,
2731 ]));
2732 assert!(is_utf16_bidi(&[
2733 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x200F, 0x62, 0x63, 0x64, 0x65, 0x66,
2734 0x67, 0x68, 0x69,
2735 ]));
2736 assert!(is_utf16_bidi(&[
2737 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202B, 0x62, 0x63, 0x64, 0x65, 0x66,
2738 0x67, 0x68, 0x69,
2739 ]));
2740 assert!(is_utf16_bidi(&[
2741 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202E, 0x62, 0x63, 0x64, 0x65, 0x66,
2742 0x67, 0x68, 0x69,
2743 ]));
2744 assert!(is_utf16_bidi(&[
2745 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x2067, 0x62, 0x63, 0x64, 0x65, 0x66,
2746 0x67, 0x68, 0x69,
2747 ]));
2748 assert!(is_utf16_bidi(&[
2749 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD802, 0x62, 0x63, 0x64, 0x65, 0x66,
2750 0x67, 0x68, 0x69,
2751 ]));
2752 assert!(is_utf16_bidi(&[
2753 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD803, 0x62, 0x63, 0x64, 0x65, 0x66,
2754 0x67, 0x68, 0x69,
2755 ]));
2756 assert!(is_utf16_bidi(&[
2757 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83A, 0x62, 0x63, 0x64, 0x65, 0x66,
2758 0x67, 0x68, 0x69,
2759 ]));
2760 assert!(is_utf16_bidi(&[
2761 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83B, 0x62, 0x63, 0x64, 0x65, 0x66,
2762 0x67, 0x68, 0x69,
2763 ]));
2764
2765 assert!(is_utf16_bidi(&[
2766 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x3041, 0x62, 0x63, 0x64, 0x65,
2767 0x66, 0x67, 0x68, 0x69,
2768 ]));
2769 }
2770
2771 #[test]
test_check_str_for_latin1_and_bidi()2772 fn test_check_str_for_latin1_and_bidi() {
2773 assert_ne!(
2774 check_str_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop"),
2775 Latin1Bidi::Bidi
2776 );
2777 assert_ne!(
2778 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop"),
2779 Latin1Bidi::Bidi
2780 );
2781 assert_ne!(
2782 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop"),
2783 Latin1Bidi::Bidi
2784 );
2785 assert_ne!(
2786 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop"),
2787 Latin1Bidi::Bidi
2788 );
2789 assert_ne!(
2790 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop"),
2791 Latin1Bidi::Bidi
2792 );
2793 assert_ne!(
2794 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop"),
2795 Latin1Bidi::Bidi
2796 );
2797 assert_ne!(
2798 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop"),
2799 Latin1Bidi::Bidi
2800 );
2801 assert_eq!(
2802 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop"),
2803 Latin1Bidi::Bidi
2804 );
2805 assert_eq!(
2806 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop"),
2807 Latin1Bidi::Bidi
2808 );
2809 assert_eq!(
2810 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop"),
2811 Latin1Bidi::Bidi
2812 );
2813 assert_eq!(
2814 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop"),
2815 Latin1Bidi::Bidi
2816 );
2817 assert_eq!(
2818 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop"),
2819 Latin1Bidi::Bidi
2820 );
2821 assert_eq!(
2822 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop"),
2823 Latin1Bidi::Bidi
2824 );
2825 assert_eq!(
2826 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop"),
2827 Latin1Bidi::Bidi
2828 );
2829 assert_eq!(
2830 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop"),
2831 Latin1Bidi::Bidi
2832 );
2833 assert_eq!(
2834 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop"),
2835 Latin1Bidi::Bidi
2836 );
2837 assert_eq!(
2838 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop"),
2839 Latin1Bidi::Bidi
2840 );
2841 assert_eq!(
2842 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop"),
2843 Latin1Bidi::Bidi
2844 );
2845 assert_eq!(
2846 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop"),
2847 Latin1Bidi::Bidi
2848 );
2849 assert_eq!(
2850 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop"),
2851 Latin1Bidi::Bidi
2852 );
2853 assert_eq!(
2854 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop"),
2855 Latin1Bidi::Bidi
2856 );
2857 assert_eq!(
2858 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop"),
2859 Latin1Bidi::Bidi
2860 );
2861 }
2862
2863 #[test]
test_check_utf8_for_latin1_and_bidi()2864 fn test_check_utf8_for_latin1_and_bidi() {
2865 assert_ne!(
2866 check_utf8_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop".as_bytes()),
2867 Latin1Bidi::Bidi
2868 );
2869 assert_ne!(
2870 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop".as_bytes()),
2871 Latin1Bidi::Bidi
2872 );
2873 assert_ne!(
2874 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop".as_bytes()),
2875 Latin1Bidi::Bidi
2876 );
2877 assert_ne!(
2878 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop".as_bytes()),
2879 Latin1Bidi::Bidi
2880 );
2881 assert_ne!(
2882 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop".as_bytes()),
2883 Latin1Bidi::Bidi
2884 );
2885 assert_ne!(
2886 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop".as_bytes()),
2887 Latin1Bidi::Bidi
2888 );
2889 assert_ne!(
2890 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes()),
2891 Latin1Bidi::Bidi
2892 );
2893 assert_eq!(
2894 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop".as_bytes()),
2895 Latin1Bidi::Bidi
2896 );
2897 assert_eq!(
2898 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop".as_bytes()),
2899 Latin1Bidi::Bidi
2900 );
2901 assert_eq!(
2902 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop".as_bytes()),
2903 Latin1Bidi::Bidi
2904 );
2905 assert_eq!(
2906 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop".as_bytes()),
2907 Latin1Bidi::Bidi
2908 );
2909 assert_eq!(
2910 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop".as_bytes()),
2911 Latin1Bidi::Bidi
2912 );
2913 assert_eq!(
2914 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop".as_bytes()),
2915 Latin1Bidi::Bidi
2916 );
2917 assert_eq!(
2918 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop".as_bytes()),
2919 Latin1Bidi::Bidi
2920 );
2921 assert_eq!(
2922 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop".as_bytes()),
2923 Latin1Bidi::Bidi
2924 );
2925 assert_eq!(
2926 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop".as_bytes()),
2927 Latin1Bidi::Bidi
2928 );
2929 assert_eq!(
2930 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop".as_bytes()),
2931 Latin1Bidi::Bidi
2932 );
2933 assert_eq!(
2934 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop".as_bytes()),
2935 Latin1Bidi::Bidi
2936 );
2937 assert_eq!(
2938 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop".as_bytes()),
2939 Latin1Bidi::Bidi
2940 );
2941 assert_eq!(
2942 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop".as_bytes()),
2943 Latin1Bidi::Bidi
2944 );
2945 assert_eq!(
2946 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop".as_bytes()),
2947 Latin1Bidi::Bidi
2948 );
2949 assert_eq!(
2950 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop".as_bytes()),
2951 Latin1Bidi::Bidi
2952 );
2953 }
2954
2955 #[test]
test_check_utf16_for_latin1_and_bidi()2956 fn test_check_utf16_for_latin1_and_bidi() {
2957 assert_ne!(
2958 check_utf16_for_latin1_and_bidi(&[
2959 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0062, 0x62, 0x63, 0x64, 0x65,
2960 0x66, 0x67, 0x68, 0x69,
2961 ]),
2962 Latin1Bidi::Bidi
2963 );
2964 assert_ne!(
2965 check_utf16_for_latin1_and_bidi(&[
2966 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x03B1, 0x62, 0x63, 0x64, 0x65,
2967 0x66, 0x67, 0x68, 0x69,
2968 ]),
2969 Latin1Bidi::Bidi
2970 );
2971 assert_ne!(
2972 check_utf16_for_latin1_and_bidi(&[
2973 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x3041, 0x62, 0x63, 0x64, 0x65,
2974 0x66, 0x67, 0x68, 0x69,
2975 ]),
2976 Latin1Bidi::Bidi
2977 );
2978 assert_ne!(
2979 check_utf16_for_latin1_and_bidi(&[
2980 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD801, 0x62, 0x63, 0x64, 0x65,
2981 0x66, 0x67, 0x68, 0x69,
2982 ]),
2983 Latin1Bidi::Bidi
2984 );
2985 assert_ne!(
2986 check_utf16_for_latin1_and_bidi(&[
2987 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE00, 0x62, 0x63, 0x64, 0x65,
2988 0x66, 0x67, 0x68, 0x69,
2989 ]),
2990 Latin1Bidi::Bidi
2991 );
2992 assert_ne!(
2993 check_utf16_for_latin1_and_bidi(&[
2994 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65,
2995 0x66, 0x67, 0x68, 0x69,
2996 ]),
2997 Latin1Bidi::Bidi
2998 );
2999 assert_ne!(
3000 check_utf16_for_latin1_and_bidi(&[
3001 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65,
3002 0x66, 0x67, 0x68, 0x69,
3003 ]),
3004 Latin1Bidi::Bidi
3005 );
3006 assert_eq!(
3007 check_utf16_for_latin1_and_bidi(&[
3008 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65,
3009 0x66, 0x67, 0x68, 0x69,
3010 ]),
3011 Latin1Bidi::Bidi
3012 );
3013 assert_eq!(
3014 check_utf16_for_latin1_and_bidi(&[
3015 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x08FF, 0x62, 0x63, 0x64, 0x65,
3016 0x66, 0x67, 0x68, 0x69,
3017 ]),
3018 Latin1Bidi::Bidi
3019 );
3020 assert_eq!(
3021 check_utf16_for_latin1_and_bidi(&[
3022 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65,
3023 0x66, 0x67, 0x68, 0x69,
3024 ]),
3025 Latin1Bidi::Bidi
3026 );
3027 assert_eq!(
3028 check_utf16_for_latin1_and_bidi(&[
3029 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB1D, 0x62, 0x63, 0x64, 0x65,
3030 0x66, 0x67, 0x68, 0x69,
3031 ]),
3032 Latin1Bidi::Bidi
3033 );
3034 assert_eq!(
3035 check_utf16_for_latin1_and_bidi(&[
3036 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65,
3037 0x66, 0x67, 0x68, 0x69,
3038 ]),
3039 Latin1Bidi::Bidi
3040 );
3041 assert_eq!(
3042 check_utf16_for_latin1_and_bidi(&[
3043 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFDFF, 0x62, 0x63, 0x64, 0x65,
3044 0x66, 0x67, 0x68, 0x69,
3045 ]),
3046 Latin1Bidi::Bidi
3047 );
3048 assert_eq!(
3049 check_utf16_for_latin1_and_bidi(&[
3050 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE70, 0x62, 0x63, 0x64, 0x65,
3051 0x66, 0x67, 0x68, 0x69,
3052 ]),
3053 Latin1Bidi::Bidi
3054 );
3055 assert_eq!(
3056 check_utf16_for_latin1_and_bidi(&[
3057 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFE, 0x62, 0x63, 0x64, 0x65,
3058 0x66, 0x67, 0x68, 0x69,
3059 ]),
3060 Latin1Bidi::Bidi
3061 );
3062 assert_eq!(
3063 check_utf16_for_latin1_and_bidi(&[
3064 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x200F, 0x62, 0x63, 0x64, 0x65,
3065 0x66, 0x67, 0x68, 0x69,
3066 ]),
3067 Latin1Bidi::Bidi
3068 );
3069 assert_eq!(
3070 check_utf16_for_latin1_and_bidi(&[
3071 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202B, 0x62, 0x63, 0x64, 0x65,
3072 0x66, 0x67, 0x68, 0x69,
3073 ]),
3074 Latin1Bidi::Bidi
3075 );
3076 assert_eq!(
3077 check_utf16_for_latin1_and_bidi(&[
3078 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202E, 0x62, 0x63, 0x64, 0x65,
3079 0x66, 0x67, 0x68, 0x69,
3080 ]),
3081 Latin1Bidi::Bidi
3082 );
3083 assert_eq!(
3084 check_utf16_for_latin1_and_bidi(&[
3085 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x2067, 0x62, 0x63, 0x64, 0x65,
3086 0x66, 0x67, 0x68, 0x69,
3087 ]),
3088 Latin1Bidi::Bidi
3089 );
3090 assert_eq!(
3091 check_utf16_for_latin1_and_bidi(&[
3092 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD802, 0x62, 0x63, 0x64, 0x65,
3093 0x66, 0x67, 0x68, 0x69,
3094 ]),
3095 Latin1Bidi::Bidi
3096 );
3097 assert_eq!(
3098 check_utf16_for_latin1_and_bidi(&[
3099 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD803, 0x62, 0x63, 0x64, 0x65,
3100 0x66, 0x67, 0x68, 0x69,
3101 ]),
3102 Latin1Bidi::Bidi
3103 );
3104 assert_eq!(
3105 check_utf16_for_latin1_and_bidi(&[
3106 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83A, 0x62, 0x63, 0x64, 0x65,
3107 0x66, 0x67, 0x68, 0x69,
3108 ]),
3109 Latin1Bidi::Bidi
3110 );
3111 assert_eq!(
3112 check_utf16_for_latin1_and_bidi(&[
3113 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83B, 0x62, 0x63, 0x64, 0x65,
3114 0x66, 0x67, 0x68, 0x69,
3115 ]),
3116 Latin1Bidi::Bidi
3117 );
3118
3119 assert_eq!(
3120 check_utf16_for_latin1_and_bidi(&[
3121 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x3041, 0x62, 0x63, 0x64,
3122 0x65, 0x66, 0x67, 0x68, 0x69,
3123 ]),
3124 Latin1Bidi::Bidi
3125 );
3126 }
3127
3128 #[inline(always)]
reference_is_char_bidi(c: char) -> bool3129 pub fn reference_is_char_bidi(c: char) -> bool {
3130 match c {
3131 '\u{0590}'..='\u{08FF}'
3132 | '\u{FB1D}'..='\u{FDFF}'
3133 | '\u{FE70}'..='\u{FEFE}'
3134 | '\u{10800}'..='\u{10FFF}'
3135 | '\u{1E800}'..='\u{1EFFF}'
3136 | '\u{200F}'
3137 | '\u{202B}'
3138 | '\u{202E}'
3139 | '\u{2067}' => true,
3140 _ => false,
3141 }
3142 }
3143
3144 #[inline(always)]
reference_is_utf16_code_unit_bidi(u: u16) -> bool3145 pub fn reference_is_utf16_code_unit_bidi(u: u16) -> bool {
3146 match u {
3147 0x0590..=0x08FF
3148 | 0xFB1D..=0xFDFF
3149 | 0xFE70..=0xFEFE
3150 | 0xD802
3151 | 0xD803
3152 | 0xD83A
3153 | 0xD83B
3154 | 0x200F
3155 | 0x202B
3156 | 0x202E
3157 | 0x2067 => true,
3158 _ => false,
3159 }
3160 }
3161
3162 #[test]
3163 #[cfg_attr(miri, ignore)] // Miri is too slow
test_is_char_bidi_thoroughly()3164 fn test_is_char_bidi_thoroughly() {
3165 for i in 0..0xD800u32 {
3166 let c: char = ::core::char::from_u32(i).unwrap();
3167 assert_eq!(is_char_bidi(c), reference_is_char_bidi(c));
3168 }
3169 for i in 0xE000..0x110000u32 {
3170 let c: char = ::core::char::from_u32(i).unwrap();
3171 assert_eq!(is_char_bidi(c), reference_is_char_bidi(c));
3172 }
3173 }
3174
3175 #[test]
3176 #[cfg_attr(miri, ignore)] // Miri is too slow
test_is_utf16_code_unit_bidi_thoroughly()3177 fn test_is_utf16_code_unit_bidi_thoroughly() {
3178 for i in 0..0x10000u32 {
3179 let u = i as u16;
3180 assert_eq!(
3181 is_utf16_code_unit_bidi(u),
3182 reference_is_utf16_code_unit_bidi(u)
3183 );
3184 }
3185 }
3186
3187 #[test]
3188 #[cfg_attr(miri, ignore)] // Miri is too slow
test_is_str_bidi_thoroughly()3189 fn test_is_str_bidi_thoroughly() {
3190 let mut buf = [0; 4];
3191 for i in 0..0xD800u32 {
3192 let c: char = ::core::char::from_u32(i).unwrap();
3193 assert_eq!(
3194 is_str_bidi(c.encode_utf8(&mut buf[..])),
3195 reference_is_char_bidi(c)
3196 );
3197 }
3198 for i in 0xE000..0x110000u32 {
3199 let c: char = ::core::char::from_u32(i).unwrap();
3200 assert_eq!(
3201 is_str_bidi(c.encode_utf8(&mut buf[..])),
3202 reference_is_char_bidi(c)
3203 );
3204 }
3205 }
3206
3207 #[test]
3208 #[cfg_attr(miri, ignore)] // Miri is too slow
test_is_utf8_bidi_thoroughly()3209 fn test_is_utf8_bidi_thoroughly() {
3210 let mut buf = [0; 8];
3211 for i in 0..0xD800u32 {
3212 let c: char = ::core::char::from_u32(i).unwrap();
3213 let expect = reference_is_char_bidi(c);
3214 {
3215 let len = {
3216 let bytes = c.encode_utf8(&mut buf[..]).as_bytes();
3217 assert_eq!(is_utf8_bidi(bytes), expect);
3218 bytes.len()
3219 };
3220 {
3221 let tail = &mut buf[len..];
3222 for b in tail.iter_mut() {
3223 *b = 0;
3224 }
3225 }
3226 }
3227 assert_eq!(is_utf8_bidi(&buf[..]), expect);
3228 }
3229 for i in 0xE000..0x110000u32 {
3230 let c: char = ::core::char::from_u32(i).unwrap();
3231 let expect = reference_is_char_bidi(c);
3232 {
3233 let len = {
3234 let bytes = c.encode_utf8(&mut buf[..]).as_bytes();
3235 assert_eq!(is_utf8_bidi(bytes), expect);
3236 bytes.len()
3237 };
3238 {
3239 let tail = &mut buf[len..];
3240 for b in tail.iter_mut() {
3241 *b = 0;
3242 }
3243 }
3244 }
3245 assert_eq!(is_utf8_bidi(&buf[..]), expect);
3246 }
3247 }
3248
3249 #[test]
3250 #[cfg_attr(miri, ignore)] // Miri is too slow
test_is_utf16_bidi_thoroughly()3251 fn test_is_utf16_bidi_thoroughly() {
3252 let mut buf = [0; 32];
3253 for i in 0..0x10000u32 {
3254 let u = i as u16;
3255 buf[15] = u;
3256 assert_eq!(
3257 is_utf16_bidi(&buf[..]),
3258 reference_is_utf16_code_unit_bidi(u)
3259 );
3260 }
3261 }
3262
3263 #[test]
test_is_utf8_bidi_edge_cases()3264 fn test_is_utf8_bidi_edge_cases() {
3265 assert!(!is_utf8_bidi(b"\xD5\xBF\x61"));
3266 assert!(!is_utf8_bidi(b"\xD6\x80\x61"));
3267 assert!(!is_utf8_bidi(b"abc"));
3268 assert!(is_utf8_bidi(b"\xD5\xBF\xC2"));
3269 assert!(is_utf8_bidi(b"\xD6\x80\xC2"));
3270 assert!(is_utf8_bidi(b"ab\xC2"));
3271 }
3272
3273 #[test]
test_decode_latin1()3274 fn test_decode_latin1() {
3275 match decode_latin1(b"ab") {
3276 Cow::Borrowed(s) => {
3277 assert_eq!(s, "ab");
3278 }
3279 Cow::Owned(_) => {
3280 unreachable!("Should have borrowed");
3281 }
3282 }
3283 assert_eq!(decode_latin1(b"a\xE4"), "a\u{E4}");
3284 }
3285
3286 #[test]
test_encode_latin1_lossy()3287 fn test_encode_latin1_lossy() {
3288 match encode_latin1_lossy("ab") {
3289 Cow::Borrowed(s) => {
3290 assert_eq!(s, b"ab");
3291 }
3292 Cow::Owned(_) => {
3293 unreachable!("Should have borrowed");
3294 }
3295 }
3296 assert_eq!(encode_latin1_lossy("a\u{E4}"), &(b"a\xE4")[..]);
3297 }
3298
3299 #[test]
test_convert_utf8_to_utf16_without_replacement()3300 fn test_convert_utf8_to_utf16_without_replacement() {
3301 let mut buf = [0u16; 5];
3302 assert_eq!(
3303 convert_utf8_to_utf16_without_replacement(b"ab", &mut buf[..2]),
3304 Some(2)
3305 );
3306 assert_eq!(buf[0], u16::from(b'a'));
3307 assert_eq!(buf[1], u16::from(b'b'));
3308 assert_eq!(buf[2], 0);
3309 assert_eq!(
3310 convert_utf8_to_utf16_without_replacement(b"\xC3\xA4c", &mut buf[..3]),
3311 Some(2)
3312 );
3313 assert_eq!(buf[0], 0xE4);
3314 assert_eq!(buf[1], u16::from(b'c'));
3315 assert_eq!(buf[2], 0);
3316 assert_eq!(
3317 convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83", &mut buf[..3]),
3318 Some(1)
3319 );
3320 assert_eq!(buf[0], 0x2603);
3321 assert_eq!(buf[1], u16::from(b'c'));
3322 assert_eq!(buf[2], 0);
3323 assert_eq!(
3324 convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83d", &mut buf[..4]),
3325 Some(2)
3326 );
3327 assert_eq!(buf[0], 0x2603);
3328 assert_eq!(buf[1], u16::from(b'd'));
3329 assert_eq!(buf[2], 0);
3330 assert_eq!(
3331 convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83\xC3\xA4", &mut buf[..5]),
3332 Some(2)
3333 );
3334 assert_eq!(buf[0], 0x2603);
3335 assert_eq!(buf[1], 0xE4);
3336 assert_eq!(buf[2], 0);
3337 assert_eq!(
3338 convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93\x8E", &mut buf[..4]),
3339 Some(2)
3340 );
3341 assert_eq!(buf[0], 0xD83D);
3342 assert_eq!(buf[1], 0xDCCE);
3343 assert_eq!(buf[2], 0);
3344 assert_eq!(
3345 convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93\x8Ee", &mut buf[..5]),
3346 Some(3)
3347 );
3348 assert_eq!(buf[0], 0xD83D);
3349 assert_eq!(buf[1], 0xDCCE);
3350 assert_eq!(buf[2], u16::from(b'e'));
3351 assert_eq!(
3352 convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93", &mut buf[..5]),
3353 None
3354 );
3355 }
3356 }
3357