1 // Copyright Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9
10 //! Functions for converting between different in-RAM representations of text
11 //! and for quickly checking if the Unicode Bidirectional Algorithm can be
12 //! avoided.
13 //!
14 //! By using slices for output, the functions here seek to enable by-register
15 //! (ALU register or SIMD register as available) operations in order to
16 //! outperform iterator-based conversions available in the Rust standard
17 //! library.
18 //!
19 //! _Note:_ "Latin1" in this module refers to the Unicode range from U+0000 to
20 //! U+00FF, inclusive, and does not refer to the windows-1252 range. This
21 //! in-memory encoding is sometimes used as a storage optimization of text
22 //! when UTF-16 indexing and length semantics are exposed.
23 //!
24 //! The FFI binding for this module are in the
25 //! [encoding_c_mem crate](https://github.com/hsivonen/encoding_c_mem).
26
27 use alloc::borrow::Cow;
28 use alloc::string::String;
29 use alloc::vec::Vec;
30
31 use super::in_inclusive_range16;
32 use super::in_inclusive_range32;
33 use super::in_inclusive_range8;
34 use super::in_range16;
35 use super::in_range32;
36 use super::DecoderResult;
37 use crate::ascii::*;
38 use crate::utf_8::*;
39
40 macro_rules! non_fuzz_debug_assert {
41 ($($arg:tt)*) => (if !cfg!(fuzzing) { debug_assert!($($arg)*); })
42 }
43
44 cfg_if! {
45 if #[cfg(feature = "simd-accel")] {
46 use ::core::intrinsics::likely;
47 use ::core::intrinsics::unlikely;
48 } else {
49 #[inline(always)]
50 // Unsafe to match the intrinsic, which is needlessly unsafe.
51 unsafe fn likely(b: bool) -> bool {
52 b
53 }
54 #[inline(always)]
55 // Unsafe to match the intrinsic, which is needlessly unsafe.
56 unsafe fn unlikely(b: bool) -> bool {
57 b
58 }
59 }
60 }
61
62 /// Classification of text as Latin1 (all code points are below U+0100),
63 /// left-to-right with some non-Latin1 characters or as containing at least
64 /// some right-to-left characters.
65 #[must_use]
66 #[derive(Debug, PartialEq, Eq)]
67 #[repr(C)]
68 pub enum Latin1Bidi {
69 /// Every character is below U+0100.
70 Latin1 = 0,
71 /// There is at least one character that's U+0100 or higher, but there
72 /// are no right-to-left characters.
73 LeftToRight = 1,
74 /// There is at least one right-to-left character.
75 Bidi = 2,
76 }
77
78 // `as` truncates, so works on 32-bit, too.
79 #[allow(dead_code)]
80 const LATIN1_MASK: usize = 0xFF00_FF00_FF00_FF00u64 as usize;
81
82 #[allow(unused_macros)]
83 macro_rules! by_unit_check_alu {
84 ($name:ident, $unit:ty, $bound:expr, $mask:ident) => {
85 #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
86 #[inline(always)]
87 fn $name(buffer: &[$unit]) -> bool {
88 let mut offset = 0usize;
89 let mut accu = 0usize;
90 let unit_size = ::core::mem::size_of::<$unit>();
91 let len = buffer.len();
92 if len >= ALU_ALIGNMENT / unit_size {
93 // The most common reason to return `false` is for the first code
94 // unit to fail the test, so check that first.
95 if buffer[0] >= $bound {
96 return false;
97 }
98 let src = buffer.as_ptr();
99 let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK))
100 & ALU_ALIGNMENT_MASK)
101 / unit_size;
102 if until_alignment + ALU_ALIGNMENT / unit_size <= len {
103 if until_alignment != 0 {
104 accu |= buffer[offset] as usize;
105 offset += 1;
106 until_alignment -= 1;
107 while until_alignment != 0 {
108 accu |= buffer[offset] as usize;
109 offset += 1;
110 until_alignment -= 1;
111 }
112 if accu >= $bound {
113 return false;
114 }
115 }
116 let len_minus_stride = len - ALU_ALIGNMENT / unit_size;
117 if offset + (4 * (ALU_ALIGNMENT / unit_size)) <= len {
118 let len_minus_unroll = len - (4 * (ALU_ALIGNMENT / unit_size));
119 loop {
120 let unroll_accu = unsafe { *(src.add(offset) as *const usize) }
121 | unsafe {
122 *(src.add(offset + (ALU_ALIGNMENT / unit_size)) as *const usize)
123 }
124 | unsafe {
125 *(src.add(offset + (2 * (ALU_ALIGNMENT / unit_size)))
126 as *const usize)
127 }
128 | unsafe {
129 *(src.add(offset + (3 * (ALU_ALIGNMENT / unit_size)))
130 as *const usize)
131 };
132 if unroll_accu & $mask != 0 {
133 return false;
134 }
135 offset += 4 * (ALU_ALIGNMENT / unit_size);
136 if offset > len_minus_unroll {
137 break;
138 }
139 }
140 }
141 while offset <= len_minus_stride {
142 accu |= unsafe { *(src.add(offset) as *const usize) };
143 offset += ALU_ALIGNMENT / unit_size;
144 }
145 }
146 }
147 for &unit in &buffer[offset..] {
148 accu |= unit as usize;
149 }
150 accu & $mask == 0
151 }
152 };
153 }
154
155 #[allow(unused_macros)]
156 macro_rules! by_unit_check_simd {
157 ($name:ident, $unit:ty, $splat:expr, $simd_ty:ty, $bound:expr, $func:ident) => {
158 #[inline(always)]
159 fn $name(buffer: &[$unit]) -> bool {
160 let mut offset = 0usize;
161 let mut accu = 0usize;
162 let unit_size = ::core::mem::size_of::<$unit>();
163 let len = buffer.len();
164 if len >= SIMD_STRIDE_SIZE / unit_size {
165 // The most common reason to return `false` is for the first code
166 // unit to fail the test, so check that first.
167 if buffer[0] >= $bound {
168 return false;
169 }
170 let src = buffer.as_ptr();
171 let mut until_alignment = ((SIMD_ALIGNMENT
172 - ((src as usize) & SIMD_ALIGNMENT_MASK))
173 & SIMD_ALIGNMENT_MASK)
174 / unit_size;
175 if until_alignment + SIMD_STRIDE_SIZE / unit_size <= len {
176 if until_alignment != 0 {
177 accu |= buffer[offset] as usize;
178 offset += 1;
179 until_alignment -= 1;
180 while until_alignment != 0 {
181 accu |= buffer[offset] as usize;
182 offset += 1;
183 until_alignment -= 1;
184 }
185 if accu >= $bound {
186 return false;
187 }
188 }
189 let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size;
190 if offset + (4 * (SIMD_STRIDE_SIZE / unit_size)) <= len {
191 let len_minus_unroll = len - (4 * (SIMD_STRIDE_SIZE / unit_size));
192 loop {
193 let unroll_accu = unsafe { *(src.add(offset) as *const $simd_ty) }
194 | unsafe {
195 *(src.add(offset + (SIMD_STRIDE_SIZE / unit_size))
196 as *const $simd_ty)
197 }
198 | unsafe {
199 *(src.add(offset + (2 * (SIMD_STRIDE_SIZE / unit_size)))
200 as *const $simd_ty)
201 }
202 | unsafe {
203 *(src.add(offset + (3 * (SIMD_STRIDE_SIZE / unit_size)))
204 as *const $simd_ty)
205 };
206 if !$func(unroll_accu) {
207 return false;
208 }
209 offset += 4 * (SIMD_STRIDE_SIZE / unit_size);
210 if offset > len_minus_unroll {
211 break;
212 }
213 }
214 }
215 let mut simd_accu = $splat;
216 while offset <= len_minus_stride {
217 simd_accu = simd_accu | unsafe { *(src.add(offset) as *const $simd_ty) };
218 offset += SIMD_STRIDE_SIZE / unit_size;
219 }
220 if !$func(simd_accu) {
221 return false;
222 }
223 }
224 }
225 for &unit in &buffer[offset..] {
226 accu |= unit as usize;
227 }
228 accu < $bound
229 }
230 };
231 }
232
233 cfg_if! {
234 if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
235 use crate::simd_funcs::*;
236 use packed_simd::u8x16;
237 use packed_simd::u16x8;
238
239 const SIMD_ALIGNMENT: usize = 16;
240
241 const SIMD_ALIGNMENT_MASK: usize = 15;
242
243 by_unit_check_simd!(is_ascii_impl, u8, u8x16::splat(0), u8x16, 0x80, simd_is_ascii);
244 by_unit_check_simd!(is_basic_latin_impl, u16, u16x8::splat(0), u16x8, 0x80, simd_is_basic_latin);
245 by_unit_check_simd!(is_utf16_latin1_impl, u16, u16x8::splat(0), u16x8, 0x100, simd_is_latin1);
246
247 #[inline(always)]
248 fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize {
249 // This function is a mess, because it simultaneously tries to do
250 // only aligned SIMD (perhaps misguidedly) and needs to deal with
251 // the last code unit in a SIMD stride being part of a valid
252 // surrogate pair.
253 let unit_size = ::core::mem::size_of::<u16>();
254 let src = buffer.as_ptr();
255 let len = buffer.len();
256 let mut offset = 0usize;
257 'outer: loop {
258 let until_alignment = ((SIMD_ALIGNMENT - ((unsafe { src.add(offset) } as usize) & SIMD_ALIGNMENT_MASK)) &
259 SIMD_ALIGNMENT_MASK) / unit_size;
260 if until_alignment == 0 {
261 if offset + SIMD_STRIDE_SIZE / unit_size > len {
262 break;
263 }
264 } else {
265 let offset_plus_until_alignment = offset + until_alignment;
266 let offset_plus_until_alignment_plus_one = offset_plus_until_alignment + 1;
267 if offset_plus_until_alignment_plus_one + SIMD_STRIDE_SIZE / unit_size > len {
268 break;
269 }
270 let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_until_alignment_plus_one]);
271 if up_to < until_alignment {
272 return offset + up_to;
273 }
274 if last_valid_low {
275 offset = offset_plus_until_alignment_plus_one;
276 continue;
277 }
278 offset = offset_plus_until_alignment;
279 }
280 let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size;
281 loop {
282 let offset_plus_stride = offset + SIMD_STRIDE_SIZE / unit_size;
283 if contains_surrogates(unsafe { *(src.add(offset) as *const u16x8) }) {
284 if offset_plus_stride == len {
285 break 'outer;
286 }
287 let offset_plus_stride_plus_one = offset_plus_stride + 1;
288 let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_stride_plus_one]);
289 if up_to < SIMD_STRIDE_SIZE / unit_size {
290 return offset + up_to;
291 }
292 if last_valid_low {
293 offset = offset_plus_stride_plus_one;
294 continue 'outer;
295 }
296 }
297 offset = offset_plus_stride;
298 if offset > len_minus_stride {
299 break 'outer;
300 }
301 }
302 }
303 let (up_to, _) = utf16_valid_up_to_alu(&buffer[offset..]);
304 offset + up_to
305 }
306 } else {
307 by_unit_check_alu!(is_ascii_impl, u8, 0x80, ASCII_MASK);
308 by_unit_check_alu!(is_basic_latin_impl, u16, 0x80, BASIC_LATIN_MASK);
309 by_unit_check_alu!(is_utf16_latin1_impl, u16, 0x100, LATIN1_MASK);
310
311 #[inline(always)]
312 fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize {
313 let (up_to, _) = utf16_valid_up_to_alu(buffer);
314 up_to
315 }
316 }
317 }
318
319 /// The second return value is true iff the last code unit of the slice was
320 /// reached and turned out to be a low surrogate that is part of a valid pair.
321 #[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))]
322 #[inline(always)]
utf16_valid_up_to_alu(buffer: &[u16]) -> (usize, bool)323 fn utf16_valid_up_to_alu(buffer: &[u16]) -> (usize, bool) {
324 let len = buffer.len();
325 if len == 0 {
326 return (0, false);
327 }
328 let mut offset = 0usize;
329 loop {
330 let unit = buffer[offset];
331 let next = offset + 1;
332 let unit_minus_surrogate_start = unit.wrapping_sub(0xD800);
333 if unit_minus_surrogate_start > (0xDFFF - 0xD800) {
334 // Not a surrogate
335 offset = next;
336 if offset == len {
337 return (offset, false);
338 }
339 continue;
340 }
341 if unit_minus_surrogate_start <= (0xDBFF - 0xD800) {
342 // high surrogate
343 if next < len {
344 let second = buffer[next];
345 let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00);
346 if second_minus_low_surrogate_start <= (0xDFFF - 0xDC00) {
347 // The next code unit is a low surrogate. Advance position.
348 offset = next + 1;
349 if offset == len {
350 return (offset, true);
351 }
352 continue;
353 }
354 // The next code unit is not a low surrogate. Don't advance
355 // position and treat the high surrogate as unpaired.
356 // fall through
357 }
358 // Unpaired, fall through
359 }
360 // Unpaired surrogate
361 return (offset, false);
362 }
363 }
364
365 cfg_if! {
366 if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
367 #[inline(always)]
368 fn is_str_latin1_impl(buffer: &str) -> Option<usize> {
369 let mut offset = 0usize;
370 let bytes = buffer.as_bytes();
371 let len = bytes.len();
372 if len >= SIMD_STRIDE_SIZE {
373 let src = bytes.as_ptr();
374 let mut until_alignment = (SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
375 SIMD_ALIGNMENT_MASK;
376 if until_alignment + SIMD_STRIDE_SIZE <= len {
377 while until_alignment != 0 {
378 if bytes[offset] > 0xC3 {
379 return Some(offset);
380 }
381 offset += 1;
382 until_alignment -= 1;
383 }
384 let len_minus_stride = len - SIMD_STRIDE_SIZE;
385 loop {
386 if !simd_is_str_latin1(unsafe { *(src.add(offset) as *const u8x16) }) {
387 // TODO: Ensure this compiles away when inlined into `is_str_latin1()`.
388 while bytes[offset] & 0xC0 == 0x80 {
389 offset += 1;
390 }
391 return Some(offset);
392 }
393 offset += SIMD_STRIDE_SIZE;
394 if offset > len_minus_stride {
395 break;
396 }
397 }
398 }
399 }
400 for i in offset..len {
401 if bytes[i] > 0xC3 {
402 return Some(i);
403 }
404 }
405 None
406 }
407 } else {
408 #[inline(always)]
409 fn is_str_latin1_impl(buffer: &str) -> Option<usize> {
410 let mut bytes = buffer.as_bytes();
411 let mut total = 0;
412 loop {
413 if let Some((byte, offset)) = validate_ascii(bytes) {
414 total += offset;
415 if byte > 0xC3 {
416 return Some(total);
417 }
418 bytes = &bytes[offset + 2..];
419 total += 2;
420 } else {
421 return None;
422 }
423 }
424 }
425 }
426 }
427
428 #[inline(always)]
is_utf8_latin1_impl(buffer: &[u8]) -> Option<usize>429 fn is_utf8_latin1_impl(buffer: &[u8]) -> Option<usize> {
430 let mut bytes = buffer;
431 let mut total = 0;
432 loop {
433 if let Some((byte, offset)) = validate_ascii(bytes) {
434 total += offset;
435 if in_inclusive_range8(byte, 0xC2, 0xC3) {
436 let next = offset + 1;
437 if next == bytes.len() {
438 return Some(total);
439 }
440 if bytes[next] & 0xC0 != 0x80 {
441 return Some(total);
442 }
443 bytes = &bytes[offset + 2..];
444 total += 2;
445 } else {
446 return Some(total);
447 }
448 } else {
449 return None;
450 }
451 }
452 }
453
454 cfg_if! {
455 if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
456 #[inline(always)]
457 fn is_utf16_bidi_impl(buffer: &[u16]) -> bool {
458 let mut offset = 0usize;
459 let len = buffer.len();
460 if len >= SIMD_STRIDE_SIZE / 2 {
461 let src = buffer.as_ptr();
462 let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
463 SIMD_ALIGNMENT_MASK) / 2;
464 if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len {
465 while until_alignment != 0 {
466 if is_utf16_code_unit_bidi(buffer[offset]) {
467 return true;
468 }
469 offset += 1;
470 until_alignment -= 1;
471 }
472 let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2);
473 loop {
474 if is_u16x8_bidi(unsafe { *(src.add(offset) as *const u16x8) }) {
475 return true;
476 }
477 offset += SIMD_STRIDE_SIZE / 2;
478 if offset > len_minus_stride {
479 break;
480 }
481 }
482 }
483 }
484 for &u in &buffer[offset..] {
485 if is_utf16_code_unit_bidi(u) {
486 return true;
487 }
488 }
489 false
490 }
491 } else {
492 #[inline(always)]
493 fn is_utf16_bidi_impl(buffer: &[u16]) -> bool {
494 for &u in buffer {
495 if is_utf16_code_unit_bidi(u) {
496 return true;
497 }
498 }
499 false
500 }
501 }
502 }
503
504 cfg_if! {
505 if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
506 #[inline(always)]
507 fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi {
508 let mut offset = 0usize;
509 let len = buffer.len();
510 if len >= SIMD_STRIDE_SIZE / 2 {
511 let src = buffer.as_ptr();
512 let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
513 SIMD_ALIGNMENT_MASK) / 2;
514 if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len {
515 while until_alignment != 0 {
516 if buffer[offset] > 0xFF {
517 // This transition isn't optimal, since the aligment is recomputing
518 // but not tweaking further today.
519 if is_utf16_bidi_impl(&buffer[offset..]) {
520 return Latin1Bidi::Bidi;
521 }
522 return Latin1Bidi::LeftToRight;
523 }
524 offset += 1;
525 until_alignment -= 1;
526 }
527 let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2);
528 loop {
529 let mut s = unsafe { *(src.add(offset) as *const u16x8) };
530 if !simd_is_latin1(s) {
531 loop {
532 if is_u16x8_bidi(s) {
533 return Latin1Bidi::Bidi;
534 }
535 offset += SIMD_STRIDE_SIZE / 2;
536 if offset > len_minus_stride {
537 for &u in &buffer[offset..] {
538 if is_utf16_code_unit_bidi(u) {
539 return Latin1Bidi::Bidi;
540 }
541 }
542 return Latin1Bidi::LeftToRight;
543 }
544 s = unsafe { *(src.add(offset) as *const u16x8) };
545 }
546 }
547 offset += SIMD_STRIDE_SIZE / 2;
548 if offset > len_minus_stride {
549 break;
550 }
551 }
552 }
553 }
554 let mut iter = (&buffer[offset..]).iter();
555 loop {
556 if let Some(&u) = iter.next() {
557 if u > 0xFF {
558 let mut inner_u = u;
559 loop {
560 if is_utf16_code_unit_bidi(inner_u) {
561 return Latin1Bidi::Bidi;
562 }
563 if let Some(&code_unit) = iter.next() {
564 inner_u = code_unit;
565 } else {
566 return Latin1Bidi::LeftToRight;
567 }
568 }
569 }
570 } else {
571 return Latin1Bidi::Latin1;
572 }
573 }
574 }
575 } else {
576 #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
577 #[inline(always)]
578 fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi {
579 let mut offset = 0usize;
580 let len = buffer.len();
581 if len >= ALU_ALIGNMENT / 2 {
582 let src = buffer.as_ptr();
583 let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK)) &
584 ALU_ALIGNMENT_MASK) / 2;
585 if until_alignment + ALU_ALIGNMENT / 2 <= len {
586 while until_alignment != 0 {
587 if buffer[offset] > 0xFF {
588 if is_utf16_bidi_impl(&buffer[offset..]) {
589 return Latin1Bidi::Bidi;
590 }
591 return Latin1Bidi::LeftToRight;
592 }
593 offset += 1;
594 until_alignment -= 1;
595 }
596 let len_minus_stride = len - ALU_ALIGNMENT / 2;
597 loop {
598 if unsafe { *(src.add(offset) as *const usize) } & LATIN1_MASK != 0 {
599 if is_utf16_bidi_impl(&buffer[offset..]) {
600 return Latin1Bidi::Bidi;
601 }
602 return Latin1Bidi::LeftToRight;
603 }
604 offset += ALU_ALIGNMENT / 2;
605 if offset > len_minus_stride {
606 break;
607 }
608 }
609 }
610 }
611 let mut iter = (&buffer[offset..]).iter();
612 loop {
613 if let Some(&u) = iter.next() {
614 if u > 0xFF {
615 let mut inner_u = u;
616 loop {
617 if is_utf16_code_unit_bidi(inner_u) {
618 return Latin1Bidi::Bidi;
619 }
620 if let Some(&code_unit) = iter.next() {
621 inner_u = code_unit;
622 } else {
623 return Latin1Bidi::LeftToRight;
624 }
625 }
626 }
627 } else {
628 return Latin1Bidi::Latin1;
629 }
630 }
631 }
632 }
633 }
634
635 /// Checks whether the buffer is all-ASCII.
636 ///
637 /// May read the entire buffer even if it isn't all-ASCII. (I.e. the function
638 /// is not guaranteed to fail fast.)
is_ascii(buffer: &[u8]) -> bool639 pub fn is_ascii(buffer: &[u8]) -> bool {
640 is_ascii_impl(buffer)
641 }
642
643 /// Checks whether the buffer is all-Basic Latin (i.e. UTF-16 representing
644 /// only ASCII characters).
645 ///
646 /// May read the entire buffer even if it isn't all-ASCII. (I.e. the function
647 /// is not guaranteed to fail fast.)
is_basic_latin(buffer: &[u16]) -> bool648 pub fn is_basic_latin(buffer: &[u16]) -> bool {
649 is_basic_latin_impl(buffer)
650 }
651
652 /// Checks whether the buffer is valid UTF-8 representing only code points
653 /// less than or equal to U+00FF.
654 ///
655 /// Fails fast. (I.e. returns before having read the whole buffer if UTF-8
656 /// invalidity or code points above U+00FF are discovered.
is_utf8_latin1(buffer: &[u8]) -> bool657 pub fn is_utf8_latin1(buffer: &[u8]) -> bool {
658 is_utf8_latin1_impl(buffer).is_none()
659 }
660
661 /// Checks whether the buffer represents only code points less than or equal
662 /// to U+00FF.
663 ///
664 /// Fails fast. (I.e. returns before having read the whole buffer if code
665 /// points above U+00FF are discovered.
is_str_latin1(buffer: &str) -> bool666 pub fn is_str_latin1(buffer: &str) -> bool {
667 is_str_latin1_impl(buffer).is_none()
668 }
669
670 /// Checks whether the buffer represents only code point less than or equal
671 /// to U+00FF.
672 ///
673 /// May read the entire buffer even if it isn't all-Latin1. (I.e. the function
674 /// is not guaranteed to fail fast.)
is_utf16_latin1(buffer: &[u16]) -> bool675 pub fn is_utf16_latin1(buffer: &[u16]) -> bool {
676 is_utf16_latin1_impl(buffer)
677 }
678
679 /// Checks whether a potentially-invalid UTF-8 buffer contains code points
680 /// that trigger right-to-left processing.
681 ///
682 /// The check is done on a Unicode block basis without regard to assigned
683 /// vs. unassigned code points in the block. Hebrew presentation forms in
684 /// the Alphabetic Presentation Forms block are treated as if they formed
685 /// a block on their own (i.e. it treated as right-to-left). Additionally,
686 /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
687 /// for. Control characters that are technically bidi controls but do not
688 /// cause right-to-left behavior without the presence of right-to-left
689 /// characters or right-to-left controls are not checked for. As a special
690 /// case, U+FEFF is excluded from Arabic Presentation Forms-B.
691 ///
692 /// Returns `true` if the input is invalid UTF-8 or the input contains an
693 /// RTL character. Returns `false` if the input is valid UTF-8 and contains
694 /// no RTL characters.
695 #[cfg_attr(feature = "cargo-clippy", allow(collapsible_if, cyclomatic_complexity))]
696 #[inline]
is_utf8_bidi(buffer: &[u8]) -> bool697 pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
698 // As of rustc 1.25.0-nightly (73ac5d6a8 2018-01-11), this is faster
699 // than UTF-8 validation followed by `is_str_bidi()` for German,
700 // Russian and Japanese. However, this is considerably slower for Thai.
701 // Chances are that the compiler makes some branch predictions that are
702 // unfortunate for Thai. Not spending the time to manually optimize
703 // further at this time, since it's unclear if this variant even has
704 // use cases. However, this is worth revisiting once Rust gets the
705 // ability to annotate relative priorities of match arms.
706
707 // U+058F: D6 8F
708 // U+0590: D6 90
709 // U+08FF: E0 A3 BF
710 // U+0900: E0 A4 80
711 //
712 // U+200F: E2 80 8F
713 // U+202B: E2 80 AB
714 // U+202E: E2 80 AE
715 // U+2067: E2 81 A7
716 //
717 // U+FB1C: EF AC 9C
718 // U+FB1D: EF AC 9D
719 // U+FDFF: EF B7 BF
720 // U+FE00: EF B8 80
721 //
722 // U+FE6F: EF B9 AF
723 // U+FE70: EF B9 B0
724 // U+FEFE: EF BB BE
725 // U+FEFF: EF BB BF
726 //
727 // U+107FF: F0 90 9F BF
728 // U+10800: F0 90 A0 80
729 // U+10FFF: F0 90 BF BF
730 // U+11000: F0 91 80 80
731 //
732 // U+1E7FF: F0 9E 9F BF
733 // U+1E800: F0 9E A0 80
734 // U+1EFFF: F0 9E BF BF
735 // U+1F000: F0 9F 80 80
736 let mut src = buffer;
737 'outer: loop {
738 if let Some((mut byte, mut read)) = validate_ascii(src) {
739 // Check for the longest sequence to avoid checking twice for the
740 // multi-byte sequences.
741 if read + 4 <= src.len() {
742 'inner: loop {
743 // At this point, `byte` is not included in `read`.
744 match byte {
745 0..=0x7F => {
746 // ASCII: go back to SIMD.
747 read += 1;
748 src = &src[read..];
749 continue 'outer;
750 }
751 0xC2..=0xD5 => {
752 // Two-byte
753 let second = unsafe { *(src.get_unchecked(read + 1)) };
754 if !in_inclusive_range8(second, 0x80, 0xBF) {
755 return true;
756 }
757 read += 2;
758 }
759 0xD6 => {
760 // Two-byte
761 let second = unsafe { *(src.get_unchecked(read + 1)) };
762 if !in_inclusive_range8(second, 0x80, 0xBF) {
763 return true;
764 }
765 // XXX consider folding the above and below checks
766 if second > 0x8F {
767 return true;
768 }
769 read += 2;
770 }
771 // two-byte starting with 0xD7 and above is bidi
772 0xE1 | 0xE3..=0xEC | 0xEE => {
773 // Three-byte normal
774 let second = unsafe { *(src.get_unchecked(read + 1)) };
775 let third = unsafe { *(src.get_unchecked(read + 2)) };
776 if ((UTF8_DATA.table[usize::from(second)]
777 & unsafe {
778 *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
779 })
780 | (third >> 6))
781 != 2
782 {
783 return true;
784 }
785 read += 3;
786 }
787 0xE2 => {
788 // Three-byte normal, potentially bidi
789 let second = unsafe { *(src.get_unchecked(read + 1)) };
790 let third = unsafe { *(src.get_unchecked(read + 2)) };
791 if ((UTF8_DATA.table[usize::from(second)]
792 & unsafe {
793 *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
794 })
795 | (third >> 6))
796 != 2
797 {
798 return true;
799 }
800 if second == 0x80 {
801 if third == 0x8F || third == 0xAB || third == 0xAE {
802 return true;
803 }
804 } else if second == 0x81 {
805 if third == 0xA7 {
806 return true;
807 }
808 }
809 read += 3;
810 }
811 0xEF => {
812 // Three-byte normal, potentially bidi
813 let second = unsafe { *(src.get_unchecked(read + 1)) };
814 let third = unsafe { *(src.get_unchecked(read + 2)) };
815 if ((UTF8_DATA.table[usize::from(second)]
816 & unsafe {
817 *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
818 })
819 | (third >> 6))
820 != 2
821 {
822 return true;
823 }
824 if in_inclusive_range8(second, 0xAC, 0xB7) {
825 if second == 0xAC {
826 if third > 0x9C {
827 return true;
828 }
829 } else {
830 return true;
831 }
832 } else if in_inclusive_range8(second, 0xB9, 0xBB) {
833 if second == 0xB9 {
834 if third > 0xAF {
835 return true;
836 }
837 } else if second == 0xBB {
838 if third != 0xBF {
839 return true;
840 }
841 } else {
842 return true;
843 }
844 }
845 read += 3;
846 }
847 0xE0 => {
848 // Three-byte special lower bound, potentially bidi
849 let second = unsafe { *(src.get_unchecked(read + 1)) };
850 let third = unsafe { *(src.get_unchecked(read + 2)) };
851 if ((UTF8_DATA.table[usize::from(second)]
852 & unsafe {
853 *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
854 })
855 | (third >> 6))
856 != 2
857 {
858 return true;
859 }
860 // XXX can this be folded into the above validity check
861 if second < 0xA4 {
862 return true;
863 }
864 read += 3;
865 }
866 0xED => {
867 // Three-byte special upper bound
868 let second = unsafe { *(src.get_unchecked(read + 1)) };
869 let third = unsafe { *(src.get_unchecked(read + 2)) };
870 if ((UTF8_DATA.table[usize::from(second)]
871 & unsafe {
872 *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
873 })
874 | (third >> 6))
875 != 2
876 {
877 return true;
878 }
879 read += 3;
880 }
881 0xF1..=0xF4 => {
882 // Four-byte normal
883 let second = unsafe { *(src.get_unchecked(read + 1)) };
884 let third = unsafe { *(src.get_unchecked(read + 2)) };
885 let fourth = unsafe { *(src.get_unchecked(read + 3)) };
886 if (u16::from(
887 UTF8_DATA.table[usize::from(second)]
888 & unsafe {
889 *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
890 },
891 ) | u16::from(third >> 6)
892 | (u16::from(fourth & 0xC0) << 2))
893 != 0x202
894 {
895 return true;
896 }
897 read += 4;
898 }
899 0xF0 => {
900 // Four-byte special lower bound, potentially bidi
901 let second = unsafe { *(src.get_unchecked(read + 1)) };
902 let third = unsafe { *(src.get_unchecked(read + 2)) };
903 let fourth = unsafe { *(src.get_unchecked(read + 3)) };
904 if (u16::from(
905 UTF8_DATA.table[usize::from(second)]
906 & unsafe {
907 *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
908 },
909 ) | u16::from(third >> 6)
910 | (u16::from(fourth & 0xC0) << 2))
911 != 0x202
912 {
913 return true;
914 }
915 if unsafe { unlikely(second == 0x90 || second == 0x9E) } {
916 let third = src[read + 2];
917 if third >= 0xA0 {
918 return true;
919 }
920 }
921 read += 4;
922 }
923 _ => {
924 // Invalid lead or bidi-only lead
925 return true;
926 }
927 }
928 if read + 4 > src.len() {
929 if read == src.len() {
930 return false;
931 }
932 byte = src[read];
933 break 'inner;
934 }
935 byte = src[read];
936 continue 'inner;
937 }
938 }
939 // We can't have a complete 4-byte sequence, but we could still have
940 // a complete shorter sequence.
941
942 // At this point, `byte` is not included in `read`.
943 match byte {
944 0..=0x7F => {
945 // ASCII: go back to SIMD.
946 read += 1;
947 src = &src[read..];
948 continue 'outer;
949 }
950 0xC2..=0xD5 => {
951 // Two-byte
952 let new_read = read + 2;
953 if new_read > src.len() {
954 return true;
955 }
956 let second = unsafe { *(src.get_unchecked(read + 1)) };
957 if !in_inclusive_range8(second, 0x80, 0xBF) {
958 return true;
959 }
960 read = new_read;
961 // We need to deal with the case where we came here with 3 bytes
962 // left, so we need to take a look at the last one.
963 src = &src[read..];
964 continue 'outer;
965 }
966 0xD6 => {
967 // Two-byte, potentially bidi
968 let new_read = read + 2;
969 if new_read > src.len() {
970 return true;
971 }
972 let second = unsafe { *(src.get_unchecked(read + 1)) };
973 if !in_inclusive_range8(second, 0x80, 0xBF) {
974 return true;
975 }
976 // XXX consider folding the above and below checks
977 if second > 0x8F {
978 return true;
979 }
980 read = new_read;
981 // We need to deal with the case where we came here with 3 bytes
982 // left, so we need to take a look at the last one.
983 src = &src[read..];
984 continue 'outer;
985 }
986 // two-byte starting with 0xD7 and above is bidi
987 0xE1 | 0xE3..=0xEC | 0xEE => {
988 // Three-byte normal
989 let new_read = read + 3;
990 if new_read > src.len() {
991 return true;
992 }
993 let second = unsafe { *(src.get_unchecked(read + 1)) };
994 let third = unsafe { *(src.get_unchecked(read + 2)) };
995 if ((UTF8_DATA.table[usize::from(second)]
996 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
997 | (third >> 6))
998 != 2
999 {
1000 return true;
1001 }
1002 }
1003 0xE2 => {
1004 // Three-byte normal, potentially bidi
1005 let new_read = read + 3;
1006 if new_read > src.len() {
1007 return true;
1008 }
1009 let second = unsafe { *(src.get_unchecked(read + 1)) };
1010 let third = unsafe { *(src.get_unchecked(read + 2)) };
1011 if ((UTF8_DATA.table[usize::from(second)]
1012 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1013 | (third >> 6))
1014 != 2
1015 {
1016 return true;
1017 }
1018 if second == 0x80 {
1019 if third == 0x8F || third == 0xAB || third == 0xAE {
1020 return true;
1021 }
1022 } else if second == 0x81 {
1023 if third == 0xA7 {
1024 return true;
1025 }
1026 }
1027 }
1028 0xEF => {
1029 // Three-byte normal, potentially bidi
1030 let new_read = read + 3;
1031 if new_read > src.len() {
1032 return true;
1033 }
1034 let second = unsafe { *(src.get_unchecked(read + 1)) };
1035 let third = unsafe { *(src.get_unchecked(read + 2)) };
1036 if ((UTF8_DATA.table[usize::from(second)]
1037 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1038 | (third >> 6))
1039 != 2
1040 {
1041 return true;
1042 }
1043 if in_inclusive_range8(second, 0xAC, 0xB7) {
1044 if second == 0xAC {
1045 if third > 0x9C {
1046 return true;
1047 }
1048 } else {
1049 return true;
1050 }
1051 } else if in_inclusive_range8(second, 0xB9, 0xBB) {
1052 if second == 0xB9 {
1053 if third > 0xAF {
1054 return true;
1055 }
1056 } else if second == 0xBB {
1057 if third != 0xBF {
1058 return true;
1059 }
1060 } else {
1061 return true;
1062 }
1063 }
1064 }
1065 0xE0 => {
1066 // Three-byte special lower bound, potentially bidi
1067 let new_read = read + 3;
1068 if new_read > src.len() {
1069 return true;
1070 }
1071 let second = unsafe { *(src.get_unchecked(read + 1)) };
1072 let third = unsafe { *(src.get_unchecked(read + 2)) };
1073 if ((UTF8_DATA.table[usize::from(second)]
1074 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1075 | (third >> 6))
1076 != 2
1077 {
1078 return true;
1079 }
1080 // XXX can this be folded into the above validity check
1081 if second < 0xA4 {
1082 return true;
1083 }
1084 }
1085 0xED => {
1086 // Three-byte special upper bound
1087 let new_read = read + 3;
1088 if new_read > src.len() {
1089 return true;
1090 }
1091 let second = unsafe { *(src.get_unchecked(read + 1)) };
1092 let third = unsafe { *(src.get_unchecked(read + 2)) };
1093 if ((UTF8_DATA.table[usize::from(second)]
1094 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1095 | (third >> 6))
1096 != 2
1097 {
1098 return true;
1099 }
1100 }
1101 _ => {
1102 // Invalid lead, 4-byte lead or 2-byte bidi-only lead
1103 return true;
1104 }
1105 }
1106 return false;
1107 } else {
1108 return false;
1109 }
1110 }
1111 }
1112
1113 /// Checks whether a valid UTF-8 buffer contains code points that trigger
1114 /// right-to-left processing.
1115 ///
1116 /// The check is done on a Unicode block basis without regard to assigned
1117 /// vs. unassigned code points in the block. Hebrew presentation forms in
1118 /// the Alphabetic Presentation Forms block are treated as if they formed
1119 /// a block on their own (i.e. it treated as right-to-left). Additionally,
1120 /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1121 /// for. Control characters that are technically bidi controls but do not
1122 /// cause right-to-left behavior without the presence of right-to-left
1123 /// characters or right-to-left controls are not checked for. As a special
1124 /// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1125 #[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))]
1126 #[inline]
is_str_bidi(buffer: &str) -> bool1127 pub fn is_str_bidi(buffer: &str) -> bool {
1128 // U+058F: D6 8F
1129 // U+0590: D6 90
1130 // U+08FF: E0 A3 BF
1131 // U+0900: E0 A4 80
1132 //
1133 // U+200F: E2 80 8F
1134 // U+202B: E2 80 AB
1135 // U+202E: E2 80 AE
1136 // U+2067: E2 81 A7
1137 //
1138 // U+FB1C: EF AC 9C
1139 // U+FB1D: EF AC 9D
1140 // U+FDFF: EF B7 BF
1141 // U+FE00: EF B8 80
1142 //
1143 // U+FE6F: EF B9 AF
1144 // U+FE70: EF B9 B0
1145 // U+FEFE: EF BB BE
1146 // U+FEFF: EF BB BF
1147 //
1148 // U+107FF: F0 90 9F BF
1149 // U+10800: F0 90 A0 80
1150 // U+10FFF: F0 90 BF BF
1151 // U+11000: F0 91 80 80
1152 //
1153 // U+1E7FF: F0 9E 9F BF
1154 // U+1E800: F0 9E A0 80
1155 // U+1EFFF: F0 9E BF BF
1156 // U+1F000: F0 9F 80 80
1157 let mut bytes = buffer.as_bytes();
1158 'outer: loop {
1159 // TODO: Instead of just validating ASCII using SIMD, use SIMD
1160 // to check for non-ASCII lead bytes, too, to quickly conclude
1161 // that the vector consist entirely of CJK and below-Hebrew
1162 // code points.
1163 // Unfortunately, scripts above Arabic but below CJK share
1164 // lead bytes with RTL.
1165 if let Some((mut byte, mut read)) = validate_ascii(bytes) {
1166 'inner: loop {
1167 // At this point, `byte` is not included in `read`.
1168 if byte < 0xE0 {
1169 if byte >= 0x80 {
1170 // Two-byte
1171 // Adding `unlikely` here improved throughput on
1172 // Russian plain text by 33%!
1173 if unsafe { unlikely(byte >= 0xD6) } {
1174 if byte == 0xD6 {
1175 let second = bytes[read + 1];
1176 if second > 0x8F {
1177 return true;
1178 }
1179 } else {
1180 return true;
1181 }
1182 }
1183 read += 2;
1184 } else {
1185 // ASCII: write and go back to SIMD.
1186 read += 1;
1187 // Intuitively, we should go back to the outer loop only
1188 // if byte is 0x30 or above, so as to avoid trashing on
1189 // ASCII space, comma and period in non-Latin context.
1190 // However, the extra branch seems to cost more than it's
1191 // worth.
1192 bytes = &bytes[read..];
1193 continue 'outer;
1194 }
1195 } else if byte < 0xF0 {
1196 // Three-byte
1197 if unsafe { unlikely(!in_inclusive_range8(byte, 0xE3, 0xEE) && byte != 0xE1) } {
1198 let second = bytes[read + 1];
1199 if byte == 0xE0 {
1200 if second < 0xA4 {
1201 return true;
1202 }
1203 } else if byte == 0xE2 {
1204 let third = bytes[read + 2];
1205 if second == 0x80 {
1206 if third == 0x8F || third == 0xAB || third == 0xAE {
1207 return true;
1208 }
1209 } else if second == 0x81 {
1210 if third == 0xA7 {
1211 return true;
1212 }
1213 }
1214 } else {
1215 debug_assert_eq!(byte, 0xEF);
1216 if in_inclusive_range8(second, 0xAC, 0xB7) {
1217 if second == 0xAC {
1218 let third = bytes[read + 2];
1219 if third > 0x9C {
1220 return true;
1221 }
1222 } else {
1223 return true;
1224 }
1225 } else if in_inclusive_range8(second, 0xB9, 0xBB) {
1226 if second == 0xB9 {
1227 let third = bytes[read + 2];
1228 if third > 0xAF {
1229 return true;
1230 }
1231 } else if second == 0xBB {
1232 let third = bytes[read + 2];
1233 if third != 0xBF {
1234 return true;
1235 }
1236 } else {
1237 return true;
1238 }
1239 }
1240 }
1241 }
1242 read += 3;
1243 } else {
1244 // Four-byte
1245 let second = bytes[read + 1];
1246 if unsafe { unlikely(byte == 0xF0 && (second == 0x90 || second == 0x9E)) } {
1247 let third = bytes[read + 2];
1248 if third >= 0xA0 {
1249 return true;
1250 }
1251 }
1252 read += 4;
1253 }
1254 // The comparison is always < or == and never >, but including
1255 // > here to let the compiler assume that < is true if this
1256 // comparison is false.
1257 if read >= bytes.len() {
1258 return false;
1259 }
1260 byte = bytes[read];
1261 continue 'inner;
1262 }
1263 } else {
1264 return false;
1265 }
1266 }
1267 }
1268
1269 /// Checks whether a UTF-16 buffer contains code points that trigger
1270 /// right-to-left processing.
1271 ///
1272 /// The check is done on a Unicode block basis without regard to assigned
1273 /// vs. unassigned code points in the block. Hebrew presentation forms in
1274 /// the Alphabetic Presentation Forms block are treated as if they formed
1275 /// a block on their own (i.e. it treated as right-to-left). Additionally,
1276 /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1277 /// for. Control characters that are technically bidi controls but do not
1278 /// cause right-to-left behavior without the presence of right-to-left
1279 /// characters or right-to-left controls are not checked for. As a special
1280 /// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1281 ///
1282 /// Returns `true` if the input contains an RTL character or an unpaired
1283 /// high surrogate that could be the high half of an RTL character.
1284 /// Returns `false` if the input contains neither RTL characters nor
1285 /// unpaired high surrogates that could be higher halves of RTL characters.
is_utf16_bidi(buffer: &[u16]) -> bool1286 pub fn is_utf16_bidi(buffer: &[u16]) -> bool {
1287 is_utf16_bidi_impl(buffer)
1288 }
1289
1290 /// Checks whether a scalar value triggers right-to-left processing.
1291 ///
1292 /// The check is done on a Unicode block basis without regard to assigned
1293 /// vs. unassigned code points in the block. Hebrew presentation forms in
1294 /// the Alphabetic Presentation Forms block are treated as if they formed
1295 /// a block on their own (i.e. it treated as right-to-left). Additionally,
1296 /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1297 /// for. Control characters that are technically bidi controls but do not
1298 /// cause right-to-left behavior without the presence of right-to-left
1299 /// characters or right-to-left controls are not checked for. As a special
1300 /// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1301 #[inline(always)]
is_char_bidi(c: char) -> bool1302 pub fn is_char_bidi(c: char) -> bool {
1303 // Controls:
1304 // Every control with RIGHT-TO-LEFT in its name in
1305 // https://www.unicode.org/charts/PDF/U2000.pdf
1306 // U+200F RLM
1307 // U+202B RLE
1308 // U+202E RLO
1309 // U+2067 RLI
1310 //
1311 // BMP RTL:
1312 // https://www.unicode.org/roadmaps/bmp/
1313 // U+0590...U+08FF
1314 // U+FB1D...U+FDFF Hebrew presentation forms and
1315 // Arabic Presentation Forms A
1316 // U+FE70...U+FEFE Arabic Presentation Forms B (excl. BOM)
1317 //
1318 // Supplementary RTL:
1319 // https://www.unicode.org/roadmaps/smp/
1320 // U+10800...U+10FFF (Lead surrogate U+D802 or U+D803)
1321 // U+1E800...U+1EFFF (Lead surrogate U+D83A or U+D83B)
1322 let code_point = u32::from(c);
1323 if code_point < 0x0590 {
1324 // Below Hebrew
1325 return false;
1326 }
1327 if in_range32(code_point, 0x0900, 0xFB1D) {
1328 // Above Arabic Extended-A and below Hebrew presentation forms
1329 if in_inclusive_range32(code_point, 0x200F, 0x2067) {
1330 // In the range that contains the RTL controls
1331 return code_point == 0x200F
1332 || code_point == 0x202B
1333 || code_point == 0x202E
1334 || code_point == 0x2067;
1335 }
1336 return false;
1337 }
1338 if code_point > 0x1EFFF {
1339 // Above second astral RTL. (Emoji is here.)
1340 return false;
1341 }
1342 if in_range32(code_point, 0x11000, 0x1E800) {
1343 // Between astral RTL blocks
1344 return false;
1345 }
1346 if in_range32(code_point, 0xFEFF, 0x10800) {
1347 // Above Arabic Presentations Forms B (excl. BOM) and below first
1348 // astral RTL
1349 return false;
1350 }
1351 if in_range32(code_point, 0xFE00, 0xFE70) {
1352 // Between Arabic Presentations Forms
1353 return false;
1354 }
1355 true
1356 }
1357
1358 /// Checks whether a UTF-16 code unit triggers right-to-left processing.
1359 ///
1360 /// The check is done on a Unicode block basis without regard to assigned
1361 /// vs. unassigned code points in the block. Hebrew presentation forms in
1362 /// the Alphabetic Presentation Forms block are treated as if they formed
1363 /// a block on their own (i.e. it treated as right-to-left). Additionally,
1364 /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1365 /// for. Control characters that are technically bidi controls but do not
1366 /// cause right-to-left behavior without the presence of right-to-left
1367 /// characters or right-to-left controls are not checked for. As a special
1368 /// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1369 ///
1370 /// Since supplementary-plane right-to-left blocks are identifiable from the
1371 /// high surrogate without examining the low surrogate, this function returns
1372 /// `true` for such high surrogates making the function suitable for handling
1373 /// supplementary-plane text without decoding surrogate pairs to scalar
1374 /// values. Obviously, such high surrogates are then reported as right-to-left
1375 /// even if actually unpaired.
1376 #[inline(always)]
is_utf16_code_unit_bidi(u: u16) -> bool1377 pub fn is_utf16_code_unit_bidi(u: u16) -> bool {
1378 if u < 0x0590 {
1379 // Below Hebrew
1380 return false;
1381 }
1382 if in_range16(u, 0x0900, 0xD802) {
1383 // Above Arabic Extended-A and below first RTL surrogate
1384 if in_inclusive_range16(u, 0x200F, 0x2067) {
1385 // In the range that contains the RTL controls
1386 return u == 0x200F || u == 0x202B || u == 0x202E || u == 0x2067;
1387 }
1388 return false;
1389 }
1390 if in_range16(u, 0xD83C, 0xFB1D) {
1391 // Between astral RTL high surrogates and Hebrew presentation forms
1392 // (Emoji is here)
1393 return false;
1394 }
1395 if in_range16(u, 0xD804, 0xD83A) {
1396 // Between RTL high surragates
1397 return false;
1398 }
1399 if u > 0xFEFE {
1400 // Above Arabic Presentation Forms (excl. BOM)
1401 return false;
1402 }
1403 if in_range16(u, 0xFE00, 0xFE70) {
1404 // Between Arabic Presentations Forms
1405 return false;
1406 }
1407 true
1408 }
1409
1410 /// Checks whether a potentially invalid UTF-8 buffer contains code points
1411 /// that trigger right-to-left processing or is all-Latin1.
1412 ///
1413 /// Possibly more efficient than performing the checks separately.
1414 ///
1415 /// Returns `Latin1Bidi::Latin1` if `is_utf8_latin1()` would return `true`.
1416 /// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf8_bidi()` would return
1417 /// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
check_utf8_for_latin1_and_bidi(buffer: &[u8]) -> Latin1Bidi1418 pub fn check_utf8_for_latin1_and_bidi(buffer: &[u8]) -> Latin1Bidi {
1419 if let Some(offset) = is_utf8_latin1_impl(buffer) {
1420 if is_utf8_bidi(&buffer[offset..]) {
1421 Latin1Bidi::Bidi
1422 } else {
1423 Latin1Bidi::LeftToRight
1424 }
1425 } else {
1426 Latin1Bidi::Latin1
1427 }
1428 }
1429
1430 /// Checks whether a valid UTF-8 buffer contains code points
1431 /// that trigger right-to-left processing or is all-Latin1.
1432 ///
1433 /// Possibly more efficient than performing the checks separately.
1434 ///
1435 /// Returns `Latin1Bidi::Latin1` if `is_str_latin1()` would return `true`.
1436 /// Otherwise, returns `Latin1Bidi::Bidi` if `is_str_bidi()` would return
1437 /// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
check_str_for_latin1_and_bidi(buffer: &str) -> Latin1Bidi1438 pub fn check_str_for_latin1_and_bidi(buffer: &str) -> Latin1Bidi {
1439 // The transition from the latin1 check to the bidi check isn't
1440 // optimal but not tweaking it to perfection today.
1441 if let Some(offset) = is_str_latin1_impl(buffer) {
1442 if is_str_bidi(&buffer[offset..]) {
1443 Latin1Bidi::Bidi
1444 } else {
1445 Latin1Bidi::LeftToRight
1446 }
1447 } else {
1448 Latin1Bidi::Latin1
1449 }
1450 }
1451
1452 /// Checks whether a potentially invalid UTF-16 buffer contains code points
1453 /// that trigger right-to-left processing or is all-Latin1.
1454 ///
1455 /// Possibly more efficient than performing the checks separately.
1456 ///
1457 /// Returns `Latin1Bidi::Latin1` if `is_utf16_latin1()` would return `true`.
1458 /// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf16_bidi()` would return
1459 /// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
check_utf16_for_latin1_and_bidi(buffer: &[u16]) -> Latin1Bidi1460 pub fn check_utf16_for_latin1_and_bidi(buffer: &[u16]) -> Latin1Bidi {
1461 check_utf16_for_latin1_and_bidi_impl(buffer)
1462 }
1463
1464 /// Converts potentially-invalid UTF-8 to valid UTF-16 with errors replaced
1465 /// with the REPLACEMENT CHARACTER.
1466 ///
1467 /// The length of the destination buffer must be at least the length of the
1468 /// source buffer _plus one_.
1469 ///
1470 /// Returns the number of `u16`s written.
1471 ///
1472 /// # Panics
1473 ///
1474 /// Panics if the destination buffer is shorter than stated above.
convert_utf8_to_utf16(src: &[u8], dst: &mut [u16]) -> usize1475 pub fn convert_utf8_to_utf16(src: &[u8], dst: &mut [u16]) -> usize {
1476 // TODO: Can the requirement for dst to be at least one unit longer
1477 // be eliminated?
1478 assert!(dst.len() > src.len());
1479 let mut decoder = Utf8Decoder::new_inner();
1480 let mut total_read = 0usize;
1481 let mut total_written = 0usize;
1482 loop {
1483 let (result, read, written) =
1484 decoder.decode_to_utf16_raw(&src[total_read..], &mut dst[total_written..], true);
1485 total_read += read;
1486 total_written += written;
1487 match result {
1488 DecoderResult::InputEmpty => {
1489 return total_written;
1490 }
1491 DecoderResult::OutputFull => {
1492 unreachable!("The assert at the top of the function should have caught this.");
1493 }
1494 DecoderResult::Malformed(_, _) => {
1495 // There should always be space for the U+FFFD, because
1496 // otherwise we'd have gotten OutputFull already.
1497 dst[total_written] = 0xFFFD;
1498 total_written += 1;
1499 }
1500 }
1501 }
1502 }
1503
1504 /// Converts valid UTF-8 to valid UTF-16.
1505 ///
1506 /// The length of the destination buffer must be at least the length of the
1507 /// source buffer.
1508 ///
1509 /// Returns the number of `u16`s written.
1510 ///
1511 /// # Panics
1512 ///
1513 /// Panics if the destination buffer is shorter than stated above.
convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize1514 pub fn convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize {
1515 assert!(
1516 dst.len() >= src.len(),
1517 "Destination must not be shorter than the source."
1518 );
1519 let bytes = src.as_bytes();
1520 let mut read = 0;
1521 let mut written = 0;
1522 'outer: loop {
1523 let mut byte = {
1524 let src_remaining = &bytes[read..];
1525 let dst_remaining = &mut dst[written..];
1526 let length = src_remaining.len();
1527 match unsafe {
1528 ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length)
1529 } {
1530 None => {
1531 written += length;
1532 return written;
1533 }
1534 Some((non_ascii, consumed)) => {
1535 read += consumed;
1536 written += consumed;
1537 non_ascii
1538 }
1539 }
1540 };
1541 'inner: loop {
1542 // At this point, `byte` is not included in `read`.
1543 if byte < 0xE0 {
1544 if byte >= 0x80 {
1545 // Two-byte
1546 let second = unsafe { *(bytes.get_unchecked(read + 1)) };
1547 let point = ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F);
1548 unsafe { *(dst.get_unchecked_mut(written)) = point };
1549 read += 2;
1550 written += 1;
1551 } else {
1552 // ASCII: write and go back to SIMD.
1553 unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) };
1554 read += 1;
1555 written += 1;
1556 // Intuitively, we should go back to the outer loop only
1557 // if byte is 0x30 or above, so as to avoid trashing on
1558 // ASCII space, comma and period in non-Latin context.
1559 // However, the extra branch seems to cost more than it's
1560 // worth.
1561 continue 'outer;
1562 }
1563 } else if byte < 0xF0 {
1564 // Three-byte
1565 let second = unsafe { *(bytes.get_unchecked(read + 1)) };
1566 let third = unsafe { *(bytes.get_unchecked(read + 2)) };
1567 let point = ((u16::from(byte) & 0xF) << 12)
1568 | ((u16::from(second) & 0x3F) << 6)
1569 | (u16::from(third) & 0x3F);
1570 unsafe { *(dst.get_unchecked_mut(written)) = point };
1571 read += 3;
1572 written += 1;
1573 } else {
1574 // Four-byte
1575 let second = unsafe { *(bytes.get_unchecked(read + 1)) };
1576 let third = unsafe { *(bytes.get_unchecked(read + 2)) };
1577 let fourth = unsafe { *(bytes.get_unchecked(read + 3)) };
1578 let point = ((u32::from(byte) & 0x7) << 18)
1579 | ((u32::from(second) & 0x3F) << 12)
1580 | ((u32::from(third) & 0x3F) << 6)
1581 | (u32::from(fourth) & 0x3F);
1582 unsafe { *(dst.get_unchecked_mut(written)) = (0xD7C0 + (point >> 10)) as u16 };
1583 unsafe {
1584 *(dst.get_unchecked_mut(written + 1)) = (0xDC00 + (point & 0x3FF)) as u16
1585 };
1586 read += 4;
1587 written += 2;
1588 }
1589 // The comparison is always < or == and never >, but including
1590 // > here to let the compiler assume that < is true if this
1591 // comparison is false.
1592 if read >= src.len() {
1593 return written;
1594 }
1595 byte = bytes[read];
1596 continue 'inner;
1597 }
1598 }
1599 }
1600
1601 /// Converts potentially-invalid UTF-8 to valid UTF-16 signaling on error.
1602 ///
1603 /// The length of the destination buffer must be at least the length of the
1604 /// source buffer.
1605 ///
1606 /// Returns the number of `u16`s written or `None` if the input was invalid.
1607 ///
1608 /// When the input was invalid, some output may have been written.
1609 ///
1610 /// # Panics
1611 ///
1612 /// Panics if the destination buffer is shorter than stated above.
convert_utf8_to_utf16_without_replacement(src: &[u8], dst: &mut [u16]) -> Option<usize>1613 pub fn convert_utf8_to_utf16_without_replacement(src: &[u8], dst: &mut [u16]) -> Option<usize> {
1614 assert!(
1615 dst.len() >= src.len(),
1616 "Destination must not be shorter than the source."
1617 );
1618 let (read, written) = convert_utf8_to_utf16_up_to_invalid(src, dst);
1619 if read == src.len() {
1620 return Some(written);
1621 }
1622 None
1623 }
1624
1625 /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1626 /// with the REPLACEMENT CHARACTER with potentially insufficient output
1627 /// space.
1628 ///
1629 /// Returns the number of code units read and the number of bytes written.
1630 ///
1631 /// Guarantees that the bytes in the destination beyond the number of
1632 /// bytes claimed as written by the second item of the return tuple
1633 /// are left unmodified.
1634 ///
1635 /// Not all code units are read if there isn't enough output space.
1636 ///
1637 /// Note that this method isn't designed for general streamability but for
1638 /// not allocating memory for the worst case up front. Specifically,
1639 /// if the input starts with or ends with an unpaired surrogate, those are
1640 /// replaced with the REPLACEMENT CHARACTER.
1641 ///
1642 /// Matches the semantics of `TextEncoder.encodeInto()` from the
1643 /// Encoding Standard.
1644 ///
1645 /// # Safety
1646 ///
1647 /// If you want to convert into a `&mut str`, use
1648 /// `convert_utf16_to_str_partial()` instead of using this function
1649 /// together with the `unsafe` method `as_bytes_mut()` on `&mut str`.
1650 #[inline(always)]
convert_utf16_to_utf8_partial(src: &[u16], dst: &mut [u8]) -> (usize, usize)1651 pub fn convert_utf16_to_utf8_partial(src: &[u16], dst: &mut [u8]) -> (usize, usize) {
1652 // The two functions called below are marked `inline(never)` to make
1653 // transitions from the hot part (first function) into the cold part
1654 // (second function) go through a return and another call to discouge
1655 // the CPU from speculating from the hot code into the cold code.
1656 // Letting the transitions be mere intra-function jumps, even to
1657 // basic blocks out-of-lined to the end of the function would wipe
1658 // away a quarter of Arabic encode performance on Haswell!
1659 let (read, written) = convert_utf16_to_utf8_partial_inner(src, dst);
1660 if unsafe { likely(read == src.len()) } {
1661 return (read, written);
1662 }
1663 let (tail_read, tail_written) =
1664 convert_utf16_to_utf8_partial_tail(&src[read..], &mut dst[written..]);
1665 (read + tail_read, written + tail_written)
1666 }
1667
1668 /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1669 /// with the REPLACEMENT CHARACTER.
1670 ///
1671 /// The length of the destination buffer must be at least the length of the
1672 /// source buffer times three.
1673 ///
1674 /// Returns the number of bytes written.
1675 ///
1676 /// # Panics
1677 ///
1678 /// Panics if the destination buffer is shorter than stated above.
1679 ///
1680 /// # Safety
1681 ///
1682 /// If you want to convert into a `&mut str`, use `convert_utf16_to_str()`
1683 /// instead of using this function together with the `unsafe` method
1684 /// `as_bytes_mut()` on `&mut str`.
1685 #[inline(always)]
convert_utf16_to_utf8(src: &[u16], dst: &mut [u8]) -> usize1686 pub fn convert_utf16_to_utf8(src: &[u16], dst: &mut [u8]) -> usize {
1687 assert!(dst.len() >= src.len() * 3);
1688 let (read, written) = convert_utf16_to_utf8_partial(src, dst);
1689 debug_assert_eq!(read, src.len());
1690 written
1691 }
1692
1693 /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1694 /// with the REPLACEMENT CHARACTER such that the validity of the output is
1695 /// signaled using the Rust type system with potentially insufficient output
1696 /// space.
1697 ///
1698 /// Returns the number of code units read and the number of bytes written.
1699 ///
1700 /// Not all code units are read if there isn't enough output space.
1701 ///
1702 /// Note that this method isn't designed for general streamability but for
1703 /// not allocating memory for the worst case up front. Specifically,
1704 /// if the input starts with or ends with an unpaired surrogate, those are
1705 /// replaced with the REPLACEMENT CHARACTER.
convert_utf16_to_str_partial(src: &[u16], dst: &mut str) -> (usize, usize)1706 pub fn convert_utf16_to_str_partial(src: &[u16], dst: &mut str) -> (usize, usize) {
1707 let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
1708 let (read, written) = convert_utf16_to_utf8_partial(src, bytes);
1709 let len = bytes.len();
1710 let mut trail = written;
1711 while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
1712 bytes[trail] = 0;
1713 trail += 1;
1714 }
1715 (read, written)
1716 }
1717
1718 /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1719 /// with the REPLACEMENT CHARACTER such that the validity of the output is
1720 /// signaled using the Rust type system.
1721 ///
1722 /// The length of the destination buffer must be at least the length of the
1723 /// source buffer times three.
1724 ///
1725 /// Returns the number of bytes written.
1726 ///
1727 /// # Panics
1728 ///
1729 /// Panics if the destination buffer is shorter than stated above.
1730 #[inline(always)]
convert_utf16_to_str(src: &[u16], dst: &mut str) -> usize1731 pub fn convert_utf16_to_str(src: &[u16], dst: &mut str) -> usize {
1732 assert!(dst.len() >= src.len() * 3);
1733 let (read, written) = convert_utf16_to_str_partial(src, dst);
1734 debug_assert_eq!(read, src.len());
1735 written
1736 }
1737
1738 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1739 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-16.
1740 ///
1741 /// The length of the destination buffer must be at least the length of the
1742 /// source buffer.
1743 ///
1744 /// The number of `u16`s written equals the length of the source buffer.
1745 ///
1746 /// # Panics
1747 ///
1748 /// Panics if the destination buffer is shorter than stated above.
convert_latin1_to_utf16(src: &[u8], dst: &mut [u16])1749 pub fn convert_latin1_to_utf16(src: &[u8], dst: &mut [u16]) {
1750 assert!(
1751 dst.len() >= src.len(),
1752 "Destination must not be shorter than the source."
1753 );
1754 // TODO: On aarch64, the safe version autovectorizes to the same unpacking
1755 // instructions and this code, but, yet, the autovectorized version is
1756 // faster.
1757 unsafe {
1758 unpack_latin1(src.as_ptr(), dst.as_mut_ptr(), src.len());
1759 }
1760 }
1761
1762 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1763 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient
1764 /// output space.
1765 ///
1766 /// Returns the number of bytes read and the number of bytes written.
1767 ///
1768 /// If the output isn't large enough, not all input is consumed.
1769 ///
1770 /// # Safety
1771 ///
1772 /// If you want to convert into a `&mut str`, use
1773 /// `convert_utf16_to_str_partial()` instead of using this function
1774 /// together with the `unsafe` method `as_bytes_mut()` on `&mut str`.
convert_latin1_to_utf8_partial(src: &[u8], dst: &mut [u8]) -> (usize, usize)1775 pub fn convert_latin1_to_utf8_partial(src: &[u8], dst: &mut [u8]) -> (usize, usize) {
1776 let src_len = src.len();
1777 let src_ptr = src.as_ptr();
1778 let dst_ptr = dst.as_mut_ptr();
1779 let dst_len = dst.len();
1780 let mut total_read = 0usize;
1781 let mut total_written = 0usize;
1782 loop {
1783 // src can't advance more than dst
1784 let src_left = src_len - total_read;
1785 let dst_left = dst_len - total_written;
1786 let min_left = ::core::cmp::min(src_left, dst_left);
1787 if let Some((non_ascii, consumed)) = unsafe {
1788 ascii_to_ascii(
1789 src_ptr.add(total_read),
1790 dst_ptr.add(total_written),
1791 min_left,
1792 )
1793 } {
1794 total_read += consumed;
1795 total_written += consumed;
1796 if total_written.checked_add(2).unwrap() > dst_len {
1797 return (total_read, total_written);
1798 }
1799
1800 total_read += 1; // consume `non_ascii`
1801
1802 dst[total_written] = (non_ascii >> 6) | 0xC0;
1803 total_written += 1;
1804 dst[total_written] = (non_ascii & 0x3F) | 0x80;
1805 total_written += 1;
1806 continue;
1807 }
1808 return (total_read + min_left, total_written + min_left);
1809 }
1810 }
1811
1812 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1813 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8.
1814 ///
1815 /// The length of the destination buffer must be at least the length of the
1816 /// source buffer times two.
1817 ///
1818 /// Returns the number of bytes written.
1819 ///
1820 /// # Panics
1821 ///
1822 /// Panics if the destination buffer is shorter than stated above.
1823 ///
1824 /// # Safety
1825 ///
1826 /// Note that this function may write garbage beyond the number of bytes
1827 /// indicated by the return value, so using a `&mut str` interpreted as
1828 /// `&mut [u8]` as the destination is not safe. If you want to convert into
1829 /// a `&mut str`, use `convert_utf16_to_str()` instead of this function.
1830 #[inline]
convert_latin1_to_utf8(src: &[u8], dst: &mut [u8]) -> usize1831 pub fn convert_latin1_to_utf8(src: &[u8], dst: &mut [u8]) -> usize {
1832 assert!(
1833 dst.len() >= src.len() * 2,
1834 "Destination must not be shorter than the source times two."
1835 );
1836 let (read, written) = convert_latin1_to_utf8_partial(src, dst);
1837 debug_assert_eq!(read, src.len());
1838 written
1839 }
1840
1841 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1842 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the
1843 /// output is signaled using the Rust type system with potentially insufficient
1844 /// output space.
1845 ///
1846 /// Returns the number of bytes read and the number of bytes written.
1847 ///
1848 /// If the output isn't large enough, not all input is consumed.
1849 #[inline]
convert_latin1_to_str_partial(src: &[u8], dst: &mut str) -> (usize, usize)1850 pub fn convert_latin1_to_str_partial(src: &[u8], dst: &mut str) -> (usize, usize) {
1851 let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
1852 let (read, written) = convert_latin1_to_utf8_partial(src, bytes);
1853 let len = bytes.len();
1854 let mut trail = written;
1855 let max = ::core::cmp::min(len, trail + MAX_STRIDE_SIZE);
1856 while trail < max {
1857 bytes[trail] = 0;
1858 trail += 1;
1859 }
1860 while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
1861 bytes[trail] = 0;
1862 trail += 1;
1863 }
1864 (read, written)
1865 }
1866
1867 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1868 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the
1869 /// output is signaled using the Rust type system.
1870 ///
1871 /// The length of the destination buffer must be at least the length of the
1872 /// source buffer times two.
1873 ///
1874 /// Returns the number of bytes written.
1875 ///
1876 /// # Panics
1877 ///
1878 /// Panics if the destination buffer is shorter than stated above.
1879 #[inline]
convert_latin1_to_str(src: &[u8], dst: &mut str) -> usize1880 pub fn convert_latin1_to_str(src: &[u8], dst: &mut str) -> usize {
1881 assert!(
1882 dst.len() >= src.len() * 2,
1883 "Destination must not be shorter than the source times two."
1884 );
1885 let (read, written) = convert_latin1_to_str_partial(src, dst);
1886 debug_assert_eq!(read, src.len());
1887 written
1888 }
1889
1890 /// If the input is valid UTF-8 representing only Unicode code points from
1891 /// U+0000 to U+00FF, inclusive, converts the input into output that
1892 /// represents the value of each code point as the unsigned byte value of
1893 /// each output byte.
1894 ///
1895 /// If the input does not fulfill the condition stated above, this function
1896 /// panics if debug assertions are enabled (and fuzzing isn't) and otherwise
1897 /// does something that is memory-safe without any promises about any
1898 /// properties of the output. In particular, callers shouldn't assume the
1899 /// output to be the same across crate versions or CPU architectures and
1900 /// should not assume that non-ASCII input can't map to ASCII output.
1901 ///
1902 /// The length of the destination buffer must be at least the length of the
1903 /// source buffer.
1904 ///
1905 /// Returns the number of bytes written.
1906 ///
1907 /// # Panics
1908 ///
1909 /// Panics if the destination buffer is shorter than stated above.
1910 ///
1911 /// If debug assertions are enabled (and not fuzzing) and the input is
1912 /// not in the range U+0000 to U+00FF, inclusive.
convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize1913 pub fn convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize {
1914 assert!(
1915 dst.len() >= src.len(),
1916 "Destination must not be shorter than the source."
1917 );
1918 non_fuzz_debug_assert!(is_utf8_latin1(src));
1919 let src_len = src.len();
1920 let src_ptr = src.as_ptr();
1921 let dst_ptr = dst.as_mut_ptr();
1922 let mut total_read = 0usize;
1923 let mut total_written = 0usize;
1924 loop {
1925 // dst can't advance more than src
1926 let src_left = src_len - total_read;
1927 if let Some((non_ascii, consumed)) = unsafe {
1928 ascii_to_ascii(
1929 src_ptr.add(total_read),
1930 dst_ptr.add(total_written),
1931 src_left,
1932 )
1933 } {
1934 total_read += consumed + 1;
1935 total_written += consumed;
1936
1937 if total_read == src_len {
1938 return total_written;
1939 }
1940
1941 let trail = src[total_read];
1942 total_read += 1;
1943
1944 dst[total_written] = ((non_ascii & 0x1F) << 6) | (trail & 0x3F);
1945 total_written += 1;
1946 continue;
1947 }
1948 return total_written + src_left;
1949 }
1950 }
1951
1952 /// If the input is valid UTF-16 representing only Unicode code points from
1953 /// U+0000 to U+00FF, inclusive, converts the input into output that
1954 /// represents the value of each code point as the unsigned byte value of
1955 /// each output byte.
1956 ///
1957 /// If the input does not fulfill the condition stated above, does something
1958 /// that is memory-safe without any promises about any properties of the
1959 /// output and will probably assert in debug builds in future versions.
1960 /// In particular, callers shouldn't assume the output to be the same across
1961 /// crate versions or CPU architectures and should not assume that non-ASCII
1962 /// input can't map to ASCII output.
1963 ///
1964 /// The length of the destination buffer must be at least the length of the
1965 /// source buffer.
1966 ///
1967 /// The number of bytes written equals the length of the source buffer.
1968 ///
1969 /// # Panics
1970 ///
1971 /// Panics if the destination buffer is shorter than stated above.
1972 ///
1973 /// (Probably in future versions if debug assertions are enabled (and not
1974 /// fuzzing) and the input is not in the range U+0000 to U+00FF, inclusive.)
convert_utf16_to_latin1_lossy(src: &[u16], dst: &mut [u8])1975 pub fn convert_utf16_to_latin1_lossy(src: &[u16], dst: &mut [u8]) {
1976 assert!(
1977 dst.len() >= src.len(),
1978 "Destination must not be shorter than the source."
1979 );
1980 // non_fuzz_debug_assert!(is_utf16_latin1(src));
1981 unsafe {
1982 pack_latin1(src.as_ptr(), dst.as_mut_ptr(), src.len());
1983 }
1984 }
1985
1986 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1987 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8.
1988 ///
1989 /// Borrows if input is ASCII-only. Performs a single heap allocation
1990 /// otherwise.
decode_latin1<'a>(bytes: &'a [u8]) -> Cow<'a, str>1991 pub fn decode_latin1<'a>(bytes: &'a [u8]) -> Cow<'a, str> {
1992 let up_to = ascii_valid_up_to(bytes);
1993 // >= makes later things optimize better than ==
1994 if up_to >= bytes.len() {
1995 debug_assert_eq!(up_to, bytes.len());
1996 let s: &str = unsafe { ::core::str::from_utf8_unchecked(bytes) };
1997 return Cow::Borrowed(s);
1998 }
1999 let (head, tail) = bytes.split_at(up_to);
2000 let capacity = head.len() + tail.len() * 2;
2001 let mut vec = Vec::with_capacity(capacity);
2002 unsafe {
2003 vec.set_len(capacity);
2004 }
2005 (&mut vec[..up_to]).copy_from_slice(head);
2006 let written = convert_latin1_to_utf8(tail, &mut vec[up_to..]);
2007 vec.truncate(up_to + written);
2008 Cow::Owned(unsafe { String::from_utf8_unchecked(vec) })
2009 }
2010
2011 /// If the input is valid UTF-8 representing only Unicode code points from
2012 /// U+0000 to U+00FF, inclusive, converts the input into output that
2013 /// represents the value of each code point as the unsigned byte value of
2014 /// each output byte.
2015 ///
2016 /// If the input does not fulfill the condition stated above, this function
2017 /// panics if debug assertions are enabled (and fuzzing isn't) and otherwise
2018 /// does something that is memory-safe without any promises about any
2019 /// properties of the output. In particular, callers shouldn't assume the
2020 /// output to be the same across crate versions or CPU architectures and
2021 /// should not assume that non-ASCII input can't map to ASCII output.
2022 ///
2023 /// Borrows if input is ASCII-only. Performs a single heap allocation
2024 /// otherwise.
encode_latin1_lossy<'a>(string: &'a str) -> Cow<'a, [u8]>2025 pub fn encode_latin1_lossy<'a>(string: &'a str) -> Cow<'a, [u8]> {
2026 let bytes = string.as_bytes();
2027 let up_to = ascii_valid_up_to(bytes);
2028 // >= makes later things optimize better than ==
2029 if up_to >= bytes.len() {
2030 debug_assert_eq!(up_to, bytes.len());
2031 return Cow::Borrowed(bytes);
2032 }
2033 let (head, tail) = bytes.split_at(up_to);
2034 let capacity = bytes.len();
2035 let mut vec = Vec::with_capacity(capacity);
2036 unsafe {
2037 vec.set_len(capacity);
2038 }
2039 (&mut vec[..up_to]).copy_from_slice(head);
2040 let written = convert_utf8_to_latin1_lossy(tail, &mut vec[up_to..]);
2041 vec.truncate(up_to + written);
2042 Cow::Owned(vec)
2043 }
2044
2045 /// Returns the index of the first unpaired surrogate or, if the input is
2046 /// valid UTF-16 in its entirety, the length of the input.
utf16_valid_up_to(buffer: &[u16]) -> usize2047 pub fn utf16_valid_up_to(buffer: &[u16]) -> usize {
2048 utf16_valid_up_to_impl(buffer)
2049 }
2050
2051 /// Returns the index of first byte that starts an invalid byte
2052 /// sequence or a non-Latin1 byte sequence, or the length of the
2053 /// string if there are neither.
utf8_latin1_up_to(buffer: &[u8]) -> usize2054 pub fn utf8_latin1_up_to(buffer: &[u8]) -> usize {
2055 is_utf8_latin1_impl(buffer).unwrap_or(buffer.len())
2056 }
2057
2058 /// Returns the index of first byte that starts a non-Latin1 byte
2059 /// sequence, or the length of the string if there are none.
str_latin1_up_to(buffer: &str) -> usize2060 pub fn str_latin1_up_to(buffer: &str) -> usize {
2061 is_str_latin1_impl(buffer).unwrap_or(buffer.len())
2062 }
2063
2064 /// Replaces unpaired surrogates in the input with the REPLACEMENT CHARACTER.
2065 #[inline]
ensure_utf16_validity(buffer: &mut [u16])2066 pub fn ensure_utf16_validity(buffer: &mut [u16]) {
2067 let mut offset = 0;
2068 loop {
2069 offset += utf16_valid_up_to(&buffer[offset..]);
2070 if offset == buffer.len() {
2071 return;
2072 }
2073 buffer[offset] = 0xFFFD;
2074 offset += 1;
2075 }
2076 }
2077
2078 /// Copies ASCII from source to destination up to the first non-ASCII byte
2079 /// (or the end of the input if it is ASCII in its entirety).
2080 ///
2081 /// The length of the destination buffer must be at least the length of the
2082 /// source buffer.
2083 ///
2084 /// Returns the number of bytes written.
2085 ///
2086 /// # Panics
2087 ///
2088 /// Panics if the destination buffer is shorter than stated above.
copy_ascii_to_ascii(src: &[u8], dst: &mut [u8]) -> usize2089 pub fn copy_ascii_to_ascii(src: &[u8], dst: &mut [u8]) -> usize {
2090 assert!(
2091 dst.len() >= src.len(),
2092 "Destination must not be shorter than the source."
2093 );
2094 if let Some((_, consumed)) =
2095 unsafe { ascii_to_ascii(src.as_ptr(), dst.as_mut_ptr(), src.len()) }
2096 {
2097 consumed
2098 } else {
2099 src.len()
2100 }
2101 }
2102
2103 /// Copies ASCII from source to destination zero-extending it to UTF-16 up to
2104 /// the first non-ASCII byte (or the end of the input if it is ASCII in its
2105 /// entirety).
2106 ///
2107 /// The length of the destination buffer must be at least the length of the
2108 /// source buffer.
2109 ///
2110 /// Returns the number of `u16`s written.
2111 ///
2112 /// # Panics
2113 ///
2114 /// Panics if the destination buffer is shorter than stated above.
copy_ascii_to_basic_latin(src: &[u8], dst: &mut [u16]) -> usize2115 pub fn copy_ascii_to_basic_latin(src: &[u8], dst: &mut [u16]) -> usize {
2116 assert!(
2117 dst.len() >= src.len(),
2118 "Destination must not be shorter than the source."
2119 );
2120 if let Some((_, consumed)) =
2121 unsafe { ascii_to_basic_latin(src.as_ptr(), dst.as_mut_ptr(), src.len()) }
2122 {
2123 consumed
2124 } else {
2125 src.len()
2126 }
2127 }
2128
2129 /// Copies Basic Latin from source to destination narrowing it to ASCII up to
2130 /// the first non-Basic Latin code unit (or the end of the input if it is
2131 /// Basic Latin in its entirety).
2132 ///
2133 /// The length of the destination buffer must be at least the length of the
2134 /// source buffer.
2135 ///
2136 /// Returns the number of bytes written.
2137 ///
2138 /// # Panics
2139 ///
2140 /// Panics if the destination buffer is shorter than stated above.
copy_basic_latin_to_ascii(src: &[u16], dst: &mut [u8]) -> usize2141 pub fn copy_basic_latin_to_ascii(src: &[u16], dst: &mut [u8]) -> usize {
2142 assert!(
2143 dst.len() >= src.len(),
2144 "Destination must not be shorter than the source."
2145 );
2146 if let Some((_, consumed)) =
2147 unsafe { basic_latin_to_ascii(src.as_ptr(), dst.as_mut_ptr(), src.len()) }
2148 {
2149 consumed
2150 } else {
2151 src.len()
2152 }
2153 }
2154
2155 // Any copyright to the test code below this comment is dedicated to the
2156 // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
2157
2158 #[cfg(test)]
2159 mod tests {
2160 use super::*;
2161
2162 #[test]
test_is_ascii_success()2163 fn test_is_ascii_success() {
2164 let mut src: Vec<u8> = Vec::with_capacity(128);
2165 src.resize(128, 0);
2166 for i in 0..src.len() {
2167 src[i] = i as u8;
2168 }
2169 for i in 0..src.len() {
2170 assert!(is_ascii(&src[i..]));
2171 }
2172 }
2173
2174 #[test]
test_is_ascii_fail()2175 fn test_is_ascii_fail() {
2176 let mut src: Vec<u8> = Vec::with_capacity(128);
2177 src.resize(128, 0);
2178 for i in 0..src.len() {
2179 src[i] = i as u8;
2180 }
2181 for i in 0..src.len() {
2182 let tail = &mut src[i..];
2183 for j in 0..tail.len() {
2184 tail[j] = 0xA0;
2185 assert!(!is_ascii(tail));
2186 }
2187 }
2188 }
2189
2190 #[test]
test_is_basic_latin_success()2191 fn test_is_basic_latin_success() {
2192 let mut src: Vec<u16> = Vec::with_capacity(128);
2193 src.resize(128, 0);
2194 for i in 0..src.len() {
2195 src[i] = i as u16;
2196 }
2197 for i in 0..src.len() {
2198 assert!(is_basic_latin(&src[i..]));
2199 }
2200 }
2201
2202 #[test]
test_is_basic_latin_fail()2203 fn test_is_basic_latin_fail() {
2204 let mut src: Vec<u16> = Vec::with_capacity(128);
2205 src.resize(128, 0);
2206 for i in 0..src.len() {
2207 src[i] = i as u16;
2208 }
2209 for i in 0..src.len() {
2210 let tail = &mut src[i..];
2211 for j in 0..tail.len() {
2212 tail[j] = 0xA0;
2213 assert!(!is_basic_latin(tail));
2214 }
2215 }
2216 }
2217
2218 #[test]
test_is_utf16_latin1_success()2219 fn test_is_utf16_latin1_success() {
2220 let mut src: Vec<u16> = Vec::with_capacity(256);
2221 src.resize(256, 0);
2222 for i in 0..src.len() {
2223 src[i] = i as u16;
2224 }
2225 for i in 0..src.len() {
2226 assert!(is_utf16_latin1(&src[i..]));
2227 assert_eq!(
2228 check_utf16_for_latin1_and_bidi(&src[i..]),
2229 Latin1Bidi::Latin1
2230 );
2231 }
2232 }
2233
2234 #[test]
test_is_utf16_latin1_fail()2235 fn test_is_utf16_latin1_fail() {
2236 let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow
2237 let mut src: Vec<u16> = Vec::with_capacity(len);
2238 src.resize(len, 0);
2239 for i in 0..src.len() {
2240 src[i] = i as u16;
2241 }
2242 for i in 0..src.len() {
2243 let tail = &mut src[i..];
2244 for j in 0..tail.len() {
2245 tail[j] = 0x100 + j as u16;
2246 assert!(!is_utf16_latin1(tail));
2247 assert_ne!(check_utf16_for_latin1_and_bidi(tail), Latin1Bidi::Latin1);
2248 }
2249 }
2250 }
2251
2252 #[test]
test_is_str_latin1_success()2253 fn test_is_str_latin1_success() {
2254 let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow
2255 let mut src: Vec<u16> = Vec::with_capacity(len);
2256 src.resize(len, 0);
2257 for i in 0..src.len() {
2258 src[i] = i as u16;
2259 }
2260 for i in 0..src.len() {
2261 let s = String::from_utf16(&src[i..]).unwrap();
2262 assert!(is_str_latin1(&s[..]));
2263 assert_eq!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1);
2264 }
2265 }
2266
2267 #[test]
test_is_str_latin1_fail()2268 fn test_is_str_latin1_fail() {
2269 let len = if cfg!(miri) { 32 } else { 256 }; // Miri is too slow
2270 let mut src: Vec<u16> = Vec::with_capacity(len);
2271 src.resize(len, 0);
2272 for i in 0..src.len() {
2273 src[i] = i as u16;
2274 }
2275 for i in 0..src.len() {
2276 let tail = &mut src[i..];
2277 for j in 0..tail.len() {
2278 tail[j] = 0x100 + j as u16;
2279 let s = String::from_utf16(tail).unwrap();
2280 assert!(!is_str_latin1(&s[..]));
2281 assert_ne!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1);
2282 }
2283 }
2284 }
2285
2286 #[test]
test_is_utf8_latin1_success()2287 fn test_is_utf8_latin1_success() {
2288 let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow
2289 let mut src: Vec<u16> = Vec::with_capacity(len);
2290 src.resize(len, 0);
2291 for i in 0..src.len() {
2292 src[i] = i as u16;
2293 }
2294 for i in 0..src.len() {
2295 let s = String::from_utf16(&src[i..]).unwrap();
2296 assert!(is_utf8_latin1(s.as_bytes()));
2297 assert_eq!(
2298 check_utf8_for_latin1_and_bidi(s.as_bytes()),
2299 Latin1Bidi::Latin1
2300 );
2301 }
2302 }
2303
2304 #[test]
test_is_utf8_latin1_fail()2305 fn test_is_utf8_latin1_fail() {
2306 let len = if cfg!(miri) { 32 } else { 256 }; // Miri is too slow
2307 let mut src: Vec<u16> = Vec::with_capacity(len);
2308 src.resize(len, 0);
2309 for i in 0..src.len() {
2310 src[i] = i as u16;
2311 }
2312 for i in 0..src.len() {
2313 let tail = &mut src[i..];
2314 for j in 0..tail.len() {
2315 tail[j] = 0x100 + j as u16;
2316 let s = String::from_utf16(tail).unwrap();
2317 assert!(!is_utf8_latin1(s.as_bytes()));
2318 assert_ne!(
2319 check_utf8_for_latin1_and_bidi(s.as_bytes()),
2320 Latin1Bidi::Latin1
2321 );
2322 }
2323 }
2324 }
2325
2326 #[test]
test_is_utf8_latin1_invalid()2327 fn test_is_utf8_latin1_invalid() {
2328 assert!(!is_utf8_latin1(b"\xC3"));
2329 assert!(!is_utf8_latin1(b"a\xC3"));
2330 assert!(!is_utf8_latin1(b"\xFF"));
2331 assert!(!is_utf8_latin1(b"a\xFF"));
2332 assert!(!is_utf8_latin1(b"\xC3\xFF"));
2333 assert!(!is_utf8_latin1(b"a\xC3\xFF"));
2334 }
2335
2336 #[test]
test_convert_utf8_to_utf16()2337 fn test_convert_utf8_to_utf16() {
2338 let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2339 let mut dst: Vec<u16> = Vec::with_capacity(src.len() + 1);
2340 dst.resize(src.len() + 1, 0);
2341 let len = convert_utf8_to_utf16(src.as_bytes(), &mut dst[..]);
2342 dst.truncate(len);
2343 let reference: Vec<u16> = src.encode_utf16().collect();
2344 assert_eq!(dst, reference);
2345 }
2346
2347 #[test]
test_convert_str_to_utf16()2348 fn test_convert_str_to_utf16() {
2349 let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2350 let mut dst: Vec<u16> = Vec::with_capacity(src.len());
2351 dst.resize(src.len(), 0);
2352 let len = convert_str_to_utf16(src, &mut dst[..]);
2353 dst.truncate(len);
2354 let reference: Vec<u16> = src.encode_utf16().collect();
2355 assert_eq!(dst, reference);
2356 }
2357
2358 #[test]
test_convert_utf16_to_utf8_partial()2359 fn test_convert_utf16_to_utf8_partial() {
2360 let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2361 let src: Vec<u16> = reference.encode_utf16().collect();
2362 let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 3 + 1);
2363 dst.resize(src.len() * 3 + 1, 0);
2364 let (read, written) = convert_utf16_to_utf8_partial(&src[..], &mut dst[..24]);
2365 let len = written + convert_utf16_to_utf8(&src[read..], &mut dst[written..]);
2366 dst.truncate(len);
2367 assert_eq!(dst, reference.as_bytes());
2368 }
2369
2370 #[test]
test_convert_utf16_to_utf8()2371 fn test_convert_utf16_to_utf8() {
2372 let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2373 let src: Vec<u16> = reference.encode_utf16().collect();
2374 let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 3 + 1);
2375 dst.resize(src.len() * 3 + 1, 0);
2376 let len = convert_utf16_to_utf8(&src[..], &mut dst[..]);
2377 dst.truncate(len);
2378 assert_eq!(dst, reference.as_bytes());
2379 }
2380
2381 #[test]
test_convert_latin1_to_utf16()2382 fn test_convert_latin1_to_utf16() {
2383 let mut src: Vec<u8> = Vec::with_capacity(256);
2384 src.resize(256, 0);
2385 let mut reference: Vec<u16> = Vec::with_capacity(256);
2386 reference.resize(256, 0);
2387 for i in 0..256 {
2388 src[i] = i as u8;
2389 reference[i] = i as u16;
2390 }
2391 let mut dst: Vec<u16> = Vec::with_capacity(src.len());
2392 dst.resize(src.len(), 0);
2393 convert_latin1_to_utf16(&src[..], &mut dst[..]);
2394 assert_eq!(dst, reference);
2395 }
2396
2397 #[test]
test_convert_latin1_to_utf8_partial()2398 fn test_convert_latin1_to_utf8_partial() {
2399 let mut dst = [0u8, 2];
2400 let (read, written) = convert_latin1_to_utf8_partial(b"a\xFF", &mut dst[..]);
2401 assert_eq!(read, 1);
2402 assert_eq!(written, 1);
2403 }
2404
2405 #[test]
test_convert_latin1_to_utf8()2406 fn test_convert_latin1_to_utf8() {
2407 let mut src: Vec<u8> = Vec::with_capacity(256);
2408 src.resize(256, 0);
2409 let mut reference: Vec<u16> = Vec::with_capacity(256);
2410 reference.resize(256, 0);
2411 for i in 0..256 {
2412 src[i] = i as u8;
2413 reference[i] = i as u16;
2414 }
2415 let s = String::from_utf16(&reference[..]).unwrap();
2416 let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 2);
2417 dst.resize(src.len() * 2, 0);
2418 let len = convert_latin1_to_utf8(&src[..], &mut dst[..]);
2419 dst.truncate(len);
2420 assert_eq!(&dst[..], s.as_bytes());
2421 }
2422
2423 #[test]
test_convert_utf8_to_latin1_lossy()2424 fn test_convert_utf8_to_latin1_lossy() {
2425 let mut reference: Vec<u8> = Vec::with_capacity(256);
2426 reference.resize(256, 0);
2427 let mut src16: Vec<u16> = Vec::with_capacity(256);
2428 src16.resize(256, 0);
2429 for i in 0..256 {
2430 src16[i] = i as u16;
2431 reference[i] = i as u8;
2432 }
2433 let src = String::from_utf16(&src16[..]).unwrap();
2434 let mut dst: Vec<u8> = Vec::with_capacity(src.len());
2435 dst.resize(src.len(), 0);
2436 let len = convert_utf8_to_latin1_lossy(src.as_bytes(), &mut dst[..]);
2437 dst.truncate(len);
2438 assert_eq!(dst, reference);
2439 }
2440
2441 #[cfg(all(debug_assertions, not(fuzzing)))]
2442 #[test]
2443 #[should_panic]
test_convert_utf8_to_latin1_lossy_panics()2444 fn test_convert_utf8_to_latin1_lossy_panics() {
2445 let mut dst = [0u8; 16];
2446 let _ = convert_utf8_to_latin1_lossy("\u{100}".as_bytes(), &mut dst[..]);
2447 }
2448
2449 #[test]
test_convert_utf16_to_latin1_lossy()2450 fn test_convert_utf16_to_latin1_lossy() {
2451 let mut src: Vec<u16> = Vec::with_capacity(256);
2452 src.resize(256, 0);
2453 let mut reference: Vec<u8> = Vec::with_capacity(256);
2454 reference.resize(256, 0);
2455 for i in 0..256 {
2456 src[i] = i as u16;
2457 reference[i] = i as u8;
2458 }
2459 let mut dst: Vec<u8> = Vec::with_capacity(src.len());
2460 dst.resize(src.len(), 0);
2461 convert_utf16_to_latin1_lossy(&src[..], &mut dst[..]);
2462 assert_eq!(dst, reference);
2463 }
2464
2465 #[test]
2466 // #[should_panic]
test_convert_utf16_to_latin1_lossy_panics()2467 fn test_convert_utf16_to_latin1_lossy_panics() {
2468 let mut dst = [0u8; 16];
2469 let _ = convert_utf16_to_latin1_lossy(&[0x0100u16], &mut dst[..]);
2470 }
2471
2472 #[test]
test_utf16_valid_up_to()2473 fn test_utf16_valid_up_to() {
2474 let valid = vec![
2475 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0x2603u16,
2476 0xD83Du16, 0xDCA9u16, 0x00B6u16,
2477 ];
2478 assert_eq!(utf16_valid_up_to(&valid[..]), 16);
2479 let lone_high = vec![
2480 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2481 0x2603u16, 0xD83Du16, 0x00B6u16,
2482 ];
2483 assert_eq!(utf16_valid_up_to(&lone_high[..]), 14);
2484 let lone_low = vec![
2485 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2486 0x2603u16, 0xDCA9u16, 0x00B6u16,
2487 ];
2488 assert_eq!(utf16_valid_up_to(&lone_low[..]), 14);
2489 let lone_high_at_end = vec![
2490 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2491 0x2603u16, 0x00B6u16, 0xD83Du16,
2492 ];
2493 assert_eq!(utf16_valid_up_to(&lone_high_at_end[..]), 15);
2494 }
2495
2496 #[test]
test_ensure_utf16_validity()2497 fn test_ensure_utf16_validity() {
2498 let mut src = vec![
2499 0u16, 0xD83Du16, 0u16, 0u16, 0u16, 0xD83Du16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2500 0u16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2501 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2502 ];
2503 let reference = vec![
2504 0u16, 0xFFFDu16, 0u16, 0u16, 0u16, 0xD83Du16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2505 0u16, 0xFFFDu16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2506 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2507 ];
2508 ensure_utf16_validity(&mut src[..]);
2509 assert_eq!(src, reference);
2510 }
2511
2512 #[test]
test_is_char_bidi()2513 fn test_is_char_bidi() {
2514 assert!(!is_char_bidi('a'));
2515 assert!(!is_char_bidi('\u{03B1}'));
2516 assert!(!is_char_bidi('\u{3041}'));
2517 assert!(!is_char_bidi('\u{1F4A9}'));
2518 assert!(!is_char_bidi('\u{FE00}'));
2519 assert!(!is_char_bidi('\u{202C}'));
2520 assert!(!is_char_bidi('\u{FEFF}'));
2521 assert!(is_char_bidi('\u{0590}'));
2522 assert!(is_char_bidi('\u{08FF}'));
2523 assert!(is_char_bidi('\u{061C}'));
2524 assert!(is_char_bidi('\u{FB50}'));
2525 assert!(is_char_bidi('\u{FDFF}'));
2526 assert!(is_char_bidi('\u{FE70}'));
2527 assert!(is_char_bidi('\u{FEFE}'));
2528 assert!(is_char_bidi('\u{200F}'));
2529 assert!(is_char_bidi('\u{202B}'));
2530 assert!(is_char_bidi('\u{202E}'));
2531 assert!(is_char_bidi('\u{2067}'));
2532 assert!(is_char_bidi('\u{10800}'));
2533 assert!(is_char_bidi('\u{10FFF}'));
2534 assert!(is_char_bidi('\u{1E800}'));
2535 assert!(is_char_bidi('\u{1EFFF}'));
2536 }
2537
2538 #[test]
test_is_utf16_code_unit_bidi()2539 fn test_is_utf16_code_unit_bidi() {
2540 assert!(!is_utf16_code_unit_bidi(0x0062));
2541 assert!(!is_utf16_code_unit_bidi(0x03B1));
2542 assert!(!is_utf16_code_unit_bidi(0x3041));
2543 assert!(!is_utf16_code_unit_bidi(0xD801));
2544 assert!(!is_utf16_code_unit_bidi(0xFE00));
2545 assert!(!is_utf16_code_unit_bidi(0x202C));
2546 assert!(!is_utf16_code_unit_bidi(0xFEFF));
2547 assert!(is_utf16_code_unit_bidi(0x0590));
2548 assert!(is_utf16_code_unit_bidi(0x08FF));
2549 assert!(is_utf16_code_unit_bidi(0x061C));
2550 assert!(is_utf16_code_unit_bidi(0xFB1D));
2551 assert!(is_utf16_code_unit_bidi(0xFB50));
2552 assert!(is_utf16_code_unit_bidi(0xFDFF));
2553 assert!(is_utf16_code_unit_bidi(0xFE70));
2554 assert!(is_utf16_code_unit_bidi(0xFEFE));
2555 assert!(is_utf16_code_unit_bidi(0x200F));
2556 assert!(is_utf16_code_unit_bidi(0x202B));
2557 assert!(is_utf16_code_unit_bidi(0x202E));
2558 assert!(is_utf16_code_unit_bidi(0x2067));
2559 assert!(is_utf16_code_unit_bidi(0xD802));
2560 assert!(is_utf16_code_unit_bidi(0xD803));
2561 assert!(is_utf16_code_unit_bidi(0xD83A));
2562 assert!(is_utf16_code_unit_bidi(0xD83B));
2563 }
2564
2565 #[test]
test_is_str_bidi()2566 fn test_is_str_bidi() {
2567 assert!(!is_str_bidi("abcdefghijklmnopaabcdefghijklmnop"));
2568 assert!(!is_str_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop"));
2569 assert!(!is_str_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop"));
2570 assert!(!is_str_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop"));
2571 assert!(!is_str_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop"));
2572 assert!(!is_str_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop"));
2573 assert!(!is_str_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop"));
2574 assert!(is_str_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop"));
2575 assert!(is_str_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop"));
2576 assert!(is_str_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop"));
2577 assert!(is_str_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop"));
2578 assert!(is_str_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop"));
2579 assert!(is_str_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop"));
2580 assert!(is_str_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop"));
2581 assert!(is_str_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop"));
2582 assert!(is_str_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop"));
2583 assert!(is_str_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop"));
2584 assert!(is_str_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop"));
2585 assert!(is_str_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop"));
2586 assert!(is_str_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop"));
2587 assert!(is_str_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop"));
2588 assert!(is_str_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop"));
2589 }
2590
2591 #[test]
test_is_utf8_bidi()2592 fn test_is_utf8_bidi() {
2593 assert!(!is_utf8_bidi(
2594 "abcdefghijklmnopaabcdefghijklmnop".as_bytes()
2595 ));
2596 assert!(!is_utf8_bidi(
2597 "abcdefghijklmnop\u{03B1}abcdefghijklmnop".as_bytes()
2598 ));
2599 assert!(!is_utf8_bidi(
2600 "abcdefghijklmnop\u{3041}abcdefghijklmnop".as_bytes()
2601 ));
2602 assert!(!is_utf8_bidi(
2603 "abcdefghijklmnop\u{1F4A9}abcdefghijklmnop".as_bytes()
2604 ));
2605 assert!(!is_utf8_bidi(
2606 "abcdefghijklmnop\u{FE00}abcdefghijklmnop".as_bytes()
2607 ));
2608 assert!(!is_utf8_bidi(
2609 "abcdefghijklmnop\u{202C}abcdefghijklmnop".as_bytes()
2610 ));
2611 assert!(!is_utf8_bidi(
2612 "abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes()
2613 ));
2614 assert!(is_utf8_bidi(
2615 "abcdefghijklmnop\u{0590}abcdefghijklmnop".as_bytes()
2616 ));
2617 assert!(is_utf8_bidi(
2618 "abcdefghijklmnop\u{08FF}abcdefghijklmnop".as_bytes()
2619 ));
2620 assert!(is_utf8_bidi(
2621 "abcdefghijklmnop\u{061C}abcdefghijklmnop".as_bytes()
2622 ));
2623 assert!(is_utf8_bidi(
2624 "abcdefghijklmnop\u{FB50}abcdefghijklmnop".as_bytes()
2625 ));
2626 assert!(is_utf8_bidi(
2627 "abcdefghijklmnop\u{FDFF}abcdefghijklmnop".as_bytes()
2628 ));
2629 assert!(is_utf8_bidi(
2630 "abcdefghijklmnop\u{FE70}abcdefghijklmnop".as_bytes()
2631 ));
2632 assert!(is_utf8_bidi(
2633 "abcdefghijklmnop\u{FEFE}abcdefghijklmnop".as_bytes()
2634 ));
2635 assert!(is_utf8_bidi(
2636 "abcdefghijklmnop\u{200F}abcdefghijklmnop".as_bytes()
2637 ));
2638 assert!(is_utf8_bidi(
2639 "abcdefghijklmnop\u{202B}abcdefghijklmnop".as_bytes()
2640 ));
2641 assert!(is_utf8_bidi(
2642 "abcdefghijklmnop\u{202E}abcdefghijklmnop".as_bytes()
2643 ));
2644 assert!(is_utf8_bidi(
2645 "abcdefghijklmnop\u{2067}abcdefghijklmnop".as_bytes()
2646 ));
2647 assert!(is_utf8_bidi(
2648 "abcdefghijklmnop\u{10800}abcdefghijklmnop".as_bytes()
2649 ));
2650 assert!(is_utf8_bidi(
2651 "abcdefghijklmnop\u{10FFF}abcdefghijklmnop".as_bytes()
2652 ));
2653 assert!(is_utf8_bidi(
2654 "abcdefghijklmnop\u{1E800}abcdefghijklmnop".as_bytes()
2655 ));
2656 assert!(is_utf8_bidi(
2657 "abcdefghijklmnop\u{1EFFF}abcdefghijklmnop".as_bytes()
2658 ));
2659 }
2660
2661 #[test]
test_is_utf16_bidi()2662 fn test_is_utf16_bidi() {
2663 assert!(!is_utf16_bidi(&[
2664 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0062, 0x62, 0x63, 0x64, 0x65, 0x66,
2665 0x67, 0x68, 0x69,
2666 ]));
2667 assert!(!is_utf16_bidi(&[
2668 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x03B1, 0x62, 0x63, 0x64, 0x65, 0x66,
2669 0x67, 0x68, 0x69,
2670 ]));
2671 assert!(!is_utf16_bidi(&[
2672 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x3041, 0x62, 0x63, 0x64, 0x65, 0x66,
2673 0x67, 0x68, 0x69,
2674 ]));
2675 assert!(!is_utf16_bidi(&[
2676 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD801, 0x62, 0x63, 0x64, 0x65, 0x66,
2677 0x67, 0x68, 0x69,
2678 ]));
2679 assert!(!is_utf16_bidi(&[
2680 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE00, 0x62, 0x63, 0x64, 0x65, 0x66,
2681 0x67, 0x68, 0x69,
2682 ]));
2683 assert!(!is_utf16_bidi(&[
2684 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65, 0x66,
2685 0x67, 0x68, 0x69,
2686 ]));
2687 assert!(!is_utf16_bidi(&[
2688 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65, 0x66,
2689 0x67, 0x68, 0x69,
2690 ]));
2691 assert!(is_utf16_bidi(&[
2692 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65, 0x66,
2693 0x67, 0x68, 0x69,
2694 ]));
2695 assert!(is_utf16_bidi(&[
2696 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x08FF, 0x62, 0x63, 0x64, 0x65, 0x66,
2697 0x67, 0x68, 0x69,
2698 ]));
2699 assert!(is_utf16_bidi(&[
2700 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65, 0x66,
2701 0x67, 0x68, 0x69,
2702 ]));
2703 assert!(is_utf16_bidi(&[
2704 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB1D, 0x62, 0x63, 0x64, 0x65, 0x66,
2705 0x67, 0x68, 0x69,
2706 ]));
2707 assert!(is_utf16_bidi(&[
2708 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65, 0x66,
2709 0x67, 0x68, 0x69,
2710 ]));
2711 assert!(is_utf16_bidi(&[
2712 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFDFF, 0x62, 0x63, 0x64, 0x65, 0x66,
2713 0x67, 0x68, 0x69,
2714 ]));
2715 assert!(is_utf16_bidi(&[
2716 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE70, 0x62, 0x63, 0x64, 0x65, 0x66,
2717 0x67, 0x68, 0x69,
2718 ]));
2719 assert!(is_utf16_bidi(&[
2720 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFE, 0x62, 0x63, 0x64, 0x65, 0x66,
2721 0x67, 0x68, 0x69,
2722 ]));
2723 assert!(is_utf16_bidi(&[
2724 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x200F, 0x62, 0x63, 0x64, 0x65, 0x66,
2725 0x67, 0x68, 0x69,
2726 ]));
2727 assert!(is_utf16_bidi(&[
2728 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202B, 0x62, 0x63, 0x64, 0x65, 0x66,
2729 0x67, 0x68, 0x69,
2730 ]));
2731 assert!(is_utf16_bidi(&[
2732 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202E, 0x62, 0x63, 0x64, 0x65, 0x66,
2733 0x67, 0x68, 0x69,
2734 ]));
2735 assert!(is_utf16_bidi(&[
2736 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x2067, 0x62, 0x63, 0x64, 0x65, 0x66,
2737 0x67, 0x68, 0x69,
2738 ]));
2739 assert!(is_utf16_bidi(&[
2740 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD802, 0x62, 0x63, 0x64, 0x65, 0x66,
2741 0x67, 0x68, 0x69,
2742 ]));
2743 assert!(is_utf16_bidi(&[
2744 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD803, 0x62, 0x63, 0x64, 0x65, 0x66,
2745 0x67, 0x68, 0x69,
2746 ]));
2747 assert!(is_utf16_bidi(&[
2748 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83A, 0x62, 0x63, 0x64, 0x65, 0x66,
2749 0x67, 0x68, 0x69,
2750 ]));
2751 assert!(is_utf16_bidi(&[
2752 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83B, 0x62, 0x63, 0x64, 0x65, 0x66,
2753 0x67, 0x68, 0x69,
2754 ]));
2755
2756 assert!(is_utf16_bidi(&[
2757 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x3041, 0x62, 0x63, 0x64, 0x65,
2758 0x66, 0x67, 0x68, 0x69,
2759 ]));
2760 }
2761
2762 #[test]
test_check_str_for_latin1_and_bidi()2763 fn test_check_str_for_latin1_and_bidi() {
2764 assert_ne!(
2765 check_str_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop"),
2766 Latin1Bidi::Bidi
2767 );
2768 assert_ne!(
2769 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop"),
2770 Latin1Bidi::Bidi
2771 );
2772 assert_ne!(
2773 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop"),
2774 Latin1Bidi::Bidi
2775 );
2776 assert_ne!(
2777 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop"),
2778 Latin1Bidi::Bidi
2779 );
2780 assert_ne!(
2781 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop"),
2782 Latin1Bidi::Bidi
2783 );
2784 assert_ne!(
2785 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop"),
2786 Latin1Bidi::Bidi
2787 );
2788 assert_ne!(
2789 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop"),
2790 Latin1Bidi::Bidi
2791 );
2792 assert_eq!(
2793 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop"),
2794 Latin1Bidi::Bidi
2795 );
2796 assert_eq!(
2797 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop"),
2798 Latin1Bidi::Bidi
2799 );
2800 assert_eq!(
2801 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop"),
2802 Latin1Bidi::Bidi
2803 );
2804 assert_eq!(
2805 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop"),
2806 Latin1Bidi::Bidi
2807 );
2808 assert_eq!(
2809 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop"),
2810 Latin1Bidi::Bidi
2811 );
2812 assert_eq!(
2813 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop"),
2814 Latin1Bidi::Bidi
2815 );
2816 assert_eq!(
2817 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop"),
2818 Latin1Bidi::Bidi
2819 );
2820 assert_eq!(
2821 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop"),
2822 Latin1Bidi::Bidi
2823 );
2824 assert_eq!(
2825 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop"),
2826 Latin1Bidi::Bidi
2827 );
2828 assert_eq!(
2829 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop"),
2830 Latin1Bidi::Bidi
2831 );
2832 assert_eq!(
2833 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop"),
2834 Latin1Bidi::Bidi
2835 );
2836 assert_eq!(
2837 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop"),
2838 Latin1Bidi::Bidi
2839 );
2840 assert_eq!(
2841 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop"),
2842 Latin1Bidi::Bidi
2843 );
2844 assert_eq!(
2845 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop"),
2846 Latin1Bidi::Bidi
2847 );
2848 assert_eq!(
2849 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop"),
2850 Latin1Bidi::Bidi
2851 );
2852 }
2853
2854 #[test]
test_check_utf8_for_latin1_and_bidi()2855 fn test_check_utf8_for_latin1_and_bidi() {
2856 assert_ne!(
2857 check_utf8_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop".as_bytes()),
2858 Latin1Bidi::Bidi
2859 );
2860 assert_ne!(
2861 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop".as_bytes()),
2862 Latin1Bidi::Bidi
2863 );
2864 assert_ne!(
2865 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop".as_bytes()),
2866 Latin1Bidi::Bidi
2867 );
2868 assert_ne!(
2869 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop".as_bytes()),
2870 Latin1Bidi::Bidi
2871 );
2872 assert_ne!(
2873 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop".as_bytes()),
2874 Latin1Bidi::Bidi
2875 );
2876 assert_ne!(
2877 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop".as_bytes()),
2878 Latin1Bidi::Bidi
2879 );
2880 assert_ne!(
2881 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes()),
2882 Latin1Bidi::Bidi
2883 );
2884 assert_eq!(
2885 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop".as_bytes()),
2886 Latin1Bidi::Bidi
2887 );
2888 assert_eq!(
2889 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop".as_bytes()),
2890 Latin1Bidi::Bidi
2891 );
2892 assert_eq!(
2893 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop".as_bytes()),
2894 Latin1Bidi::Bidi
2895 );
2896 assert_eq!(
2897 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop".as_bytes()),
2898 Latin1Bidi::Bidi
2899 );
2900 assert_eq!(
2901 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop".as_bytes()),
2902 Latin1Bidi::Bidi
2903 );
2904 assert_eq!(
2905 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop".as_bytes()),
2906 Latin1Bidi::Bidi
2907 );
2908 assert_eq!(
2909 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop".as_bytes()),
2910 Latin1Bidi::Bidi
2911 );
2912 assert_eq!(
2913 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop".as_bytes()),
2914 Latin1Bidi::Bidi
2915 );
2916 assert_eq!(
2917 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop".as_bytes()),
2918 Latin1Bidi::Bidi
2919 );
2920 assert_eq!(
2921 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop".as_bytes()),
2922 Latin1Bidi::Bidi
2923 );
2924 assert_eq!(
2925 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop".as_bytes()),
2926 Latin1Bidi::Bidi
2927 );
2928 assert_eq!(
2929 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop".as_bytes()),
2930 Latin1Bidi::Bidi
2931 );
2932 assert_eq!(
2933 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop".as_bytes()),
2934 Latin1Bidi::Bidi
2935 );
2936 assert_eq!(
2937 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop".as_bytes()),
2938 Latin1Bidi::Bidi
2939 );
2940 assert_eq!(
2941 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop".as_bytes()),
2942 Latin1Bidi::Bidi
2943 );
2944 }
2945
2946 #[test]
test_check_utf16_for_latin1_and_bidi()2947 fn test_check_utf16_for_latin1_and_bidi() {
2948 assert_ne!(
2949 check_utf16_for_latin1_and_bidi(&[
2950 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0062, 0x62, 0x63, 0x64, 0x65,
2951 0x66, 0x67, 0x68, 0x69,
2952 ]),
2953 Latin1Bidi::Bidi
2954 );
2955 assert_ne!(
2956 check_utf16_for_latin1_and_bidi(&[
2957 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x03B1, 0x62, 0x63, 0x64, 0x65,
2958 0x66, 0x67, 0x68, 0x69,
2959 ]),
2960 Latin1Bidi::Bidi
2961 );
2962 assert_ne!(
2963 check_utf16_for_latin1_and_bidi(&[
2964 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x3041, 0x62, 0x63, 0x64, 0x65,
2965 0x66, 0x67, 0x68, 0x69,
2966 ]),
2967 Latin1Bidi::Bidi
2968 );
2969 assert_ne!(
2970 check_utf16_for_latin1_and_bidi(&[
2971 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD801, 0x62, 0x63, 0x64, 0x65,
2972 0x66, 0x67, 0x68, 0x69,
2973 ]),
2974 Latin1Bidi::Bidi
2975 );
2976 assert_ne!(
2977 check_utf16_for_latin1_and_bidi(&[
2978 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE00, 0x62, 0x63, 0x64, 0x65,
2979 0x66, 0x67, 0x68, 0x69,
2980 ]),
2981 Latin1Bidi::Bidi
2982 );
2983 assert_ne!(
2984 check_utf16_for_latin1_and_bidi(&[
2985 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65,
2986 0x66, 0x67, 0x68, 0x69,
2987 ]),
2988 Latin1Bidi::Bidi
2989 );
2990 assert_ne!(
2991 check_utf16_for_latin1_and_bidi(&[
2992 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65,
2993 0x66, 0x67, 0x68, 0x69,
2994 ]),
2995 Latin1Bidi::Bidi
2996 );
2997 assert_eq!(
2998 check_utf16_for_latin1_and_bidi(&[
2999 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65,
3000 0x66, 0x67, 0x68, 0x69,
3001 ]),
3002 Latin1Bidi::Bidi
3003 );
3004 assert_eq!(
3005 check_utf16_for_latin1_and_bidi(&[
3006 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x08FF, 0x62, 0x63, 0x64, 0x65,
3007 0x66, 0x67, 0x68, 0x69,
3008 ]),
3009 Latin1Bidi::Bidi
3010 );
3011 assert_eq!(
3012 check_utf16_for_latin1_and_bidi(&[
3013 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65,
3014 0x66, 0x67, 0x68, 0x69,
3015 ]),
3016 Latin1Bidi::Bidi
3017 );
3018 assert_eq!(
3019 check_utf16_for_latin1_and_bidi(&[
3020 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB1D, 0x62, 0x63, 0x64, 0x65,
3021 0x66, 0x67, 0x68, 0x69,
3022 ]),
3023 Latin1Bidi::Bidi
3024 );
3025 assert_eq!(
3026 check_utf16_for_latin1_and_bidi(&[
3027 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65,
3028 0x66, 0x67, 0x68, 0x69,
3029 ]),
3030 Latin1Bidi::Bidi
3031 );
3032 assert_eq!(
3033 check_utf16_for_latin1_and_bidi(&[
3034 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFDFF, 0x62, 0x63, 0x64, 0x65,
3035 0x66, 0x67, 0x68, 0x69,
3036 ]),
3037 Latin1Bidi::Bidi
3038 );
3039 assert_eq!(
3040 check_utf16_for_latin1_and_bidi(&[
3041 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE70, 0x62, 0x63, 0x64, 0x65,
3042 0x66, 0x67, 0x68, 0x69,
3043 ]),
3044 Latin1Bidi::Bidi
3045 );
3046 assert_eq!(
3047 check_utf16_for_latin1_and_bidi(&[
3048 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFE, 0x62, 0x63, 0x64, 0x65,
3049 0x66, 0x67, 0x68, 0x69,
3050 ]),
3051 Latin1Bidi::Bidi
3052 );
3053 assert_eq!(
3054 check_utf16_for_latin1_and_bidi(&[
3055 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x200F, 0x62, 0x63, 0x64, 0x65,
3056 0x66, 0x67, 0x68, 0x69,
3057 ]),
3058 Latin1Bidi::Bidi
3059 );
3060 assert_eq!(
3061 check_utf16_for_latin1_and_bidi(&[
3062 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202B, 0x62, 0x63, 0x64, 0x65,
3063 0x66, 0x67, 0x68, 0x69,
3064 ]),
3065 Latin1Bidi::Bidi
3066 );
3067 assert_eq!(
3068 check_utf16_for_latin1_and_bidi(&[
3069 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202E, 0x62, 0x63, 0x64, 0x65,
3070 0x66, 0x67, 0x68, 0x69,
3071 ]),
3072 Latin1Bidi::Bidi
3073 );
3074 assert_eq!(
3075 check_utf16_for_latin1_and_bidi(&[
3076 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x2067, 0x62, 0x63, 0x64, 0x65,
3077 0x66, 0x67, 0x68, 0x69,
3078 ]),
3079 Latin1Bidi::Bidi
3080 );
3081 assert_eq!(
3082 check_utf16_for_latin1_and_bidi(&[
3083 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD802, 0x62, 0x63, 0x64, 0x65,
3084 0x66, 0x67, 0x68, 0x69,
3085 ]),
3086 Latin1Bidi::Bidi
3087 );
3088 assert_eq!(
3089 check_utf16_for_latin1_and_bidi(&[
3090 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD803, 0x62, 0x63, 0x64, 0x65,
3091 0x66, 0x67, 0x68, 0x69,
3092 ]),
3093 Latin1Bidi::Bidi
3094 );
3095 assert_eq!(
3096 check_utf16_for_latin1_and_bidi(&[
3097 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83A, 0x62, 0x63, 0x64, 0x65,
3098 0x66, 0x67, 0x68, 0x69,
3099 ]),
3100 Latin1Bidi::Bidi
3101 );
3102 assert_eq!(
3103 check_utf16_for_latin1_and_bidi(&[
3104 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83B, 0x62, 0x63, 0x64, 0x65,
3105 0x66, 0x67, 0x68, 0x69,
3106 ]),
3107 Latin1Bidi::Bidi
3108 );
3109
3110 assert_eq!(
3111 check_utf16_for_latin1_and_bidi(&[
3112 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x3041, 0x62, 0x63, 0x64,
3113 0x65, 0x66, 0x67, 0x68, 0x69,
3114 ]),
3115 Latin1Bidi::Bidi
3116 );
3117 }
3118
3119 #[inline(always)]
reference_is_char_bidi(c: char) -> bool3120 pub fn reference_is_char_bidi(c: char) -> bool {
3121 match c {
3122 '\u{0590}'..='\u{08FF}'
3123 | '\u{FB1D}'..='\u{FDFF}'
3124 | '\u{FE70}'..='\u{FEFE}'
3125 | '\u{10800}'..='\u{10FFF}'
3126 | '\u{1E800}'..='\u{1EFFF}'
3127 | '\u{200F}'
3128 | '\u{202B}'
3129 | '\u{202E}'
3130 | '\u{2067}' => true,
3131 _ => false,
3132 }
3133 }
3134
3135 #[inline(always)]
reference_is_utf16_code_unit_bidi(u: u16) -> bool3136 pub fn reference_is_utf16_code_unit_bidi(u: u16) -> bool {
3137 match u {
3138 0x0590..=0x08FF
3139 | 0xFB1D..=0xFDFF
3140 | 0xFE70..=0xFEFE
3141 | 0xD802
3142 | 0xD803
3143 | 0xD83A
3144 | 0xD83B
3145 | 0x200F
3146 | 0x202B
3147 | 0x202E
3148 | 0x2067 => true,
3149 _ => false,
3150 }
3151 }
3152
3153 #[test]
3154 #[cfg_attr(miri, ignore)] // Miri is too slow
test_is_char_bidi_thoroughly()3155 fn test_is_char_bidi_thoroughly() {
3156 for i in 0..0xD800u32 {
3157 let c: char = ::core::char::from_u32(i).unwrap();
3158 assert_eq!(is_char_bidi(c), reference_is_char_bidi(c));
3159 }
3160 for i in 0xE000..0x110000u32 {
3161 let c: char = ::core::char::from_u32(i).unwrap();
3162 assert_eq!(is_char_bidi(c), reference_is_char_bidi(c));
3163 }
3164 }
3165
3166 #[test]
3167 #[cfg_attr(miri, ignore)] // Miri is too slow
test_is_utf16_code_unit_bidi_thoroughly()3168 fn test_is_utf16_code_unit_bidi_thoroughly() {
3169 for i in 0..0x10000u32 {
3170 let u = i as u16;
3171 assert_eq!(
3172 is_utf16_code_unit_bidi(u),
3173 reference_is_utf16_code_unit_bidi(u)
3174 );
3175 }
3176 }
3177
3178 #[test]
3179 #[cfg_attr(miri, ignore)] // Miri is too slow
test_is_str_bidi_thoroughly()3180 fn test_is_str_bidi_thoroughly() {
3181 let mut buf = [0; 4];
3182 for i in 0..0xD800u32 {
3183 let c: char = ::core::char::from_u32(i).unwrap();
3184 assert_eq!(
3185 is_str_bidi(c.encode_utf8(&mut buf[..])),
3186 reference_is_char_bidi(c)
3187 );
3188 }
3189 for i in 0xE000..0x110000u32 {
3190 let c: char = ::core::char::from_u32(i).unwrap();
3191 assert_eq!(
3192 is_str_bidi(c.encode_utf8(&mut buf[..])),
3193 reference_is_char_bidi(c)
3194 );
3195 }
3196 }
3197
3198 #[test]
3199 #[cfg_attr(miri, ignore)] // Miri is too slow
test_is_utf8_bidi_thoroughly()3200 fn test_is_utf8_bidi_thoroughly() {
3201 let mut buf = [0; 8];
3202 for i in 0..0xD800u32 {
3203 let c: char = ::core::char::from_u32(i).unwrap();
3204 let expect = reference_is_char_bidi(c);
3205 {
3206 let len = {
3207 let bytes = c.encode_utf8(&mut buf[..]).as_bytes();
3208 assert_eq!(is_utf8_bidi(bytes), expect);
3209 bytes.len()
3210 };
3211 {
3212 let tail = &mut buf[len..];
3213 for b in tail.iter_mut() {
3214 *b = 0;
3215 }
3216 }
3217 }
3218 assert_eq!(is_utf8_bidi(&buf[..]), expect);
3219 }
3220 for i in 0xE000..0x110000u32 {
3221 let c: char = ::core::char::from_u32(i).unwrap();
3222 let expect = reference_is_char_bidi(c);
3223 {
3224 let len = {
3225 let bytes = c.encode_utf8(&mut buf[..]).as_bytes();
3226 assert_eq!(is_utf8_bidi(bytes), expect);
3227 bytes.len()
3228 };
3229 {
3230 let tail = &mut buf[len..];
3231 for b in tail.iter_mut() {
3232 *b = 0;
3233 }
3234 }
3235 }
3236 assert_eq!(is_utf8_bidi(&buf[..]), expect);
3237 }
3238 }
3239
3240 #[test]
3241 #[cfg_attr(miri, ignore)] // Miri is too slow
test_is_utf16_bidi_thoroughly()3242 fn test_is_utf16_bidi_thoroughly() {
3243 let mut buf = [0; 32];
3244 for i in 0..0x10000u32 {
3245 let u = i as u16;
3246 buf[15] = u;
3247 assert_eq!(
3248 is_utf16_bidi(&buf[..]),
3249 reference_is_utf16_code_unit_bidi(u)
3250 );
3251 }
3252 }
3253
3254 #[test]
test_is_utf8_bidi_edge_cases()3255 fn test_is_utf8_bidi_edge_cases() {
3256 assert!(!is_utf8_bidi(b"\xD5\xBF\x61"));
3257 assert!(!is_utf8_bidi(b"\xD6\x80\x61"));
3258 assert!(!is_utf8_bidi(b"abc"));
3259 assert!(is_utf8_bidi(b"\xD5\xBF\xC2"));
3260 assert!(is_utf8_bidi(b"\xD6\x80\xC2"));
3261 assert!(is_utf8_bidi(b"ab\xC2"));
3262 }
3263
3264 #[test]
test_decode_latin1()3265 fn test_decode_latin1() {
3266 match decode_latin1(b"ab") {
3267 Cow::Borrowed(s) => {
3268 assert_eq!(s, "ab");
3269 }
3270 Cow::Owned(_) => {
3271 unreachable!("Should have borrowed");
3272 }
3273 }
3274 assert_eq!(decode_latin1(b"a\xE4"), "a\u{E4}");
3275 }
3276
3277 #[test]
test_encode_latin1_lossy()3278 fn test_encode_latin1_lossy() {
3279 match encode_latin1_lossy("ab") {
3280 Cow::Borrowed(s) => {
3281 assert_eq!(s, b"ab");
3282 }
3283 Cow::Owned(_) => {
3284 unreachable!("Should have borrowed");
3285 }
3286 }
3287 assert_eq!(encode_latin1_lossy("a\u{E4}"), &(b"a\xE4")[..]);
3288 }
3289
3290 #[test]
test_convert_utf8_to_utf16_without_replacement()3291 fn test_convert_utf8_to_utf16_without_replacement() {
3292 let mut buf = [0u16; 5];
3293 assert_eq!(
3294 convert_utf8_to_utf16_without_replacement(b"ab", &mut buf[..2]),
3295 Some(2)
3296 );
3297 assert_eq!(buf[0], u16::from(b'a'));
3298 assert_eq!(buf[1], u16::from(b'b'));
3299 assert_eq!(buf[2], 0);
3300 assert_eq!(
3301 convert_utf8_to_utf16_without_replacement(b"\xC3\xA4c", &mut buf[..3]),
3302 Some(2)
3303 );
3304 assert_eq!(buf[0], 0xE4);
3305 assert_eq!(buf[1], u16::from(b'c'));
3306 assert_eq!(buf[2], 0);
3307 assert_eq!(
3308 convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83", &mut buf[..3]),
3309 Some(1)
3310 );
3311 assert_eq!(buf[0], 0x2603);
3312 assert_eq!(buf[1], u16::from(b'c'));
3313 assert_eq!(buf[2], 0);
3314 assert_eq!(
3315 convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83d", &mut buf[..4]),
3316 Some(2)
3317 );
3318 assert_eq!(buf[0], 0x2603);
3319 assert_eq!(buf[1], u16::from(b'd'));
3320 assert_eq!(buf[2], 0);
3321 assert_eq!(
3322 convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83\xC3\xA4", &mut buf[..5]),
3323 Some(2)
3324 );
3325 assert_eq!(buf[0], 0x2603);
3326 assert_eq!(buf[1], 0xE4);
3327 assert_eq!(buf[2], 0);
3328 assert_eq!(
3329 convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93\x8E", &mut buf[..4]),
3330 Some(2)
3331 );
3332 assert_eq!(buf[0], 0xD83D);
3333 assert_eq!(buf[1], 0xDCCE);
3334 assert_eq!(buf[2], 0);
3335 assert_eq!(
3336 convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93\x8Ee", &mut buf[..5]),
3337 Some(3)
3338 );
3339 assert_eq!(buf[0], 0xD83D);
3340 assert_eq!(buf[1], 0xDCCE);
3341 assert_eq!(buf[2], u16::from(b'e'));
3342 assert_eq!(
3343 convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93", &mut buf[..5]),
3344 None
3345 );
3346 }
3347 }
3348