1 // Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9
10 //! Functions for converting between different in-RAM representations of text
11 //! and for quickly checking if the Unicode Bidirectional Algorithm can be
12 //! avoided.
13 //!
14 //! By using slices for output, the functions here seek to enable by-register
15 //! (ALU register or SIMD register as available) operations in order to
16 //! outperform iterator-based conversions available in the Rust standard
17 //! library.
18 //!
19 //! _Note:_ "Latin1" in this module refers to the Unicode range from U+0000 to
20 //! U+00FF, inclusive, and does not refer to the windows-1252 range. This
21 //! in-memory encoding is sometimes used as a storage optimization of text
22 //! when UTF-16 indexing and length semantics are exposed.
23 //!
24 //! The FFI binding for this module are in the
25 //! [encoding_c_mem crate](https://github.com/hsivonen/encoding_c_mem).
26
27 use std::borrow::Cow;
28
29 use super::in_inclusive_range16;
30 use super::in_inclusive_range32;
31 use super::in_inclusive_range8;
32 use super::in_range16;
33 use super::in_range32;
34 use super::DecoderResult;
35 use ascii::*;
36 use utf_8::*;
37
38 macro_rules! non_fuzz_debug_assert {
39 ($($arg:tt)*) => (if !cfg!(fuzzing) { debug_assert!($($arg)*); })
40 }
41
42 cfg_if! {
43 if #[cfg(feature = "simd-accel")] {
44 use ::std::intrinsics::likely;
45 use ::std::intrinsics::unlikely;
46 } else {
47 #[inline(always)]
48 // Unsafe to match the intrinsic, which is needlessly unsafe.
49 unsafe fn likely(b: bool) -> bool {
50 b
51 }
52 #[inline(always)]
53 // Unsafe to match the intrinsic, which is needlessly unsafe.
54 unsafe fn unlikely(b: bool) -> bool {
55 b
56 }
57 }
58 }
59
60 /// Classification of text as Latin1 (all code points are below U+0100),
61 /// left-to-right with some non-Latin1 characters or as containing at least
62 /// some right-to-left characters.
63 #[must_use]
64 #[derive(Debug, PartialEq, Eq)]
65 #[repr(C)]
66 pub enum Latin1Bidi {
67 /// Every character is below U+0100.
68 Latin1 = 0,
69 /// There is at least one character that's U+0100 or higher, but there
70 /// are no right-to-left characters.
71 LeftToRight = 1,
72 /// There is at least one right-to-left character.
73 Bidi = 2,
74 }
75
76 // `as` truncates, so works on 32-bit, too.
77 #[allow(dead_code)]
78 const LATIN1_MASK: usize = 0xFF00_FF00_FF00_FF00u64 as usize;
79
80 #[allow(unused_macros)]
81 macro_rules! by_unit_check_alu {
82 ($name:ident, $unit:ty, $bound:expr, $mask:ident) => {
83 #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
84 #[inline(always)]
85 fn $name(buffer: &[$unit]) -> bool {
86 let mut offset = 0usize;
87 let mut accu = 0usize;
88 let unit_size = ::std::mem::size_of::<$unit>();
89 let len = buffer.len();
90 if len >= ALU_ALIGNMENT / unit_size {
91 // The most common reason to return `false` is for the first code
92 // unit to fail the test, so check that first.
93 if buffer[0] >= $bound {
94 return false;
95 }
96 let src = buffer.as_ptr();
97 let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK))
98 & ALU_ALIGNMENT_MASK)
99 / unit_size;
100 if until_alignment + ALU_ALIGNMENT / unit_size <= len {
101 if until_alignment != 0 {
102 accu |= buffer[offset] as usize;
103 offset += 1;
104 until_alignment -= 1;
105 while until_alignment != 0 {
106 accu |= buffer[offset] as usize;
107 offset += 1;
108 until_alignment -= 1;
109 }
110 if accu >= $bound {
111 return false;
112 }
113 }
114 let len_minus_stride = len - ALU_ALIGNMENT / unit_size;
115 if offset + (4 * (ALU_ALIGNMENT / unit_size)) <= len {
116 let len_minus_unroll = len - (4 * (ALU_ALIGNMENT / unit_size));
117 loop {
118 let unroll_accu = unsafe { *(src.add(offset) as *const usize) }
119 | unsafe {
120 *(src.add(offset + (ALU_ALIGNMENT / unit_size)) as *const usize)
121 }
122 | unsafe {
123 *(src.add(offset + (2 * (ALU_ALIGNMENT / unit_size)))
124 as *const usize)
125 }
126 | unsafe {
127 *(src.add(offset + (3 * (ALU_ALIGNMENT / unit_size)))
128 as *const usize)
129 };
130 if unroll_accu & $mask != 0 {
131 return false;
132 }
133 offset += 4 * (ALU_ALIGNMENT / unit_size);
134 if offset > len_minus_unroll {
135 break;
136 }
137 }
138 }
139 while offset <= len_minus_stride {
140 accu |= unsafe { *(src.add(offset) as *const usize) };
141 offset += ALU_ALIGNMENT / unit_size;
142 }
143 }
144 }
145 for &unit in &buffer[offset..] {
146 accu |= unit as usize;
147 }
148 accu & $mask == 0
149 }
150 };
151 }
152
153 #[allow(unused_macros)]
154 macro_rules! by_unit_check_simd {
155 ($name:ident, $unit:ty, $splat:expr, $simd_ty:ty, $bound:expr, $func:ident) => {
156 #[inline(always)]
157 fn $name(buffer: &[$unit]) -> bool {
158 let mut offset = 0usize;
159 let mut accu = 0usize;
160 let unit_size = ::std::mem::size_of::<$unit>();
161 let len = buffer.len();
162 if len >= SIMD_STRIDE_SIZE / unit_size {
163 // The most common reason to return `false` is for the first code
164 // unit to fail the test, so check that first.
165 if buffer[0] >= $bound {
166 return false;
167 }
168 let src = buffer.as_ptr();
169 let mut until_alignment = ((SIMD_ALIGNMENT
170 - ((src as usize) & SIMD_ALIGNMENT_MASK))
171 & SIMD_ALIGNMENT_MASK)
172 / unit_size;
173 if until_alignment + SIMD_STRIDE_SIZE / unit_size <= len {
174 if until_alignment != 0 {
175 accu |= buffer[offset] as usize;
176 offset += 1;
177 until_alignment -= 1;
178 while until_alignment != 0 {
179 accu |= buffer[offset] as usize;
180 offset += 1;
181 until_alignment -= 1;
182 }
183 if accu >= $bound {
184 return false;
185 }
186 }
187 let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size;
188 if offset + (4 * (SIMD_STRIDE_SIZE / unit_size)) <= len {
189 let len_minus_unroll = len - (4 * (SIMD_STRIDE_SIZE / unit_size));
190 loop {
191 let unroll_accu = unsafe { *(src.add(offset) as *const $simd_ty) }
192 | unsafe {
193 *(src.add(offset + (SIMD_STRIDE_SIZE / unit_size))
194 as *const $simd_ty)
195 }
196 | unsafe {
197 *(src.add(offset + (2 * (SIMD_STRIDE_SIZE / unit_size)))
198 as *const $simd_ty)
199 }
200 | unsafe {
201 *(src.add(offset + (3 * (SIMD_STRIDE_SIZE / unit_size)))
202 as *const $simd_ty)
203 };
204 if !$func(unroll_accu) {
205 return false;
206 }
207 offset += 4 * (SIMD_STRIDE_SIZE / unit_size);
208 if offset > len_minus_unroll {
209 break;
210 }
211 }
212 }
213 let mut simd_accu = $splat;
214 while offset <= len_minus_stride {
215 simd_accu = simd_accu | unsafe { *(src.add(offset) as *const $simd_ty) };
216 offset += SIMD_STRIDE_SIZE / unit_size;
217 }
218 if !$func(simd_accu) {
219 return false;
220 }
221 }
222 }
223 for &unit in &buffer[offset..] {
224 accu |= unit as usize;
225 }
226 accu < $bound
227 }
228 };
229 }
230
231 cfg_if! {
232 if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
233 use simd_funcs::*;
234 use packed_simd::u8x16;
235 use packed_simd::u16x8;
236
237 const SIMD_ALIGNMENT: usize = 16;
238
239 const SIMD_ALIGNMENT_MASK: usize = 15;
240
241 by_unit_check_simd!(is_ascii_impl, u8, u8x16::splat(0), u8x16, 0x80, simd_is_ascii);
242 by_unit_check_simd!(is_basic_latin_impl, u16, u16x8::splat(0), u16x8, 0x80, simd_is_basic_latin);
243 by_unit_check_simd!(is_utf16_latin1_impl, u16, u16x8::splat(0), u16x8, 0x100, simd_is_latin1);
244
245 #[inline(always)]
246 fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize {
247 // This function is a mess, because it simultaneously tries to do
248 // only aligned SIMD (perhaps misguidedly) and needs to deal with
249 // the last code unit in a SIMD stride being part of a valid
250 // surrogate pair.
251 let unit_size = ::std::mem::size_of::<u16>();
252 let src = buffer.as_ptr();
253 let len = buffer.len();
254 let mut offset = 0usize;
255 'outer: loop {
256 let until_alignment = ((SIMD_ALIGNMENT - ((unsafe { src.add(offset) } as usize) & SIMD_ALIGNMENT_MASK)) &
257 SIMD_ALIGNMENT_MASK) / unit_size;
258 if until_alignment == 0 {
259 if offset + SIMD_STRIDE_SIZE / unit_size > len {
260 break;
261 }
262 } else {
263 let offset_plus_until_alignment = offset + until_alignment;
264 let offset_plus_until_alignment_plus_one = offset_plus_until_alignment + 1;
265 if offset_plus_until_alignment_plus_one + SIMD_STRIDE_SIZE / unit_size > len {
266 break;
267 }
268 let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_until_alignment_plus_one]);
269 if up_to < until_alignment {
270 return offset + up_to;
271 }
272 if last_valid_low {
273 offset = offset_plus_until_alignment_plus_one;
274 continue;
275 }
276 offset = offset_plus_until_alignment;
277 }
278 let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size;
279 'inner: loop {
280 let offset_plus_stride = offset + SIMD_STRIDE_SIZE / unit_size;
281 if contains_surrogates(unsafe { *(src.add(offset) as *const u16x8) }) {
282 if offset_plus_stride == len {
283 break 'outer;
284 }
285 let offset_plus_stride_plus_one = offset_plus_stride + 1;
286 let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_stride_plus_one]);
287 if up_to < SIMD_STRIDE_SIZE / unit_size {
288 return offset + up_to;
289 }
290 if last_valid_low {
291 offset = offset_plus_stride_plus_one;
292 continue 'outer;
293 }
294 }
295 offset = offset_plus_stride;
296 if offset > len_minus_stride {
297 break 'outer;
298 }
299 }
300 }
301 let (up_to, _) = utf16_valid_up_to_alu(&buffer[offset..]);
302 offset + up_to
303 }
304 } else {
305 by_unit_check_alu!(is_ascii_impl, u8, 0x80, ASCII_MASK);
306 by_unit_check_alu!(is_basic_latin_impl, u16, 0x80, BASIC_LATIN_MASK);
307 by_unit_check_alu!(is_utf16_latin1_impl, u16, 0x100, LATIN1_MASK);
308
309 #[inline(always)]
310 fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize {
311 let (up_to, _) = utf16_valid_up_to_alu(buffer);
312 up_to
313 }
314 }
315 }
316
317 /// The second return value is true iff the last code unit of the slice was
318 /// reached and turned out to be a low surrogate that is part of a valid pair.
319 #[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))]
320 #[inline(always)]
utf16_valid_up_to_alu(buffer: &[u16]) -> (usize, bool)321 fn utf16_valid_up_to_alu(buffer: &[u16]) -> (usize, bool) {
322 let len = buffer.len();
323 if len == 0 {
324 return (0, false);
325 }
326 let mut offset = 0usize;
327 loop {
328 let unit = buffer[offset];
329 let next = offset + 1;
330 let unit_minus_surrogate_start = unit.wrapping_sub(0xD800);
331 if unit_minus_surrogate_start > (0xDFFF - 0xD800) {
332 // Not a surrogate
333 offset = next;
334 if offset == len {
335 return (offset, false);
336 }
337 continue;
338 }
339 if unit_minus_surrogate_start <= (0xDBFF - 0xD800) {
340 // high surrogate
341 if next < len {
342 let second = buffer[next];
343 let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00);
344 if second_minus_low_surrogate_start <= (0xDFFF - 0xDC00) {
345 // The next code unit is a low surrogate. Advance position.
346 offset = next + 1;
347 if offset == len {
348 return (offset, true);
349 }
350 continue;
351 }
352 // The next code unit is not a low surrogate. Don't advance
353 // position and treat the high surrogate as unpaired.
354 // fall through
355 }
356 // Unpaired, fall through
357 }
358 // Unpaired surrogate
359 return (offset, false);
360 }
361 }
362
363 cfg_if! {
364 if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
365 #[inline(always)]
366 fn is_str_latin1_impl(buffer: &str) -> Option<usize> {
367 let mut offset = 0usize;
368 let bytes = buffer.as_bytes();
369 let len = bytes.len();
370 if len >= SIMD_STRIDE_SIZE {
371 let src = bytes.as_ptr();
372 let mut until_alignment = (SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
373 SIMD_ALIGNMENT_MASK;
374 if until_alignment + SIMD_STRIDE_SIZE <= len {
375 while until_alignment != 0 {
376 if bytes[offset] > 0xC3 {
377 return Some(offset);
378 }
379 offset += 1;
380 until_alignment -= 1;
381 }
382 let len_minus_stride = len - SIMD_STRIDE_SIZE;
383 loop {
384 if !simd_is_str_latin1(unsafe { *(src.add(offset) as *const u8x16) }) {
385 // TODO: Ensure this compiles away when inlined into `is_str_latin1()`.
386 while bytes[offset] & 0xC0 == 0x80 {
387 offset += 1;
388 }
389 return Some(offset);
390 }
391 offset += SIMD_STRIDE_SIZE;
392 if offset > len_minus_stride {
393 break;
394 }
395 }
396 }
397 }
398 for i in offset..len {
399 if bytes[i] > 0xC3 {
400 return Some(i);
401 }
402 }
403 None
404 }
405 } else {
406 #[inline(always)]
407 fn is_str_latin1_impl(buffer: &str) -> Option<usize> {
408 let mut bytes = buffer.as_bytes();
409 let mut total = 0;
410 loop {
411 if let Some((byte, offset)) = validate_ascii(bytes) {
412 total += offset;
413 if byte > 0xC3 {
414 return Some(total);
415 }
416 bytes = &bytes[offset + 2..];
417 total += 2;
418 } else {
419 return None;
420 }
421 }
422 }
423 }
424 }
425
426 #[inline(always)]
is_utf8_latin1_impl(buffer: &[u8]) -> Option<usize>427 fn is_utf8_latin1_impl(buffer: &[u8]) -> Option<usize> {
428 let mut bytes = buffer;
429 let mut total = 0;
430 loop {
431 if let Some((byte, offset)) = validate_ascii(bytes) {
432 total += offset;
433 if in_inclusive_range8(byte, 0xC2, 0xC3) {
434 let next = offset + 1;
435 if next == bytes.len() {
436 return Some(total);
437 }
438 if bytes[next] & 0xC0 != 0x80 {
439 return Some(total);
440 }
441 bytes = &bytes[offset + 2..];
442 total += 2;
443 } else {
444 return Some(total);
445 }
446 } else {
447 return None;
448 }
449 }
450 }
451
452 cfg_if! {
453 if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
454 #[inline(always)]
455 fn is_utf16_bidi_impl(buffer: &[u16]) -> bool {
456 let mut offset = 0usize;
457 let len = buffer.len();
458 if len >= SIMD_STRIDE_SIZE / 2 {
459 let src = buffer.as_ptr();
460 let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
461 SIMD_ALIGNMENT_MASK) / 2;
462 if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len {
463 while until_alignment != 0 {
464 if is_utf16_code_unit_bidi(buffer[offset]) {
465 return true;
466 }
467 offset += 1;
468 until_alignment -= 1;
469 }
470 let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2);
471 loop {
472 if is_u16x8_bidi(unsafe { *(src.add(offset) as *const u16x8) }) {
473 return true;
474 }
475 offset += SIMD_STRIDE_SIZE / 2;
476 if offset > len_minus_stride {
477 break;
478 }
479 }
480 }
481 }
482 for &u in &buffer[offset..] {
483 if is_utf16_code_unit_bidi(u) {
484 return true;
485 }
486 }
487 false
488 }
489 } else {
490 #[inline(always)]
491 fn is_utf16_bidi_impl(buffer: &[u16]) -> bool {
492 for &u in buffer {
493 if is_utf16_code_unit_bidi(u) {
494 return true;
495 }
496 }
497 false
498 }
499 }
500 }
501
502 cfg_if! {
503 if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
504 #[inline(always)]
505 fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi {
506 let mut offset = 0usize;
507 let len = buffer.len();
508 if len >= SIMD_STRIDE_SIZE / 2 {
509 let src = buffer.as_ptr();
510 let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
511 SIMD_ALIGNMENT_MASK) / 2;
512 if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len {
513 while until_alignment != 0 {
514 if buffer[offset] > 0xFF {
515 // This transition isn't optimal, since the aligment is recomputing
516 // but not tweaking further today.
517 if is_utf16_bidi_impl(&buffer[offset..]) {
518 return Latin1Bidi::Bidi;
519 }
520 return Latin1Bidi::LeftToRight;
521 }
522 offset += 1;
523 until_alignment -= 1;
524 }
525 let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2);
526 loop {
527 let mut s = unsafe { *(src.add(offset) as *const u16x8) };
528 if !simd_is_latin1(s) {
529 loop {
530 if is_u16x8_bidi(s) {
531 return Latin1Bidi::Bidi;
532 }
533 offset += SIMD_STRIDE_SIZE / 2;
534 if offset > len_minus_stride {
535 for &u in &buffer[offset..] {
536 if is_utf16_code_unit_bidi(u) {
537 return Latin1Bidi::Bidi;
538 }
539 }
540 return Latin1Bidi::LeftToRight;
541 }
542 s = unsafe { *(src.add(offset) as *const u16x8) };
543 }
544 }
545 offset += SIMD_STRIDE_SIZE / 2;
546 if offset > len_minus_stride {
547 break;
548 }
549 }
550 }
551 }
552 let mut iter = (&buffer[offset..]).iter();
553 loop {
554 if let Some(&u) = iter.next() {
555 if u > 0xFF {
556 let mut inner_u = u;
557 loop {
558 if is_utf16_code_unit_bidi(inner_u) {
559 return Latin1Bidi::Bidi;
560 }
561 if let Some(&code_unit) = iter.next() {
562 inner_u = code_unit;
563 } else {
564 return Latin1Bidi::LeftToRight;
565 }
566 }
567 }
568 } else {
569 return Latin1Bidi::Latin1;
570 }
571 }
572 }
573 } else {
574 #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
575 #[inline(always)]
576 fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi {
577 let mut offset = 0usize;
578 let len = buffer.len();
579 if len >= ALU_ALIGNMENT / 2 {
580 let src = buffer.as_ptr();
581 let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK)) &
582 ALU_ALIGNMENT_MASK) / 2;
583 if until_alignment + ALU_ALIGNMENT / 2 <= len {
584 while until_alignment != 0 {
585 if buffer[offset] > 0xFF {
586 if is_utf16_bidi_impl(&buffer[offset..]) {
587 return Latin1Bidi::Bidi;
588 }
589 return Latin1Bidi::LeftToRight;
590 }
591 offset += 1;
592 until_alignment -= 1;
593 }
594 let len_minus_stride = len - ALU_ALIGNMENT / 2;
595 loop {
596 if unsafe { *(src.add(offset) as *const usize) } & LATIN1_MASK != 0 {
597 if is_utf16_bidi_impl(&buffer[offset..]) {
598 return Latin1Bidi::Bidi;
599 }
600 return Latin1Bidi::LeftToRight;
601 }
602 offset += ALU_ALIGNMENT / 2;
603 if offset > len_minus_stride {
604 break;
605 }
606 }
607 }
608 }
609 let mut iter = (&buffer[offset..]).iter();
610 loop {
611 if let Some(&u) = iter.next() {
612 if u > 0xFF {
613 let mut inner_u = u;
614 loop {
615 if is_utf16_code_unit_bidi(inner_u) {
616 return Latin1Bidi::Bidi;
617 }
618 if let Some(&code_unit) = iter.next() {
619 inner_u = code_unit;
620 } else {
621 return Latin1Bidi::LeftToRight;
622 }
623 }
624 }
625 } else {
626 return Latin1Bidi::Latin1;
627 }
628 }
629 }
630 }
631 }
632
633 /// Checks whether the buffer is all-ASCII.
634 ///
635 /// May read the entire buffer even if it isn't all-ASCII. (I.e. the function
636 /// is not guaranteed to fail fast.)
is_ascii(buffer: &[u8]) -> bool637 pub fn is_ascii(buffer: &[u8]) -> bool {
638 is_ascii_impl(buffer)
639 }
640
641 /// Checks whether the buffer is all-Basic Latin (i.e. UTF-16 representing
642 /// only ASCII characters).
643 ///
644 /// May read the entire buffer even if it isn't all-ASCII. (I.e. the function
645 /// is not guaranteed to fail fast.)
is_basic_latin(buffer: &[u16]) -> bool646 pub fn is_basic_latin(buffer: &[u16]) -> bool {
647 is_basic_latin_impl(buffer)
648 }
649
650 /// Checks whether the buffer is valid UTF-8 representing only code points
651 /// less than or equal to U+00FF.
652 ///
653 /// Fails fast. (I.e. returns before having read the whole buffer if UTF-8
654 /// invalidity or code points above U+00FF are discovered.
is_utf8_latin1(buffer: &[u8]) -> bool655 pub fn is_utf8_latin1(buffer: &[u8]) -> bool {
656 is_utf8_latin1_impl(buffer).is_none()
657 }
658
659 /// Checks whether the buffer represents only code points less than or equal
660 /// to U+00FF.
661 ///
662 /// Fails fast. (I.e. returns before having read the whole buffer if code
663 /// points above U+00FF are discovered.
is_str_latin1(buffer: &str) -> bool664 pub fn is_str_latin1(buffer: &str) -> bool {
665 is_str_latin1_impl(buffer).is_none()
666 }
667
668 /// Checks whether the buffer represents only code point less than or equal
669 /// to U+00FF.
670 ///
671 /// May read the entire buffer even if it isn't all-Latin1. (I.e. the function
672 /// is not guaranteed to fail fast.)
is_utf16_latin1(buffer: &[u16]) -> bool673 pub fn is_utf16_latin1(buffer: &[u16]) -> bool {
674 is_utf16_latin1_impl(buffer)
675 }
676
677 /// Checks whether a potentially-invalid UTF-8 buffer contains code points
678 /// that trigger right-to-left processing.
679 ///
680 /// The check is done on a Unicode block basis without regard to assigned
681 /// vs. unassigned code points in the block. Hebrew presentation forms in
682 /// the Alphabetic Presentation Forms block are treated as if they formed
683 /// a block on their own (i.e. it treated as right-to-left). Additionally,
684 /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
685 /// for. Control characters that are technically bidi controls but do not
686 /// cause right-to-left behavior without the presence of right-to-left
687 /// characters or right-to-left controls are not checked for. As a special
688 /// case, U+FEFF is excluded from Arabic Presentation Forms-B.
689 ///
690 /// Returns `true` if the input is invalid UTF-8 or the input contains an
691 /// RTL character. Returns `false` if the input is valid UTF-8 and contains
692 /// no RTL characters.
693 #[cfg_attr(feature = "cargo-clippy", allow(collapsible_if, cyclomatic_complexity))]
694 #[inline]
is_utf8_bidi(buffer: &[u8]) -> bool695 pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
696 // As of rustc 1.25.0-nightly (73ac5d6a8 2018-01-11), this is faster
697 // than UTF-8 validation followed by `is_str_bidi()` for German,
698 // Russian and Japanese. However, this is considerably slower for Thai.
699 // Chances are that the compiler makes some branch predictions that are
700 // unfortunate for Thai. Not spending the time to manually optimize
701 // further at this time, since it's unclear if this variant even has
702 // use cases. However, this is worth revisiting once Rust gets the
703 // ability to annotate relative priorities of match arms.
704
705 // U+058F: D6 8F
706 // U+0590: D6 90
707 // U+08FF: E0 A3 BF
708 // U+0900: E0 A4 80
709 //
710 // U+200F: E2 80 8F
711 // U+202B: E2 80 AB
712 // U+202E: E2 80 AE
713 // U+2067: E2 81 A7
714 //
715 // U+FB1C: EF AC 9C
716 // U+FB1D: EF AC 9D
717 // U+FDFF: EF B7 BF
718 // U+FE00: EF B8 80
719 //
720 // U+FE6F: EF B9 AF
721 // U+FE70: EF B9 B0
722 // U+FEFE: EF BB BE
723 // U+FEFF: EF BB BF
724 //
725 // U+107FF: F0 90 9F BF
726 // U+10800: F0 90 A0 80
727 // U+10FFF: F0 90 BF BF
728 // U+11000: F0 91 80 80
729 //
730 // U+1E7FF: F0 9E 9F BF
731 // U+1E800: F0 9E A0 80
732 // U+1EFFF: F0 9E BF BF
733 // U+1F000: F0 9F 80 80
734 let mut src = buffer;
735 'outer: loop {
736 if let Some((mut byte, mut read)) = validate_ascii(src) {
737 // Check for the longest sequence to avoid checking twice for the
738 // multi-byte sequences.
739 if read + 4 <= src.len() {
740 'inner: loop {
741 // At this point, `byte` is not included in `read`.
742 match byte {
743 0...0x7F => {
744 // ASCII: go back to SIMD.
745 read += 1;
746 src = &src[read..];
747 continue 'outer;
748 }
749 0xC2...0xD5 => {
750 // Two-byte
751 let second = unsafe { *(src.get_unchecked(read + 1)) };
752 if !in_inclusive_range8(second, 0x80, 0xBF) {
753 return true;
754 }
755 read += 2;
756 }
757 0xD6 => {
758 // Two-byte
759 let second = unsafe { *(src.get_unchecked(read + 1)) };
760 if !in_inclusive_range8(second, 0x80, 0xBF) {
761 return true;
762 }
763 // XXX consider folding the above and below checks
764 if second > 0x8F {
765 return true;
766 }
767 read += 2;
768 }
769 // two-byte starting with 0xD7 and above is bidi
770 0xE1 | 0xE3...0xEC | 0xEE => {
771 // Three-byte normal
772 let second = unsafe { *(src.get_unchecked(read + 1)) };
773 let third = unsafe { *(src.get_unchecked(read + 2)) };
774 if ((UTF8_DATA.table[usize::from(second)]
775 & unsafe {
776 *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
777 })
778 | (third >> 6))
779 != 2
780 {
781 return true;
782 }
783 read += 3;
784 }
785 0xE2 => {
786 // Three-byte normal, potentially bidi
787 let second = unsafe { *(src.get_unchecked(read + 1)) };
788 let third = unsafe { *(src.get_unchecked(read + 2)) };
789 if ((UTF8_DATA.table[usize::from(second)]
790 & unsafe {
791 *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
792 })
793 | (third >> 6))
794 != 2
795 {
796 return true;
797 }
798 if second == 0x80 {
799 if third == 0x8F || third == 0xAB || third == 0xAE {
800 return true;
801 }
802 } else if second == 0x81 {
803 if third == 0xA7 {
804 return true;
805 }
806 }
807 read += 3;
808 }
809 0xEF => {
810 // Three-byte normal, potentially bidi
811 let second = unsafe { *(src.get_unchecked(read + 1)) };
812 let third = unsafe { *(src.get_unchecked(read + 2)) };
813 if ((UTF8_DATA.table[usize::from(second)]
814 & unsafe {
815 *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
816 })
817 | (third >> 6))
818 != 2
819 {
820 return true;
821 }
822 if in_inclusive_range8(second, 0xAC, 0xB7) {
823 if second == 0xAC {
824 if third > 0x9C {
825 return true;
826 }
827 } else {
828 return true;
829 }
830 } else if in_inclusive_range8(second, 0xB9, 0xBB) {
831 if second == 0xB9 {
832 if third > 0xAF {
833 return true;
834 }
835 } else if second == 0xBB {
836 if third != 0xBF {
837 return true;
838 }
839 } else {
840 return true;
841 }
842 }
843 read += 3;
844 }
845 0xE0 => {
846 // Three-byte special lower bound, potentially bidi
847 let second = unsafe { *(src.get_unchecked(read + 1)) };
848 let third = unsafe { *(src.get_unchecked(read + 2)) };
849 if ((UTF8_DATA.table[usize::from(second)]
850 & unsafe {
851 *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
852 })
853 | (third >> 6))
854 != 2
855 {
856 return true;
857 }
858 // XXX can this be folded into the above validity check
859 if second < 0xA4 {
860 return true;
861 }
862 read += 3;
863 }
864 0xED => {
865 // Three-byte special upper bound
866 let second = unsafe { *(src.get_unchecked(read + 1)) };
867 let third = unsafe { *(src.get_unchecked(read + 2)) };
868 if ((UTF8_DATA.table[usize::from(second)]
869 & unsafe {
870 *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
871 })
872 | (third >> 6))
873 != 2
874 {
875 return true;
876 }
877 read += 3;
878 }
879 0xF1...0xF4 => {
880 // Four-byte normal
881 let second = unsafe { *(src.get_unchecked(read + 1)) };
882 let third = unsafe { *(src.get_unchecked(read + 2)) };
883 let fourth = unsafe { *(src.get_unchecked(read + 3)) };
884 if (u16::from(
885 UTF8_DATA.table[usize::from(second)]
886 & unsafe {
887 *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
888 },
889 ) | u16::from(third >> 6)
890 | (u16::from(fourth & 0xC0) << 2))
891 != 0x202
892 {
893 return true;
894 }
895 read += 4;
896 }
897 0xF0 => {
898 // Four-byte special lower bound, potentially bidi
899 let second = unsafe { *(src.get_unchecked(read + 1)) };
900 let third = unsafe { *(src.get_unchecked(read + 2)) };
901 let fourth = unsafe { *(src.get_unchecked(read + 3)) };
902 if (u16::from(
903 UTF8_DATA.table[usize::from(second)]
904 & unsafe {
905 *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
906 },
907 ) | u16::from(third >> 6)
908 | (u16::from(fourth & 0xC0) << 2))
909 != 0x202
910 {
911 return true;
912 }
913 if unsafe { unlikely(second == 0x90 || second == 0x9E) } {
914 let third = src[read + 2];
915 if third >= 0xA0 {
916 return true;
917 }
918 }
919 read += 4;
920 }
921 _ => {
922 // Invalid lead or bidi-only lead
923 return true;
924 }
925 }
926 if read + 4 > src.len() {
927 if read == src.len() {
928 return false;
929 }
930 byte = src[read];
931 break 'inner;
932 }
933 byte = src[read];
934 continue 'inner;
935 }
936 }
937 // We can't have a complete 4-byte sequence, but we could still have
938 // a complete shorter sequence.
939
940 // At this point, `byte` is not included in `read`.
941 match byte {
942 0...0x7F => {
943 // ASCII: go back to SIMD.
944 read += 1;
945 src = &src[read..];
946 continue 'outer;
947 }
948 0xC2...0xD5 => {
949 // Two-byte
950 let new_read = read + 2;
951 if new_read > src.len() {
952 return true;
953 }
954 let second = unsafe { *(src.get_unchecked(read + 1)) };
955 if !in_inclusive_range8(second, 0x80, 0xBF) {
956 return true;
957 }
958 read = new_read;
959 // We need to deal with the case where we came here with 3 bytes
960 // left, so we need to take a look at the last one.
961 src = &src[read..];
962 continue 'outer;
963 }
964 0xD6 => {
965 // Two-byte, potentially bidi
966 let new_read = read + 2;
967 if new_read > src.len() {
968 return true;
969 }
970 let second = unsafe { *(src.get_unchecked(read + 1)) };
971 if !in_inclusive_range8(second, 0x80, 0xBF) {
972 return true;
973 }
974 // XXX consider folding the above and below checks
975 if second > 0x8F {
976 return true;
977 }
978 read = new_read;
979 // We need to deal with the case where we came here with 3 bytes
980 // left, so we need to take a look at the last one.
981 src = &src[read..];
982 continue 'outer;
983 }
984 // two-byte starting with 0xD7 and above is bidi
985 0xE1 | 0xE3...0xEC | 0xEE => {
986 // Three-byte normal
987 let new_read = read + 3;
988 if new_read > src.len() {
989 return true;
990 }
991 let second = unsafe { *(src.get_unchecked(read + 1)) };
992 let third = unsafe { *(src.get_unchecked(read + 2)) };
993 if ((UTF8_DATA.table[usize::from(second)]
994 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
995 | (third >> 6))
996 != 2
997 {
998 return true;
999 }
1000 }
1001 0xE2 => {
1002 // Three-byte normal, potentially bidi
1003 let new_read = read + 3;
1004 if new_read > src.len() {
1005 return true;
1006 }
1007 let second = unsafe { *(src.get_unchecked(read + 1)) };
1008 let third = unsafe { *(src.get_unchecked(read + 2)) };
1009 if ((UTF8_DATA.table[usize::from(second)]
1010 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1011 | (third >> 6))
1012 != 2
1013 {
1014 return true;
1015 }
1016 if second == 0x80 {
1017 if third == 0x8F || third == 0xAB || third == 0xAE {
1018 return true;
1019 }
1020 } else if second == 0x81 {
1021 if third == 0xA7 {
1022 return true;
1023 }
1024 }
1025 }
1026 0xEF => {
1027 // Three-byte normal, potentially bidi
1028 let new_read = read + 3;
1029 if new_read > src.len() {
1030 return true;
1031 }
1032 let second = unsafe { *(src.get_unchecked(read + 1)) };
1033 let third = unsafe { *(src.get_unchecked(read + 2)) };
1034 if ((UTF8_DATA.table[usize::from(second)]
1035 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1036 | (third >> 6))
1037 != 2
1038 {
1039 return true;
1040 }
1041 if in_inclusive_range8(second, 0xAC, 0xB7) {
1042 if second == 0xAC {
1043 if third > 0x9C {
1044 return true;
1045 }
1046 } else {
1047 return true;
1048 }
1049 } else if in_inclusive_range8(second, 0xB9, 0xBB) {
1050 if second == 0xB9 {
1051 if third > 0xAF {
1052 return true;
1053 }
1054 } else if second == 0xBB {
1055 if third != 0xBF {
1056 return true;
1057 }
1058 } else {
1059 return true;
1060 }
1061 }
1062 }
1063 0xE0 => {
1064 // Three-byte special lower bound, potentially bidi
1065 let new_read = read + 3;
1066 if new_read > src.len() {
1067 return true;
1068 }
1069 let second = unsafe { *(src.get_unchecked(read + 1)) };
1070 let third = unsafe { *(src.get_unchecked(read + 2)) };
1071 if ((UTF8_DATA.table[usize::from(second)]
1072 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1073 | (third >> 6))
1074 != 2
1075 {
1076 return true;
1077 }
1078 // XXX can this be folded into the above validity check
1079 if second < 0xA4 {
1080 return true;
1081 }
1082 }
1083 0xED => {
1084 // Three-byte special upper bound
1085 let new_read = read + 3;
1086 if new_read > src.len() {
1087 return true;
1088 }
1089 let second = unsafe { *(src.get_unchecked(read + 1)) };
1090 let third = unsafe { *(src.get_unchecked(read + 2)) };
1091 if ((UTF8_DATA.table[usize::from(second)]
1092 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1093 | (third >> 6))
1094 != 2
1095 {
1096 return true;
1097 }
1098 }
1099 _ => {
1100 // Invalid lead, 4-byte lead or 2-byte bidi-only lead
1101 return true;
1102 }
1103 }
1104 return false;
1105 } else {
1106 return false;
1107 }
1108 }
1109 }
1110
1111 /// Checks whether a valid UTF-8 buffer contains code points that trigger
1112 /// right-to-left processing.
1113 ///
1114 /// The check is done on a Unicode block basis without regard to assigned
1115 /// vs. unassigned code points in the block. Hebrew presentation forms in
1116 /// the Alphabetic Presentation Forms block are treated as if they formed
1117 /// a block on their own (i.e. it treated as right-to-left). Additionally,
1118 /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1119 /// for. Control characters that are technically bidi controls but do not
1120 /// cause right-to-left behavior without the presence of right-to-left
1121 /// characters or right-to-left controls are not checked for. As a special
1122 /// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1123 #[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))]
1124 #[inline]
is_str_bidi(buffer: &str) -> bool1125 pub fn is_str_bidi(buffer: &str) -> bool {
1126 // U+058F: D6 8F
1127 // U+0590: D6 90
1128 // U+08FF: E0 A3 BF
1129 // U+0900: E0 A4 80
1130 //
1131 // U+200F: E2 80 8F
1132 // U+202B: E2 80 AB
1133 // U+202E: E2 80 AE
1134 // U+2067: E2 81 A7
1135 //
1136 // U+FB1C: EF AC 9C
1137 // U+FB1D: EF AC 9D
1138 // U+FDFF: EF B7 BF
1139 // U+FE00: EF B8 80
1140 //
1141 // U+FE6F: EF B9 AF
1142 // U+FE70: EF B9 B0
1143 // U+FEFE: EF BB BE
1144 // U+FEFF: EF BB BF
1145 //
1146 // U+107FF: F0 90 9F BF
1147 // U+10800: F0 90 A0 80
1148 // U+10FFF: F0 90 BF BF
1149 // U+11000: F0 91 80 80
1150 //
1151 // U+1E7FF: F0 9E 9F BF
1152 // U+1E800: F0 9E A0 80
1153 // U+1EFFF: F0 9E BF BF
1154 // U+1F000: F0 9F 80 80
1155 let mut bytes = buffer.as_bytes();
1156 'outer: loop {
1157 // TODO: Instead of just validating ASCII using SIMD, use SIMD
1158 // to check for non-ASCII lead bytes, too, to quickly conclude
1159 // that the vector consist entirely of CJK and below-Hebrew
1160 // code points.
1161 // Unfortunately, scripts above Arabic but below CJK share
1162 // lead bytes with RTL.
1163 if let Some((mut byte, mut read)) = validate_ascii(bytes) {
1164 'inner: loop {
1165 // At this point, `byte` is not included in `read`.
1166 if byte < 0xE0 {
1167 if byte >= 0x80 {
1168 // Two-byte
1169 // Adding `unlikely` here improved throughput on
1170 // Russian plain text by 33%!
1171 if unsafe { unlikely(byte >= 0xD6) } {
1172 if byte == 0xD6 {
1173 let second = bytes[read + 1];
1174 if second > 0x8F {
1175 return true;
1176 }
1177 } else {
1178 return true;
1179 }
1180 }
1181 read += 2;
1182 } else {
1183 // ASCII: write and go back to SIMD.
1184 read += 1;
1185 // Intuitively, we should go back to the outer loop only
1186 // if byte is 0x30 or above, so as to avoid trashing on
1187 // ASCII space, comma and period in non-Latin context.
1188 // However, the extra branch seems to cost more than it's
1189 // worth.
1190 bytes = &bytes[read..];
1191 continue 'outer;
1192 }
1193 } else if byte < 0xF0 {
1194 // Three-byte
1195 if unsafe { unlikely(!in_inclusive_range8(byte, 0xE3, 0xEE) && byte != 0xE1) } {
1196 let second = bytes[read + 1];
1197 if byte == 0xE0 {
1198 if second < 0xA4 {
1199 return true;
1200 }
1201 } else if byte == 0xE2 {
1202 let third = bytes[read + 2];
1203 if second == 0x80 {
1204 if third == 0x8F || third == 0xAB || third == 0xAE {
1205 return true;
1206 }
1207 } else if second == 0x81 {
1208 if third == 0xA7 {
1209 return true;
1210 }
1211 }
1212 } else {
1213 debug_assert_eq!(byte, 0xEF);
1214 if in_inclusive_range8(second, 0xAC, 0xB7) {
1215 if second == 0xAC {
1216 let third = bytes[read + 2];
1217 if third > 0x9C {
1218 return true;
1219 }
1220 } else {
1221 return true;
1222 }
1223 } else if in_inclusive_range8(second, 0xB9, 0xBB) {
1224 if second == 0xB9 {
1225 let third = bytes[read + 2];
1226 if third > 0xAF {
1227 return true;
1228 }
1229 } else if second == 0xBB {
1230 let third = bytes[read + 2];
1231 if third != 0xBF {
1232 return true;
1233 }
1234 } else {
1235 return true;
1236 }
1237 }
1238 }
1239 }
1240 read += 3;
1241 } else {
1242 // Four-byte
1243 let second = bytes[read + 1];
1244 if unsafe { unlikely(byte == 0xF0 && (second == 0x90 || second == 0x9E)) } {
1245 let third = bytes[read + 2];
1246 if third >= 0xA0 {
1247 return true;
1248 }
1249 }
1250 read += 4;
1251 }
1252 // The comparison is always < or == and never >, but including
1253 // > here to let the compiler assume that < is true if this
1254 // comparison is false.
1255 if read >= bytes.len() {
1256 return false;
1257 }
1258 byte = bytes[read];
1259 continue 'inner;
1260 }
1261 } else {
1262 return false;
1263 }
1264 }
1265 }
1266
1267 /// Checks whether a UTF-16 buffer contains code points that trigger
1268 /// right-to-left processing.
1269 ///
1270 /// The check is done on a Unicode block basis without regard to assigned
1271 /// vs. unassigned code points in the block. Hebrew presentation forms in
1272 /// the Alphabetic Presentation Forms block are treated as if they formed
1273 /// a block on their own (i.e. it treated as right-to-left). Additionally,
1274 /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1275 /// for. Control characters that are technically bidi controls but do not
1276 /// cause right-to-left behavior without the presence of right-to-left
1277 /// characters or right-to-left controls are not checked for. As a special
1278 /// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1279 ///
1280 /// Returns `true` if the input contains an RTL character or an unpaired
1281 /// high surrogate that could be the high half of an RTL character.
1282 /// Returns `false` if the input contains neither RTL characters nor
1283 /// unpaired high surrogates that could be higher halves of RTL characters.
is_utf16_bidi(buffer: &[u16]) -> bool1284 pub fn is_utf16_bidi(buffer: &[u16]) -> bool {
1285 is_utf16_bidi_impl(buffer)
1286 }
1287
1288 /// Checks whether a scalar value triggers right-to-left processing.
1289 ///
1290 /// The check is done on a Unicode block basis without regard to assigned
1291 /// vs. unassigned code points in the block. Hebrew presentation forms in
1292 /// the Alphabetic Presentation Forms block are treated as if they formed
1293 /// a block on their own (i.e. it treated as right-to-left). Additionally,
1294 /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1295 /// for. Control characters that are technically bidi controls but do not
1296 /// cause right-to-left behavior without the presence of right-to-left
1297 /// characters or right-to-left controls are not checked for. As a special
1298 /// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1299 #[inline(always)]
is_char_bidi(c: char) -> bool1300 pub fn is_char_bidi(c: char) -> bool {
1301 // Controls:
1302 // Every control with RIGHT-TO-LEFT in its name in
1303 // https://www.unicode.org/charts/PDF/U2000.pdf
1304 // U+200F RLM
1305 // U+202B RLE
1306 // U+202E RLO
1307 // U+2067 RLI
1308 //
1309 // BMP RTL:
1310 // https://www.unicode.org/roadmaps/bmp/
1311 // U+0590...U+08FF
1312 // U+FB1D...U+FDFF Hebrew presentation forms and
1313 // Arabic Presentation Forms A
1314 // U+FE70...U+FEFE Arabic Presentation Forms B (excl. BOM)
1315 //
1316 // Supplementary RTL:
1317 // https://www.unicode.org/roadmaps/smp/
1318 // U+10800...U+10FFF (Lead surrogate U+D802 or U+D803)
1319 // U+1E800...U+1EFFF (Lead surrogate U+D83A or U+D83B)
1320 let code_point = u32::from(c);
1321 if code_point < 0x0590 {
1322 // Below Hebrew
1323 return false;
1324 }
1325 if in_range32(code_point, 0x0900, 0xFB1D) {
1326 // Above Arabic Extended-A and below Hebrew presentation forms
1327 if in_inclusive_range32(code_point, 0x200F, 0x2067) {
1328 // In the range that contains the RTL controls
1329 return code_point == 0x200F
1330 || code_point == 0x202B
1331 || code_point == 0x202E
1332 || code_point == 0x2067;
1333 }
1334 return false;
1335 }
1336 if code_point > 0x1EFFF {
1337 // Above second astral RTL. (Emoji is here.)
1338 return false;
1339 }
1340 if in_range32(code_point, 0x11000, 0x1E800) {
1341 // Between astral RTL blocks
1342 return false;
1343 }
1344 if in_range32(code_point, 0xFEFF, 0x10800) {
1345 // Above Arabic Presentations Forms B (excl. BOM) and below first
1346 // astral RTL
1347 return false;
1348 }
1349 if in_range32(code_point, 0xFE00, 0xFE70) {
1350 // Between Arabic Presentations Forms
1351 return false;
1352 }
1353 true
1354 }
1355
1356 /// Checks whether a UTF-16 code unit triggers right-to-left processing.
1357 ///
1358 /// The check is done on a Unicode block basis without regard to assigned
1359 /// vs. unassigned code points in the block. Hebrew presentation forms in
1360 /// the Alphabetic Presentation Forms block are treated as if they formed
1361 /// a block on their own (i.e. it treated as right-to-left). Additionally,
1362 /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1363 /// for. Control characters that are technically bidi controls but do not
1364 /// cause right-to-left behavior without the presence of right-to-left
1365 /// characters or right-to-left controls are not checked for. As a special
1366 /// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1367 ///
1368 /// Since supplementary-plane right-to-left blocks are identifiable from the
1369 /// high surrogate without examining the low surrogate, this function returns
1370 /// `true` for such high surrogates making the function suitable for handling
1371 /// supplementary-plane text without decoding surrogate pairs to scalar
1372 /// values. Obviously, such high surrogates are then reported as right-to-left
1373 /// even if actually unpaired.
1374 #[inline(always)]
is_utf16_code_unit_bidi(u: u16) -> bool1375 pub fn is_utf16_code_unit_bidi(u: u16) -> bool {
1376 if u < 0x0590 {
1377 // Below Hebrew
1378 return false;
1379 }
1380 if in_range16(u, 0x0900, 0xD802) {
1381 // Above Arabic Extended-A and below first RTL surrogate
1382 if in_inclusive_range16(u, 0x200F, 0x2067) {
1383 // In the range that contains the RTL controls
1384 return u == 0x200F || u == 0x202B || u == 0x202E || u == 0x2067;
1385 }
1386 return false;
1387 }
1388 if in_range16(u, 0xD83C, 0xFB1D) {
1389 // Between astral RTL high surrogates and Hebrew presentation forms
1390 // (Emoji is here)
1391 return false;
1392 }
1393 if in_range16(u, 0xD804, 0xD83A) {
1394 // Between RTL high surragates
1395 return false;
1396 }
1397 if u > 0xFEFE {
1398 // Above Arabic Presentation Forms (excl. BOM)
1399 return false;
1400 }
1401 if in_range16(u, 0xFE00, 0xFE70) {
1402 // Between Arabic Presentations Forms
1403 return false;
1404 }
1405 true
1406 }
1407
1408 /// Checks whether a potentially invalid UTF-8 buffer contains code points
1409 /// that trigger right-to-left processing or is all-Latin1.
1410 ///
1411 /// Possibly more efficient than performing the checks separately.
1412 ///
1413 /// Returns `Latin1Bidi::Latin1` if `is_utf8_latin1()` would return `true`.
1414 /// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf8_bidi()` would return
1415 /// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
check_utf8_for_latin1_and_bidi(buffer: &[u8]) -> Latin1Bidi1416 pub fn check_utf8_for_latin1_and_bidi(buffer: &[u8]) -> Latin1Bidi {
1417 if let Some(offset) = is_utf8_latin1_impl(buffer) {
1418 if is_utf8_bidi(&buffer[offset..]) {
1419 Latin1Bidi::Bidi
1420 } else {
1421 Latin1Bidi::LeftToRight
1422 }
1423 } else {
1424 Latin1Bidi::Latin1
1425 }
1426 }
1427
1428 /// Checks whether a valid UTF-8 buffer contains code points
1429 /// that trigger right-to-left processing or is all-Latin1.
1430 ///
1431 /// Possibly more efficient than performing the checks separately.
1432 ///
1433 /// Returns `Latin1Bidi::Latin1` if `is_str_latin1()` would return `true`.
1434 /// Otherwise, returns `Latin1Bidi::Bidi` if `is_str_bidi()` would return
1435 /// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
check_str_for_latin1_and_bidi(buffer: &str) -> Latin1Bidi1436 pub fn check_str_for_latin1_and_bidi(buffer: &str) -> Latin1Bidi {
1437 // The transition from the latin1 check to the bidi check isn't
1438 // optimal but not tweaking it to perfection today.
1439 if let Some(offset) = is_str_latin1_impl(buffer) {
1440 if is_str_bidi(&buffer[offset..]) {
1441 Latin1Bidi::Bidi
1442 } else {
1443 Latin1Bidi::LeftToRight
1444 }
1445 } else {
1446 Latin1Bidi::Latin1
1447 }
1448 }
1449
1450 /// Checks whether a potentially invalid UTF-16 buffer contains code points
1451 /// that trigger right-to-left processing or is all-Latin1.
1452 ///
1453 /// Possibly more efficient than performing the checks separately.
1454 ///
1455 /// Returns `Latin1Bidi::Latin1` if `is_utf16_latin1()` would return `true`.
1456 /// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf16_bidi()` would return
1457 /// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
check_utf16_for_latin1_and_bidi(buffer: &[u16]) -> Latin1Bidi1458 pub fn check_utf16_for_latin1_and_bidi(buffer: &[u16]) -> Latin1Bidi {
1459 check_utf16_for_latin1_and_bidi_impl(buffer)
1460 }
1461
1462 /// Converts potentially-invalid UTF-8 to valid UTF-16 with errors replaced
1463 /// with the REPLACEMENT CHARACTER.
1464 ///
1465 /// The length of the destination buffer must be at least the length of the
1466 /// source buffer _plus one_.
1467 ///
1468 /// Returns the number of `u16`s written.
1469 ///
1470 /// # Panics
1471 ///
1472 /// Panics if the destination buffer is shorter than stated above.
convert_utf8_to_utf16(src: &[u8], dst: &mut [u16]) -> usize1473 pub fn convert_utf8_to_utf16(src: &[u8], dst: &mut [u16]) -> usize {
1474 // TODO: Can the requirement for dst to be at least one unit longer
1475 // be eliminated?
1476 assert!(dst.len() > src.len());
1477 let mut decoder = Utf8Decoder::new_inner();
1478 let mut total_read = 0usize;
1479 let mut total_written = 0usize;
1480 loop {
1481 let (result, read, written) =
1482 decoder.decode_to_utf16_raw(&src[total_read..], &mut dst[total_written..], true);
1483 total_read += read;
1484 total_written += written;
1485 match result {
1486 DecoderResult::InputEmpty => {
1487 return total_written;
1488 }
1489 DecoderResult::OutputFull => {
1490 unreachable!("The assert at the top of the function should have caught this.");
1491 }
1492 DecoderResult::Malformed(_, _) => {
1493 // There should always be space for the U+FFFD, because
1494 // otherwise we'd have gotten OutputFull already.
1495 dst[total_written] = 0xFFFD;
1496 total_written += 1;
1497 }
1498 }
1499 }
1500 }
1501
1502 /// Converts valid UTF-8 to valid UTF-16.
1503 ///
1504 /// The length of the destination buffer must be at least the length of the
1505 /// source buffer.
1506 ///
1507 /// Returns the number of `u16`s written.
1508 ///
1509 /// # Panics
1510 ///
1511 /// Panics if the destination buffer is shorter than stated above.
convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize1512 pub fn convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize {
1513 assert!(
1514 dst.len() >= src.len(),
1515 "Destination must not be shorter than the source."
1516 );
1517 let bytes = src.as_bytes();
1518 let mut read = 0;
1519 let mut written = 0;
1520 'outer: loop {
1521 let mut byte = {
1522 let src_remaining = &bytes[read..];
1523 let dst_remaining = &mut dst[written..];
1524 let length = src_remaining.len();
1525 match unsafe {
1526 ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length)
1527 } {
1528 None => {
1529 written += length;
1530 return written;
1531 }
1532 Some((non_ascii, consumed)) => {
1533 read += consumed;
1534 written += consumed;
1535 non_ascii
1536 }
1537 }
1538 };
1539 'inner: loop {
1540 // At this point, `byte` is not included in `read`.
1541 if byte < 0xE0 {
1542 if byte >= 0x80 {
1543 // Two-byte
1544 let second = unsafe { *(bytes.get_unchecked(read + 1)) };
1545 let point = ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F);
1546 unsafe { *(dst.get_unchecked_mut(written)) = point };
1547 read += 2;
1548 written += 1;
1549 } else {
1550 // ASCII: write and go back to SIMD.
1551 unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) };
1552 read += 1;
1553 written += 1;
1554 // Intuitively, we should go back to the outer loop only
1555 // if byte is 0x30 or above, so as to avoid trashing on
1556 // ASCII space, comma and period in non-Latin context.
1557 // However, the extra branch seems to cost more than it's
1558 // worth.
1559 continue 'outer;
1560 }
1561 } else if byte < 0xF0 {
1562 // Three-byte
1563 let second = unsafe { *(bytes.get_unchecked(read + 1)) };
1564 let third = unsafe { *(bytes.get_unchecked(read + 2)) };
1565 let point = ((u16::from(byte) & 0xF) << 12)
1566 | ((u16::from(second) & 0x3F) << 6)
1567 | (u16::from(third) & 0x3F);
1568 unsafe { *(dst.get_unchecked_mut(written)) = point };
1569 read += 3;
1570 written += 1;
1571 } else {
1572 // Four-byte
1573 let second = unsafe { *(bytes.get_unchecked(read + 1)) };
1574 let third = unsafe { *(bytes.get_unchecked(read + 2)) };
1575 let fourth = unsafe { *(bytes.get_unchecked(read + 3)) };
1576 let point = ((u32::from(byte) & 0x7) << 18)
1577 | ((u32::from(second) & 0x3F) << 12)
1578 | ((u32::from(third) & 0x3F) << 6)
1579 | (u32::from(fourth) & 0x3F);
1580 unsafe { *(dst.get_unchecked_mut(written)) = (0xD7C0 + (point >> 10)) as u16 };
1581 unsafe {
1582 *(dst.get_unchecked_mut(written + 1)) = (0xDC00 + (point & 0x3FF)) as u16
1583 };
1584 read += 4;
1585 written += 2;
1586 }
1587 // The comparison is always < or == and never >, but including
1588 // > here to let the compiler assume that < is true if this
1589 // comparison is false.
1590 if read >= src.len() {
1591 return written;
1592 }
1593 byte = bytes[read];
1594 continue 'inner;
1595 }
1596 }
1597 }
1598
1599 /// Converts potentially-invalid UTF-8 to valid UTF-16 signaling on error.
1600 ///
1601 /// The length of the destination buffer must be at least the length of the
1602 /// source buffer.
1603 ///
1604 /// Returns the number of `u16`s written or `None` if the input was invalid.
1605 ///
1606 /// When the input was invalid, some output may have been written.
1607 ///
1608 /// # Panics
1609 ///
1610 /// Panics if the destination buffer is shorter than stated above.
convert_utf8_to_utf16_without_replacement(src: &[u8], dst: &mut [u16]) -> Option<usize>1611 pub fn convert_utf8_to_utf16_without_replacement(src: &[u8], dst: &mut [u16]) -> Option<usize> {
1612 assert!(
1613 dst.len() >= src.len(),
1614 "Destination must not be shorter than the source."
1615 );
1616 let (read, written) = convert_utf8_to_utf16_up_to_invalid(src, dst);
1617 if read == src.len() {
1618 return Some(written);
1619 }
1620 None
1621 }
1622
1623 /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1624 /// with the REPLACEMENT CHARACTER with potentially insufficient output
1625 /// space.
1626 ///
1627 /// Returns the number of code units read and the number of bytes written.
1628 ///
1629 /// Guarantees that the bytes in the destination beyond the number of
1630 /// bytes claimed as written by the second item of the return tuple
1631 /// are left unmodified.
1632 ///
1633 /// Not all code units are read if there isn't enough output space.
1634 ///
1635 /// Note that this method isn't designed for general streamability but for
1636 /// not allocating memory for the worst case up front. Specifically,
1637 /// if the input starts with or ends with an unpaired surrogate, those are
1638 /// replaced with the REPLACEMENT CHARACTER.
1639 ///
1640 /// Matches the semantics of `TextEncoder.encodeInto()` from the
1641 /// Encoding Standard.
1642 ///
1643 /// # Safety
1644 ///
1645 /// If you want to convert into a `&mut str`, use
1646 /// `convert_utf16_to_str_partial()` instead of using this function
1647 /// together with the `unsafe` method `as_bytes_mut()` on `&mut str`.
1648 #[inline(always)]
convert_utf16_to_utf8_partial(src: &[u16], dst: &mut [u8]) -> (usize, usize)1649 pub fn convert_utf16_to_utf8_partial(src: &[u16], dst: &mut [u8]) -> (usize, usize) {
1650 // The two functions called below are marked `inline(never)` to make
1651 // transitions from the hot part (first function) into the cold part
1652 // (second function) go through a return and another call to discouge
1653 // the CPU from speculating from the hot code into the cold code.
1654 // Letting the transitions be mere intra-function jumps, even to
1655 // basic blocks out-of-lined to the end of the function would wipe
1656 // away a quarter of Arabic encode performance on Haswell!
1657 let (read, written) = convert_utf16_to_utf8_partial_inner(src, dst);
1658 if unsafe { likely(read == src.len()) } {
1659 return (read, written);
1660 }
1661 let (tail_read, tail_written) =
1662 convert_utf16_to_utf8_partial_tail(&src[read..], &mut dst[written..]);
1663 (read + tail_read, written + tail_written)
1664 }
1665
1666 /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1667 /// with the REPLACEMENT CHARACTER.
1668 ///
1669 /// The length of the destination buffer must be at least the length of the
1670 /// source buffer times three.
1671 ///
1672 /// Returns the number of bytes written.
1673 ///
1674 /// # Panics
1675 ///
1676 /// Panics if the destination buffer is shorter than stated above.
1677 ///
1678 /// # Safety
1679 ///
1680 /// If you want to convert into a `&mut str`, use `convert_utf16_to_str()`
1681 /// instead of using this function together with the `unsafe` method
1682 /// `as_bytes_mut()` on `&mut str`.
1683 #[inline(always)]
convert_utf16_to_utf8(src: &[u16], dst: &mut [u8]) -> usize1684 pub fn convert_utf16_to_utf8(src: &[u16], dst: &mut [u8]) -> usize {
1685 assert!(dst.len() >= src.len() * 3);
1686 let (read, written) = convert_utf16_to_utf8_partial(src, dst);
1687 debug_assert_eq!(read, src.len());
1688 written
1689 }
1690
1691 /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1692 /// with the REPLACEMENT CHARACTER such that the validity of the output is
1693 /// signaled using the Rust type system with potentially insufficient output
1694 /// space.
1695 ///
1696 /// Returns the number of code units read and the number of bytes written.
1697 ///
1698 /// Not all code units are read if there isn't enough output space.
1699 ///
1700 /// Note that this method isn't designed for general streamability but for
1701 /// not allocating memory for the worst case up front. Specifically,
1702 /// if the input starts with or ends with an unpaired surrogate, those are
1703 /// replaced with the REPLACEMENT CHARACTER.
convert_utf16_to_str_partial(src: &[u16], dst: &mut str) -> (usize, usize)1704 pub fn convert_utf16_to_str_partial(src: &[u16], dst: &mut str) -> (usize, usize) {
1705 let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
1706 let (read, written) = convert_utf16_to_utf8_partial(src, bytes);
1707 let len = bytes.len();
1708 let mut trail = written;
1709 while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
1710 bytes[trail] = 0;
1711 trail += 1;
1712 }
1713 (read, written)
1714 }
1715
1716 /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1717 /// with the REPLACEMENT CHARACTER such that the validity of the output is
1718 /// signaled using the Rust type system.
1719 ///
1720 /// The length of the destination buffer must be at least the length of the
1721 /// source buffer times three.
1722 ///
1723 /// Returns the number of bytes written.
1724 ///
1725 /// # Panics
1726 ///
1727 /// Panics if the destination buffer is shorter than stated above.
1728 #[inline(always)]
convert_utf16_to_str(src: &[u16], dst: &mut str) -> usize1729 pub fn convert_utf16_to_str(src: &[u16], dst: &mut str) -> usize {
1730 assert!(dst.len() >= src.len() * 3);
1731 let (read, written) = convert_utf16_to_str_partial(src, dst);
1732 debug_assert_eq!(read, src.len());
1733 written
1734 }
1735
1736 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1737 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-16.
1738 ///
1739 /// The length of the destination buffer must be at least the length of the
1740 /// source buffer.
1741 ///
1742 /// The number of `u16`s written equals the length of the source buffer.
1743 ///
1744 /// # Panics
1745 ///
1746 /// Panics if the destination buffer is shorter than stated above.
convert_latin1_to_utf16(src: &[u8], dst: &mut [u16])1747 pub fn convert_latin1_to_utf16(src: &[u8], dst: &mut [u16]) {
1748 assert!(
1749 dst.len() >= src.len(),
1750 "Destination must not be shorter than the source."
1751 );
1752 // TODO: On aarch64, the safe version autovectorizes to the same unpacking
1753 // instructions and this code, but, yet, the autovectorized version is
1754 // faster.
1755 unsafe {
1756 unpack_latin1(src.as_ptr(), dst.as_mut_ptr(), src.len());
1757 }
1758 }
1759
1760 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1761 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient
1762 /// output space.
1763 ///
1764 /// Returns the number of bytes read and the number of bytes written.
1765 ///
1766 /// If the output isn't large enough, not all input is consumed.
1767 ///
1768 /// # Safety
1769 ///
1770 /// If you want to convert into a `&mut str`, use
1771 /// `convert_utf16_to_str_partial()` instead of using this function
1772 /// together with the `unsafe` method `as_bytes_mut()` on `&mut str`.
convert_latin1_to_utf8_partial(src: &[u8], dst: &mut [u8]) -> (usize, usize)1773 pub fn convert_latin1_to_utf8_partial(src: &[u8], dst: &mut [u8]) -> (usize, usize) {
1774 let src_len = src.len();
1775 let src_ptr = src.as_ptr();
1776 let dst_ptr = dst.as_mut_ptr();
1777 let dst_len = dst.len();
1778 let mut total_read = 0usize;
1779 let mut total_written = 0usize;
1780 loop {
1781 // src can't advance more than dst
1782 let src_left = src_len - total_read;
1783 let dst_left = dst_len - total_written;
1784 let min_left = ::std::cmp::min(src_left, dst_left);
1785 if let Some((non_ascii, consumed)) = unsafe {
1786 ascii_to_ascii(
1787 src_ptr.add(total_read),
1788 dst_ptr.add(total_written),
1789 min_left,
1790 )
1791 } {
1792 total_read += consumed;
1793 total_written += consumed;
1794 if total_written.checked_add(2).unwrap() > dst_len {
1795 return (total_read, total_written);
1796 }
1797
1798 total_read += 1; // consume `non_ascii`
1799
1800 dst[total_written] = (non_ascii >> 6) | 0xC0;
1801 total_written += 1;
1802 dst[total_written] = (non_ascii & 0x3F) | 0x80;
1803 total_written += 1;
1804 continue;
1805 }
1806 return (total_read + min_left, total_written + min_left);
1807 }
1808 }
1809
1810 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1811 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8.
1812 ///
1813 /// The length of the destination buffer must be at least the length of the
1814 /// source buffer times two.
1815 ///
1816 /// Returns the number of bytes written.
1817 ///
1818 /// # Panics
1819 ///
1820 /// Panics if the destination buffer is shorter than stated above.
1821 ///
1822 /// # Safety
1823 ///
1824 /// Note that this function may write garbage beyond the number of bytes
1825 /// indicated by the return value, so using a `&mut str` interpreted as
1826 /// `&mut [u8]` as the destination is not safe. If you want to convert into
1827 /// a `&mut str`, use `convert_utf16_to_str()` instead of this function.
1828 #[inline]
convert_latin1_to_utf8(src: &[u8], dst: &mut [u8]) -> usize1829 pub fn convert_latin1_to_utf8(src: &[u8], dst: &mut [u8]) -> usize {
1830 assert!(
1831 dst.len() >= src.len() * 2,
1832 "Destination must not be shorter than the source times two."
1833 );
1834 let (read, written) = convert_latin1_to_utf8_partial(src, dst);
1835 debug_assert_eq!(read, src.len());
1836 written
1837 }
1838
1839 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1840 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the
1841 /// output is signaled using the Rust type system with potentially insufficient
1842 /// output space.
1843 ///
1844 /// Returns the number of bytes read and the number of bytes written.
1845 ///
1846 /// If the output isn't large enough, not all input is consumed.
1847 #[inline]
convert_latin1_to_str_partial(src: &[u8], dst: &mut str) -> (usize, usize)1848 pub fn convert_latin1_to_str_partial(src: &[u8], dst: &mut str) -> (usize, usize) {
1849 let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
1850 let (read, written) = convert_latin1_to_utf8_partial(src, bytes);
1851 let len = bytes.len();
1852 let mut trail = written;
1853 let max = ::std::cmp::min(len, trail + MAX_STRIDE_SIZE);
1854 while trail < max {
1855 bytes[trail] = 0;
1856 trail += 1;
1857 }
1858 while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
1859 bytes[trail] = 0;
1860 trail += 1;
1861 }
1862 (read, written)
1863 }
1864
1865 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1866 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the
1867 /// output is signaled using the Rust type system.
1868 ///
1869 /// The length of the destination buffer must be at least the length of the
1870 /// source buffer times two.
1871 ///
1872 /// Returns the number of bytes written.
1873 ///
1874 /// # Panics
1875 ///
1876 /// Panics if the destination buffer is shorter than stated above.
1877 #[inline]
convert_latin1_to_str(src: &[u8], dst: &mut str) -> usize1878 pub fn convert_latin1_to_str(src: &[u8], dst: &mut str) -> usize {
1879 assert!(
1880 dst.len() >= src.len() * 2,
1881 "Destination must not be shorter than the source times two."
1882 );
1883 let (read, written) = convert_latin1_to_str_partial(src, dst);
1884 debug_assert_eq!(read, src.len());
1885 written
1886 }
1887
1888 /// If the input is valid UTF-8 representing only Unicode code points from
1889 /// U+0000 to U+00FF, inclusive, converts the input into output that
1890 /// represents the value of each code point as the unsigned byte value of
1891 /// each output byte.
1892 ///
1893 /// If the input does not fulfill the condition stated above, this function
1894 /// panics if debug assertions are enabled (and fuzzing isn't) and otherwise
1895 /// does something that is memory-safe without any promises about any
1896 /// properties of the output. In particular, callers shouldn't assume the
1897 /// output to be the same across crate versions or CPU architectures and
1898 /// should not assume that non-ASCII input can't map to ASCII output.
1899 ///
1900 /// The length of the destination buffer must be at least the length of the
1901 /// source buffer.
1902 ///
1903 /// Returns the number of bytes written.
1904 ///
1905 /// # Panics
1906 ///
1907 /// Panics if the destination buffer is shorter than stated above.
1908 ///
1909 /// If debug assertions are enabled (and not fuzzing) and the input is
1910 /// not in the range U+0000 to U+00FF, inclusive.
convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize1911 pub fn convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize {
1912 assert!(
1913 dst.len() >= src.len(),
1914 "Destination must not be shorter than the source."
1915 );
1916 non_fuzz_debug_assert!(is_utf8_latin1(src));
1917 let src_len = src.len();
1918 let src_ptr = src.as_ptr();
1919 let dst_ptr = dst.as_mut_ptr();
1920 let mut total_read = 0usize;
1921 let mut total_written = 0usize;
1922 loop {
1923 // dst can't advance more than src
1924 let src_left = src_len - total_read;
1925 if let Some((non_ascii, consumed)) = unsafe {
1926 ascii_to_ascii(
1927 src_ptr.add(total_read),
1928 dst_ptr.add(total_written),
1929 src_left,
1930 )
1931 } {
1932 total_read += consumed + 1;
1933 total_written += consumed;
1934
1935 if total_read == src_len {
1936 return total_written;
1937 }
1938
1939 let trail = src[total_read];
1940 total_read += 1;
1941
1942 dst[total_written] = ((non_ascii & 0x1F) << 6) | (trail & 0x3F);
1943 total_written += 1;
1944 continue;
1945 }
1946 return total_written + src_left;
1947 }
1948 }
1949
1950 /// If the input is valid UTF-16 representing only Unicode code points from
1951 /// U+0000 to U+00FF, inclusive, converts the input into output that
1952 /// represents the value of each code point as the unsigned byte value of
1953 /// each output byte.
1954 ///
1955 /// If the input does not fulfill the condition stated above, does something
1956 /// that is memory-safe without any promises about any properties of the
1957 /// output and will probably assert in debug builds in future versions.
1958 /// In particular, callers shouldn't assume the output to be the same across
1959 /// crate versions or CPU architectures and should not assume that non-ASCII
1960 /// input can't map to ASCII output.
1961 ///
1962 /// The length of the destination buffer must be at least the length of the
1963 /// source buffer.
1964 ///
1965 /// The number of bytes written equals the length of the source buffer.
1966 ///
1967 /// # Panics
1968 ///
1969 /// Panics if the destination buffer is shorter than stated above.
1970 ///
1971 /// (Probably in future versions if debug assertions are enabled (and not
1972 /// fuzzing) and the input is not in the range U+0000 to U+00FF, inclusive.)
convert_utf16_to_latin1_lossy(src: &[u16], dst: &mut [u8])1973 pub fn convert_utf16_to_latin1_lossy(src: &[u16], dst: &mut [u8]) {
1974 assert!(
1975 dst.len() >= src.len(),
1976 "Destination must not be shorter than the source."
1977 );
1978 // non_fuzz_debug_assert!(is_utf16_latin1(src));
1979 unsafe {
1980 pack_latin1(src.as_ptr(), dst.as_mut_ptr(), src.len());
1981 }
1982 }
1983
1984 /// Converts bytes whose unsigned value is interpreted as Unicode code point
1985 /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8.
1986 ///
1987 /// Borrows if input is ASCII-only. Performs a single heap allocation
1988 /// otherwise.
decode_latin1<'a>(bytes: &'a [u8]) -> Cow<'a, str>1989 pub fn decode_latin1<'a>(bytes: &'a [u8]) -> Cow<'a, str> {
1990 let up_to = ascii_valid_up_to(bytes);
1991 // >= makes later things optimize better than ==
1992 if up_to >= bytes.len() {
1993 debug_assert_eq!(up_to, bytes.len());
1994 let s: &str = unsafe { ::std::str::from_utf8_unchecked(bytes) };
1995 return Cow::Borrowed(s);
1996 }
1997 let (head, tail) = bytes.split_at(up_to);
1998 let capacity = head.len() + tail.len() * 2;
1999 let mut vec = Vec::with_capacity(capacity);
2000 unsafe {
2001 vec.set_len(capacity);
2002 }
2003 (&mut vec[..up_to]).copy_from_slice(head);
2004 let written = convert_latin1_to_utf8(tail, &mut vec[up_to..]);
2005 vec.truncate(up_to + written);
2006 Cow::Owned(unsafe { String::from_utf8_unchecked(vec) })
2007 }
2008
2009 /// If the input is valid UTF-8 representing only Unicode code points from
2010 /// U+0000 to U+00FF, inclusive, converts the input into output that
2011 /// represents the value of each code point as the unsigned byte value of
2012 /// each output byte.
2013 ///
2014 /// If the input does not fulfill the condition stated above, this function
2015 /// panics if debug assertions are enabled (and fuzzing isn't) and otherwise
2016 /// does something that is memory-safe without any promises about any
2017 /// properties of the output. In particular, callers shouldn't assume the
2018 /// output to be the same across crate versions or CPU architectures and
2019 /// should not assume that non-ASCII input can't map to ASCII output.
2020 ///
2021 /// Borrows if input is ASCII-only. Performs a single heap allocation
2022 /// otherwise.
encode_latin1_lossy<'a>(string: &'a str) -> Cow<'a, [u8]>2023 pub fn encode_latin1_lossy<'a>(string: &'a str) -> Cow<'a, [u8]> {
2024 let bytes = string.as_bytes();
2025 let up_to = ascii_valid_up_to(bytes);
2026 // >= makes later things optimize better than ==
2027 if up_to >= bytes.len() {
2028 debug_assert_eq!(up_to, bytes.len());
2029 return Cow::Borrowed(bytes);
2030 }
2031 let (head, tail) = bytes.split_at(up_to);
2032 let capacity = bytes.len();
2033 let mut vec = Vec::with_capacity(capacity);
2034 unsafe {
2035 vec.set_len(capacity);
2036 }
2037 (&mut vec[..up_to]).copy_from_slice(head);
2038 let written = convert_utf8_to_latin1_lossy(tail, &mut vec[up_to..]);
2039 vec.truncate(up_to + written);
2040 Cow::Owned(vec)
2041 }
2042
2043 /// Returns the index of the first unpaired surrogate or, if the input is
2044 /// valid UTF-16 in its entirety, the length of the input.
utf16_valid_up_to(buffer: &[u16]) -> usize2045 pub fn utf16_valid_up_to(buffer: &[u16]) -> usize {
2046 utf16_valid_up_to_impl(buffer)
2047 }
2048
2049 /// Returns the index of first byte that starts an invalid byte
2050 /// sequence or a non-Latin1 byte sequence, or the length of the
2051 /// string if there are neither.
utf8_latin1_up_to(buffer: &[u8]) -> usize2052 pub fn utf8_latin1_up_to(buffer: &[u8]) -> usize {
2053 is_utf8_latin1_impl(buffer).unwrap_or(buffer.len())
2054 }
2055
2056 /// Returns the index of first byte that starts a non-Latin1 byte
2057 /// sequence, or the length of the string if there are none.
str_latin1_up_to(buffer: &str) -> usize2058 pub fn str_latin1_up_to(buffer: &str) -> usize {
2059 is_str_latin1_impl(buffer).unwrap_or(buffer.len())
2060 }
2061
2062 /// Replaces unpaired surrogates in the input with the REPLACEMENT CHARACTER.
2063 #[inline]
ensure_utf16_validity(buffer: &mut [u16])2064 pub fn ensure_utf16_validity(buffer: &mut [u16]) {
2065 let mut offset = 0;
2066 loop {
2067 offset += utf16_valid_up_to(&buffer[offset..]);
2068 if offset == buffer.len() {
2069 return;
2070 }
2071 buffer[offset] = 0xFFFD;
2072 offset += 1;
2073 }
2074 }
2075
2076 /// Copies ASCII from source to destination up to the first non-ASCII byte
2077 /// (or the end of the input if it is ASCII in its entirety).
2078 ///
2079 /// The length of the destination buffer must be at least the length of the
2080 /// source buffer.
2081 ///
2082 /// Returns the number of bytes written.
2083 ///
2084 /// # Panics
2085 ///
2086 /// Panics if the destination buffer is shorter than stated above.
copy_ascii_to_ascii(src: &[u8], dst: &mut [u8]) -> usize2087 pub fn copy_ascii_to_ascii(src: &[u8], dst: &mut [u8]) -> usize {
2088 assert!(
2089 dst.len() >= src.len(),
2090 "Destination must not be shorter than the source."
2091 );
2092 if let Some((_, consumed)) =
2093 unsafe { ascii_to_ascii(src.as_ptr(), dst.as_mut_ptr(), src.len()) }
2094 {
2095 consumed
2096 } else {
2097 src.len()
2098 }
2099 }
2100
2101 /// Copies ASCII from source to destination zero-extending it to UTF-16 up to
2102 /// the first non-ASCII byte (or the end of the input if it is ASCII in its
2103 /// entirety).
2104 ///
2105 /// The length of the destination buffer must be at least the length of the
2106 /// source buffer.
2107 ///
2108 /// Returns the number of `u16`s written.
2109 ///
2110 /// # Panics
2111 ///
2112 /// Panics if the destination buffer is shorter than stated above.
copy_ascii_to_basic_latin(src: &[u8], dst: &mut [u16]) -> usize2113 pub fn copy_ascii_to_basic_latin(src: &[u8], dst: &mut [u16]) -> usize {
2114 assert!(
2115 dst.len() >= src.len(),
2116 "Destination must not be shorter than the source."
2117 );
2118 if let Some((_, consumed)) =
2119 unsafe { ascii_to_basic_latin(src.as_ptr(), dst.as_mut_ptr(), src.len()) }
2120 {
2121 consumed
2122 } else {
2123 src.len()
2124 }
2125 }
2126
2127 /// Copies Basic Latin from source to destination narrowing it to ASCII up to
2128 /// the first non-Basic Latin code unit (or the end of the input if it is
2129 /// Basic Latin in its entirety).
2130 ///
2131 /// The length of the destination buffer must be at least the length of the
2132 /// source buffer.
2133 ///
2134 /// Returns the number of bytes written.
2135 ///
2136 /// # Panics
2137 ///
2138 /// Panics if the destination buffer is shorter than stated above.
copy_basic_latin_to_ascii(src: &[u16], dst: &mut [u8]) -> usize2139 pub fn copy_basic_latin_to_ascii(src: &[u16], dst: &mut [u8]) -> usize {
2140 assert!(
2141 dst.len() >= src.len(),
2142 "Destination must not be shorter than the source."
2143 );
2144 if let Some((_, consumed)) =
2145 unsafe { basic_latin_to_ascii(src.as_ptr(), dst.as_mut_ptr(), src.len()) }
2146 {
2147 consumed
2148 } else {
2149 src.len()
2150 }
2151 }
2152
2153 // Any copyright to the test code below this comment is dedicated to the
2154 // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
2155
2156 #[cfg(test)]
2157 mod tests {
2158 use super::*;
2159
2160 #[test]
test_is_ascii_success()2161 fn test_is_ascii_success() {
2162 let mut src: Vec<u8> = Vec::with_capacity(128);
2163 src.resize(128, 0);
2164 for i in 0..src.len() {
2165 src[i] = i as u8;
2166 }
2167 for i in 0..src.len() {
2168 assert!(is_ascii(&src[i..]));
2169 }
2170 }
2171
2172 #[test]
test_is_ascii_fail()2173 fn test_is_ascii_fail() {
2174 let mut src: Vec<u8> = Vec::with_capacity(128);
2175 src.resize(128, 0);
2176 for i in 0..src.len() {
2177 src[i] = i as u8;
2178 }
2179 for i in 0..src.len() {
2180 let tail = &mut src[i..];
2181 for j in 0..tail.len() {
2182 tail[j] = 0xA0;
2183 assert!(!is_ascii(tail));
2184 }
2185 }
2186 }
2187
2188 #[test]
test_is_basic_latin_success()2189 fn test_is_basic_latin_success() {
2190 let mut src: Vec<u16> = Vec::with_capacity(128);
2191 src.resize(128, 0);
2192 for i in 0..src.len() {
2193 src[i] = i as u16;
2194 }
2195 for i in 0..src.len() {
2196 assert!(is_basic_latin(&src[i..]));
2197 }
2198 }
2199
2200 #[test]
test_is_basic_latin_fail()2201 fn test_is_basic_latin_fail() {
2202 let mut src: Vec<u16> = Vec::with_capacity(128);
2203 src.resize(128, 0);
2204 for i in 0..src.len() {
2205 src[i] = i as u16;
2206 }
2207 for i in 0..src.len() {
2208 let tail = &mut src[i..];
2209 for j in 0..tail.len() {
2210 tail[j] = 0xA0;
2211 assert!(!is_basic_latin(tail));
2212 }
2213 }
2214 }
2215
2216 #[test]
test_is_utf16_latin1_success()2217 fn test_is_utf16_latin1_success() {
2218 let mut src: Vec<u16> = Vec::with_capacity(256);
2219 src.resize(256, 0);
2220 for i in 0..src.len() {
2221 src[i] = i as u16;
2222 }
2223 for i in 0..src.len() {
2224 assert!(is_utf16_latin1(&src[i..]));
2225 assert_eq!(
2226 check_utf16_for_latin1_and_bidi(&src[i..]),
2227 Latin1Bidi::Latin1
2228 );
2229 }
2230 }
2231
2232 #[test]
test_is_utf16_latin1_fail()2233 fn test_is_utf16_latin1_fail() {
2234 let mut src: Vec<u16> = Vec::with_capacity(256);
2235 src.resize(256, 0);
2236 for i in 0..src.len() {
2237 src[i] = i as u16;
2238 }
2239 for i in 0..src.len() {
2240 let tail = &mut src[i..];
2241 for j in 0..tail.len() {
2242 tail[j] = 0x100 + j as u16;
2243 assert!(!is_utf16_latin1(tail));
2244 assert_ne!(check_utf16_for_latin1_and_bidi(tail), Latin1Bidi::Latin1);
2245 }
2246 }
2247 }
2248
2249 #[test]
test_is_str_latin1_success()2250 fn test_is_str_latin1_success() {
2251 let mut src: Vec<u16> = Vec::with_capacity(256);
2252 src.resize(256, 0);
2253 for i in 0..src.len() {
2254 src[i] = i as u16;
2255 }
2256 for i in 0..src.len() {
2257 let s = String::from_utf16(&src[i..]).unwrap();
2258 assert!(is_str_latin1(&s[..]));
2259 assert_eq!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1);
2260 }
2261 }
2262
2263 #[test]
test_is_str_latin1_fail()2264 fn test_is_str_latin1_fail() {
2265 let mut src: Vec<u16> = Vec::with_capacity(256);
2266 src.resize(256, 0);
2267 for i in 0..src.len() {
2268 src[i] = i as u16;
2269 }
2270 for i in 0..src.len() {
2271 let tail = &mut src[i..];
2272 for j in 0..tail.len() {
2273 tail[j] = 0x100 + j as u16;
2274 let s = String::from_utf16(tail).unwrap();
2275 assert!(!is_str_latin1(&s[..]));
2276 assert_ne!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1);
2277 }
2278 }
2279 }
2280
2281 #[test]
test_is_utf8_latin1_success()2282 fn test_is_utf8_latin1_success() {
2283 let mut src: Vec<u16> = Vec::with_capacity(256);
2284 src.resize(256, 0);
2285 for i in 0..src.len() {
2286 src[i] = i as u16;
2287 }
2288 for i in 0..src.len() {
2289 let s = String::from_utf16(&src[i..]).unwrap();
2290 assert!(is_utf8_latin1(s.as_bytes()));
2291 assert_eq!(
2292 check_utf8_for_latin1_and_bidi(s.as_bytes()),
2293 Latin1Bidi::Latin1
2294 );
2295 }
2296 }
2297
2298 #[test]
test_is_utf8_latin1_fail()2299 fn test_is_utf8_latin1_fail() {
2300 let mut src: Vec<u16> = Vec::with_capacity(256);
2301 src.resize(256, 0);
2302 for i in 0..src.len() {
2303 src[i] = i as u16;
2304 }
2305 for i in 0..src.len() {
2306 let tail = &mut src[i..];
2307 for j in 0..tail.len() {
2308 tail[j] = 0x100 + j as u16;
2309 let s = String::from_utf16(tail).unwrap();
2310 assert!(!is_utf8_latin1(s.as_bytes()));
2311 assert_ne!(
2312 check_utf8_for_latin1_and_bidi(s.as_bytes()),
2313 Latin1Bidi::Latin1
2314 );
2315 }
2316 }
2317 }
2318
2319 #[test]
test_is_utf8_latin1_invalid()2320 fn test_is_utf8_latin1_invalid() {
2321 assert!(!is_utf8_latin1(b"\xC3"));
2322 assert!(!is_utf8_latin1(b"a\xC3"));
2323 assert!(!is_utf8_latin1(b"\xFF"));
2324 assert!(!is_utf8_latin1(b"a\xFF"));
2325 assert!(!is_utf8_latin1(b"\xC3\xFF"));
2326 assert!(!is_utf8_latin1(b"a\xC3\xFF"));
2327 }
2328
2329 #[test]
test_convert_utf8_to_utf16()2330 fn test_convert_utf8_to_utf16() {
2331 let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2332 let mut dst: Vec<u16> = Vec::with_capacity(src.len() + 1);
2333 dst.resize(src.len() + 1, 0);
2334 let len = convert_utf8_to_utf16(src.as_bytes(), &mut dst[..]);
2335 dst.truncate(len);
2336 let reference: Vec<u16> = src.encode_utf16().collect();
2337 assert_eq!(dst, reference);
2338 }
2339
2340 #[test]
test_convert_str_to_utf16()2341 fn test_convert_str_to_utf16() {
2342 let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2343 let mut dst: Vec<u16> = Vec::with_capacity(src.len());
2344 dst.resize(src.len(), 0);
2345 let len = convert_str_to_utf16(src, &mut dst[..]);
2346 dst.truncate(len);
2347 let reference: Vec<u16> = src.encode_utf16().collect();
2348 assert_eq!(dst, reference);
2349 }
2350
2351 #[test]
test_convert_utf16_to_utf8_partial()2352 fn test_convert_utf16_to_utf8_partial() {
2353 let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2354 let src: Vec<u16> = reference.encode_utf16().collect();
2355 let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 3 + 1);
2356 dst.resize(src.len() * 3 + 1, 0);
2357 let (read, written) = convert_utf16_to_utf8_partial(&src[..], &mut dst[..24]);
2358 let len = written + convert_utf16_to_utf8(&src[read..], &mut dst[written..]);
2359 dst.truncate(len);
2360 assert_eq!(dst, reference.as_bytes());
2361 }
2362
2363 #[test]
test_convert_utf16_to_utf8()2364 fn test_convert_utf16_to_utf8() {
2365 let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2366 let src: Vec<u16> = reference.encode_utf16().collect();
2367 let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 3 + 1);
2368 dst.resize(src.len() * 3 + 1, 0);
2369 let len = convert_utf16_to_utf8(&src[..], &mut dst[..]);
2370 dst.truncate(len);
2371 assert_eq!(dst, reference.as_bytes());
2372 }
2373
2374 #[test]
test_convert_latin1_to_utf16()2375 fn test_convert_latin1_to_utf16() {
2376 let mut src: Vec<u8> = Vec::with_capacity(256);
2377 src.resize(256, 0);
2378 let mut reference: Vec<u16> = Vec::with_capacity(256);
2379 reference.resize(256, 0);
2380 for i in 0..256 {
2381 src[i] = i as u8;
2382 reference[i] = i as u16;
2383 }
2384 let mut dst: Vec<u16> = Vec::with_capacity(src.len());
2385 dst.resize(src.len(), 0);
2386 convert_latin1_to_utf16(&src[..], &mut dst[..]);
2387 assert_eq!(dst, reference);
2388 }
2389
2390 #[test]
test_convert_latin1_to_utf8_partial()2391 fn test_convert_latin1_to_utf8_partial() {
2392 let mut dst = [0u8, 2];
2393 let (read, written) = convert_latin1_to_utf8_partial(b"a\xFF", &mut dst[..]);
2394 assert_eq!(read, 1);
2395 assert_eq!(written, 1);
2396 }
2397
2398 #[test]
test_convert_latin1_to_utf8()2399 fn test_convert_latin1_to_utf8() {
2400 let mut src: Vec<u8> = Vec::with_capacity(256);
2401 src.resize(256, 0);
2402 let mut reference: Vec<u16> = Vec::with_capacity(256);
2403 reference.resize(256, 0);
2404 for i in 0..256 {
2405 src[i] = i as u8;
2406 reference[i] = i as u16;
2407 }
2408 let s = String::from_utf16(&reference[..]).unwrap();
2409 let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 2);
2410 dst.resize(src.len() * 2, 0);
2411 let len = convert_latin1_to_utf8(&src[..], &mut dst[..]);
2412 dst.truncate(len);
2413 assert_eq!(&dst[..], s.as_bytes());
2414 }
2415
2416 #[test]
test_convert_utf8_to_latin1_lossy()2417 fn test_convert_utf8_to_latin1_lossy() {
2418 let mut reference: Vec<u8> = Vec::with_capacity(256);
2419 reference.resize(256, 0);
2420 let mut src16: Vec<u16> = Vec::with_capacity(256);
2421 src16.resize(256, 0);
2422 for i in 0..256 {
2423 src16[i] = i as u16;
2424 reference[i] = i as u8;
2425 }
2426 let src = String::from_utf16(&src16[..]).unwrap();
2427 let mut dst: Vec<u8> = Vec::with_capacity(src.len());
2428 dst.resize(src.len(), 0);
2429 let len = convert_utf8_to_latin1_lossy(src.as_bytes(), &mut dst[..]);
2430 dst.truncate(len);
2431 assert_eq!(dst, reference);
2432 }
2433
2434 #[cfg(all(debug_assertions, not(fuzzing)))]
2435 #[test]
2436 #[should_panic]
test_convert_utf8_to_latin1_lossy_panics()2437 fn test_convert_utf8_to_latin1_lossy_panics() {
2438 let mut dst = [0u8; 16];
2439 let _ = convert_utf8_to_latin1_lossy("\u{100}".as_bytes(), &mut dst[..]);
2440 }
2441
2442 #[test]
test_convert_utf16_to_latin1_lossy()2443 fn test_convert_utf16_to_latin1_lossy() {
2444 let mut src: Vec<u16> = Vec::with_capacity(256);
2445 src.resize(256, 0);
2446 let mut reference: Vec<u8> = Vec::with_capacity(256);
2447 reference.resize(256, 0);
2448 for i in 0..256 {
2449 src[i] = i as u16;
2450 reference[i] = i as u8;
2451 }
2452 let mut dst: Vec<u8> = Vec::with_capacity(src.len());
2453 dst.resize(src.len(), 0);
2454 convert_utf16_to_latin1_lossy(&src[..], &mut dst[..]);
2455 assert_eq!(dst, reference);
2456 }
2457
2458 #[test]
2459 // #[should_panic]
test_convert_utf16_to_latin1_lossy_panics()2460 fn test_convert_utf16_to_latin1_lossy_panics() {
2461 let mut dst = [0u8; 16];
2462 let _ = convert_utf16_to_latin1_lossy(&[0x0100u16], &mut dst[..]);
2463 }
2464
2465 #[test]
test_utf16_valid_up_to()2466 fn test_utf16_valid_up_to() {
2467 let valid = vec![
2468 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0x2603u16,
2469 0xD83Du16, 0xDCA9u16, 0x00B6u16,
2470 ];
2471 assert_eq!(utf16_valid_up_to(&valid[..]), 16);
2472 let lone_high = vec![
2473 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2474 0x2603u16, 0xD83Du16, 0x00B6u16,
2475 ];
2476 assert_eq!(utf16_valid_up_to(&lone_high[..]), 14);
2477 let lone_low = vec![
2478 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2479 0x2603u16, 0xDCA9u16, 0x00B6u16,
2480 ];
2481 assert_eq!(utf16_valid_up_to(&lone_low[..]), 14);
2482 let lone_high_at_end = vec![
2483 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2484 0x2603u16, 0x00B6u16, 0xD83Du16,
2485 ];
2486 assert_eq!(utf16_valid_up_to(&lone_high_at_end[..]), 15);
2487 }
2488
2489 #[test]
test_ensure_utf16_validity()2490 fn test_ensure_utf16_validity() {
2491 let mut src = vec![
2492 0u16, 0xD83Du16, 0u16, 0u16, 0u16, 0xD83Du16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2493 0u16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2494 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2495 ];
2496 let reference = vec![
2497 0u16, 0xFFFDu16, 0u16, 0u16, 0u16, 0xD83Du16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2498 0u16, 0xFFFDu16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2499 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2500 ];
2501 ensure_utf16_validity(&mut src[..]);
2502 assert_eq!(src, reference);
2503 }
2504
2505 #[test]
test_is_char_bidi()2506 fn test_is_char_bidi() {
2507 assert!(!is_char_bidi('a'));
2508 assert!(!is_char_bidi('\u{03B1}'));
2509 assert!(!is_char_bidi('\u{3041}'));
2510 assert!(!is_char_bidi('\u{1F4A9}'));
2511 assert!(!is_char_bidi('\u{FE00}'));
2512 assert!(!is_char_bidi('\u{202C}'));
2513 assert!(!is_char_bidi('\u{FEFF}'));
2514 assert!(is_char_bidi('\u{0590}'));
2515 assert!(is_char_bidi('\u{08FF}'));
2516 assert!(is_char_bidi('\u{061C}'));
2517 assert!(is_char_bidi('\u{FB50}'));
2518 assert!(is_char_bidi('\u{FDFF}'));
2519 assert!(is_char_bidi('\u{FE70}'));
2520 assert!(is_char_bidi('\u{FEFE}'));
2521 assert!(is_char_bidi('\u{200F}'));
2522 assert!(is_char_bidi('\u{202B}'));
2523 assert!(is_char_bidi('\u{202E}'));
2524 assert!(is_char_bidi('\u{2067}'));
2525 assert!(is_char_bidi('\u{10800}'));
2526 assert!(is_char_bidi('\u{10FFF}'));
2527 assert!(is_char_bidi('\u{1E800}'));
2528 assert!(is_char_bidi('\u{1EFFF}'));
2529 }
2530
2531 #[test]
test_is_utf16_code_unit_bidi()2532 fn test_is_utf16_code_unit_bidi() {
2533 assert!(!is_utf16_code_unit_bidi(0x0062));
2534 assert!(!is_utf16_code_unit_bidi(0x03B1));
2535 assert!(!is_utf16_code_unit_bidi(0x3041));
2536 assert!(!is_utf16_code_unit_bidi(0xD801));
2537 assert!(!is_utf16_code_unit_bidi(0xFE00));
2538 assert!(!is_utf16_code_unit_bidi(0x202C));
2539 assert!(!is_utf16_code_unit_bidi(0xFEFF));
2540 assert!(is_utf16_code_unit_bidi(0x0590));
2541 assert!(is_utf16_code_unit_bidi(0x08FF));
2542 assert!(is_utf16_code_unit_bidi(0x061C));
2543 assert!(is_utf16_code_unit_bidi(0xFB1D));
2544 assert!(is_utf16_code_unit_bidi(0xFB50));
2545 assert!(is_utf16_code_unit_bidi(0xFDFF));
2546 assert!(is_utf16_code_unit_bidi(0xFE70));
2547 assert!(is_utf16_code_unit_bidi(0xFEFE));
2548 assert!(is_utf16_code_unit_bidi(0x200F));
2549 assert!(is_utf16_code_unit_bidi(0x202B));
2550 assert!(is_utf16_code_unit_bidi(0x202E));
2551 assert!(is_utf16_code_unit_bidi(0x2067));
2552 assert!(is_utf16_code_unit_bidi(0xD802));
2553 assert!(is_utf16_code_unit_bidi(0xD803));
2554 assert!(is_utf16_code_unit_bidi(0xD83A));
2555 assert!(is_utf16_code_unit_bidi(0xD83B));
2556 }
2557
2558 #[test]
test_is_str_bidi()2559 fn test_is_str_bidi() {
2560 assert!(!is_str_bidi("abcdefghijklmnopaabcdefghijklmnop"));
2561 assert!(!is_str_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop"));
2562 assert!(!is_str_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop"));
2563 assert!(!is_str_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop"));
2564 assert!(!is_str_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop"));
2565 assert!(!is_str_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop"));
2566 assert!(!is_str_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop"));
2567 assert!(is_str_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop"));
2568 assert!(is_str_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop"));
2569 assert!(is_str_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop"));
2570 assert!(is_str_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop"));
2571 assert!(is_str_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop"));
2572 assert!(is_str_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop"));
2573 assert!(is_str_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop"));
2574 assert!(is_str_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop"));
2575 assert!(is_str_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop"));
2576 assert!(is_str_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop"));
2577 assert!(is_str_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop"));
2578 assert!(is_str_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop"));
2579 assert!(is_str_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop"));
2580 assert!(is_str_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop"));
2581 assert!(is_str_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop"));
2582 }
2583
2584 #[test]
test_is_utf8_bidi()2585 fn test_is_utf8_bidi() {
2586 assert!(!is_utf8_bidi(
2587 "abcdefghijklmnopaabcdefghijklmnop".as_bytes()
2588 ));
2589 assert!(!is_utf8_bidi(
2590 "abcdefghijklmnop\u{03B1}abcdefghijklmnop".as_bytes()
2591 ));
2592 assert!(!is_utf8_bidi(
2593 "abcdefghijklmnop\u{3041}abcdefghijklmnop".as_bytes()
2594 ));
2595 assert!(!is_utf8_bidi(
2596 "abcdefghijklmnop\u{1F4A9}abcdefghijklmnop".as_bytes()
2597 ));
2598 assert!(!is_utf8_bidi(
2599 "abcdefghijklmnop\u{FE00}abcdefghijklmnop".as_bytes()
2600 ));
2601 assert!(!is_utf8_bidi(
2602 "abcdefghijklmnop\u{202C}abcdefghijklmnop".as_bytes()
2603 ));
2604 assert!(!is_utf8_bidi(
2605 "abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes()
2606 ));
2607 assert!(is_utf8_bidi(
2608 "abcdefghijklmnop\u{0590}abcdefghijklmnop".as_bytes()
2609 ));
2610 assert!(is_utf8_bidi(
2611 "abcdefghijklmnop\u{08FF}abcdefghijklmnop".as_bytes()
2612 ));
2613 assert!(is_utf8_bidi(
2614 "abcdefghijklmnop\u{061C}abcdefghijklmnop".as_bytes()
2615 ));
2616 assert!(is_utf8_bidi(
2617 "abcdefghijklmnop\u{FB50}abcdefghijklmnop".as_bytes()
2618 ));
2619 assert!(is_utf8_bidi(
2620 "abcdefghijklmnop\u{FDFF}abcdefghijklmnop".as_bytes()
2621 ));
2622 assert!(is_utf8_bidi(
2623 "abcdefghijklmnop\u{FE70}abcdefghijklmnop".as_bytes()
2624 ));
2625 assert!(is_utf8_bidi(
2626 "abcdefghijklmnop\u{FEFE}abcdefghijklmnop".as_bytes()
2627 ));
2628 assert!(is_utf8_bidi(
2629 "abcdefghijklmnop\u{200F}abcdefghijklmnop".as_bytes()
2630 ));
2631 assert!(is_utf8_bidi(
2632 "abcdefghijklmnop\u{202B}abcdefghijklmnop".as_bytes()
2633 ));
2634 assert!(is_utf8_bidi(
2635 "abcdefghijklmnop\u{202E}abcdefghijklmnop".as_bytes()
2636 ));
2637 assert!(is_utf8_bidi(
2638 "abcdefghijklmnop\u{2067}abcdefghijklmnop".as_bytes()
2639 ));
2640 assert!(is_utf8_bidi(
2641 "abcdefghijklmnop\u{10800}abcdefghijklmnop".as_bytes()
2642 ));
2643 assert!(is_utf8_bidi(
2644 "abcdefghijklmnop\u{10FFF}abcdefghijklmnop".as_bytes()
2645 ));
2646 assert!(is_utf8_bidi(
2647 "abcdefghijklmnop\u{1E800}abcdefghijklmnop".as_bytes()
2648 ));
2649 assert!(is_utf8_bidi(
2650 "abcdefghijklmnop\u{1EFFF}abcdefghijklmnop".as_bytes()
2651 ));
2652 }
2653
2654 #[test]
test_is_utf16_bidi()2655 fn test_is_utf16_bidi() {
2656 assert!(!is_utf16_bidi(&[
2657 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0062, 0x62, 0x63, 0x64, 0x65, 0x66,
2658 0x67, 0x68, 0x69,
2659 ]));
2660 assert!(!is_utf16_bidi(&[
2661 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x03B1, 0x62, 0x63, 0x64, 0x65, 0x66,
2662 0x67, 0x68, 0x69,
2663 ]));
2664 assert!(!is_utf16_bidi(&[
2665 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x3041, 0x62, 0x63, 0x64, 0x65, 0x66,
2666 0x67, 0x68, 0x69,
2667 ]));
2668 assert!(!is_utf16_bidi(&[
2669 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD801, 0x62, 0x63, 0x64, 0x65, 0x66,
2670 0x67, 0x68, 0x69,
2671 ]));
2672 assert!(!is_utf16_bidi(&[
2673 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE00, 0x62, 0x63, 0x64, 0x65, 0x66,
2674 0x67, 0x68, 0x69,
2675 ]));
2676 assert!(!is_utf16_bidi(&[
2677 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65, 0x66,
2678 0x67, 0x68, 0x69,
2679 ]));
2680 assert!(!is_utf16_bidi(&[
2681 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65, 0x66,
2682 0x67, 0x68, 0x69,
2683 ]));
2684 assert!(is_utf16_bidi(&[
2685 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65, 0x66,
2686 0x67, 0x68, 0x69,
2687 ]));
2688 assert!(is_utf16_bidi(&[
2689 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x08FF, 0x62, 0x63, 0x64, 0x65, 0x66,
2690 0x67, 0x68, 0x69,
2691 ]));
2692 assert!(is_utf16_bidi(&[
2693 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65, 0x66,
2694 0x67, 0x68, 0x69,
2695 ]));
2696 assert!(is_utf16_bidi(&[
2697 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB1D, 0x62, 0x63, 0x64, 0x65, 0x66,
2698 0x67, 0x68, 0x69,
2699 ]));
2700 assert!(is_utf16_bidi(&[
2701 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65, 0x66,
2702 0x67, 0x68, 0x69,
2703 ]));
2704 assert!(is_utf16_bidi(&[
2705 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFDFF, 0x62, 0x63, 0x64, 0x65, 0x66,
2706 0x67, 0x68, 0x69,
2707 ]));
2708 assert!(is_utf16_bidi(&[
2709 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE70, 0x62, 0x63, 0x64, 0x65, 0x66,
2710 0x67, 0x68, 0x69,
2711 ]));
2712 assert!(is_utf16_bidi(&[
2713 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFE, 0x62, 0x63, 0x64, 0x65, 0x66,
2714 0x67, 0x68, 0x69,
2715 ]));
2716 assert!(is_utf16_bidi(&[
2717 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x200F, 0x62, 0x63, 0x64, 0x65, 0x66,
2718 0x67, 0x68, 0x69,
2719 ]));
2720 assert!(is_utf16_bidi(&[
2721 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202B, 0x62, 0x63, 0x64, 0x65, 0x66,
2722 0x67, 0x68, 0x69,
2723 ]));
2724 assert!(is_utf16_bidi(&[
2725 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202E, 0x62, 0x63, 0x64, 0x65, 0x66,
2726 0x67, 0x68, 0x69,
2727 ]));
2728 assert!(is_utf16_bidi(&[
2729 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x2067, 0x62, 0x63, 0x64, 0x65, 0x66,
2730 0x67, 0x68, 0x69,
2731 ]));
2732 assert!(is_utf16_bidi(&[
2733 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD802, 0x62, 0x63, 0x64, 0x65, 0x66,
2734 0x67, 0x68, 0x69,
2735 ]));
2736 assert!(is_utf16_bidi(&[
2737 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD803, 0x62, 0x63, 0x64, 0x65, 0x66,
2738 0x67, 0x68, 0x69,
2739 ]));
2740 assert!(is_utf16_bidi(&[
2741 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83A, 0x62, 0x63, 0x64, 0x65, 0x66,
2742 0x67, 0x68, 0x69,
2743 ]));
2744 assert!(is_utf16_bidi(&[
2745 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83B, 0x62, 0x63, 0x64, 0x65, 0x66,
2746 0x67, 0x68, 0x69,
2747 ]));
2748
2749 assert!(is_utf16_bidi(&[
2750 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x3041, 0x62, 0x63, 0x64, 0x65,
2751 0x66, 0x67, 0x68, 0x69,
2752 ]));
2753 }
2754
2755 #[test]
test_check_str_for_latin1_and_bidi()2756 fn test_check_str_for_latin1_and_bidi() {
2757 assert_ne!(
2758 check_str_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop"),
2759 Latin1Bidi::Bidi
2760 );
2761 assert_ne!(
2762 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop"),
2763 Latin1Bidi::Bidi
2764 );
2765 assert_ne!(
2766 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop"),
2767 Latin1Bidi::Bidi
2768 );
2769 assert_ne!(
2770 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop"),
2771 Latin1Bidi::Bidi
2772 );
2773 assert_ne!(
2774 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop"),
2775 Latin1Bidi::Bidi
2776 );
2777 assert_ne!(
2778 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop"),
2779 Latin1Bidi::Bidi
2780 );
2781 assert_ne!(
2782 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop"),
2783 Latin1Bidi::Bidi
2784 );
2785 assert_eq!(
2786 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop"),
2787 Latin1Bidi::Bidi
2788 );
2789 assert_eq!(
2790 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop"),
2791 Latin1Bidi::Bidi
2792 );
2793 assert_eq!(
2794 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop"),
2795 Latin1Bidi::Bidi
2796 );
2797 assert_eq!(
2798 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop"),
2799 Latin1Bidi::Bidi
2800 );
2801 assert_eq!(
2802 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop"),
2803 Latin1Bidi::Bidi
2804 );
2805 assert_eq!(
2806 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop"),
2807 Latin1Bidi::Bidi
2808 );
2809 assert_eq!(
2810 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop"),
2811 Latin1Bidi::Bidi
2812 );
2813 assert_eq!(
2814 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop"),
2815 Latin1Bidi::Bidi
2816 );
2817 assert_eq!(
2818 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop"),
2819 Latin1Bidi::Bidi
2820 );
2821 assert_eq!(
2822 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop"),
2823 Latin1Bidi::Bidi
2824 );
2825 assert_eq!(
2826 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop"),
2827 Latin1Bidi::Bidi
2828 );
2829 assert_eq!(
2830 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop"),
2831 Latin1Bidi::Bidi
2832 );
2833 assert_eq!(
2834 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop"),
2835 Latin1Bidi::Bidi
2836 );
2837 assert_eq!(
2838 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop"),
2839 Latin1Bidi::Bidi
2840 );
2841 assert_eq!(
2842 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop"),
2843 Latin1Bidi::Bidi
2844 );
2845 }
2846
2847 #[test]
test_check_utf8_for_latin1_and_bidi()2848 fn test_check_utf8_for_latin1_and_bidi() {
2849 assert_ne!(
2850 check_utf8_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop".as_bytes()),
2851 Latin1Bidi::Bidi
2852 );
2853 assert_ne!(
2854 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop".as_bytes()),
2855 Latin1Bidi::Bidi
2856 );
2857 assert_ne!(
2858 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop".as_bytes()),
2859 Latin1Bidi::Bidi
2860 );
2861 assert_ne!(
2862 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop".as_bytes()),
2863 Latin1Bidi::Bidi
2864 );
2865 assert_ne!(
2866 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop".as_bytes()),
2867 Latin1Bidi::Bidi
2868 );
2869 assert_ne!(
2870 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop".as_bytes()),
2871 Latin1Bidi::Bidi
2872 );
2873 assert_ne!(
2874 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes()),
2875 Latin1Bidi::Bidi
2876 );
2877 assert_eq!(
2878 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop".as_bytes()),
2879 Latin1Bidi::Bidi
2880 );
2881 assert_eq!(
2882 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop".as_bytes()),
2883 Latin1Bidi::Bidi
2884 );
2885 assert_eq!(
2886 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop".as_bytes()),
2887 Latin1Bidi::Bidi
2888 );
2889 assert_eq!(
2890 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop".as_bytes()),
2891 Latin1Bidi::Bidi
2892 );
2893 assert_eq!(
2894 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop".as_bytes()),
2895 Latin1Bidi::Bidi
2896 );
2897 assert_eq!(
2898 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop".as_bytes()),
2899 Latin1Bidi::Bidi
2900 );
2901 assert_eq!(
2902 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop".as_bytes()),
2903 Latin1Bidi::Bidi
2904 );
2905 assert_eq!(
2906 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop".as_bytes()),
2907 Latin1Bidi::Bidi
2908 );
2909 assert_eq!(
2910 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop".as_bytes()),
2911 Latin1Bidi::Bidi
2912 );
2913 assert_eq!(
2914 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop".as_bytes()),
2915 Latin1Bidi::Bidi
2916 );
2917 assert_eq!(
2918 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop".as_bytes()),
2919 Latin1Bidi::Bidi
2920 );
2921 assert_eq!(
2922 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop".as_bytes()),
2923 Latin1Bidi::Bidi
2924 );
2925 assert_eq!(
2926 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop".as_bytes()),
2927 Latin1Bidi::Bidi
2928 );
2929 assert_eq!(
2930 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop".as_bytes()),
2931 Latin1Bidi::Bidi
2932 );
2933 assert_eq!(
2934 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop".as_bytes()),
2935 Latin1Bidi::Bidi
2936 );
2937 }
2938
2939 #[test]
test_check_utf16_for_latin1_and_bidi()2940 fn test_check_utf16_for_latin1_and_bidi() {
2941 assert_ne!(
2942 check_utf16_for_latin1_and_bidi(&[
2943 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0062, 0x62, 0x63, 0x64, 0x65,
2944 0x66, 0x67, 0x68, 0x69,
2945 ]),
2946 Latin1Bidi::Bidi
2947 );
2948 assert_ne!(
2949 check_utf16_for_latin1_and_bidi(&[
2950 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x03B1, 0x62, 0x63, 0x64, 0x65,
2951 0x66, 0x67, 0x68, 0x69,
2952 ]),
2953 Latin1Bidi::Bidi
2954 );
2955 assert_ne!(
2956 check_utf16_for_latin1_and_bidi(&[
2957 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x3041, 0x62, 0x63, 0x64, 0x65,
2958 0x66, 0x67, 0x68, 0x69,
2959 ]),
2960 Latin1Bidi::Bidi
2961 );
2962 assert_ne!(
2963 check_utf16_for_latin1_and_bidi(&[
2964 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD801, 0x62, 0x63, 0x64, 0x65,
2965 0x66, 0x67, 0x68, 0x69,
2966 ]),
2967 Latin1Bidi::Bidi
2968 );
2969 assert_ne!(
2970 check_utf16_for_latin1_and_bidi(&[
2971 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE00, 0x62, 0x63, 0x64, 0x65,
2972 0x66, 0x67, 0x68, 0x69,
2973 ]),
2974 Latin1Bidi::Bidi
2975 );
2976 assert_ne!(
2977 check_utf16_for_latin1_and_bidi(&[
2978 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65,
2979 0x66, 0x67, 0x68, 0x69,
2980 ]),
2981 Latin1Bidi::Bidi
2982 );
2983 assert_ne!(
2984 check_utf16_for_latin1_and_bidi(&[
2985 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65,
2986 0x66, 0x67, 0x68, 0x69,
2987 ]),
2988 Latin1Bidi::Bidi
2989 );
2990 assert_eq!(
2991 check_utf16_for_latin1_and_bidi(&[
2992 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65,
2993 0x66, 0x67, 0x68, 0x69,
2994 ]),
2995 Latin1Bidi::Bidi
2996 );
2997 assert_eq!(
2998 check_utf16_for_latin1_and_bidi(&[
2999 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x08FF, 0x62, 0x63, 0x64, 0x65,
3000 0x66, 0x67, 0x68, 0x69,
3001 ]),
3002 Latin1Bidi::Bidi
3003 );
3004 assert_eq!(
3005 check_utf16_for_latin1_and_bidi(&[
3006 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65,
3007 0x66, 0x67, 0x68, 0x69,
3008 ]),
3009 Latin1Bidi::Bidi
3010 );
3011 assert_eq!(
3012 check_utf16_for_latin1_and_bidi(&[
3013 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB1D, 0x62, 0x63, 0x64, 0x65,
3014 0x66, 0x67, 0x68, 0x69,
3015 ]),
3016 Latin1Bidi::Bidi
3017 );
3018 assert_eq!(
3019 check_utf16_for_latin1_and_bidi(&[
3020 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65,
3021 0x66, 0x67, 0x68, 0x69,
3022 ]),
3023 Latin1Bidi::Bidi
3024 );
3025 assert_eq!(
3026 check_utf16_for_latin1_and_bidi(&[
3027 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFDFF, 0x62, 0x63, 0x64, 0x65,
3028 0x66, 0x67, 0x68, 0x69,
3029 ]),
3030 Latin1Bidi::Bidi
3031 );
3032 assert_eq!(
3033 check_utf16_for_latin1_and_bidi(&[
3034 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE70, 0x62, 0x63, 0x64, 0x65,
3035 0x66, 0x67, 0x68, 0x69,
3036 ]),
3037 Latin1Bidi::Bidi
3038 );
3039 assert_eq!(
3040 check_utf16_for_latin1_and_bidi(&[
3041 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFE, 0x62, 0x63, 0x64, 0x65,
3042 0x66, 0x67, 0x68, 0x69,
3043 ]),
3044 Latin1Bidi::Bidi
3045 );
3046 assert_eq!(
3047 check_utf16_for_latin1_and_bidi(&[
3048 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x200F, 0x62, 0x63, 0x64, 0x65,
3049 0x66, 0x67, 0x68, 0x69,
3050 ]),
3051 Latin1Bidi::Bidi
3052 );
3053 assert_eq!(
3054 check_utf16_for_latin1_and_bidi(&[
3055 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202B, 0x62, 0x63, 0x64, 0x65,
3056 0x66, 0x67, 0x68, 0x69,
3057 ]),
3058 Latin1Bidi::Bidi
3059 );
3060 assert_eq!(
3061 check_utf16_for_latin1_and_bidi(&[
3062 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202E, 0x62, 0x63, 0x64, 0x65,
3063 0x66, 0x67, 0x68, 0x69,
3064 ]),
3065 Latin1Bidi::Bidi
3066 );
3067 assert_eq!(
3068 check_utf16_for_latin1_and_bidi(&[
3069 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x2067, 0x62, 0x63, 0x64, 0x65,
3070 0x66, 0x67, 0x68, 0x69,
3071 ]),
3072 Latin1Bidi::Bidi
3073 );
3074 assert_eq!(
3075 check_utf16_for_latin1_and_bidi(&[
3076 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD802, 0x62, 0x63, 0x64, 0x65,
3077 0x66, 0x67, 0x68, 0x69,
3078 ]),
3079 Latin1Bidi::Bidi
3080 );
3081 assert_eq!(
3082 check_utf16_for_latin1_and_bidi(&[
3083 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD803, 0x62, 0x63, 0x64, 0x65,
3084 0x66, 0x67, 0x68, 0x69,
3085 ]),
3086 Latin1Bidi::Bidi
3087 );
3088 assert_eq!(
3089 check_utf16_for_latin1_and_bidi(&[
3090 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83A, 0x62, 0x63, 0x64, 0x65,
3091 0x66, 0x67, 0x68, 0x69,
3092 ]),
3093 Latin1Bidi::Bidi
3094 );
3095 assert_eq!(
3096 check_utf16_for_latin1_and_bidi(&[
3097 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83B, 0x62, 0x63, 0x64, 0x65,
3098 0x66, 0x67, 0x68, 0x69,
3099 ]),
3100 Latin1Bidi::Bidi
3101 );
3102
3103 assert_eq!(
3104 check_utf16_for_latin1_and_bidi(&[
3105 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x3041, 0x62, 0x63, 0x64,
3106 0x65, 0x66, 0x67, 0x68, 0x69,
3107 ]),
3108 Latin1Bidi::Bidi
3109 );
3110 }
3111
3112 #[inline(always)]
reference_is_char_bidi(c: char) -> bool3113 pub fn reference_is_char_bidi(c: char) -> bool {
3114 match c {
3115 '\u{0590}'...'\u{08FF}'
3116 | '\u{FB1D}'...'\u{FDFF}'
3117 | '\u{FE70}'...'\u{FEFE}'
3118 | '\u{10800}'...'\u{10FFF}'
3119 | '\u{1E800}'...'\u{1EFFF}'
3120 | '\u{200F}'
3121 | '\u{202B}'
3122 | '\u{202E}'
3123 | '\u{2067}' => true,
3124 _ => false,
3125 }
3126 }
3127
3128 #[inline(always)]
reference_is_utf16_code_unit_bidi(u: u16) -> bool3129 pub fn reference_is_utf16_code_unit_bidi(u: u16) -> bool {
3130 match u {
3131 0x0590...0x08FF
3132 | 0xFB1D...0xFDFF
3133 | 0xFE70...0xFEFE
3134 | 0xD802
3135 | 0xD803
3136 | 0xD83A
3137 | 0xD83B
3138 | 0x200F
3139 | 0x202B
3140 | 0x202E
3141 | 0x2067 => true,
3142 _ => false,
3143 }
3144 }
3145
3146 #[test]
test_is_char_bidi_thoroughly()3147 fn test_is_char_bidi_thoroughly() {
3148 for i in 0..0xD800u32 {
3149 let c: char = ::std::char::from_u32(i).unwrap();
3150 assert_eq!(is_char_bidi(c), reference_is_char_bidi(c));
3151 }
3152 for i in 0xE000..0x110000u32 {
3153 let c: char = ::std::char::from_u32(i).unwrap();
3154 assert_eq!(is_char_bidi(c), reference_is_char_bidi(c));
3155 }
3156 }
3157
3158 #[test]
test_is_utf16_code_unit_bidi_thoroughly()3159 fn test_is_utf16_code_unit_bidi_thoroughly() {
3160 for i in 0..0x10000u32 {
3161 let u = i as u16;
3162 assert_eq!(
3163 is_utf16_code_unit_bidi(u),
3164 reference_is_utf16_code_unit_bidi(u)
3165 );
3166 }
3167 }
3168
3169 #[test]
test_is_str_bidi_thoroughly()3170 fn test_is_str_bidi_thoroughly() {
3171 let mut buf = [0; 4];
3172 for i in 0..0xD800u32 {
3173 let c: char = ::std::char::from_u32(i).unwrap();
3174 assert_eq!(
3175 is_str_bidi(c.encode_utf8(&mut buf[..])),
3176 reference_is_char_bidi(c)
3177 );
3178 }
3179 for i in 0xE000..0x110000u32 {
3180 let c: char = ::std::char::from_u32(i).unwrap();
3181 assert_eq!(
3182 is_str_bidi(c.encode_utf8(&mut buf[..])),
3183 reference_is_char_bidi(c)
3184 );
3185 }
3186 }
3187
3188 #[test]
test_is_utf8_bidi_thoroughly()3189 fn test_is_utf8_bidi_thoroughly() {
3190 let mut buf = [0; 8];
3191 for i in 0..0xD800u32 {
3192 let c: char = ::std::char::from_u32(i).unwrap();
3193 let expect = reference_is_char_bidi(c);
3194 {
3195 let len = {
3196 let bytes = c.encode_utf8(&mut buf[..]).as_bytes();
3197 assert_eq!(is_utf8_bidi(bytes), expect);
3198 bytes.len()
3199 };
3200 {
3201 let tail = &mut buf[len..];
3202 for b in tail.iter_mut() {
3203 *b = 0;
3204 }
3205 }
3206 }
3207 assert_eq!(is_utf8_bidi(&buf[..]), expect);
3208 }
3209 for i in 0xE000..0x110000u32 {
3210 let c: char = ::std::char::from_u32(i).unwrap();
3211 let expect = reference_is_char_bidi(c);
3212 {
3213 let len = {
3214 let bytes = c.encode_utf8(&mut buf[..]).as_bytes();
3215 assert_eq!(is_utf8_bidi(bytes), expect);
3216 bytes.len()
3217 };
3218 {
3219 let tail = &mut buf[len..];
3220 for b in tail.iter_mut() {
3221 *b = 0;
3222 }
3223 }
3224 }
3225 assert_eq!(is_utf8_bidi(&buf[..]), expect);
3226 }
3227 }
3228
3229 #[test]
test_is_utf16_bidi_thoroughly()3230 fn test_is_utf16_bidi_thoroughly() {
3231 let mut buf = [0; 32];
3232 for i in 0..0x10000u32 {
3233 let u = i as u16;
3234 buf[15] = u;
3235 assert_eq!(
3236 is_utf16_bidi(&buf[..]),
3237 reference_is_utf16_code_unit_bidi(u)
3238 );
3239 }
3240 }
3241
3242 #[test]
test_is_utf8_bidi_edge_cases()3243 fn test_is_utf8_bidi_edge_cases() {
3244 assert!(!is_utf8_bidi(b"\xD5\xBF\x61"));
3245 assert!(!is_utf8_bidi(b"\xD6\x80\x61"));
3246 assert!(!is_utf8_bidi(b"abc"));
3247 assert!(is_utf8_bidi(b"\xD5\xBF\xC2"));
3248 assert!(is_utf8_bidi(b"\xD6\x80\xC2"));
3249 assert!(is_utf8_bidi(b"ab\xC2"));
3250 }
3251
3252 #[test]
test_decode_latin1()3253 fn test_decode_latin1() {
3254 match decode_latin1(b"ab") {
3255 Cow::Borrowed(s) => {
3256 assert_eq!(s, "ab");
3257 }
3258 Cow::Owned(_) => {
3259 unreachable!("Should have borrowed");
3260 }
3261 }
3262 assert_eq!(decode_latin1(b"a\xE4"), "a\u{E4}");
3263 }
3264
3265 #[test]
test_encode_latin1_lossy()3266 fn test_encode_latin1_lossy() {
3267 match encode_latin1_lossy("ab") {
3268 Cow::Borrowed(s) => {
3269 assert_eq!(s, b"ab");
3270 }
3271 Cow::Owned(_) => {
3272 unreachable!("Should have borrowed");
3273 }
3274 }
3275 assert_eq!(encode_latin1_lossy("a\u{E4}"), &(b"a\xE4")[..]);
3276 }
3277
3278 #[test]
test_convert_utf8_to_utf16_without_replacement()3279 fn test_convert_utf8_to_utf16_without_replacement() {
3280 let mut buf = [0u16; 5];
3281 assert_eq!(
3282 convert_utf8_to_utf16_without_replacement(b"ab", &mut buf[..2]),
3283 Some(2)
3284 );
3285 assert_eq!(buf[0], u16::from(b'a'));
3286 assert_eq!(buf[1], u16::from(b'b'));
3287 assert_eq!(buf[2], 0);
3288 assert_eq!(
3289 convert_utf8_to_utf16_without_replacement(b"\xC3\xA4c", &mut buf[..3]),
3290 Some(2)
3291 );
3292 assert_eq!(buf[0], 0xE4);
3293 assert_eq!(buf[1], u16::from(b'c'));
3294 assert_eq!(buf[2], 0);
3295 assert_eq!(
3296 convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83", &mut buf[..3]),
3297 Some(1)
3298 );
3299 assert_eq!(buf[0], 0x2603);
3300 assert_eq!(buf[1], u16::from(b'c'));
3301 assert_eq!(buf[2], 0);
3302 assert_eq!(
3303 convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83d", &mut buf[..4]),
3304 Some(2)
3305 );
3306 assert_eq!(buf[0], 0x2603);
3307 assert_eq!(buf[1], u16::from(b'd'));
3308 assert_eq!(buf[2], 0);
3309 assert_eq!(
3310 convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83\xC3\xA4", &mut buf[..5]),
3311 Some(2)
3312 );
3313 assert_eq!(buf[0], 0x2603);
3314 assert_eq!(buf[1], 0xE4);
3315 assert_eq!(buf[2], 0);
3316 assert_eq!(
3317 convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93\x8E", &mut buf[..4]),
3318 Some(2)
3319 );
3320 assert_eq!(buf[0], 0xD83D);
3321 assert_eq!(buf[1], 0xDCCE);
3322 assert_eq!(buf[2], 0);
3323 assert_eq!(
3324 convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93\x8Ee", &mut buf[..5]),
3325 Some(3)
3326 );
3327 assert_eq!(buf[0], 0xD83D);
3328 assert_eq!(buf[1], 0xDCCE);
3329 assert_eq!(buf[2], u16::from(b'e'));
3330 assert_eq!(
3331 convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93", &mut buf[..5]),
3332 None
3333 );
3334 }
3335 }
3336