1 // Copyright Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 // It's assumed that in due course Rust will have explicit SIMD but will not
11 // be good at run-time selection of SIMD vs. no-SIMD. In such a future,
12 // x86_64 will always use SSE2 and 32-bit x86 will use SSE2 when compiled with
13 // a Mozilla-shipped rustc. SIMD support and especially detection on ARM is a
14 // mess. Under the circumstances, it seems to make sense to optimize the ALU
15 // case for ARMv7 rather than x86. Annoyingly, I was unable to get useful
16 // numbers of the actual ARMv7 CPU I have access to, because (thermal?)
17 // throttling kept interfering. Since Raspberry Pi 3 (ARMv8 core but running
18 // ARMv7 code) produced reproducible performance numbers, that's the ARM
19 // computer that this code ended up being optimized for in the ALU case.
20 // Less popular CPU architectures simply get the approach that was chosen based
21 // on Raspberry Pi 3 measurements. The UTF-16 and UTF-8 ALU cases take
22 // different approaches based on benchmarking on Raspberry Pi 3.
23 
24 #[cfg(all(
25     feature = "simd-accel",
26     any(
27         target_feature = "sse2",
28         all(target_endian = "little", target_arch = "aarch64"),
29         all(target_endian = "little", target_feature = "neon")
30     )
31 ))]
32 use crate::simd_funcs::*;
33 
34 cfg_if! {
35     if #[cfg(feature = "simd-accel")] {
36         #[allow(unused_imports)]
37         use ::core::intrinsics::unlikely;
38         #[allow(unused_imports)]
39         use ::core::intrinsics::likely;
40     } else {
41         #[allow(dead_code)]
42         #[inline(always)]
43         // Unsafe to match the intrinsic, which is needlessly unsafe.
44         unsafe fn unlikely(b: bool) -> bool {
45             b
46         }
47         #[allow(dead_code)]
48         #[inline(always)]
49         // Unsafe to match the intrinsic, which is needlessly unsafe.
50         unsafe fn likely(b: bool) -> bool {
51             b
52         }
53     }
54 }
55 
56 // `as` truncates, so works on 32-bit, too.
57 #[allow(dead_code)]
58 pub const ASCII_MASK: usize = 0x8080_8080_8080_8080u64 as usize;
59 
60 // `as` truncates, so works on 32-bit, too.
61 #[allow(dead_code)]
62 pub const BASIC_LATIN_MASK: usize = 0xFF80_FF80_FF80_FF80u64 as usize;
63 
64 #[allow(unused_macros)]
65 macro_rules! ascii_naive {
66     ($name:ident, $src_unit:ty, $dst_unit:ty) => {
67         #[inline(always)]
68         pub unsafe fn $name(
69             src: *const $src_unit,
70             dst: *mut $dst_unit,
71             len: usize,
72         ) -> Option<($src_unit, usize)> {
73             // Yes, manually omitting the bound check here matters
74             // a lot for perf.
75             for i in 0..len {
76                 let code_unit = *(src.add(i));
77                 if code_unit > 127 {
78                     return Some((code_unit, i));
79                 }
80                 *(dst.add(i)) = code_unit as $dst_unit;
81             }
82             return None;
83         }
84     };
85 }
86 
87 #[allow(unused_macros)]
88 macro_rules! ascii_alu {
89     ($name:ident,
90      $src_unit:ty,
91      $dst_unit:ty,
92      $stride_fn:ident) => {
93         #[cfg_attr(feature = "cargo-clippy", allow(never_loop, cast_ptr_alignment))]
94         #[inline(always)]
95         pub unsafe fn $name(
96             src: *const $src_unit,
97             dst: *mut $dst_unit,
98             len: usize,
99         ) -> Option<($src_unit, usize)> {
100             let mut offset = 0usize;
101             // This loop is only broken out of as a `goto` forward
102             loop {
103                 let mut until_alignment = {
104                     // Check if the other unit aligns if we move the narrower unit
105                     // to alignment.
106                     //               if ::core::mem::size_of::<$src_unit>() == ::core::mem::size_of::<$dst_unit>() {
107                     // ascii_to_ascii
108                     let src_alignment = (src as usize) & ALU_ALIGNMENT_MASK;
109                     let dst_alignment = (dst as usize) & ALU_ALIGNMENT_MASK;
110                     if src_alignment != dst_alignment {
111                         break;
112                     }
113                     (ALU_ALIGNMENT - src_alignment) & ALU_ALIGNMENT_MASK
114                     //               } else if ::core::mem::size_of::<$src_unit>() < ::core::mem::size_of::<$dst_unit>() {
115                     // ascii_to_basic_latin
116                     //                   let src_until_alignment = (ALIGNMENT - ((src as usize) & ALIGNMENT_MASK)) & ALIGNMENT_MASK;
117                     //                   if (dst.add(src_until_alignment) as usize) & ALIGNMENT_MASK != 0 {
118                     //                       break;
119                     //                   }
120                     //                   src_until_alignment
121                     //               } else {
122                     // basic_latin_to_ascii
123                     //                   let dst_until_alignment = (ALIGNMENT - ((dst as usize) & ALIGNMENT_MASK)) & ALIGNMENT_MASK;
124                     //                   if (src.add(dst_until_alignment) as usize) & ALIGNMENT_MASK != 0 {
125                     //                       break;
126                     //                   }
127                     //                   dst_until_alignment
128                     //               }
129                 };
130                 if until_alignment + ALU_STRIDE_SIZE <= len {
131                     // Moving pointers to alignment seems to be a pessimization on
132                     // x86_64 for operations that have UTF-16 as the internal
133                     // Unicode representation. However, since it seems to be a win
134                     // on ARM (tested ARMv7 code running on ARMv8 [rpi3]), except
135                     // mixed results when encoding from UTF-16 and since x86 and
136                     // x86_64 should be using SSE2 in due course, keeping the move
137                     // to alignment here. It would be good to test on more ARM CPUs
138                     // and on real MIPS and POWER hardware.
139                     while until_alignment != 0 {
140                         let code_unit = *(src.add(offset));
141                         if code_unit > 127 {
142                             return Some((code_unit, offset));
143                         }
144                         *(dst.add(offset)) = code_unit as $dst_unit;
145                         offset += 1;
146                         until_alignment -= 1;
147                     }
148                     let len_minus_stride = len - ALU_STRIDE_SIZE;
149                     loop {
150                         if let Some(num_ascii) = $stride_fn(
151                             src.add(offset) as *const usize,
152                             dst.add(offset) as *mut usize,
153                         ) {
154                             offset += num_ascii;
155                             return Some((*(src.add(offset)), offset));
156                         }
157                         offset += ALU_STRIDE_SIZE;
158                         if offset > len_minus_stride {
159                             break;
160                         }
161                     }
162                 }
163                 break;
164             }
165             while offset < len {
166                 let code_unit = *(src.add(offset));
167                 if code_unit > 127 {
168                     return Some((code_unit, offset));
169                 }
170                 *(dst.add(offset)) = code_unit as $dst_unit;
171                 offset += 1;
172             }
173             None
174         }
175     };
176 }
177 
178 #[allow(unused_macros)]
179 macro_rules! basic_latin_alu {
180     ($name:ident,
181      $src_unit:ty,
182      $dst_unit:ty,
183      $stride_fn:ident) => {
184         #[cfg_attr(
185             feature = "cargo-clippy",
186             allow(never_loop, cast_ptr_alignment, cast_lossless)
187         )]
188         #[inline(always)]
189         pub unsafe fn $name(
190             src: *const $src_unit,
191             dst: *mut $dst_unit,
192             len: usize,
193         ) -> Option<($src_unit, usize)> {
194             let mut offset = 0usize;
195             // This loop is only broken out of as a `goto` forward
196             loop {
197                 let mut until_alignment = {
198                     // Check if the other unit aligns if we move the narrower unit
199                     // to alignment.
200                     //               if ::core::mem::size_of::<$src_unit>() == ::core::mem::size_of::<$dst_unit>() {
201                     // ascii_to_ascii
202                     //                   let src_alignment = (src as usize) & ALIGNMENT_MASK;
203                     //                   let dst_alignment = (dst as usize) & ALIGNMENT_MASK;
204                     //                   if src_alignment != dst_alignment {
205                     //                       break;
206                     //                   }
207                     //                   (ALIGNMENT - src_alignment) & ALIGNMENT_MASK
208                     //               } else
209                     if ::core::mem::size_of::<$src_unit>() < ::core::mem::size_of::<$dst_unit>() {
210                         // ascii_to_basic_latin
211                         let src_until_alignment = (ALU_ALIGNMENT
212                             - ((src as usize) & ALU_ALIGNMENT_MASK))
213                             & ALU_ALIGNMENT_MASK;
214                         if (dst.wrapping_add(src_until_alignment) as usize) & ALU_ALIGNMENT_MASK
215                             != 0
216                         {
217                             break;
218                         }
219                         src_until_alignment
220                     } else {
221                         // basic_latin_to_ascii
222                         let dst_until_alignment = (ALU_ALIGNMENT
223                             - ((dst as usize) & ALU_ALIGNMENT_MASK))
224                             & ALU_ALIGNMENT_MASK;
225                         if (src.wrapping_add(dst_until_alignment) as usize) & ALU_ALIGNMENT_MASK
226                             != 0
227                         {
228                             break;
229                         }
230                         dst_until_alignment
231                     }
232                 };
233                 if until_alignment + ALU_STRIDE_SIZE <= len {
234                     // Moving pointers to alignment seems to be a pessimization on
235                     // x86_64 for operations that have UTF-16 as the internal
236                     // Unicode representation. However, since it seems to be a win
237                     // on ARM (tested ARMv7 code running on ARMv8 [rpi3]), except
238                     // mixed results when encoding from UTF-16 and since x86 and
239                     // x86_64 should be using SSE2 in due course, keeping the move
240                     // to alignment here. It would be good to test on more ARM CPUs
241                     // and on real MIPS and POWER hardware.
242                     while until_alignment != 0 {
243                         let code_unit = *(src.add(offset));
244                         if code_unit > 127 {
245                             return Some((code_unit, offset));
246                         }
247                         *(dst.add(offset)) = code_unit as $dst_unit;
248                         offset += 1;
249                         until_alignment -= 1;
250                     }
251                     let len_minus_stride = len - ALU_STRIDE_SIZE;
252                     loop {
253                         if !$stride_fn(
254                             src.add(offset) as *const usize,
255                             dst.add(offset) as *mut usize,
256                         ) {
257                             break;
258                         }
259                         offset += ALU_STRIDE_SIZE;
260                         if offset > len_minus_stride {
261                             break;
262                         }
263                     }
264                 }
265                 break;
266             }
267             while offset < len {
268                 let code_unit = *(src.add(offset));
269                 if code_unit > 127 {
270                     return Some((code_unit, offset));
271                 }
272                 *(dst.add(offset)) = code_unit as $dst_unit;
273                 offset += 1;
274             }
275             None
276         }
277     };
278 }
279 
280 #[allow(unused_macros)]
281 macro_rules! latin1_alu {
282     ($name:ident, $src_unit:ty, $dst_unit:ty, $stride_fn:ident) => {
283         #[cfg_attr(
284             feature = "cargo-clippy",
285             allow(never_loop, cast_ptr_alignment, cast_lossless)
286         )]
287         #[inline(always)]
288         pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) {
289             let mut offset = 0usize;
290             // This loop is only broken out of as a `goto` forward
291             loop {
292                 let mut until_alignment = {
293                     if ::core::mem::size_of::<$src_unit>() < ::core::mem::size_of::<$dst_unit>() {
294                         // unpack
295                         let src_until_alignment = (ALU_ALIGNMENT
296                             - ((src as usize) & ALU_ALIGNMENT_MASK))
297                             & ALU_ALIGNMENT_MASK;
298                         if (dst.wrapping_add(src_until_alignment) as usize) & ALU_ALIGNMENT_MASK
299                             != 0
300                         {
301                             break;
302                         }
303                         src_until_alignment
304                     } else {
305                         // pack
306                         let dst_until_alignment = (ALU_ALIGNMENT
307                             - ((dst as usize) & ALU_ALIGNMENT_MASK))
308                             & ALU_ALIGNMENT_MASK;
309                         if (src.wrapping_add(dst_until_alignment) as usize) & ALU_ALIGNMENT_MASK
310                             != 0
311                         {
312                             break;
313                         }
314                         dst_until_alignment
315                     }
316                 };
317                 if until_alignment + ALU_STRIDE_SIZE <= len {
318                     while until_alignment != 0 {
319                         let code_unit = *(src.add(offset));
320                         *(dst.add(offset)) = code_unit as $dst_unit;
321                         offset += 1;
322                         until_alignment -= 1;
323                     }
324                     let len_minus_stride = len - ALU_STRIDE_SIZE;
325                     loop {
326                         $stride_fn(
327                             src.add(offset) as *const usize,
328                             dst.add(offset) as *mut usize,
329                         );
330                         offset += ALU_STRIDE_SIZE;
331                         if offset > len_minus_stride {
332                             break;
333                         }
334                     }
335                 }
336                 break;
337             }
338             while offset < len {
339                 let code_unit = *(src.add(offset));
340                 *(dst.add(offset)) = code_unit as $dst_unit;
341                 offset += 1;
342             }
343         }
344     };
345 }
346 
347 #[allow(unused_macros)]
348 macro_rules! ascii_simd_check_align {
349     (
350         $name:ident,
351         $src_unit:ty,
352         $dst_unit:ty,
353         $stride_both_aligned:ident,
354         $stride_src_aligned:ident,
355         $stride_dst_aligned:ident,
356         $stride_neither_aligned:ident
357     ) => {
358         #[inline(always)]
359         pub unsafe fn $name(
360             src: *const $src_unit,
361             dst: *mut $dst_unit,
362             len: usize,
363         ) -> Option<($src_unit, usize)> {
364             let mut offset = 0usize;
365             if SIMD_STRIDE_SIZE <= len {
366                 let len_minus_stride = len - SIMD_STRIDE_SIZE;
367                 // XXX Should we first process one stride unconditionally as unaligned to
368                 // avoid the cost of the branchiness below if the first stride fails anyway?
369                 // XXX Should we just use unaligned SSE2 access unconditionally? It seems that
370                 // on Haswell, it would make sense to just use unaligned and not bother
371                 // checking. Need to benchmark older architectures before deciding.
372                 let dst_masked = (dst as usize) & SIMD_ALIGNMENT_MASK;
373                 if ((src as usize) & SIMD_ALIGNMENT_MASK) == 0 {
374                     if dst_masked == 0 {
375                         loop {
376                             if !$stride_both_aligned(src.add(offset), dst.add(offset)) {
377                                 break;
378                             }
379                             offset += SIMD_STRIDE_SIZE;
380                             if offset > len_minus_stride {
381                                 break;
382                             }
383                         }
384                     } else {
385                         loop {
386                             if !$stride_src_aligned(src.add(offset), dst.add(offset)) {
387                                 break;
388                             }
389                             offset += SIMD_STRIDE_SIZE;
390                             if offset > len_minus_stride {
391                                 break;
392                             }
393                         }
394                     }
395                 } else {
396                     if dst_masked == 0 {
397                         loop {
398                             if !$stride_dst_aligned(src.add(offset), dst.add(offset)) {
399                                 break;
400                             }
401                             offset += SIMD_STRIDE_SIZE;
402                             if offset > len_minus_stride {
403                                 break;
404                             }
405                         }
406                     } else {
407                         loop {
408                             if !$stride_neither_aligned(src.add(offset), dst.add(offset)) {
409                                 break;
410                             }
411                             offset += SIMD_STRIDE_SIZE;
412                             if offset > len_minus_stride {
413                                 break;
414                             }
415                         }
416                     }
417                 }
418             }
419             while offset < len {
420                 let code_unit = *(src.add(offset));
421                 if code_unit > 127 {
422                     return Some((code_unit, offset));
423                 }
424                 *(dst.add(offset)) = code_unit as $dst_unit;
425                 offset += 1;
426             }
427             None
428         }
429     };
430 }
431 
432 #[allow(unused_macros)]
433 macro_rules! ascii_simd_check_align_unrolled {
434     (
435         $name:ident,
436         $src_unit:ty,
437         $dst_unit:ty,
438         $stride_both_aligned:ident,
439         $stride_src_aligned:ident,
440         $stride_neither_aligned:ident,
441         $double_stride_both_aligned:ident,
442         $double_stride_src_aligned:ident
443     ) => {
444         #[inline(always)]
445         pub unsafe fn $name(
446             src: *const $src_unit,
447             dst: *mut $dst_unit,
448             len: usize,
449         ) -> Option<($src_unit, usize)> {
450             let unit_size = ::core::mem::size_of::<$src_unit>();
451             let mut offset = 0usize;
452             // This loop is only broken out of as a goto forward without
453             // actually looping
454             'outer: loop {
455                 if SIMD_STRIDE_SIZE <= len {
456                     // First, process one unaligned
457                     if !$stride_neither_aligned(src, dst) {
458                         break 'outer;
459                     }
460                     offset = SIMD_STRIDE_SIZE;
461 
462                     // We have now seen 16 ASCII bytes. Let's guess that
463                     // there will be enough more to justify more expense
464                     // in the case of non-ASCII.
465                     // Use aligned reads for the sake of old microachitectures.
466                     let until_alignment = ((SIMD_ALIGNMENT
467                         - ((src.add(offset) as usize) & SIMD_ALIGNMENT_MASK))
468                         & SIMD_ALIGNMENT_MASK)
469                         / unit_size;
470                     // This addition won't overflow, because even in the 32-bit PAE case the
471                     // address space holds enough code that the slice length can't be that
472                     // close to address space size.
473                     // offset now equals SIMD_STRIDE_SIZE, hence times 3 below.
474                     if until_alignment + (SIMD_STRIDE_SIZE * 3) <= len {
475                         if until_alignment != 0 {
476                             if !$stride_neither_aligned(src.add(offset), dst.add(offset)) {
477                                 break;
478                             }
479                             offset += until_alignment;
480                         }
481                         let len_minus_stride_times_two = len - (SIMD_STRIDE_SIZE * 2);
482                         let dst_masked = (dst.add(offset) as usize) & SIMD_ALIGNMENT_MASK;
483                         if dst_masked == 0 {
484                             loop {
485                                 if let Some(advance) =
486                                     $double_stride_both_aligned(src.add(offset), dst.add(offset))
487                                 {
488                                     offset += advance;
489                                     let code_unit = *(src.add(offset));
490                                     return Some((code_unit, offset));
491                                 }
492                                 offset += SIMD_STRIDE_SIZE * 2;
493                                 if offset > len_minus_stride_times_two {
494                                     break;
495                                 }
496                             }
497                             if offset + SIMD_STRIDE_SIZE <= len {
498                                 if !$stride_both_aligned(src.add(offset), dst.add(offset)) {
499                                     break 'outer;
500                                 }
501                                 offset += SIMD_STRIDE_SIZE;
502                             }
503                         } else {
504                             loop {
505                                 if let Some(advance) =
506                                     $double_stride_src_aligned(src.add(offset), dst.add(offset))
507                                 {
508                                     offset += advance;
509                                     let code_unit = *(src.add(offset));
510                                     return Some((code_unit, offset));
511                                 }
512                                 offset += SIMD_STRIDE_SIZE * 2;
513                                 if offset > len_minus_stride_times_two {
514                                     break;
515                                 }
516                             }
517                             if offset + SIMD_STRIDE_SIZE <= len {
518                                 if !$stride_src_aligned(src.add(offset), dst.add(offset)) {
519                                     break 'outer;
520                                 }
521                                 offset += SIMD_STRIDE_SIZE;
522                             }
523                         }
524                     } else {
525                         // At most two iterations, so unroll
526                         if offset + SIMD_STRIDE_SIZE <= len {
527                             if !$stride_neither_aligned(src.add(offset), dst.add(offset)) {
528                                 break;
529                             }
530                             offset += SIMD_STRIDE_SIZE;
531                             if offset + SIMD_STRIDE_SIZE <= len {
532                                 if !$stride_neither_aligned(src.add(offset), dst.add(offset)) {
533                                     break;
534                                 }
535                                 offset += SIMD_STRIDE_SIZE;
536                             }
537                         }
538                     }
539                 }
540                 break 'outer;
541             }
542             while offset < len {
543                 let code_unit = *(src.add(offset));
544                 if code_unit > 127 {
545                     return Some((code_unit, offset));
546                 }
547                 *(dst.add(offset)) = code_unit as $dst_unit;
548                 offset += 1;
549             }
550             None
551         }
552     };
553 }
554 
555 #[allow(unused_macros)]
556 macro_rules! latin1_simd_check_align {
557     (
558         $name:ident,
559         $src_unit:ty,
560         $dst_unit:ty,
561         $stride_both_aligned:ident,
562         $stride_src_aligned:ident,
563         $stride_dst_aligned:ident,
564         $stride_neither_aligned:ident
565     ) => {
566         #[inline(always)]
567         pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) {
568             let mut offset = 0usize;
569             if SIMD_STRIDE_SIZE <= len {
570                 let len_minus_stride = len - SIMD_STRIDE_SIZE;
571                 let dst_masked = (dst as usize) & SIMD_ALIGNMENT_MASK;
572                 if ((src as usize) & SIMD_ALIGNMENT_MASK) == 0 {
573                     if dst_masked == 0 {
574                         loop {
575                             $stride_both_aligned(src.add(offset), dst.add(offset));
576                             offset += SIMD_STRIDE_SIZE;
577                             if offset > len_minus_stride {
578                                 break;
579                             }
580                         }
581                     } else {
582                         loop {
583                             $stride_src_aligned(src.add(offset), dst.add(offset));
584                             offset += SIMD_STRIDE_SIZE;
585                             if offset > len_minus_stride {
586                                 break;
587                             }
588                         }
589                     }
590                 } else {
591                     if dst_masked == 0 {
592                         loop {
593                             $stride_dst_aligned(src.add(offset), dst.add(offset));
594                             offset += SIMD_STRIDE_SIZE;
595                             if offset > len_minus_stride {
596                                 break;
597                             }
598                         }
599                     } else {
600                         loop {
601                             $stride_neither_aligned(src.add(offset), dst.add(offset));
602                             offset += SIMD_STRIDE_SIZE;
603                             if offset > len_minus_stride {
604                                 break;
605                             }
606                         }
607                     }
608                 }
609             }
610             while offset < len {
611                 let code_unit = *(src.add(offset));
612                 *(dst.add(offset)) = code_unit as $dst_unit;
613                 offset += 1;
614             }
615         }
616     };
617 }
618 
619 #[allow(unused_macros)]
620 macro_rules! latin1_simd_check_align_unrolled {
621     (
622         $name:ident,
623         $src_unit:ty,
624         $dst_unit:ty,
625         $stride_both_aligned:ident,
626         $stride_src_aligned:ident,
627         $stride_dst_aligned:ident,
628         $stride_neither_aligned:ident
629     ) => {
630         #[inline(always)]
631         pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) {
632             let unit_size = ::core::mem::size_of::<$src_unit>();
633             let mut offset = 0usize;
634             if SIMD_STRIDE_SIZE <= len {
635                 let mut until_alignment = ((SIMD_STRIDE_SIZE
636                     - ((src as usize) & SIMD_ALIGNMENT_MASK))
637                     & SIMD_ALIGNMENT_MASK)
638                     / unit_size;
639                 while until_alignment != 0 {
640                     *(dst.add(offset)) = *(src.add(offset)) as $dst_unit;
641                     offset += 1;
642                     until_alignment -= 1;
643                 }
644                 let len_minus_stride = len - SIMD_STRIDE_SIZE;
645                 if offset + SIMD_STRIDE_SIZE * 2 <= len {
646                     let len_minus_stride_times_two = len_minus_stride - SIMD_STRIDE_SIZE;
647                     if (dst.add(offset) as usize) & SIMD_ALIGNMENT_MASK == 0 {
648                         loop {
649                             $stride_both_aligned(src.add(offset), dst.add(offset));
650                             offset += SIMD_STRIDE_SIZE;
651                             $stride_both_aligned(src.add(offset), dst.add(offset));
652                             offset += SIMD_STRIDE_SIZE;
653                             if offset > len_minus_stride_times_two {
654                                 break;
655                             }
656                         }
657                     } else {
658                         loop {
659                             $stride_src_aligned(src.add(offset), dst.add(offset));
660                             offset += SIMD_STRIDE_SIZE;
661                             $stride_src_aligned(src.add(offset), dst.add(offset));
662                             offset += SIMD_STRIDE_SIZE;
663                             if offset > len_minus_stride_times_two {
664                                 break;
665                             }
666                         }
667                     }
668                 }
669                 if offset < len_minus_stride {
670                     $stride_src_aligned(src.add(offset), dst.add(offset));
671                     offset += SIMD_STRIDE_SIZE;
672                 }
673             }
674             while offset < len {
675                 let code_unit = *(src.add(offset));
676                 // On x86_64, this loop autovectorizes but in the pack
677                 // case there are instructions whose purpose is to make sure
678                 // each u16 in the vector is truncated before packing. However,
679                 // since we don't care about saturating behavior of SSE2 packing
680                 // when the input isn't Latin1, those instructions are useless.
681                 // Unfortunately, using the `assume` intrinsic to lie to the
682                 // optimizer doesn't make LLVM omit the trunctation that we
683                 // don't need. Possibly this loop could be manually optimized
684                 // to do the sort of thing that LLVM does but without the
685                 // ANDing the read vectors of u16 with a constant that discards
686                 // the high half of each u16. As far as I can tell, the
687                 // optimization assumes that doing a SIMD read past the end of
688                 // the array is OK.
689                 *(dst.add(offset)) = code_unit as $dst_unit;
690                 offset += 1;
691             }
692         }
693     };
694 }
695 
696 #[allow(unused_macros)]
697 macro_rules! ascii_simd_unalign {
698     ($name:ident, $src_unit:ty, $dst_unit:ty, $stride_neither_aligned:ident) => {
699         #[inline(always)]
700         pub unsafe fn $name(
701             src: *const $src_unit,
702             dst: *mut $dst_unit,
703             len: usize,
704         ) -> Option<($src_unit, usize)> {
705             let mut offset = 0usize;
706             if SIMD_STRIDE_SIZE <= len {
707                 let len_minus_stride = len - SIMD_STRIDE_SIZE;
708                 loop {
709                     if !$stride_neither_aligned(src.add(offset), dst.add(offset)) {
710                         break;
711                     }
712                     offset += SIMD_STRIDE_SIZE;
713                     if offset > len_minus_stride {
714                         break;
715                     }
716                 }
717             }
718             while offset < len {
719                 let code_unit = *(src.add(offset));
720                 if code_unit > 127 {
721                     return Some((code_unit, offset));
722                 }
723                 *(dst.add(offset)) = code_unit as $dst_unit;
724                 offset += 1;
725             }
726             None
727         }
728     };
729 }
730 
731 #[allow(unused_macros)]
732 macro_rules! latin1_simd_unalign {
733     ($name:ident, $src_unit:ty, $dst_unit:ty, $stride_neither_aligned:ident) => {
734         #[inline(always)]
735         pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) {
736             let mut offset = 0usize;
737             if SIMD_STRIDE_SIZE <= len {
738                 let len_minus_stride = len - SIMD_STRIDE_SIZE;
739                 loop {
740                     $stride_neither_aligned(src.add(offset), dst.add(offset));
741                     offset += SIMD_STRIDE_SIZE;
742                     if offset > len_minus_stride {
743                         break;
744                     }
745                 }
746             }
747             while offset < len {
748                 let code_unit = *(src.add(offset));
749                 *(dst.add(offset)) = code_unit as $dst_unit;
750                 offset += 1;
751             }
752         }
753     };
754 }
755 
756 #[allow(unused_macros)]
757 macro_rules! ascii_to_ascii_simd_stride {
758     ($name:ident, $load:ident, $store:ident) => {
759         #[inline(always)]
760         pub unsafe fn $name(src: *const u8, dst: *mut u8) -> bool {
761             let simd = $load(src);
762             if !simd_is_ascii(simd) {
763                 return false;
764             }
765             $store(dst, simd);
766             true
767         }
768     };
769 }
770 
771 #[allow(unused_macros)]
772 macro_rules! ascii_to_ascii_simd_double_stride {
773     ($name:ident, $store:ident) => {
774         #[inline(always)]
775         pub unsafe fn $name(src: *const u8, dst: *mut u8) -> Option<usize> {
776             let first = load16_aligned(src);
777             let second = load16_aligned(src.add(SIMD_STRIDE_SIZE));
778             $store(dst, first);
779             if unlikely(!simd_is_ascii(first | second)) {
780                 let mask_first = mask_ascii(first);
781                 if mask_first != 0 {
782                     return Some(mask_first.trailing_zeros() as usize);
783                 }
784                 $store(dst.add(SIMD_STRIDE_SIZE), second);
785                 let mask_second = mask_ascii(second);
786                 return Some(SIMD_STRIDE_SIZE + mask_second.trailing_zeros() as usize);
787             }
788             $store(dst.add(SIMD_STRIDE_SIZE), second);
789             None
790         }
791     };
792 }
793 
794 #[allow(unused_macros)]
795 macro_rules! ascii_to_basic_latin_simd_stride {
796     ($name:ident, $load:ident, $store:ident) => {
797         #[inline(always)]
798         pub unsafe fn $name(src: *const u8, dst: *mut u16) -> bool {
799             let simd = $load(src);
800             if !simd_is_ascii(simd) {
801                 return false;
802             }
803             let (first, second) = simd_unpack(simd);
804             $store(dst, first);
805             $store(dst.add(8), second);
806             true
807         }
808     };
809 }
810 
811 #[allow(unused_macros)]
812 macro_rules! ascii_to_basic_latin_simd_double_stride {
813     ($name:ident, $store:ident) => {
814         #[inline(always)]
815         pub unsafe fn $name(src: *const u8, dst: *mut u16) -> Option<usize> {
816             let first = load16_aligned(src);
817             let second = load16_aligned(src.add(SIMD_STRIDE_SIZE));
818             let (a, b) = simd_unpack(first);
819             $store(dst, a);
820             $store(dst.add(SIMD_STRIDE_SIZE / 2), b);
821             if unlikely(!simd_is_ascii(first | second)) {
822                 let mask_first = mask_ascii(first);
823                 if mask_first != 0 {
824                     return Some(mask_first.trailing_zeros() as usize);
825                 }
826                 let (c, d) = simd_unpack(second);
827                 $store(dst.add(SIMD_STRIDE_SIZE), c);
828                 $store(dst.add(SIMD_STRIDE_SIZE + (SIMD_STRIDE_SIZE / 2)), d);
829                 let mask_second = mask_ascii(second);
830                 return Some(SIMD_STRIDE_SIZE + mask_second.trailing_zeros() as usize);
831             }
832             let (c, d) = simd_unpack(second);
833             $store(dst.add(SIMD_STRIDE_SIZE), c);
834             $store(dst.add(SIMD_STRIDE_SIZE + (SIMD_STRIDE_SIZE / 2)), d);
835             None
836         }
837     };
838 }
839 
840 #[allow(unused_macros)]
841 macro_rules! unpack_simd_stride {
842     ($name:ident, $load:ident, $store:ident) => {
843         #[inline(always)]
844         pub unsafe fn $name(src: *const u8, dst: *mut u16) {
845             let simd = $load(src);
846             let (first, second) = simd_unpack(simd);
847             $store(dst, first);
848             $store(dst.add(8), second);
849         }
850     };
851 }
852 
853 #[allow(unused_macros)]
854 macro_rules! basic_latin_to_ascii_simd_stride {
855     ($name:ident, $load:ident, $store:ident) => {
856         #[inline(always)]
857         pub unsafe fn $name(src: *const u16, dst: *mut u8) -> bool {
858             let first = $load(src);
859             let second = $load(src.add(8));
860             if simd_is_basic_latin(first | second) {
861                 $store(dst, simd_pack(first, second));
862                 true
863             } else {
864                 false
865             }
866         }
867     };
868 }
869 
870 #[allow(unused_macros)]
871 macro_rules! pack_simd_stride {
872     ($name:ident, $load:ident, $store:ident) => {
873         #[inline(always)]
874         pub unsafe fn $name(src: *const u16, dst: *mut u8) {
875             let first = $load(src);
876             let second = $load(src.add(8));
877             $store(dst, simd_pack(first, second));
878         }
879     };
880 }
881 
882 cfg_if! {
883     if #[cfg(all(feature = "simd-accel", target_endian = "little", target_arch = "aarch64"))] {
884         // SIMD with the same instructions for aligned and unaligned loads and stores
885 
886         pub const SIMD_STRIDE_SIZE: usize = 16;
887 
888         pub const MAX_STRIDE_SIZE: usize = 16;
889 
890 //        pub const ALIGNMENT: usize = 8;
891 
892         pub const ALU_STRIDE_SIZE: usize = 16;
893 
894         pub const ALU_ALIGNMENT: usize = 8;
895 
896         pub const ALU_ALIGNMENT_MASK: usize = 7;
897 
898         ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_neither_aligned, load16_unaligned, store16_unaligned);
899 
900         ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_neither_aligned, load16_unaligned, store8_unaligned);
901         unpack_simd_stride!(unpack_stride_neither_aligned, load16_unaligned, store8_unaligned);
902 
903         basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_neither_aligned, load8_unaligned, store16_unaligned);
904         pack_simd_stride!(pack_stride_neither_aligned, load8_unaligned, store16_unaligned);
905 
906         ascii_simd_unalign!(ascii_to_ascii, u8, u8, ascii_to_ascii_stride_neither_aligned);
907         ascii_simd_unalign!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_neither_aligned);
908         ascii_simd_unalign!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_neither_aligned);
909         latin1_simd_unalign!(unpack_latin1, u8, u16, unpack_stride_neither_aligned);
910         latin1_simd_unalign!(pack_latin1, u16, u8, pack_stride_neither_aligned);
911     } else if #[cfg(all(feature = "simd-accel", target_endian = "little", target_feature = "neon"))] {
912         // SIMD with different instructions for aligned and unaligned loads and stores.
913         //
914         // Newer microarchitectures are not supposed to have a performance difference between
915         // aligned and unaligned SSE2 loads and stores when the address is actually aligned,
916         // but the benchmark results I see don't agree.
917 
918         pub const SIMD_STRIDE_SIZE: usize = 16;
919 
920         pub const MAX_STRIDE_SIZE: usize = 16;
921 
922         pub const SIMD_ALIGNMENT_MASK: usize = 15;
923 
924         ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_both_aligned, load16_aligned, store16_aligned);
925         ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_src_aligned, load16_aligned, store16_unaligned);
926         ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_dst_aligned, load16_unaligned, store16_aligned);
927         ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_neither_aligned, load16_unaligned, store16_unaligned);
928 
929         ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_both_aligned, load16_aligned, store8_aligned);
930         ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_src_aligned, load16_aligned, store8_unaligned);
931         ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_dst_aligned, load16_unaligned, store8_aligned);
932         ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_neither_aligned, load16_unaligned, store8_unaligned);
933 
934         unpack_simd_stride!(unpack_stride_both_aligned, load16_aligned, store8_aligned);
935         unpack_simd_stride!(unpack_stride_src_aligned, load16_aligned, store8_unaligned);
936         unpack_simd_stride!(unpack_stride_dst_aligned, load16_unaligned, store8_aligned);
937         unpack_simd_stride!(unpack_stride_neither_aligned, load16_unaligned, store8_unaligned);
938 
939         basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_both_aligned, load8_aligned, store16_aligned);
940         basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_src_aligned, load8_aligned, store16_unaligned);
941         basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_dst_aligned, load8_unaligned, store16_aligned);
942         basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_neither_aligned, load8_unaligned, store16_unaligned);
943 
944         pack_simd_stride!(pack_stride_both_aligned, load8_aligned, store16_aligned);
945         pack_simd_stride!(pack_stride_src_aligned, load8_aligned, store16_unaligned);
946         pack_simd_stride!(pack_stride_dst_aligned, load8_unaligned, store16_aligned);
947         pack_simd_stride!(pack_stride_neither_aligned, load8_unaligned, store16_unaligned);
948 
949         ascii_simd_check_align!(ascii_to_ascii, u8, u8, ascii_to_ascii_stride_both_aligned, ascii_to_ascii_stride_src_aligned, ascii_to_ascii_stride_dst_aligned, ascii_to_ascii_stride_neither_aligned);
950         ascii_simd_check_align!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_both_aligned, ascii_to_basic_latin_stride_src_aligned, ascii_to_basic_latin_stride_dst_aligned, ascii_to_basic_latin_stride_neither_aligned);
951         ascii_simd_check_align!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_both_aligned, basic_latin_to_ascii_stride_src_aligned, basic_latin_to_ascii_stride_dst_aligned, basic_latin_to_ascii_stride_neither_aligned);
952         latin1_simd_check_align!(unpack_latin1, u8, u16, unpack_stride_both_aligned, unpack_stride_src_aligned, unpack_stride_dst_aligned, unpack_stride_neither_aligned);
953         latin1_simd_check_align!(pack_latin1, u16, u8, pack_stride_both_aligned, pack_stride_src_aligned, pack_stride_dst_aligned, pack_stride_neither_aligned);
954     } else if #[cfg(all(feature = "simd-accel", target_feature = "sse2"))] {
955         // SIMD with different instructions for aligned and unaligned loads and stores.
956         //
957         // Newer microarchitectures are not supposed to have a performance difference between
958         // aligned and unaligned SSE2 loads and stores when the address is actually aligned,
959         // but the benchmark results I see don't agree.
960 
961         pub const SIMD_STRIDE_SIZE: usize = 16;
962 
963         pub const SIMD_ALIGNMENT: usize = 16;
964 
965         pub const MAX_STRIDE_SIZE: usize = 16;
966 
967         pub const SIMD_ALIGNMENT_MASK: usize = 15;
968 
969         ascii_to_ascii_simd_double_stride!(ascii_to_ascii_simd_double_stride_both_aligned, store16_aligned);
970         ascii_to_ascii_simd_double_stride!(ascii_to_ascii_simd_double_stride_src_aligned, store16_unaligned);
971 
972         ascii_to_basic_latin_simd_double_stride!(ascii_to_basic_latin_simd_double_stride_both_aligned, store8_aligned);
973         ascii_to_basic_latin_simd_double_stride!(ascii_to_basic_latin_simd_double_stride_src_aligned, store8_unaligned);
974 
975         ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_both_aligned, load16_aligned, store16_aligned);
976         ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_src_aligned, load16_aligned, store16_unaligned);
977         ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_neither_aligned, load16_unaligned, store16_unaligned);
978 
979         ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_both_aligned, load16_aligned, store8_aligned);
980         ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_src_aligned, load16_aligned, store8_unaligned);
981         ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_neither_aligned, load16_unaligned, store8_unaligned);
982 
983         unpack_simd_stride!(unpack_stride_both_aligned, load16_aligned, store8_aligned);
984         unpack_simd_stride!(unpack_stride_src_aligned, load16_aligned, store8_unaligned);
985 
986         basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_both_aligned, load8_aligned, store16_aligned);
987         basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_src_aligned, load8_aligned, store16_unaligned);
988         basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_dst_aligned, load8_unaligned, store16_aligned);
989         basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_neither_aligned, load8_unaligned, store16_unaligned);
990 
991         pack_simd_stride!(pack_stride_both_aligned, load8_aligned, store16_aligned);
992         pack_simd_stride!(pack_stride_src_aligned, load8_aligned, store16_unaligned);
993 
994         ascii_simd_check_align_unrolled!(ascii_to_ascii, u8, u8, ascii_to_ascii_stride_both_aligned, ascii_to_ascii_stride_src_aligned, ascii_to_ascii_stride_neither_aligned, ascii_to_ascii_simd_double_stride_both_aligned, ascii_to_ascii_simd_double_stride_src_aligned);
995         ascii_simd_check_align_unrolled!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_both_aligned, ascii_to_basic_latin_stride_src_aligned, ascii_to_basic_latin_stride_neither_aligned, ascii_to_basic_latin_simd_double_stride_both_aligned, ascii_to_basic_latin_simd_double_stride_src_aligned);
996 
997         ascii_simd_check_align!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_both_aligned, basic_latin_to_ascii_stride_src_aligned, basic_latin_to_ascii_stride_dst_aligned, basic_latin_to_ascii_stride_neither_aligned);
998         latin1_simd_check_align_unrolled!(unpack_latin1, u8, u16, unpack_stride_both_aligned, unpack_stride_src_aligned, unpack_stride_dst_aligned, unpack_stride_neither_aligned);
999         latin1_simd_check_align_unrolled!(pack_latin1, u16, u8, pack_stride_both_aligned, pack_stride_src_aligned, pack_stride_dst_aligned, pack_stride_neither_aligned);
1000     } else if #[cfg(all(target_endian = "little", target_pointer_width = "64"))] {
1001         // Aligned ALU word, little-endian, 64-bit
1002 
1003         pub const ALU_STRIDE_SIZE: usize = 16;
1004 
1005         pub const MAX_STRIDE_SIZE: usize = 16;
1006 
1007         pub const ALU_ALIGNMENT: usize = 8;
1008 
1009         pub const ALU_ALIGNMENT_MASK: usize = 7;
1010 
1011         #[inline(always)]
1012         unsafe fn unpack_alu(word: usize, second_word: usize, dst: *mut usize) {
1013             let first = ((0x0000_0000_FF00_0000usize & word) << 24) |
1014                         ((0x0000_0000_00FF_0000usize & word) << 16) |
1015                         ((0x0000_0000_0000_FF00usize & word) << 8) |
1016                         (0x0000_0000_0000_00FFusize & word);
1017             let second = ((0xFF00_0000_0000_0000usize & word) >> 8) |
1018                          ((0x00FF_0000_0000_0000usize & word) >> 16) |
1019                          ((0x0000_FF00_0000_0000usize & word) >> 24) |
1020                          ((0x0000_00FF_0000_0000usize & word) >> 32);
1021             let third = ((0x0000_0000_FF00_0000usize & second_word) << 24) |
1022                         ((0x0000_0000_00FF_0000usize & second_word) << 16) |
1023                         ((0x0000_0000_0000_FF00usize & second_word) << 8) |
1024                         (0x0000_0000_0000_00FFusize & second_word);
1025             let fourth = ((0xFF00_0000_0000_0000usize & second_word) >> 8) |
1026                          ((0x00FF_0000_0000_0000usize & second_word) >> 16) |
1027                          ((0x0000_FF00_0000_0000usize & second_word) >> 24) |
1028                          ((0x0000_00FF_0000_0000usize & second_word) >> 32);
1029             *dst = first;
1030             *(dst.add(1)) = second;
1031             *(dst.add(2)) = third;
1032             *(dst.add(3)) = fourth;
1033         }
1034 
1035         #[inline(always)]
1036         unsafe fn pack_alu(first: usize, second: usize, third: usize, fourth: usize, dst: *mut usize) {
1037             let word = ((0x00FF_0000_0000_0000usize & second) << 8) |
1038                        ((0x0000_00FF_0000_0000usize & second) << 16) |
1039                        ((0x0000_0000_00FF_0000usize & second) << 24) |
1040                        ((0x0000_0000_0000_00FFusize & second) << 32) |
1041                        ((0x00FF_0000_0000_0000usize & first) >> 24) |
1042                        ((0x0000_00FF_0000_0000usize & first) >> 16) |
1043                        ((0x0000_0000_00FF_0000usize & first) >> 8) |
1044                        (0x0000_0000_0000_00FFusize & first);
1045             let second_word = ((0x00FF_0000_0000_0000usize & fourth) << 8) |
1046                               ((0x0000_00FF_0000_0000usize & fourth) << 16) |
1047                               ((0x0000_0000_00FF_0000usize & fourth) << 24) |
1048                               ((0x0000_0000_0000_00FFusize & fourth) << 32) |
1049                               ((0x00FF_0000_0000_0000usize & third) >> 24) |
1050                               ((0x0000_00FF_0000_0000usize & third) >> 16) |
1051                               ((0x0000_0000_00FF_0000usize & third) >> 8) |
1052                               (0x0000_0000_0000_00FFusize & third);
1053             *dst = word;
1054             *(dst.add(1)) = second_word;
1055         }
1056     } else if #[cfg(all(target_endian = "little", target_pointer_width = "32"))] {
1057         // Aligned ALU word, little-endian, 32-bit
1058 
1059         pub const ALU_STRIDE_SIZE: usize = 8;
1060 
1061         pub const MAX_STRIDE_SIZE: usize = 8;
1062 
1063         pub const ALU_ALIGNMENT: usize = 4;
1064 
1065         pub const ALU_ALIGNMENT_MASK: usize = 3;
1066 
1067         #[inline(always)]
1068         unsafe fn unpack_alu(word: usize, second_word: usize, dst: *mut usize) {
1069             let first = ((0x0000_FF00usize & word) << 8) |
1070                         (0x0000_00FFusize & word);
1071             let second = ((0xFF00_0000usize & word) >> 8) |
1072                          ((0x00FF_0000usize & word) >> 16);
1073             let third = ((0x0000_FF00usize & second_word) << 8) |
1074                         (0x0000_00FFusize & second_word);
1075             let fourth = ((0xFF00_0000usize & second_word) >> 8) |
1076                          ((0x00FF_0000usize & second_word) >> 16);
1077             *dst = first;
1078             *(dst.add(1)) = second;
1079             *(dst.add(2)) = third;
1080             *(dst.add(3)) = fourth;
1081         }
1082 
1083         #[inline(always)]
1084         unsafe fn pack_alu(first: usize, second: usize, third: usize, fourth: usize, dst: *mut usize) {
1085             let word = ((0x00FF_0000usize & second) << 8) |
1086                        ((0x0000_00FFusize & second) << 16) |
1087                        ((0x00FF_0000usize & first) >> 8) |
1088                        (0x0000_00FFusize & first);
1089             let second_word = ((0x00FF_0000usize & fourth) << 8) |
1090                               ((0x0000_00FFusize & fourth) << 16) |
1091                               ((0x00FF_0000usize & third) >> 8) |
1092                               (0x0000_00FFusize & third);
1093             *dst = word;
1094             *(dst.add(1)) = second_word;
1095         }
1096     } else if #[cfg(all(target_endian = "big", target_pointer_width = "64"))] {
1097         // Aligned ALU word, big-endian, 64-bit
1098 
1099         pub const ALU_STRIDE_SIZE: usize = 16;
1100 
1101         pub const MAX_STRIDE_SIZE: usize = 16;
1102 
1103         pub const ALU_ALIGNMENT: usize = 8;
1104 
1105         pub const ALU_ALIGNMENT_MASK: usize = 7;
1106 
1107         #[inline(always)]
1108         unsafe fn unpack_alu(word: usize, second_word: usize, dst: *mut usize) {
1109             let first = ((0xFF00_0000_0000_0000usize & word) >> 8) |
1110                          ((0x00FF_0000_0000_0000usize & word) >> 16) |
1111                          ((0x0000_FF00_0000_0000usize & word) >> 24) |
1112                          ((0x0000_00FF_0000_0000usize & word) >> 32);
1113             let second = ((0x0000_0000_FF00_0000usize & word) << 24) |
1114                         ((0x0000_0000_00FF_0000usize & word) << 16) |
1115                         ((0x0000_0000_0000_FF00usize & word) << 8) |
1116                         (0x0000_0000_0000_00FFusize & word);
1117             let third = ((0xFF00_0000_0000_0000usize & second_word) >> 8) |
1118                          ((0x00FF_0000_0000_0000usize & second_word) >> 16) |
1119                          ((0x0000_FF00_0000_0000usize & second_word) >> 24) |
1120                          ((0x0000_00FF_0000_0000usize & second_word) >> 32);
1121             let fourth = ((0x0000_0000_FF00_0000usize & second_word) << 24) |
1122                         ((0x0000_0000_00FF_0000usize & second_word) << 16) |
1123                         ((0x0000_0000_0000_FF00usize & second_word) << 8) |
1124                         (0x0000_0000_0000_00FFusize & second_word);
1125             *dst = first;
1126             *(dst.add(1)) = second;
1127             *(dst.add(2)) = third;
1128             *(dst.add(3)) = fourth;
1129         }
1130 
1131         #[inline(always)]
1132         unsafe fn pack_alu(first: usize, second: usize, third: usize, fourth: usize, dst: *mut usize) {
1133             let word = ((0x00FF0000_00000000usize & first) << 8) |
1134                        ((0x000000FF_00000000usize & first) << 16) |
1135                        ((0x00000000_00FF0000usize & first) << 24) |
1136                        ((0x00000000_000000FFusize & first) << 32) |
1137                        ((0x00FF0000_00000000usize & second) >> 24) |
1138                        ((0x000000FF_00000000usize & second) >> 16) |
1139                        ((0x00000000_00FF0000usize & second) >> 8) |
1140                        (0x00000000_000000FFusize & second);
1141             let second_word = ((0x00FF0000_00000000usize & third) << 8) |
1142                               ((0x000000FF_00000000usize & third) << 16) |
1143                               ((0x00000000_00FF0000usize & third) << 24) |
1144                               ((0x00000000_000000FFusize & third) << 32) |
1145                               ((0x00FF0000_00000000usize & fourth) >> 24) |
1146                               ((0x000000FF_00000000usize & fourth) >> 16) |
1147                               ((0x00000000_00FF0000usize & fourth) >> 8) |
1148                               (0x00000000_000000FFusize &  fourth);
1149             *dst = word;
1150             *(dst.add(1)) = second_word;
1151         }
1152     } else if #[cfg(all(target_endian = "big", target_pointer_width = "32"))] {
1153         // Aligned ALU word, big-endian, 32-bit
1154 
1155         pub const ALU_STRIDE_SIZE: usize = 8;
1156 
1157         pub const MAX_STRIDE_SIZE: usize = 8;
1158 
1159         pub const ALU_ALIGNMENT: usize = 4;
1160 
1161         pub const ALU_ALIGNMENT_MASK: usize = 3;
1162 
1163         #[inline(always)]
1164         unsafe fn unpack_alu(word: usize, second_word: usize, dst: *mut usize) {
1165             let first = ((0xFF00_0000usize & word) >> 8) |
1166                          ((0x00FF_0000usize & word) >> 16);
1167             let second = ((0x0000_FF00usize & word) << 8) |
1168                         (0x0000_00FFusize & word);
1169             let third = ((0xFF00_0000usize & second_word) >> 8) |
1170                          ((0x00FF_0000usize & second_word) >> 16);
1171             let fourth = ((0x0000_FF00usize & second_word) << 8) |
1172                         (0x0000_00FFusize & second_word);
1173             *dst = first;
1174             *(dst.add(1)) = second;
1175             *(dst.add(2)) = third;
1176             *(dst.add(3)) = fourth;
1177         }
1178 
1179         #[inline(always)]
1180         unsafe fn pack_alu(first: usize, second: usize, third: usize, fourth: usize, dst: *mut usize) {
1181             let word = ((0x00FF_0000usize & first) << 8) |
1182                        ((0x0000_00FFusize & first) << 16) |
1183                        ((0x00FF_0000usize & second) >> 8) |
1184                        (0x0000_00FFusize & second);
1185             let second_word = ((0x00FF_0000usize & third) << 8) |
1186                               ((0x0000_00FFusize & third) << 16) |
1187                               ((0x00FF_0000usize & fourth) >> 8) |
1188                               (0x0000_00FFusize & fourth);
1189             *dst = word;
1190             *(dst.add(1)) = second_word;
1191         }
1192     } else {
1193         ascii_naive!(ascii_to_ascii, u8, u8);
1194         ascii_naive!(ascii_to_basic_latin, u8, u16);
1195         ascii_naive!(basic_latin_to_ascii, u16, u8);
1196     }
1197 }
1198 
1199 cfg_if! {
1200     if #[cfg(target_endian = "little")] {
1201         #[allow(dead_code)]
1202         #[inline(always)]
1203         fn count_zeros(word: usize) -> u32 {
1204             word.trailing_zeros()
1205         }
1206     } else {
1207         #[allow(dead_code)]
1208         #[inline(always)]
1209         fn count_zeros(word: usize) -> u32 {
1210             word.leading_zeros()
1211         }
1212     }
1213 }
1214 
1215 cfg_if! {
1216     if #[cfg(all(feature = "simd-accel", target_endian = "little", target_arch = "disabled"))] {
1217         #[inline(always)]
1218         pub fn validate_ascii(slice: &[u8]) -> Option<(u8, usize)> {
1219             let src = slice.as_ptr();
1220             let len = slice.len();
1221             let mut offset = 0usize;
1222             if SIMD_STRIDE_SIZE <= len {
1223                 let len_minus_stride = len - SIMD_STRIDE_SIZE;
1224                 loop {
1225                     let simd = unsafe { load16_unaligned(src.add(offset)) };
1226                     if !simd_is_ascii(simd) {
1227                         break;
1228                     }
1229                     offset += SIMD_STRIDE_SIZE;
1230                     if offset > len_minus_stride {
1231                         break;
1232                     }
1233                 }
1234             }
1235             while offset < len {
1236                 let code_unit = slice[offset];
1237                 if code_unit > 127 {
1238                     return Some((code_unit, offset));
1239                 }
1240                 offset += 1;
1241             }
1242             None
1243         }
1244     } else if #[cfg(all(feature = "simd-accel", target_feature = "sse2"))] {
1245         #[inline(always)]
1246         pub fn validate_ascii(slice: &[u8]) -> Option<(u8, usize)> {
1247             let src = slice.as_ptr();
1248             let len = slice.len();
1249             let mut offset = 0usize;
1250             if SIMD_STRIDE_SIZE <= len {
1251                 // First, process one unaligned vector
1252                 let simd = unsafe { load16_unaligned(src) };
1253                 let mask = mask_ascii(simd);
1254                 if mask != 0 {
1255                     offset = mask.trailing_zeros() as usize;
1256                     let non_ascii = unsafe { *src.add(offset) };
1257                     return Some((non_ascii, offset));
1258                 }
1259                 offset = SIMD_STRIDE_SIZE;
1260 
1261                 // We have now seen 16 ASCII bytes. Let's guess that
1262                 // there will be enough more to justify more expense
1263                 // in the case of non-ASCII.
1264                 // Use aligned reads for the sake of old microachitectures.
1265                 let until_alignment = unsafe { (SIMD_ALIGNMENT - ((src.add(offset) as usize) & SIMD_ALIGNMENT_MASK)) & SIMD_ALIGNMENT_MASK };
1266                 // This addition won't overflow, because even in the 32-bit PAE case the
1267                 // address space holds enough code that the slice length can't be that
1268                 // close to address space size.
1269                 // offset now equals SIMD_STRIDE_SIZE, hence times 3 below.
1270                 if until_alignment + (SIMD_STRIDE_SIZE * 3) <= len {
1271                     if until_alignment != 0 {
1272                         let simd = unsafe { load16_unaligned(src.add(offset)) };
1273                         let mask = mask_ascii(simd);
1274                         if mask != 0 {
1275                             offset += mask.trailing_zeros() as usize;
1276                             let non_ascii = unsafe { *src.add(offset) };
1277                             return Some((non_ascii, offset));
1278                         }
1279                         offset += until_alignment;
1280                     }
1281                     let len_minus_stride_times_two = len - (SIMD_STRIDE_SIZE * 2);
1282                     loop {
1283                         let first = unsafe { load16_aligned(src.add(offset)) };
1284                         let second = unsafe { load16_aligned(src.add(offset + SIMD_STRIDE_SIZE)) };
1285                         if !simd_is_ascii(first | second) {
1286                             let mask_first = mask_ascii(first);
1287                             if mask_first != 0 {
1288                                 offset += mask_first.trailing_zeros() as usize;
1289                             } else {
1290                                 let mask_second = mask_ascii(second);
1291                                 offset += SIMD_STRIDE_SIZE + mask_second.trailing_zeros() as usize;
1292                             }
1293                             let non_ascii = unsafe { *src.add(offset) };
1294                             return Some((non_ascii, offset));
1295                         }
1296                         offset += SIMD_STRIDE_SIZE * 2;
1297                         if offset > len_minus_stride_times_two {
1298                             break;
1299                         }
1300                     }
1301                     if offset + SIMD_STRIDE_SIZE <= len {
1302                          let simd = unsafe { load16_aligned(src.add(offset)) };
1303                          let mask = mask_ascii(simd);
1304                         if mask != 0 {
1305                             offset += mask.trailing_zeros() as usize;
1306                             let non_ascii = unsafe { *src.add(offset) };
1307                             return Some((non_ascii, offset));
1308                         }
1309                         offset += SIMD_STRIDE_SIZE;
1310                     }
1311                 } else {
1312                     // At most two iterations, so unroll
1313                     if offset + SIMD_STRIDE_SIZE <= len {
1314                         let simd = unsafe { load16_unaligned(src.add(offset)) };
1315                         let mask = mask_ascii(simd);
1316                         if mask != 0 {
1317                             offset += mask.trailing_zeros() as usize;
1318                             let non_ascii = unsafe { *src.add(offset) };
1319                             return Some((non_ascii, offset));
1320                         }
1321                         offset += SIMD_STRIDE_SIZE;
1322                         if offset + SIMD_STRIDE_SIZE <= len {
1323                              let simd = unsafe { load16_unaligned(src.add(offset)) };
1324                              let mask = mask_ascii(simd);
1325                             if mask != 0 {
1326                                 offset += mask.trailing_zeros() as usize;
1327                                 let non_ascii = unsafe { *src.add(offset) };
1328                                 return Some((non_ascii, offset));
1329                             }
1330                             offset += SIMD_STRIDE_SIZE;
1331                         }
1332                     }
1333                 }
1334             }
1335             while offset < len {
1336                 let code_unit = unsafe { *(src.add(offset)) };
1337                 if code_unit > 127 {
1338                     return Some((code_unit, offset));
1339                 }
1340                 offset += 1;
1341             }
1342             None
1343         }
1344     } else {
1345         #[inline(always)]
1346         fn find_non_ascii(word: usize, second_word: usize) -> Option<usize> {
1347             let word_masked = word & ASCII_MASK;
1348             let second_masked = second_word & ASCII_MASK;
1349             if (word_masked | second_masked) == 0 {
1350                 return None;
1351             }
1352             if word_masked != 0 {
1353                 let zeros = count_zeros(word_masked);
1354                 // `zeros` now contains 7 (for the seven bits of non-ASCII)
1355                 // plus 8 times the number of ASCII in text order before the
1356                 // non-ASCII byte in the little-endian case or 8 times the number of ASCII in
1357                 // text order before the non-ASCII byte in the big-endian case.
1358                 let num_ascii = (zeros >> 3) as usize;
1359                 return Some(num_ascii);
1360             }
1361             let zeros = count_zeros(second_masked);
1362             // `zeros` now contains 7 (for the seven bits of non-ASCII)
1363             // plus 8 times the number of ASCII in text order before the
1364             // non-ASCII byte in the little-endian case or 8 times the number of ASCII in
1365             // text order before the non-ASCII byte in the big-endian case.
1366             let num_ascii = (zeros >> 3) as usize;
1367             Some(ALU_ALIGNMENT + num_ascii)
1368         }
1369 
1370         #[inline(always)]
1371         unsafe fn validate_ascii_stride(src: *const usize) -> Option<usize> {
1372             let word = *src;
1373             let second_word = *(src.add(1));
1374             find_non_ascii(word, second_word)
1375         }
1376 
1377         #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
1378         #[inline(always)]
1379         pub fn validate_ascii(slice: &[u8]) -> Option<(u8, usize)> {
1380             let src = slice.as_ptr();
1381             let len = slice.len();
1382             let mut offset = 0usize;
1383             let mut until_alignment = (ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK)) & ALU_ALIGNMENT_MASK;
1384             if until_alignment + ALU_STRIDE_SIZE <= len {
1385                 while until_alignment != 0 {
1386                     let code_unit = slice[offset];
1387                     if code_unit > 127 {
1388                         return Some((code_unit, offset));
1389                     }
1390                     offset += 1;
1391                     until_alignment -= 1;
1392                 }
1393                 let len_minus_stride = len - ALU_STRIDE_SIZE;
1394                 loop {
1395                     let ptr = unsafe { src.add(offset) as *const usize };
1396                     if let Some(num_ascii) = unsafe { validate_ascii_stride(ptr) } {
1397                         offset += num_ascii;
1398                         return Some((unsafe { *(src.add(offset)) }, offset));
1399                     }
1400                     offset += ALU_STRIDE_SIZE;
1401                     if offset > len_minus_stride {
1402                         break;
1403                     }
1404                 }
1405             }
1406             while offset < len {
1407                 let code_unit = slice[offset];
1408                 if code_unit > 127 {
1409                     return Some((code_unit, offset));
1410                 }
1411                 offset += 1;
1412            }
1413            None
1414         }
1415 
1416     }
1417 }
1418 
1419 cfg_if! {
1420     if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"))))] {
1421 
1422     } else if #[cfg(all(feature = "simd-accel", target_endian = "little", target_feature = "neon"))] {
1423         // Even with NEON enabled, we use the ALU path for ASCII validation, because testing
1424         // on Exynos 5 indicated that using NEON isn't worthwhile where there are only
1425         // vector reads without vector writes.
1426 
1427         pub const ALU_STRIDE_SIZE: usize = 8;
1428 
1429         pub const ALU_ALIGNMENT: usize = 4;
1430 
1431         pub const ALU_ALIGNMENT_MASK: usize = 3;
1432     } else {
1433         #[inline(always)]
1434         unsafe fn unpack_latin1_stride_alu(src: *const usize, dst: *mut usize) {
1435             let word = *src;
1436             let second_word = *(src.add(1));
1437             unpack_alu(word, second_word, dst);
1438         }
1439 
1440         #[inline(always)]
1441         unsafe fn pack_latin1_stride_alu(src: *const usize, dst: *mut usize) {
1442             let first = *src;
1443             let second = *(src.add(1));
1444             let third = *(src.add(2));
1445             let fourth = *(src.add(3));
1446             pack_alu(first, second, third, fourth, dst);
1447         }
1448 
1449         #[inline(always)]
1450         unsafe fn ascii_to_basic_latin_stride_alu(src: *const usize, dst: *mut usize) -> bool {
1451             let word = *src;
1452             let second_word = *(src.add(1));
1453             // Check if the words contains non-ASCII
1454             if (word & ASCII_MASK) | (second_word & ASCII_MASK) != 0 {
1455                 return false;
1456             }
1457             unpack_alu(word, second_word, dst);
1458             true
1459         }
1460 
1461         #[inline(always)]
1462         unsafe fn basic_latin_to_ascii_stride_alu(src: *const usize, dst: *mut usize) -> bool {
1463             let first = *src;
1464             let second = *(src.add(1));
1465             let third = *(src.add(2));
1466             let fourth = *(src.add(3));
1467             if (first & BASIC_LATIN_MASK) | (second & BASIC_LATIN_MASK) | (third & BASIC_LATIN_MASK) | (fourth & BASIC_LATIN_MASK) != 0 {
1468                 return false;
1469             }
1470             pack_alu(first, second, third, fourth, dst);
1471             true
1472         }
1473 
1474         #[inline(always)]
1475         unsafe fn ascii_to_ascii_stride(src: *const usize, dst: *mut usize) -> Option<usize> {
1476             let word = *src;
1477             let second_word = *(src.add(1));
1478             *dst = word;
1479             *(dst.add(1)) = second_word;
1480             find_non_ascii(word, second_word)
1481         }
1482 
1483         basic_latin_alu!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_alu);
1484         basic_latin_alu!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_alu);
1485         latin1_alu!(unpack_latin1, u8, u16, unpack_latin1_stride_alu);
1486         latin1_alu!(pack_latin1, u16, u8, pack_latin1_stride_alu);
1487         ascii_alu!(ascii_to_ascii, u8, u8, ascii_to_ascii_stride);
1488     }
1489 }
1490 
ascii_valid_up_to(bytes: &[u8]) -> usize1491 pub fn ascii_valid_up_to(bytes: &[u8]) -> usize {
1492     match validate_ascii(bytes) {
1493         None => bytes.len(),
1494         Some((_, num_valid)) => num_valid,
1495     }
1496 }
1497 
iso_2022_jp_ascii_valid_up_to(bytes: &[u8]) -> usize1498 pub fn iso_2022_jp_ascii_valid_up_to(bytes: &[u8]) -> usize {
1499     for (i, b_ref) in bytes.iter().enumerate() {
1500         let b = *b_ref;
1501         if b >= 0x80 || b == 0x1B || b == 0x0E || b == 0x0F {
1502             return i;
1503         }
1504     }
1505     bytes.len()
1506 }
1507 
1508 // Any copyright to the test code below this comment is dedicated to the
1509 // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
1510 
1511 #[cfg(all(test, feature = "alloc"))]
1512 mod tests {
1513     use super::*;
1514     use alloc::vec::Vec;
1515 
1516     macro_rules! test_ascii {
1517         ($test_name:ident, $fn_tested:ident, $src_unit:ty, $dst_unit:ty) => {
1518             #[test]
1519             fn $test_name() {
1520                 let mut src: Vec<$src_unit> = Vec::with_capacity(32);
1521                 let mut dst: Vec<$dst_unit> = Vec::with_capacity(32);
1522                 for i in 0..32 {
1523                     src.clear();
1524                     dst.clear();
1525                     dst.resize(32, 0);
1526                     for j in 0..32 {
1527                         let c = if i == j { 0xAA } else { j + 0x40 };
1528                         src.push(c as $src_unit);
1529                     }
1530                     match unsafe { $fn_tested(src.as_ptr(), dst.as_mut_ptr(), 32) } {
1531                         None => unreachable!("Should always find non-ASCII"),
1532                         Some((non_ascii, num_ascii)) => {
1533                             assert_eq!(non_ascii, 0xAA);
1534                             assert_eq!(num_ascii, i);
1535                             for j in 0..i {
1536                                 assert_eq!(dst[j], (j + 0x40) as $dst_unit);
1537                             }
1538                         }
1539                     }
1540                 }
1541             }
1542         };
1543     }
1544 
1545     test_ascii!(test_ascii_to_ascii, ascii_to_ascii, u8, u8);
1546     test_ascii!(test_ascii_to_basic_latin, ascii_to_basic_latin, u8, u16);
1547     test_ascii!(test_basic_latin_to_ascii, basic_latin_to_ascii, u16, u8);
1548 }
1549