1 // Copyright Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 use packed_simd::u16x8;
11 use packed_simd::u8x16;
12 use packed_simd::FromBits;
13 
14 // TODO: Migrate unaligned access to stdlib code if/when the RFC
15 // https://github.com/rust-lang/rfcs/pull/1725 is implemented.
16 
17 #[inline(always)]
load16_unaligned(ptr: *const u8) -> u8x1618 pub unsafe fn load16_unaligned(ptr: *const u8) -> u8x16 {
19     let mut simd = ::std::mem::uninitialized();
20     ::std::ptr::copy_nonoverlapping(ptr, &mut simd as *mut u8x16 as *mut u8, 16);
21     simd
22 }
23 
24 #[allow(dead_code)]
25 #[inline(always)]
load16_aligned(ptr: *const u8) -> u8x1626 pub unsafe fn load16_aligned(ptr: *const u8) -> u8x16 {
27     *(ptr as *const u8x16)
28 }
29 
30 #[inline(always)]
store16_unaligned(ptr: *mut u8, s: u8x16)31 pub unsafe fn store16_unaligned(ptr: *mut u8, s: u8x16) {
32     ::std::ptr::copy_nonoverlapping(&s as *const u8x16 as *const u8, ptr, 16);
33 }
34 
35 #[allow(dead_code)]
36 #[inline(always)]
store16_aligned(ptr: *mut u8, s: u8x16)37 pub unsafe fn store16_aligned(ptr: *mut u8, s: u8x16) {
38     *(ptr as *mut u8x16) = s;
39 }
40 
41 #[inline(always)]
load8_unaligned(ptr: *const u16) -> u16x842 pub unsafe fn load8_unaligned(ptr: *const u16) -> u16x8 {
43     let mut simd = ::std::mem::uninitialized();
44     ::std::ptr::copy_nonoverlapping(ptr as *const u8, &mut simd as *mut u16x8 as *mut u8, 16);
45     simd
46 }
47 
48 #[allow(dead_code)]
49 #[inline(always)]
load8_aligned(ptr: *const u16) -> u16x850 pub unsafe fn load8_aligned(ptr: *const u16) -> u16x8 {
51     *(ptr as *const u16x8)
52 }
53 
54 #[inline(always)]
store8_unaligned(ptr: *mut u16, s: u16x8)55 pub unsafe fn store8_unaligned(ptr: *mut u16, s: u16x8) {
56     ::std::ptr::copy_nonoverlapping(&s as *const u16x8 as *const u8, ptr as *mut u8, 16);
57 }
58 
59 #[allow(dead_code)]
60 #[inline(always)]
store8_aligned(ptr: *mut u16, s: u16x8)61 pub unsafe fn store8_aligned(ptr: *mut u16, s: u16x8) {
62     *(ptr as *mut u16x8) = s;
63 }
64 
65 cfg_if! {
66     if #[cfg(all(target_feature = "sse2", target_arch = "x86_64"))] {
67         use std::arch::x86_64::__m128i;
68         use std::arch::x86_64::_mm_movemask_epi8;
69         use std::arch::x86_64::_mm_packus_epi16;
70     } else if #[cfg(all(target_feature = "sse2", target_arch = "x86"))] {
71         use std::arch::x86::__m128i;
72         use std::arch::x86::_mm_movemask_epi8;
73         use std::arch::x86::_mm_packus_epi16;
74     } else if #[cfg(target_arch = "aarch64")]{
75         use std::arch::aarch64::uint8x16_t;
76         use std::arch::aarch64::uint16x8_t;
77         use std::arch::aarch64::vmaxvq_u8;
78         use std::arch::aarch64::vmaxvq_u16;
79     } else {
80 
81     }
82 }
83 
84 // #[inline(always)]
85 // fn simd_byte_swap_u8(s: u8x16) -> u8x16 {
86 //     unsafe {
87 //         shuffle!(s, s, [1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14])
88 //     }
89 // }
90 
91 // #[inline(always)]
92 // pub fn simd_byte_swap(s: u16x8) -> u16x8 {
93 //     to_u16_lanes(simd_byte_swap_u8(to_u8_lanes(s)))
94 // }
95 
96 #[inline(always)]
simd_byte_swap(s: u16x8) -> u16x897 pub fn simd_byte_swap(s: u16x8) -> u16x8 {
98     let left = s << 8;
99     let right = s >> 8;
100     left | right
101 }
102 
103 #[inline(always)]
to_u16_lanes(s: u8x16) -> u16x8104 pub fn to_u16_lanes(s: u8x16) -> u16x8 {
105     u16x8::from_bits(s)
106 }
107 
108 cfg_if! {
109     if #[cfg(target_feature = "sse2")] {
110 
111         // Expose low-level mask instead of higher-level conclusion,
112         // because the non-ASCII case would perform less well otherwise.
113         #[inline(always)]
114         pub fn mask_ascii(s: u8x16) -> i32 {
115             unsafe {
116                 _mm_movemask_epi8(__m128i::from_bits(s))
117             }
118         }
119 
120     } else {
121 
122     }
123 }
124 
125 cfg_if! {
126     if #[cfg(target_feature = "sse2")] {
127         #[inline(always)]
128         pub fn simd_is_ascii(s: u8x16) -> bool {
129             unsafe {
130                 _mm_movemask_epi8(__m128i::from_bits(s)) == 0
131             }
132         }
133     } else if #[cfg(target_arch = "aarch64")]{
134         #[inline(always)]
135         pub fn simd_is_ascii(s: u8x16) -> bool {
136             unsafe {
137                 vmaxvq_u8(uint8x16_t::from_bits(s)) < 0x80
138             }
139         }
140     } else {
141         #[inline(always)]
142         pub fn simd_is_ascii(s: u8x16) -> bool {
143             // This optimizes better on ARM than
144             // the lt formulation.
145             let highest_ascii = u8x16::splat(0x7F);
146             !s.gt(highest_ascii).any()
147         }
148     }
149 }
150 
151 cfg_if! {
152     if #[cfg(target_feature = "sse2")] {
153         #[inline(always)]
154         pub fn simd_is_str_latin1(s: u8x16) -> bool {
155             if simd_is_ascii(s) {
156                 return true;
157             }
158             let above_str_latin1 = u8x16::splat(0xC4);
159             s.lt(above_str_latin1).all()
160         }
161     } else if #[cfg(target_arch = "aarch64")]{
162         #[inline(always)]
163         pub fn simd_is_str_latin1(s: u8x16) -> bool {
164             unsafe {
165                 vmaxvq_u8(uint8x16_t::from_bits(s)) < 0xC4
166             }
167         }
168     } else {
169         #[inline(always)]
170         pub fn simd_is_str_latin1(s: u8x16) -> bool {
171             let above_str_latin1 = u8x16::splat(0xC4);
172             s.lt(above_str_latin1).all()
173         }
174     }
175 }
176 
177 cfg_if! {
178     if #[cfg(target_arch = "aarch64")]{
179         #[inline(always)]
180         pub fn simd_is_basic_latin(s: u16x8) -> bool {
181             unsafe {
182                 vmaxvq_u16(uint16x8_t::from_bits(s)) < 0x80
183             }
184         }
185 
186         #[inline(always)]
187         pub fn simd_is_latin1(s: u16x8) -> bool {
188             unsafe {
189                 vmaxvq_u16(uint16x8_t::from_bits(s)) < 0x100
190             }
191         }
192     } else {
193         #[inline(always)]
194         pub fn simd_is_basic_latin(s: u16x8) -> bool {
195             let above_ascii = u16x8::splat(0x80);
196             s.lt(above_ascii).all()
197         }
198 
199         #[inline(always)]
200         pub fn simd_is_latin1(s: u16x8) -> bool {
201             // For some reason, on SSE2 this formulation
202             // seems faster in this case while the above
203             // function is better the other way round...
204             let highest_latin1 = u16x8::splat(0xFF);
205             !s.gt(highest_latin1).any()
206         }
207     }
208 }
209 
210 #[inline(always)]
contains_surrogates(s: u16x8) -> bool211 pub fn contains_surrogates(s: u16x8) -> bool {
212     let mask = u16x8::splat(0xF800);
213     let surrogate_bits = u16x8::splat(0xD800);
214     (s & mask).eq(surrogate_bits).any()
215 }
216 
217 cfg_if! {
218     if #[cfg(target_arch = "aarch64")]{
219         macro_rules! aarch64_return_false_if_below_hebrew {
220             ($s:ident) => ({
221                 unsafe {
222                     if vmaxvq_u16(uint16x8_t::from_bits($s)) < 0x0590 {
223                         return false;
224                     }
225                 }
226             })
227         }
228 
229         macro_rules! non_aarch64_return_false_if_all {
230             ($s:ident) => ()
231         }
232     } else {
233         macro_rules! aarch64_return_false_if_below_hebrew {
234             ($s:ident) => ()
235         }
236 
237         macro_rules! non_aarch64_return_false_if_all {
238             ($s:ident) => ({
239                 if $s.all() {
240                     return false;
241                 }
242             })
243         }
244     }
245 }
246 
247 macro_rules! in_range16x8 {
248     ($s:ident, $start:expr, $end:expr) => {{
249         // SIMD sub is wrapping
250         ($s - u16x8::splat($start)).lt(u16x8::splat($end - $start))
251     }};
252 }
253 
254 #[inline(always)]
is_u16x8_bidi(s: u16x8) -> bool255 pub fn is_u16x8_bidi(s: u16x8) -> bool {
256     // We try to first quickly refute the RTLness of the vector. If that
257     // fails, we do the real RTL check, so in that case we end up wasting
258     // the work for the up-front quick checks. Even the quick-check is
259     // two-fold in order to return `false` ASAP if everything is below
260     // Hebrew.
261 
262     aarch64_return_false_if_below_hebrew!(s);
263 
264     let below_hebrew = s.lt(u16x8::splat(0x0590));
265 
266     non_aarch64_return_false_if_all!(below_hebrew);
267 
268     if (below_hebrew | in_range16x8!(s, 0x0900, 0x200F) | in_range16x8!(s, 0x2068, 0xD802)).all() {
269         return false;
270     }
271 
272     // Quick refutation failed. Let's do the full check.
273 
274     (in_range16x8!(s, 0x0590, 0x0900)
275         | in_range16x8!(s, 0xFB1D, 0xFE00)
276         | in_range16x8!(s, 0xFE70, 0xFEFF)
277         | in_range16x8!(s, 0xD802, 0xD804)
278         | in_range16x8!(s, 0xD83A, 0xD83C)
279         | s.eq(u16x8::splat(0x200F))
280         | s.eq(u16x8::splat(0x202B))
281         | s.eq(u16x8::splat(0x202E))
282         | s.eq(u16x8::splat(0x2067)))
283     .any()
284 }
285 
286 #[inline(always)]
simd_unpack(s: u8x16) -> (u16x8, u16x8)287 pub fn simd_unpack(s: u8x16) -> (u16x8, u16x8) {
288     unsafe {
289         let first: u8x16 = shuffle!(
290             s,
291             u8x16::splat(0),
292             [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]
293         );
294         let second: u8x16 = shuffle!(
295             s,
296             u8x16::splat(0),
297             [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31]
298         );
299         (u16x8::from_bits(first), u16x8::from_bits(second))
300     }
301 }
302 
303 cfg_if! {
304     if #[cfg(target_feature = "sse2")] {
305         #[inline(always)]
306         pub fn simd_pack(a: u16x8, b: u16x8) -> u8x16 {
307             unsafe {
308                 u8x16::from_bits(_mm_packus_epi16(__m128i::from_bits(a), __m128i::from_bits(b)))
309             }
310         }
311     } else {
312         #[inline(always)]
313         pub fn simd_pack(a: u16x8, b: u16x8) -> u8x16 {
314             unsafe {
315                 let first = u8x16::from_bits(a);
316                 let second = u8x16::from_bits(b);
317                 shuffle!(
318                     first,
319                     second,
320                     [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
321                 )
322             }
323         }
324     }
325 }
326 
327 #[cfg(test)]
328 mod tests {
329     use super::*;
330 
331     #[test]
test_unpack()332     fn test_unpack() {
333         let ascii: [u8; 16] = [
334             0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
335             0x75, 0x76,
336         ];
337         let basic_latin: [u16; 16] = [
338             0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
339             0x75, 0x76,
340         ];
341         let simd = unsafe { load16_unaligned(ascii.as_ptr()) };
342         let mut vec = Vec::with_capacity(16);
343         vec.resize(16, 0u16);
344         let (first, second) = simd_unpack(simd);
345         let ptr = vec.as_mut_ptr();
346         unsafe {
347             store8_unaligned(ptr, first);
348             store8_unaligned(ptr.add(8), second);
349         }
350         assert_eq!(&vec[..], &basic_latin[..]);
351     }
352 
353     #[test]
test_simd_is_basic_latin_success()354     fn test_simd_is_basic_latin_success() {
355         let ascii: [u8; 16] = [
356             0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
357             0x75, 0x76,
358         ];
359         let basic_latin: [u16; 16] = [
360             0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
361             0x75, 0x76,
362         ];
363         let first = unsafe { load8_unaligned(basic_latin.as_ptr()) };
364         let second = unsafe { load8_unaligned(basic_latin.as_ptr().add(8)) };
365         let mut vec = Vec::with_capacity(16);
366         vec.resize(16, 0u8);
367         let ptr = vec.as_mut_ptr();
368         assert!(simd_is_basic_latin(first | second));
369         unsafe {
370             store16_unaligned(ptr, simd_pack(first, second));
371         }
372         assert_eq!(&vec[..], &ascii[..]);
373     }
374 
375     #[test]
test_simd_is_basic_latin_c0()376     fn test_simd_is_basic_latin_c0() {
377         let input: [u16; 16] = [
378             0x61, 0x62, 0x63, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
379             0x75, 0x76,
380         ];
381         let first = unsafe { load8_unaligned(input.as_ptr()) };
382         let second = unsafe { load8_unaligned(input.as_ptr().add(8)) };
383         assert!(!simd_is_basic_latin(first | second));
384     }
385 
386     #[test]
test_simd_is_basic_latin_0fff()387     fn test_simd_is_basic_latin_0fff() {
388         let input: [u16; 16] = [
389             0x61, 0x62, 0x63, 0x0FFF, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
390             0x75, 0x76,
391         ];
392         let first = unsafe { load8_unaligned(input.as_ptr()) };
393         let second = unsafe { load8_unaligned(input.as_ptr().add(8)) };
394         assert!(!simd_is_basic_latin(first | second));
395     }
396 
397     #[test]
test_simd_is_basic_latin_ffff()398     fn test_simd_is_basic_latin_ffff() {
399         let input: [u16; 16] = [
400             0x61, 0x62, 0x63, 0xFFFF, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
401             0x75, 0x76,
402         ];
403         let first = unsafe { load8_unaligned(input.as_ptr()) };
404         let second = unsafe { load8_unaligned(input.as_ptr().add(8)) };
405         assert!(!simd_is_basic_latin(first | second));
406     }
407 
408     #[test]
test_simd_is_ascii_success()409     fn test_simd_is_ascii_success() {
410         let ascii: [u8; 16] = [
411             0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
412             0x75, 0x76,
413         ];
414         let simd = unsafe { load16_unaligned(ascii.as_ptr()) };
415         assert!(simd_is_ascii(simd));
416     }
417 
418     #[test]
test_simd_is_ascii_failure()419     fn test_simd_is_ascii_failure() {
420         let input: [u8; 16] = [
421             0x61, 0x62, 0x63, 0x64, 0x81, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
422             0x75, 0x76,
423         ];
424         let simd = unsafe { load16_unaligned(input.as_ptr()) };
425         assert!(!simd_is_ascii(simd));
426     }
427 
428     #[cfg(target_feature = "sse2")]
429     #[test]
test_check_ascii()430     fn test_check_ascii() {
431         let input: [u8; 16] = [
432             0x61, 0x62, 0x63, 0x64, 0x81, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
433             0x75, 0x76,
434         ];
435         let simd = unsafe { load16_unaligned(input.as_ptr()) };
436         let mask = mask_ascii(simd);
437         assert_ne!(mask, 0);
438         assert_eq!(mask.trailing_zeros(), 4);
439     }
440 
441     #[test]
test_alu()442     fn test_alu() {
443         let input: [u8; 16] = [
444             0x61, 0x62, 0x63, 0x64, 0x81, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
445             0x75, 0x76,
446         ];
447         let mut alu = 0u64;
448         unsafe {
449             ::std::ptr::copy_nonoverlapping(input.as_ptr(), &mut alu as *mut u64 as *mut u8, 8);
450         }
451         let masked = alu & 0x8080808080808080;
452         assert_eq!(masked.trailing_zeros(), 39);
453     }
454 }
455