1 // Copyright 2016 Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9
10 use packed_simd::u16x8;
11 use packed_simd::u8x16;
12 use packed_simd::FromBits;
13
14 // TODO: Migrate unaligned access to stdlib code if/when the RFC
15 // https://github.com/rust-lang/rfcs/pull/1725 is implemented.
16
17 #[inline(always)]
load16_unaligned(ptr: *const u8) -> u8x1618 pub unsafe fn load16_unaligned(ptr: *const u8) -> u8x16 {
19 let mut simd = ::std::mem::uninitialized();
20 ::std::ptr::copy_nonoverlapping(ptr, &mut simd as *mut u8x16 as *mut u8, 16);
21 simd
22 }
23
24 #[allow(dead_code)]
25 #[inline(always)]
load16_aligned(ptr: *const u8) -> u8x1626 pub unsafe fn load16_aligned(ptr: *const u8) -> u8x16 {
27 *(ptr as *const u8x16)
28 }
29
30 #[inline(always)]
store16_unaligned(ptr: *mut u8, s: u8x16)31 pub unsafe fn store16_unaligned(ptr: *mut u8, s: u8x16) {
32 ::std::ptr::copy_nonoverlapping(&s as *const u8x16 as *const u8, ptr, 16);
33 }
34
35 #[allow(dead_code)]
36 #[inline(always)]
store16_aligned(ptr: *mut u8, s: u8x16)37 pub unsafe fn store16_aligned(ptr: *mut u8, s: u8x16) {
38 *(ptr as *mut u8x16) = s;
39 }
40
41 #[inline(always)]
load8_unaligned(ptr: *const u16) -> u16x842 pub unsafe fn load8_unaligned(ptr: *const u16) -> u16x8 {
43 let mut simd = ::std::mem::uninitialized();
44 ::std::ptr::copy_nonoverlapping(ptr as *const u8, &mut simd as *mut u16x8 as *mut u8, 16);
45 simd
46 }
47
48 #[allow(dead_code)]
49 #[inline(always)]
load8_aligned(ptr: *const u16) -> u16x850 pub unsafe fn load8_aligned(ptr: *const u16) -> u16x8 {
51 *(ptr as *const u16x8)
52 }
53
54 #[inline(always)]
store8_unaligned(ptr: *mut u16, s: u16x8)55 pub unsafe fn store8_unaligned(ptr: *mut u16, s: u16x8) {
56 ::std::ptr::copy_nonoverlapping(&s as *const u16x8 as *const u8, ptr as *mut u8, 16);
57 }
58
59 #[allow(dead_code)]
60 #[inline(always)]
store8_aligned(ptr: *mut u16, s: u16x8)61 pub unsafe fn store8_aligned(ptr: *mut u16, s: u16x8) {
62 *(ptr as *mut u16x8) = s;
63 }
64
65 cfg_if! {
66 if #[cfg(all(target_feature = "sse2", target_arch = "x86_64"))] {
67 use std::arch::x86_64::__m128i;
68 use std::arch::x86_64::_mm_movemask_epi8;
69 use std::arch::x86_64::_mm_packus_epi16;
70 } else if #[cfg(all(target_feature = "sse2", target_arch = "x86"))] {
71 use std::arch::x86::__m128i;
72 use std::arch::x86::_mm_movemask_epi8;
73 use std::arch::x86::_mm_packus_epi16;
74 } else if #[cfg(target_arch = "aarch64")]{
75 use std::arch::aarch64::uint8x16_t;
76 use std::arch::aarch64::uint16x8_t;
77 use std::arch::aarch64::vmaxvq_u8;
78 use std::arch::aarch64::vmaxvq_u16;
79 } else {
80
81 }
82 }
83
84 // #[inline(always)]
85 // fn simd_byte_swap_u8(s: u8x16) -> u8x16 {
86 // unsafe {
87 // shuffle!(s, s, [1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14])
88 // }
89 // }
90
91 // #[inline(always)]
92 // pub fn simd_byte_swap(s: u16x8) -> u16x8 {
93 // to_u16_lanes(simd_byte_swap_u8(to_u8_lanes(s)))
94 // }
95
96 #[inline(always)]
simd_byte_swap(s: u16x8) -> u16x897 pub fn simd_byte_swap(s: u16x8) -> u16x8 {
98 let left = s << 8;
99 let right = s >> 8;
100 left | right
101 }
102
103 #[inline(always)]
to_u16_lanes(s: u8x16) -> u16x8104 pub fn to_u16_lanes(s: u8x16) -> u16x8 {
105 u16x8::from_bits(s)
106 }
107
108 cfg_if! {
109 if #[cfg(target_feature = "sse2")] {
110
111 // Expose low-level mask instead of higher-level conclusion,
112 // because the non-ASCII case would perform less well otherwise.
113 #[inline(always)]
114 pub fn mask_ascii(s: u8x16) -> i32 {
115 unsafe {
116 _mm_movemask_epi8(__m128i::from_bits(s))
117 }
118 }
119
120 } else {
121
122 }
123 }
124
125 cfg_if! {
126 if #[cfg(target_feature = "sse2")] {
127 #[inline(always)]
128 pub fn simd_is_ascii(s: u8x16) -> bool {
129 unsafe {
130 _mm_movemask_epi8(__m128i::from_bits(s)) == 0
131 }
132 }
133 } else if #[cfg(target_arch = "aarch64")]{
134 #[inline(always)]
135 pub fn simd_is_ascii(s: u8x16) -> bool {
136 unsafe {
137 vmaxvq_u8(uint8x16_t::from_bits(s)) < 0x80
138 }
139 }
140 } else {
141 #[inline(always)]
142 pub fn simd_is_ascii(s: u8x16) -> bool {
143 // This optimizes better on ARM than
144 // the lt formulation.
145 let highest_ascii = u8x16::splat(0x7F);
146 !s.gt(highest_ascii).any()
147 }
148 }
149 }
150
151 cfg_if! {
152 if #[cfg(target_feature = "sse2")] {
153 #[inline(always)]
154 pub fn simd_is_str_latin1(s: u8x16) -> bool {
155 if simd_is_ascii(s) {
156 return true;
157 }
158 let above_str_latin1 = u8x16::splat(0xC4);
159 s.lt(above_str_latin1).all()
160 }
161 } else if #[cfg(target_arch = "aarch64")]{
162 #[inline(always)]
163 pub fn simd_is_str_latin1(s: u8x16) -> bool {
164 unsafe {
165 vmaxvq_u8(uint8x16_t::from_bits(s)) < 0xC4
166 }
167 }
168 } else {
169 #[inline(always)]
170 pub fn simd_is_str_latin1(s: u8x16) -> bool {
171 let above_str_latin1 = u8x16::splat(0xC4);
172 s.lt(above_str_latin1).all()
173 }
174 }
175 }
176
177 cfg_if! {
178 if #[cfg(target_arch = "aarch64")]{
179 #[inline(always)]
180 pub fn simd_is_basic_latin(s: u16x8) -> bool {
181 unsafe {
182 vmaxvq_u16(uint16x8_t::from_bits(s)) < 0x80
183 }
184 }
185
186 #[inline(always)]
187 pub fn simd_is_latin1(s: u16x8) -> bool {
188 unsafe {
189 vmaxvq_u16(uint16x8_t::from_bits(s)) < 0x100
190 }
191 }
192 } else {
193 #[inline(always)]
194 pub fn simd_is_basic_latin(s: u16x8) -> bool {
195 let above_ascii = u16x8::splat(0x80);
196 s.lt(above_ascii).all()
197 }
198
199 #[inline(always)]
200 pub fn simd_is_latin1(s: u16x8) -> bool {
201 // For some reason, on SSE2 this formulation
202 // seems faster in this case while the above
203 // function is better the other way round...
204 let highest_latin1 = u16x8::splat(0xFF);
205 !s.gt(highest_latin1).any()
206 }
207 }
208 }
209
210 #[inline(always)]
contains_surrogates(s: u16x8) -> bool211 pub fn contains_surrogates(s: u16x8) -> bool {
212 let mask = u16x8::splat(0xF800);
213 let surrogate_bits = u16x8::splat(0xD800);
214 (s & mask).eq(surrogate_bits).any()
215 }
216
217 cfg_if! {
218 if #[cfg(target_arch = "aarch64")]{
219 macro_rules! aarch64_return_false_if_below_hebrew {
220 ($s:ident) => ({
221 unsafe {
222 if vmaxvq_u16(uint16x8_t::from_bits($s)) < 0x0590 {
223 return false;
224 }
225 }
226 })
227 }
228
229 macro_rules! non_aarch64_return_false_if_all {
230 ($s:ident) => ()
231 }
232 } else {
233 macro_rules! aarch64_return_false_if_below_hebrew {
234 ($s:ident) => ()
235 }
236
237 macro_rules! non_aarch64_return_false_if_all {
238 ($s:ident) => ({
239 if $s.all() {
240 return false;
241 }
242 })
243 }
244 }
245 }
246
247 macro_rules! in_range16x8 {
248 ($s:ident, $start:expr, $end:expr) => {{
249 // SIMD sub is wrapping
250 ($s - u16x8::splat($start)).lt(u16x8::splat($end - $start))
251 }};
252 }
253
254 #[inline(always)]
is_u16x8_bidi(s: u16x8) -> bool255 pub fn is_u16x8_bidi(s: u16x8) -> bool {
256 // We try to first quickly refute the RTLness of the vector. If that
257 // fails, we do the real RTL check, so in that case we end up wasting
258 // the work for the up-front quick checks. Even the quick-check is
259 // two-fold in order to return `false` ASAP if everything is below
260 // Hebrew.
261
262 aarch64_return_false_if_below_hebrew!(s);
263
264 let below_hebrew = s.lt(u16x8::splat(0x0590));
265
266 non_aarch64_return_false_if_all!(below_hebrew);
267
268 if (below_hebrew | in_range16x8!(s, 0x0900, 0x200F) | in_range16x8!(s, 0x2068, 0xD802)).all() {
269 return false;
270 }
271
272 // Quick refutation failed. Let's do the full check.
273
274 (in_range16x8!(s, 0x0590, 0x0900)
275 | in_range16x8!(s, 0xFB1D, 0xFE00)
276 | in_range16x8!(s, 0xFE70, 0xFEFF)
277 | in_range16x8!(s, 0xD802, 0xD804)
278 | in_range16x8!(s, 0xD83A, 0xD83C)
279 | s.eq(u16x8::splat(0x200F))
280 | s.eq(u16x8::splat(0x202B))
281 | s.eq(u16x8::splat(0x202E))
282 | s.eq(u16x8::splat(0x2067)))
283 .any()
284 }
285
286 #[inline(always)]
simd_unpack(s: u8x16) -> (u16x8, u16x8)287 pub fn simd_unpack(s: u8x16) -> (u16x8, u16x8) {
288 unsafe {
289 let first: u8x16 = shuffle!(
290 s,
291 u8x16::splat(0),
292 [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]
293 );
294 let second: u8x16 = shuffle!(
295 s,
296 u8x16::splat(0),
297 [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31]
298 );
299 (u16x8::from_bits(first), u16x8::from_bits(second))
300 }
301 }
302
303 cfg_if! {
304 if #[cfg(target_feature = "sse2")] {
305 #[inline(always)]
306 pub fn simd_pack(a: u16x8, b: u16x8) -> u8x16 {
307 unsafe {
308 u8x16::from_bits(_mm_packus_epi16(__m128i::from_bits(a), __m128i::from_bits(b)))
309 }
310 }
311 } else {
312 #[inline(always)]
313 pub fn simd_pack(a: u16x8, b: u16x8) -> u8x16 {
314 unsafe {
315 let first = u8x16::from_bits(a);
316 let second = u8x16::from_bits(b);
317 shuffle!(
318 first,
319 second,
320 [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
321 )
322 }
323 }
324 }
325 }
326
327 #[cfg(test)]
328 mod tests {
329 use super::*;
330
331 #[test]
test_unpack()332 fn test_unpack() {
333 let ascii: [u8; 16] = [
334 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
335 0x75, 0x76,
336 ];
337 let basic_latin: [u16; 16] = [
338 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
339 0x75, 0x76,
340 ];
341 let simd = unsafe { load16_unaligned(ascii.as_ptr()) };
342 let mut vec = Vec::with_capacity(16);
343 vec.resize(16, 0u16);
344 let (first, second) = simd_unpack(simd);
345 let ptr = vec.as_mut_ptr();
346 unsafe {
347 store8_unaligned(ptr, first);
348 store8_unaligned(ptr.add(8), second);
349 }
350 assert_eq!(&vec[..], &basic_latin[..]);
351 }
352
353 #[test]
test_simd_is_basic_latin_success()354 fn test_simd_is_basic_latin_success() {
355 let ascii: [u8; 16] = [
356 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
357 0x75, 0x76,
358 ];
359 let basic_latin: [u16; 16] = [
360 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
361 0x75, 0x76,
362 ];
363 let first = unsafe { load8_unaligned(basic_latin.as_ptr()) };
364 let second = unsafe { load8_unaligned(basic_latin.as_ptr().add(8)) };
365 let mut vec = Vec::with_capacity(16);
366 vec.resize(16, 0u8);
367 let ptr = vec.as_mut_ptr();
368 assert!(simd_is_basic_latin(first | second));
369 unsafe {
370 store16_unaligned(ptr, simd_pack(first, second));
371 }
372 assert_eq!(&vec[..], &ascii[..]);
373 }
374
375 #[test]
test_simd_is_basic_latin_c0()376 fn test_simd_is_basic_latin_c0() {
377 let input: [u16; 16] = [
378 0x61, 0x62, 0x63, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
379 0x75, 0x76,
380 ];
381 let first = unsafe { load8_unaligned(input.as_ptr()) };
382 let second = unsafe { load8_unaligned(input.as_ptr().add(8)) };
383 assert!(!simd_is_basic_latin(first | second));
384 }
385
386 #[test]
test_simd_is_basic_latin_0fff()387 fn test_simd_is_basic_latin_0fff() {
388 let input: [u16; 16] = [
389 0x61, 0x62, 0x63, 0x0FFF, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
390 0x75, 0x76,
391 ];
392 let first = unsafe { load8_unaligned(input.as_ptr()) };
393 let second = unsafe { load8_unaligned(input.as_ptr().add(8)) };
394 assert!(!simd_is_basic_latin(first | second));
395 }
396
397 #[test]
test_simd_is_basic_latin_ffff()398 fn test_simd_is_basic_latin_ffff() {
399 let input: [u16; 16] = [
400 0x61, 0x62, 0x63, 0xFFFF, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
401 0x75, 0x76,
402 ];
403 let first = unsafe { load8_unaligned(input.as_ptr()) };
404 let second = unsafe { load8_unaligned(input.as_ptr().add(8)) };
405 assert!(!simd_is_basic_latin(first | second));
406 }
407
408 #[test]
test_simd_is_ascii_success()409 fn test_simd_is_ascii_success() {
410 let ascii: [u8; 16] = [
411 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
412 0x75, 0x76,
413 ];
414 let simd = unsafe { load16_unaligned(ascii.as_ptr()) };
415 assert!(simd_is_ascii(simd));
416 }
417
418 #[test]
test_simd_is_ascii_failure()419 fn test_simd_is_ascii_failure() {
420 let input: [u8; 16] = [
421 0x61, 0x62, 0x63, 0x64, 0x81, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
422 0x75, 0x76,
423 ];
424 let simd = unsafe { load16_unaligned(input.as_ptr()) };
425 assert!(!simd_is_ascii(simd));
426 }
427
428 #[cfg(target_feature = "sse2")]
429 #[test]
test_check_ascii()430 fn test_check_ascii() {
431 let input: [u8; 16] = [
432 0x61, 0x62, 0x63, 0x64, 0x81, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
433 0x75, 0x76,
434 ];
435 let simd = unsafe { load16_unaligned(input.as_ptr()) };
436 let mask = mask_ascii(simd);
437 assert_ne!(mask, 0);
438 assert_eq!(mask.trailing_zeros(), 4);
439 }
440
441 #[test]
test_alu()442 fn test_alu() {
443 let input: [u8; 16] = [
444 0x61, 0x62, 0x63, 0x64, 0x81, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
445 0x75, 0x76,
446 ];
447 let mut alu = 0u64;
448 unsafe {
449 ::std::ptr::copy_nonoverlapping(input.as_ptr(), &mut alu as *mut u64 as *mut u8, 8);
450 }
451 let masked = alu & 0x8080808080808080;
452 assert_eq!(masked.trailing_zeros(), 39);
453 }
454 }
455