1 //! Shuffle vector lanes with run-time indices.
2 
3 use crate::*;
4 
5 pub trait Shuffle1Dyn {
6     type Indices;
shuffle1_dyn(self, _: Self::Indices) -> Self7     fn shuffle1_dyn(self, _: Self::Indices) -> Self;
8 }
9 
10 // Fallback implementation
11 macro_rules! impl_fallback {
12     ($id:ident) => {
13         impl Shuffle1Dyn for $id {
14             type Indices = Self;
15             #[inline]
16             fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
17                 let mut result = Self::splat(0);
18                 for i in 0..$id::lanes() {
19                     result = result
20                         .replace(i, self.extract(indices.extract(i) as usize));
21                 }
22                 result
23             }
24         }
25     };
26 }
27 
28 macro_rules! impl_shuffle1_dyn {
29     (u8x8) => {
30         cfg_if! {
31             if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"),
32                          target_feature = "ssse3"))] {
33                 impl Shuffle1Dyn for u8x8 {
34                     type Indices = Self;
35                     #[inline]
36                     fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
37                         #[cfg(target_arch = "x86")]
38                         use crate::arch::x86::_mm_shuffle_pi8;
39                         #[cfg(target_arch = "x86_64")]
40                         use crate::arch::x86_64::_mm_shuffle_pi8;
41 
42                         unsafe {
43                             crate::mem::transmute(
44                                 _mm_shuffle_pi8(
45                                     crate::mem::transmute(self.0),
46                                     crate::mem::transmute(indices.0)
47                                 )
48                             )
49                         }
50                     }
51                 }
52             } else if #[cfg(all(
53                 any(
54                     all(target_aarch = "aarch64", target_feature = "neon"),
55                     all(target_aarch = "arm", target_feature = "v7",
56                         target_feature = "neon")
57                 ),
58                 any(feature = "core_arch", libcore_neon)
59             )
60             )] {
61                 impl Shuffle1Dyn for u8x8 {
62                     type Indices = Self;
63                     #[inline]
64                     fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
65                         #[cfg(targt_arch = "aarch64")]
66                         use crate::arch::aarch64::vtbl1_u8;
67                         #[cfg(targt_arch = "arm")]
68                         use crate::arch::arm::vtbl1_u8;
69 
70                         // This is safe because the binary is compiled with
71                         // neon enabled at compile-time and can therefore only
72                         // run on CPUs that have it enabled.
73                         unsafe {
74                             Simd(mem::transmute(
75                                 vtbl1_u8(mem::transmute(self.0),
76                                         crate::mem::transmute(indices.0))
77                             ))
78                         }
79                     }
80                 }
81             } else {
82                 impl_fallback!(u8x8);
83             }
84         }
85     };
86     (u8x16) => {
87         cfg_if! {
88             if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"),
89                          target_feature = "ssse3"))] {
90                 impl Shuffle1Dyn for u8x16 {
91                     type Indices = Self;
92                     #[inline]
93                     fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
94                         #[cfg(target_arch = "x86")]
95                         use crate::arch::x86::_mm_shuffle_epi8;
96                         #[cfg(target_arch = "x86_64")]
97                         use crate::arch::x86_64::_mm_shuffle_epi8;
98                         // This is safe because the binary is compiled with
99                         // ssse3 enabled at compile-time and can therefore only
100                         // run on CPUs that have it enabled.
101                         unsafe {
102                             Simd(mem::transmute(
103                                 _mm_shuffle_epi8(mem::transmute(self.0),
104                                                 crate::mem::transmute(indices))
105                             ))
106                         }
107                     }
108                 }
109             } else if #[cfg(all(target_aarch = "aarch64", target_feature = "neon",
110                                 any(feature = "core_arch", libcore_neon)))] {
111                 impl Shuffle1Dyn for u8x16 {
112                     type Indices = Self;
113                     #[inline]
114                     fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
115                         use crate::arch::aarch64::vqtbl1q_u8;
116 
117                         // This is safe because the binary is compiled with
118                         // neon enabled at compile-time and can therefore only
119                         // run on CPUs that have it enabled.
120                         unsafe {
121                             Simd(mem::transmute(
122                                 vqtbl1q_u8(mem::transmute(self.0),
123                                           crate::mem::transmute(indices.0))
124                             ))
125                         }
126                     }
127                 }
128             } else if #[cfg(all(target_aarch = "arm", target_feature = "v7",
129                                 target_feature = "neon",
130                                 any(feature = "core_arch", libcore_neon)))] {
131                 impl Shuffle1Dyn for u8x16 {
132                     type Indices = Self;
133                     #[inline]
134                     fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
135                         use crate::arch::arm::vtbl2_u8;
136 
137                         // This is safe because the binary is compiled with
138                         // neon enabled at compile-time and can therefore only
139                         // run on CPUs that have it enabled.
140                         unsafe {
141                             union U {
142                                 j: u8x16,
143                                 s: (u8x8, u8x8),
144                             }
145 
146                             let (i0, i1) = U { j: y }.s;
147 
148                             let r0 = vtbl2_u8(
149                                 mem::transmute(x),
150                                 crate::mem::transmute(i0)
151                             );
152                             let r1 = vtbl2_u8(
153                                 mem::transmute(x),
154                                 crate::mem::transmute(i1)
155                             );
156 
157                             let r = U { s: (r0, r1) }.j;
158 
159                             Simd(mem::transmute(r))
160                         }
161                     }
162                 }
163             } else {
164                 impl_fallback!(u8x16);
165             }
166         }
167     };
168     (u16x8) => {
169         impl Shuffle1Dyn for u16x8 {
170             type Indices = Self;
171             #[inline]
172             fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
173                 let indices: u8x8 = (indices * 2).cast();
174                 let indices: u8x16 = shuffle!(
175                     indices, [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7]
176                 );
177                 let v = u8x16::new(
178                     0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
179                 );
180                 let indices = indices + v;
181                 unsafe {
182                     let s: u8x16 =crate::mem::transmute(self);
183                    crate::mem::transmute(s.shuffle1_dyn(indices))
184                 }
185             }
186         }
187     };
188     (u32x4) => {
189         cfg_if! {
190             if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"),
191                          target_feature = "avx"))] {
192                 impl Shuffle1Dyn for u32x4 {
193                     type Indices = Self;
194                     #[inline]
195                     fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
196                         #[cfg(target_arch = "x86")]
197                         use crate::arch::x86::{_mm_permutevar_ps};
198                         #[cfg(target_arch = "x86_64")]
199                         use crate::arch::x86_64::{_mm_permutevar_ps};
200 
201                         unsafe {
202                             crate::mem::transmute(
203                                 _mm_permutevar_ps(
204                                     crate::mem::transmute(self.0),
205                                     crate::mem::transmute(indices.0)
206                                 )
207                             )
208                         }
209                     }
210                 }
211             } else {
212                 impl Shuffle1Dyn for u32x4 {
213                     type Indices = Self;
214                     #[inline]
215                     fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
216                         let indices: u8x4 = (indices * 4).cast();
217                         let indices: u8x16 = shuffle!(
218                             indices,
219                             [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3]
220                         );
221                         let v = u8x16::new(
222                             0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
223                         );
224                         let indices = indices + v;
225                         unsafe {
226                             let s: u8x16 =crate::mem::transmute(self);
227                            crate::mem::transmute(s.shuffle1_dyn(indices))
228                         }
229                     }
230                 }
231             }
232         }
233     };
234     (u64x2) => {
235         cfg_if! {
236             if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"),
237                          target_feature = "avx"))] {
238                 impl Shuffle1Dyn for u64x2 {
239                     type Indices = Self;
240                     #[inline]
241                     fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
242                         #[cfg(target_arch = "x86")]
243                         use crate::arch::x86::{_mm_permutevar_pd};
244                         #[cfg(target_arch = "x86_64")]
245                         use crate::arch::x86_64::{_mm_permutevar_pd};
246                         // _mm_permutevar_pd uses the _second_ bit of each
247                         // element to perform the selection, that is: 0b00 => 0,
248                         // 0b10 => 1:
249                         let indices = indices << 1;
250                         unsafe {
251                             crate::mem::transmute(
252                                 _mm_permutevar_pd(
253                                     crate::mem::transmute(self),
254                                     crate::mem::transmute(indices)
255                                 )
256                             )
257                         }
258                     }
259                 }
260             } else {
261                 impl Shuffle1Dyn for u64x2 {
262                     type Indices = Self;
263                     #[inline]
264                     fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
265                         let indices: u8x2 = (indices * 8).cast();
266                         let indices: u8x16 = shuffle!(
267                             indices,
268                             [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
269                         );
270                         let v = u8x16::new(
271                             0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7
272                         );
273                         let indices = indices + v;
274                         unsafe {
275                             let s: u8x16 =crate::mem::transmute(self);
276                            crate::mem::transmute(s.shuffle1_dyn(indices))
277                         }
278                     }
279                 }
280             }
281         }
282     };
283     (u128x1) => {
284         impl Shuffle1Dyn for u128x1 {
285             type Indices = Self;
286             #[inline]
287             fn shuffle1_dyn(self, _indices: Self::Indices) -> Self {
288                 self
289             }
290         }
291     };
292     ($id:ident) => { impl_fallback!($id); }
293 }
294 
295 impl_shuffle1_dyn!(u8x2);
296 impl_shuffle1_dyn!(u8x4);
297 impl_shuffle1_dyn!(u8x8);
298 impl_shuffle1_dyn!(u8x16);
299 impl_shuffle1_dyn!(u8x32);
300 impl_shuffle1_dyn!(u8x64);
301 
302 impl_shuffle1_dyn!(u16x2);
303 impl_shuffle1_dyn!(u16x4);
304 impl_shuffle1_dyn!(u16x8);
305 impl_shuffle1_dyn!(u16x16);
306 impl_shuffle1_dyn!(u16x32);
307 
308 impl_shuffle1_dyn!(u32x2);
309 impl_shuffle1_dyn!(u32x4);
310 impl_shuffle1_dyn!(u32x8);
311 impl_shuffle1_dyn!(u32x16);
312 
313 impl_shuffle1_dyn!(u64x2);
314 impl_shuffle1_dyn!(u64x4);
315 impl_shuffle1_dyn!(u64x8);
316 
317 impl_shuffle1_dyn!(usizex2);
318 impl_shuffle1_dyn!(usizex4);
319 impl_shuffle1_dyn!(usizex8);
320 
321 impl_shuffle1_dyn!(u128x1);
322 impl_shuffle1_dyn!(u128x2);
323 impl_shuffle1_dyn!(u128x4);
324 
325 // Implementation for non-unsigned vector types
326 macro_rules! impl_shuffle1_dyn_non_u {
327     ($id:ident, $uid:ident) => {
328         impl Shuffle1Dyn for $id {
329             type Indices = $uid;
330             #[inline]
331             fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
332                 unsafe {
333                     let u: $uid = crate::mem::transmute(self);
334                     crate::mem::transmute(u.shuffle1_dyn(indices))
335                 }
336             }
337         }
338     };
339 }
340 
341 impl_shuffle1_dyn_non_u!(i8x2, u8x2);
342 impl_shuffle1_dyn_non_u!(i8x4, u8x4);
343 impl_shuffle1_dyn_non_u!(i8x8, u8x8);
344 impl_shuffle1_dyn_non_u!(i8x16, u8x16);
345 impl_shuffle1_dyn_non_u!(i8x32, u8x32);
346 impl_shuffle1_dyn_non_u!(i8x64, u8x64);
347 
348 impl_shuffle1_dyn_non_u!(i16x2, u16x2);
349 impl_shuffle1_dyn_non_u!(i16x4, u16x4);
350 impl_shuffle1_dyn_non_u!(i16x8, u16x8);
351 impl_shuffle1_dyn_non_u!(i16x16, u16x16);
352 impl_shuffle1_dyn_non_u!(i16x32, u16x32);
353 
354 impl_shuffle1_dyn_non_u!(i32x2, u32x2);
355 impl_shuffle1_dyn_non_u!(i32x4, u32x4);
356 impl_shuffle1_dyn_non_u!(i32x8, u32x8);
357 impl_shuffle1_dyn_non_u!(i32x16, u32x16);
358 
359 impl_shuffle1_dyn_non_u!(i64x2, u64x2);
360 impl_shuffle1_dyn_non_u!(i64x4, u64x4);
361 impl_shuffle1_dyn_non_u!(i64x8, u64x8);
362 
363 impl_shuffle1_dyn_non_u!(isizex2, usizex2);
364 impl_shuffle1_dyn_non_u!(isizex4, usizex4);
365 impl_shuffle1_dyn_non_u!(isizex8, usizex8);
366 
367 impl_shuffle1_dyn_non_u!(i128x1, u128x1);
368 impl_shuffle1_dyn_non_u!(i128x2, u128x2);
369 impl_shuffle1_dyn_non_u!(i128x4, u128x4);
370 
371 impl_shuffle1_dyn_non_u!(m8x2, u8x2);
372 impl_shuffle1_dyn_non_u!(m8x4, u8x4);
373 impl_shuffle1_dyn_non_u!(m8x8, u8x8);
374 impl_shuffle1_dyn_non_u!(m8x16, u8x16);
375 impl_shuffle1_dyn_non_u!(m8x32, u8x32);
376 impl_shuffle1_dyn_non_u!(m8x64, u8x64);
377 
378 impl_shuffle1_dyn_non_u!(m16x2, u16x2);
379 impl_shuffle1_dyn_non_u!(m16x4, u16x4);
380 impl_shuffle1_dyn_non_u!(m16x8, u16x8);
381 impl_shuffle1_dyn_non_u!(m16x16, u16x16);
382 impl_shuffle1_dyn_non_u!(m16x32, u16x32);
383 
384 impl_shuffle1_dyn_non_u!(m32x2, u32x2);
385 impl_shuffle1_dyn_non_u!(m32x4, u32x4);
386 impl_shuffle1_dyn_non_u!(m32x8, u32x8);
387 impl_shuffle1_dyn_non_u!(m32x16, u32x16);
388 
389 impl_shuffle1_dyn_non_u!(m64x2, u64x2);
390 impl_shuffle1_dyn_non_u!(m64x4, u64x4);
391 impl_shuffle1_dyn_non_u!(m64x8, u64x8);
392 
393 impl_shuffle1_dyn_non_u!(msizex2, usizex2);
394 impl_shuffle1_dyn_non_u!(msizex4, usizex4);
395 impl_shuffle1_dyn_non_u!(msizex8, usizex8);
396 
397 impl_shuffle1_dyn_non_u!(m128x1, u128x1);
398 impl_shuffle1_dyn_non_u!(m128x2, u128x2);
399 impl_shuffle1_dyn_non_u!(m128x4, u128x4);
400 
401 impl_shuffle1_dyn_non_u!(f32x2, u32x2);
402 impl_shuffle1_dyn_non_u!(f32x4, u32x4);
403 impl_shuffle1_dyn_non_u!(f32x8, u32x8);
404 impl_shuffle1_dyn_non_u!(f32x16, u32x16);
405 
406 impl_shuffle1_dyn_non_u!(f64x2, u64x2);
407 impl_shuffle1_dyn_non_u!(f64x4, u64x4);
408 impl_shuffle1_dyn_non_u!(f64x8, u64x8);
409 
410 // Implementation for non-unsigned vector types
411 macro_rules! impl_shuffle1_dyn_ptr {
412     ($id:ident, $uid:ident) => {
413         impl<T> Shuffle1Dyn for $id<T> {
414             type Indices = $uid;
415             #[inline]
416             fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
417                 unsafe {
418                     let u: $uid = crate::mem::transmute(self);
419                     crate::mem::transmute(u.shuffle1_dyn(indices))
420                 }
421             }
422         }
423     };
424 }
425 
426 impl_shuffle1_dyn_ptr!(cptrx2, usizex2);
427 impl_shuffle1_dyn_ptr!(cptrx4, usizex4);
428 impl_shuffle1_dyn_ptr!(cptrx8, usizex8);
429 
430 impl_shuffle1_dyn_ptr!(mptrx2, usizex2);
431 impl_shuffle1_dyn_ptr!(mptrx4, usizex4);
432 impl_shuffle1_dyn_ptr!(mptrx8, usizex8);
433