1 //! Shuffle vector lanes with run-time indices. 2 3 use crate::*; 4 5 pub trait Shuffle1Dyn { 6 type Indices; shuffle1_dyn(self, _: Self::Indices) -> Self7 fn shuffle1_dyn(self, _: Self::Indices) -> Self; 8 } 9 10 // Fallback implementation 11 macro_rules! impl_fallback { 12 ($id:ident) => { 13 impl Shuffle1Dyn for $id { 14 type Indices = Self; 15 #[inline] 16 fn shuffle1_dyn(self, indices: Self::Indices) -> Self { 17 let mut result = Self::splat(0); 18 for i in 0..$id::lanes() { 19 result = result 20 .replace(i, self.extract(indices.extract(i) as usize)); 21 } 22 result 23 } 24 } 25 }; 26 } 27 28 macro_rules! impl_shuffle1_dyn { 29 (u8x8) => { 30 cfg_if! { 31 if #[cfg(all( 32 any( 33 all(target_arch = "aarch64", target_feature = "neon"), 34 all(target_arch = "doesnotexist", target_feature = "v7", 35 target_feature = "neon") 36 ), 37 any(feature = "core_arch", libcore_neon) 38 ) 39 )] { 40 impl Shuffle1Dyn for u8x8 { 41 type Indices = Self; 42 #[inline] 43 fn shuffle1_dyn(self, indices: Self::Indices) -> Self { 44 #[cfg(target_arch = "aarch64")] 45 use crate::arch::aarch64::vtbl1_u8; 46 #[cfg(target_arch = "doesnotexist")] 47 use crate::arch::arm::vtbl1_u8; 48 49 // This is safe because the binary is compiled with 50 // neon enabled at compile-time and can therefore only 51 // run on CPUs that have it enabled. 52 unsafe { 53 Simd(mem::transmute( 54 vtbl1_u8(mem::transmute(self.0), 55 crate::mem::transmute(indices.0)) 56 )) 57 } 58 } 59 } 60 } else { 61 impl_fallback!(u8x8); 62 } 63 } 64 }; 65 (u8x16) => { 66 cfg_if! { 67 if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), 68 target_feature = "ssse3"))] { 69 impl Shuffle1Dyn for u8x16 { 70 type Indices = Self; 71 #[inline] 72 fn shuffle1_dyn(self, indices: Self::Indices) -> Self { 73 #[cfg(target_arch = "x86")] 74 use crate::arch::x86::_mm_shuffle_epi8; 75 #[cfg(target_arch = "x86_64")] 76 use crate::arch::x86_64::_mm_shuffle_epi8; 77 // This is safe because the binary is compiled with 78 // ssse3 enabled at compile-time and can therefore only 79 // run on CPUs that have it enabled. 80 unsafe { 81 Simd(mem::transmute( 82 _mm_shuffle_epi8(mem::transmute(self.0), 83 crate::mem::transmute(indices)) 84 )) 85 } 86 } 87 } 88 } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon", 89 any(feature = "core_arch", libcore_neon)))] { 90 impl Shuffle1Dyn for u8x16 { 91 type Indices = Self; 92 #[inline] 93 fn shuffle1_dyn(self, indices: Self::Indices) -> Self { 94 use crate::arch::aarch64::vqtbl1q_u8; 95 96 // This is safe because the binary is compiled with 97 // neon enabled at compile-time and can therefore only 98 // run on CPUs that have it enabled. 99 unsafe { 100 Simd(mem::transmute( 101 vqtbl1q_u8(mem::transmute(self.0), 102 crate::mem::transmute(indices.0)) 103 )) 104 } 105 } 106 } 107 } else if #[cfg(all(target_arch = "doesnotexist", target_feature = "v7", 108 target_feature = "neon", 109 any(feature = "core_arch", libcore_neon)))] { 110 impl Shuffle1Dyn for u8x16 { 111 type Indices = Self; 112 #[inline] 113 fn shuffle1_dyn(self, indices: Self::Indices) -> Self { 114 use crate::arch::arm::vtbl2_u8; 115 116 // This is safe because the binary is compiled with 117 // neon enabled at compile-time and can therefore only 118 // run on CPUs that have it enabled. 119 unsafe { 120 union U { 121 j: u8x16, 122 s: (u8x8, u8x8), 123 } 124 125 let (i0, i1) = U { j: y }.s; 126 127 let r0 = vtbl2_u8( 128 mem::transmute(x), 129 crate::mem::transmute(i0) 130 ); 131 let r1 = vtbl2_u8( 132 mem::transmute(x), 133 crate::mem::transmute(i1) 134 ); 135 136 let r = U { s: (r0, r1) }.j; 137 138 Simd(mem::transmute(r)) 139 } 140 } 141 } 142 } else { 143 impl_fallback!(u8x16); 144 } 145 } 146 }; 147 (u16x8) => { 148 impl Shuffle1Dyn for u16x8 { 149 type Indices = Self; 150 #[inline] 151 fn shuffle1_dyn(self, indices: Self::Indices) -> Self { 152 let indices: u8x8 = (indices * 2).cast(); 153 let indices: u8x16 = shuffle!( 154 indices, [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7] 155 ); 156 let v = u8x16::new( 157 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 158 ); 159 let indices = indices + v; 160 unsafe { 161 let s: u8x16 =crate::mem::transmute(self); 162 crate::mem::transmute(s.shuffle1_dyn(indices)) 163 } 164 } 165 } 166 }; 167 (u32x4) => { 168 cfg_if! { 169 if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), 170 target_feature = "avx"))] { 171 impl Shuffle1Dyn for u32x4 { 172 type Indices = Self; 173 #[inline] 174 fn shuffle1_dyn(self, indices: Self::Indices) -> Self { 175 #[cfg(target_arch = "x86")] 176 use crate::arch::x86::{_mm_permutevar_ps}; 177 #[cfg(target_arch = "x86_64")] 178 use crate::arch::x86_64::{_mm_permutevar_ps}; 179 180 unsafe { 181 crate::mem::transmute( 182 _mm_permutevar_ps( 183 crate::mem::transmute(self.0), 184 crate::mem::transmute(indices.0) 185 ) 186 ) 187 } 188 } 189 } 190 } else { 191 impl Shuffle1Dyn for u32x4 { 192 type Indices = Self; 193 #[inline] 194 fn shuffle1_dyn(self, indices: Self::Indices) -> Self { 195 let indices: u8x4 = (indices * 4).cast(); 196 let indices: u8x16 = shuffle!( 197 indices, 198 [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3] 199 ); 200 let v = u8x16::new( 201 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 202 ); 203 let indices = indices + v; 204 unsafe { 205 let s: u8x16 =crate::mem::transmute(self); 206 crate::mem::transmute(s.shuffle1_dyn(indices)) 207 } 208 } 209 } 210 } 211 } 212 }; 213 (u64x2) => { 214 cfg_if! { 215 if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), 216 target_feature = "avx"))] { 217 impl Shuffle1Dyn for u64x2 { 218 type Indices = Self; 219 #[inline] 220 fn shuffle1_dyn(self, indices: Self::Indices) -> Self { 221 #[cfg(target_arch = "x86")] 222 use crate::arch::x86::{_mm_permutevar_pd}; 223 #[cfg(target_arch = "x86_64")] 224 use crate::arch::x86_64::{_mm_permutevar_pd}; 225 // _mm_permutevar_pd uses the _second_ bit of each 226 // element to perform the selection, that is: 0b00 => 0, 227 // 0b10 => 1: 228 let indices = indices << 1; 229 unsafe { 230 crate::mem::transmute( 231 _mm_permutevar_pd( 232 crate::mem::transmute(self), 233 crate::mem::transmute(indices) 234 ) 235 ) 236 } 237 } 238 } 239 } else { 240 impl Shuffle1Dyn for u64x2 { 241 type Indices = Self; 242 #[inline] 243 fn shuffle1_dyn(self, indices: Self::Indices) -> Self { 244 let indices: u8x2 = (indices * 8).cast(); 245 let indices: u8x16 = shuffle!( 246 indices, 247 [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] 248 ); 249 let v = u8x16::new( 250 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 251 ); 252 let indices = indices + v; 253 unsafe { 254 let s: u8x16 =crate::mem::transmute(self); 255 crate::mem::transmute(s.shuffle1_dyn(indices)) 256 } 257 } 258 } 259 } 260 } 261 }; 262 (u128x1) => { 263 impl Shuffle1Dyn for u128x1 { 264 type Indices = Self; 265 #[inline] 266 fn shuffle1_dyn(self, _indices: Self::Indices) -> Self { 267 self 268 } 269 } 270 }; 271 ($id:ident) => { impl_fallback!($id); } 272 } 273 274 impl_shuffle1_dyn!(u8x2); 275 impl_shuffle1_dyn!(u8x4); 276 impl_shuffle1_dyn!(u8x8); 277 impl_shuffle1_dyn!(u8x16); 278 impl_shuffle1_dyn!(u8x32); 279 impl_shuffle1_dyn!(u8x64); 280 281 impl_shuffle1_dyn!(u16x2); 282 impl_shuffle1_dyn!(u16x4); 283 impl_shuffle1_dyn!(u16x8); 284 impl_shuffle1_dyn!(u16x16); 285 impl_shuffle1_dyn!(u16x32); 286 287 impl_shuffle1_dyn!(u32x2); 288 impl_shuffle1_dyn!(u32x4); 289 impl_shuffle1_dyn!(u32x8); 290 impl_shuffle1_dyn!(u32x16); 291 292 impl_shuffle1_dyn!(u64x2); 293 impl_shuffle1_dyn!(u64x4); 294 impl_shuffle1_dyn!(u64x8); 295 296 impl_shuffle1_dyn!(usizex2); 297 impl_shuffle1_dyn!(usizex4); 298 impl_shuffle1_dyn!(usizex8); 299 300 impl_shuffle1_dyn!(u128x1); 301 impl_shuffle1_dyn!(u128x2); 302 impl_shuffle1_dyn!(u128x4); 303 304 // Implementation for non-unsigned vector types 305 macro_rules! impl_shuffle1_dyn_non_u { 306 ($id:ident, $uid:ident) => { 307 impl Shuffle1Dyn for $id { 308 type Indices = $uid; 309 #[inline] 310 fn shuffle1_dyn(self, indices: Self::Indices) -> Self { 311 unsafe { 312 let u: $uid = crate::mem::transmute(self); 313 crate::mem::transmute(u.shuffle1_dyn(indices)) 314 } 315 } 316 } 317 }; 318 } 319 320 impl_shuffle1_dyn_non_u!(i8x2, u8x2); 321 impl_shuffle1_dyn_non_u!(i8x4, u8x4); 322 impl_shuffle1_dyn_non_u!(i8x8, u8x8); 323 impl_shuffle1_dyn_non_u!(i8x16, u8x16); 324 impl_shuffle1_dyn_non_u!(i8x32, u8x32); 325 impl_shuffle1_dyn_non_u!(i8x64, u8x64); 326 327 impl_shuffle1_dyn_non_u!(i16x2, u16x2); 328 impl_shuffle1_dyn_non_u!(i16x4, u16x4); 329 impl_shuffle1_dyn_non_u!(i16x8, u16x8); 330 impl_shuffle1_dyn_non_u!(i16x16, u16x16); 331 impl_shuffle1_dyn_non_u!(i16x32, u16x32); 332 333 impl_shuffle1_dyn_non_u!(i32x2, u32x2); 334 impl_shuffle1_dyn_non_u!(i32x4, u32x4); 335 impl_shuffle1_dyn_non_u!(i32x8, u32x8); 336 impl_shuffle1_dyn_non_u!(i32x16, u32x16); 337 338 impl_shuffle1_dyn_non_u!(i64x2, u64x2); 339 impl_shuffle1_dyn_non_u!(i64x4, u64x4); 340 impl_shuffle1_dyn_non_u!(i64x8, u64x8); 341 342 impl_shuffle1_dyn_non_u!(isizex2, usizex2); 343 impl_shuffle1_dyn_non_u!(isizex4, usizex4); 344 impl_shuffle1_dyn_non_u!(isizex8, usizex8); 345 346 impl_shuffle1_dyn_non_u!(i128x1, u128x1); 347 impl_shuffle1_dyn_non_u!(i128x2, u128x2); 348 impl_shuffle1_dyn_non_u!(i128x4, u128x4); 349 350 impl_shuffle1_dyn_non_u!(m8x2, u8x2); 351 impl_shuffle1_dyn_non_u!(m8x4, u8x4); 352 impl_shuffle1_dyn_non_u!(m8x8, u8x8); 353 impl_shuffle1_dyn_non_u!(m8x16, u8x16); 354 impl_shuffle1_dyn_non_u!(m8x32, u8x32); 355 impl_shuffle1_dyn_non_u!(m8x64, u8x64); 356 357 impl_shuffle1_dyn_non_u!(m16x2, u16x2); 358 impl_shuffle1_dyn_non_u!(m16x4, u16x4); 359 impl_shuffle1_dyn_non_u!(m16x8, u16x8); 360 impl_shuffle1_dyn_non_u!(m16x16, u16x16); 361 impl_shuffle1_dyn_non_u!(m16x32, u16x32); 362 363 impl_shuffle1_dyn_non_u!(m32x2, u32x2); 364 impl_shuffle1_dyn_non_u!(m32x4, u32x4); 365 impl_shuffle1_dyn_non_u!(m32x8, u32x8); 366 impl_shuffle1_dyn_non_u!(m32x16, u32x16); 367 368 impl_shuffle1_dyn_non_u!(m64x2, u64x2); 369 impl_shuffle1_dyn_non_u!(m64x4, u64x4); 370 impl_shuffle1_dyn_non_u!(m64x8, u64x8); 371 372 impl_shuffle1_dyn_non_u!(msizex2, usizex2); 373 impl_shuffle1_dyn_non_u!(msizex4, usizex4); 374 impl_shuffle1_dyn_non_u!(msizex8, usizex8); 375 376 impl_shuffle1_dyn_non_u!(m128x1, u128x1); 377 impl_shuffle1_dyn_non_u!(m128x2, u128x2); 378 impl_shuffle1_dyn_non_u!(m128x4, u128x4); 379 380 impl_shuffle1_dyn_non_u!(f32x2, u32x2); 381 impl_shuffle1_dyn_non_u!(f32x4, u32x4); 382 impl_shuffle1_dyn_non_u!(f32x8, u32x8); 383 impl_shuffle1_dyn_non_u!(f32x16, u32x16); 384 385 impl_shuffle1_dyn_non_u!(f64x2, u64x2); 386 impl_shuffle1_dyn_non_u!(f64x4, u64x4); 387 impl_shuffle1_dyn_non_u!(f64x8, u64x8); 388 389 // Implementation for non-unsigned vector types 390 macro_rules! impl_shuffle1_dyn_ptr { 391 ($id:ident, $uid:ident) => { 392 impl<T> Shuffle1Dyn for $id<T> { 393 type Indices = $uid; 394 #[inline] 395 fn shuffle1_dyn(self, indices: Self::Indices) -> Self { 396 unsafe { 397 let u: $uid = crate::mem::transmute(self); 398 crate::mem::transmute(u.shuffle1_dyn(indices)) 399 } 400 } 401 } 402 }; 403 } 404 405 impl_shuffle1_dyn_ptr!(cptrx2, usizex2); 406 impl_shuffle1_dyn_ptr!(cptrx4, usizex4); 407 impl_shuffle1_dyn_ptr!(cptrx8, usizex8); 408 409 impl_shuffle1_dyn_ptr!(mptrx2, usizex2); 410 impl_shuffle1_dyn_ptr!(mptrx4, usizex4); 411 impl_shuffle1_dyn_ptr!(mptrx8, usizex8); 412