1 // crate minimums: sse2, x86_64 2 3 use core::arch::x86_64::{__m128i, __m256i}; 4 use crate::types::*; 5 6 mod sse2; 7 8 #[derive(Copy, Clone)] 9 pub struct YesS3; 10 #[derive(Copy, Clone)] 11 pub struct NoS3; 12 13 #[derive(Copy, Clone)] 14 pub struct YesS4; 15 #[derive(Copy, Clone)] 16 pub struct NoS4; 17 18 #[derive(Copy, Clone)] 19 pub struct YesA1; 20 #[derive(Copy, Clone)] 21 pub struct NoA1; 22 23 #[derive(Copy, Clone)] 24 pub struct YesA2; 25 #[derive(Copy, Clone)] 26 pub struct NoA2; 27 28 #[derive(Copy, Clone)] 29 pub struct YesNI; 30 #[derive(Copy, Clone)] 31 pub struct NoNI; 32 33 use core::marker::PhantomData; 34 35 #[derive(Copy, Clone)] 36 pub struct SseMachine<S3, S4, NI>(PhantomData<(S3, S4, NI)>); 37 impl<S3: Copy, S4: Copy, NI: Copy> Machine for SseMachine<S3, S4, NI> 38 where 39 sse2::u128x1_sse2<S3, S4, NI>: Swap64, 40 sse2::u64x2_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>, 41 sse2::u32x4_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>, 42 sse2::u64x4_sse2<S3, S4, NI>: BSwap + Words4, 43 sse2::u128x1_sse2<S3, S4, NI>: BSwap, 44 sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x2x2_sse2<S3, S4, NI>>, 45 sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x4_sse2<S3, S4, NI>>, 46 sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u32x4x2_sse2<S3, S4, NI>>, 47 sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u64x2x4_sse2<S3, S4, NI>>, 48 sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u32x4x4_sse2<S3, S4, NI>>, 49 { 50 type u32x4 = sse2::u32x4_sse2<S3, S4, NI>; 51 type u64x2 = sse2::u64x2_sse2<S3, S4, NI>; 52 type u128x1 = sse2::u128x1_sse2<S3, S4, NI>; 53 54 type u32x4x2 = sse2::u32x4x2_sse2<S3, S4, NI>; 55 type u64x2x2 = sse2::u64x2x2_sse2<S3, S4, NI>; 56 type u64x4 = sse2::u64x4_sse2<S3, S4, NI>; 57 type u128x2 = sse2::u128x2_sse2<S3, S4, NI>; 58 59 type u32x4x4 = sse2::u32x4x4_sse2<S3, S4, NI>; 60 type u64x2x4 = sse2::u64x2x4_sse2<S3, S4, NI>; 61 type u128x4 = sse2::u128x4_sse2<S3, S4, NI>; 62 63 #[inline(always)] instance() -> Self64 unsafe fn instance() -> Self { 65 SseMachine(PhantomData) 66 } 67 } 68 69 #[derive(Copy, Clone)] 70 pub struct Avx2Machine<NI>(PhantomData<NI>); 71 impl<NI: Copy> Machine for Avx2Machine<NI> 72 where 73 sse2::u128x1_sse2<YesS3, YesS4, NI>: BSwap + Swap64, 74 sse2::u64x2_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>, 75 sse2::u32x4_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>, 76 sse2::u64x4_sse2<YesS3, YesS4, NI>: BSwap + Words4, 77 { 78 type u32x4 = sse2::u32x4_sse2<YesS3, YesS4, NI>; 79 type u64x2 = sse2::u64x2_sse2<YesS3, YesS4, NI>; 80 type u128x1 = sse2::u128x1_sse2<YesS3, YesS4, NI>; 81 82 type u32x4x2 = sse2::u32x4x2_sse2<YesS3, YesS4, NI>; 83 type u64x2x2 = sse2::u64x2x2_sse2<YesS3, YesS4, NI>; 84 type u64x4 = sse2::u64x4_sse2<YesS3, YesS4, NI>; 85 type u128x2 = sse2::u128x2_sse2<YesS3, YesS4, NI>; 86 87 type u32x4x4 = sse2::avx2::u32x4x4_avx2<NI>; 88 type u64x2x4 = sse2::u64x2x4_sse2<YesS3, YesS4, NI>; 89 type u128x4 = sse2::u128x4_sse2<YesS3, YesS4, NI>; 90 91 #[inline(always)] instance() -> Self92 unsafe fn instance() -> Self { 93 Avx2Machine(PhantomData) 94 } 95 } 96 97 pub type SSE2 = SseMachine<NoS3, NoS4, NoNI>; 98 pub type SSSE3 = SseMachine<YesS3, NoS4, NoNI>; 99 pub type SSE41 = SseMachine<YesS3, YesS4, NoNI>; 100 /// AVX but not AVX2: only 128-bit integer operations, but use VEX versions of everything 101 /// to avoid expensive SSE/VEX conflicts. 102 pub type AVX = SseMachine<YesS3, YesS4, NoNI>; 103 pub type AVX2 = Avx2Machine<NoNI>; 104 105 /// Generic wrapper for unparameterized storage of any of the possible impls. 106 /// Converting into and out of this type should be essentially free, although it may be more 107 /// aligned than a particular impl requires. 108 #[allow(non_camel_case_types)] 109 #[derive(Copy, Clone)] 110 pub union vec128_storage { 111 u32x4: [u32; 4], 112 u64x2: [u64; 2], 113 u128x1: [u128; 1], 114 sse2: __m128i, 115 } 116 impl Store<vec128_storage> for vec128_storage { 117 #[inline(always)] unpack(p: vec128_storage) -> Self118 unsafe fn unpack(p: vec128_storage) -> Self { 119 p 120 } 121 } 122 impl<'a> Into<&'a [u32; 4]> for &'a vec128_storage { 123 #[inline(always)] into(self) -> &'a [u32; 4]124 fn into(self) -> &'a [u32; 4] { 125 unsafe { &self.u32x4 } 126 } 127 } 128 impl Into<vec128_storage> for [u32; 4] { 129 #[inline(always)] into(self) -> vec128_storage130 fn into(self) -> vec128_storage { 131 vec128_storage { u32x4: self } 132 } 133 } 134 impl Default for vec128_storage { 135 #[inline(always)] default() -> Self136 fn default() -> Self { 137 vec128_storage { u128x1: [0] } 138 } 139 } 140 impl Eq for vec128_storage {} 141 impl PartialEq for vec128_storage { 142 #[inline(always)] eq(&self, rhs: &Self) -> bool143 fn eq(&self, rhs: &Self) -> bool { 144 unsafe { self.u128x1 == rhs.u128x1 } 145 } 146 } 147 148 #[allow(non_camel_case_types)] 149 #[derive(Copy, Clone)] 150 pub union vec256_storage { 151 u32x8: [u32; 8], 152 u64x4: [u64; 4], 153 u128x2: [u128; 2], 154 sse2: [vec128_storage; 2], 155 avx: __m256i, 156 } 157 impl Into<vec256_storage> for [u64; 4] { 158 #[inline(always)] into(self) -> vec256_storage159 fn into(self) -> vec256_storage { 160 vec256_storage { u64x4: self } 161 } 162 } 163 impl Default for vec256_storage { 164 #[inline(always)] default() -> Self165 fn default() -> Self { 166 vec256_storage { u128x2: [0, 0] } 167 } 168 } 169 impl vec256_storage { new128(xs: [vec128_storage; 2]) -> Self170 pub fn new128(xs: [vec128_storage; 2]) -> Self { 171 Self { sse2: xs } 172 } split128(self) -> [vec128_storage; 2]173 pub fn split128(self) -> [vec128_storage; 2] { 174 unsafe { self.sse2 } 175 } 176 } 177 impl Eq for vec256_storage {} 178 impl PartialEq for vec256_storage { 179 #[inline(always)] eq(&self, rhs: &Self) -> bool180 fn eq(&self, rhs: &Self) -> bool { 181 unsafe { self.sse2 == rhs.sse2 } 182 } 183 } 184 185 #[allow(non_camel_case_types)] 186 #[derive(Copy, Clone)] 187 pub union vec512_storage { 188 u32x16: [u32; 16], 189 u64x8: [u64; 8], 190 u128x4: [u128; 4], 191 sse2: [vec128_storage; 4], 192 avx: [vec256_storage; 2], 193 } 194 impl Default for vec512_storage { 195 #[inline(always)] default() -> Self196 fn default() -> Self { 197 vec512_storage { 198 u128x4: [0, 0, 0, 0], 199 } 200 } 201 } 202 impl vec512_storage { new128(xs: [vec128_storage; 4]) -> Self203 pub fn new128(xs: [vec128_storage; 4]) -> Self { 204 Self { sse2: xs } 205 } split128(self) -> [vec128_storage; 4]206 pub fn split128(self) -> [vec128_storage; 4] { 207 unsafe { self.sse2 } 208 } 209 } 210 impl Eq for vec512_storage {} 211 impl PartialEq for vec512_storage { 212 #[inline(always)] eq(&self, rhs: &Self) -> bool213 fn eq(&self, rhs: &Self) -> bool { 214 unsafe { self.avx == rhs.avx } 215 } 216 } 217 218 macro_rules! impl_into { 219 ($storage:ident, $array:ty, $name:ident) => { 220 impl Into<$array> for $storage { 221 #[inline(always)] 222 fn into(self) -> $array { 223 unsafe { self.$name } 224 } 225 } 226 }; 227 } 228 impl_into!(vec128_storage, [u32; 4], u32x4); 229 impl_into!(vec128_storage, [u64; 2], u64x2); 230 impl_into!(vec128_storage, [u128; 1], u128x1); 231 impl_into!(vec256_storage, [u32; 8], u32x8); 232 impl_into!(vec256_storage, [u64; 4], u64x4); 233 impl_into!(vec256_storage, [u128; 2], u128x2); 234 impl_into!(vec512_storage, [u32; 16], u32x16); 235 impl_into!(vec512_storage, [u64; 8], u64x8); 236 impl_into!(vec512_storage, [u128; 4], u128x4); 237 238 /// Generate the full set of optimized implementations to take advantage of the most important 239 /// hardware feature sets. 240 /// 241 /// This dispatcher is suitable for maximizing throughput. 242 #[macro_export] 243 macro_rules! dispatch { 244 ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { 245 #[cfg(feature = "std")] 246 $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { 247 #[inline(always)] 248 fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body 249 use std::arch::x86_64::*; 250 #[target_feature(enable = "avx2")] 251 unsafe fn impl_avx2($($arg: $argty),*) -> $ret { 252 let ret = fn_impl($crate::x86_64::AVX2::instance(), $($arg),*); 253 _mm256_zeroupper(); 254 ret 255 } 256 #[target_feature(enable = "avx")] 257 #[target_feature(enable = "sse4.1")] 258 #[target_feature(enable = "ssse3")] 259 unsafe fn impl_avx($($arg: $argty),*) -> $ret { 260 let ret = fn_impl($crate::x86_64::AVX::instance(), $($arg),*); 261 _mm256_zeroupper(); 262 ret 263 } 264 #[target_feature(enable = "sse4.1")] 265 #[target_feature(enable = "ssse3")] 266 unsafe fn impl_sse41($($arg: $argty),*) -> $ret { 267 fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) 268 } 269 #[target_feature(enable = "ssse3")] 270 unsafe fn impl_ssse3($($arg: $argty),*) -> $ret { 271 fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) 272 } 273 #[target_feature(enable = "sse2")] 274 unsafe fn impl_sse2($($arg: $argty),*) -> $ret { 275 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) 276 } 277 unsafe { 278 if is_x86_feature_detected!("avx2") { 279 impl_avx2($($arg),*) 280 } else if is_x86_feature_detected!("avx") { 281 impl_avx($($arg),*) 282 } else if is_x86_feature_detected!("sse4.1") { 283 impl_sse41($($arg),*) 284 } else if is_x86_feature_detected!("ssse3") { 285 impl_ssse3($($arg),*) 286 } else if is_x86_feature_detected!("sse2") { 287 impl_sse2($($arg),*) 288 } else { 289 unimplemented!() 290 } 291 } 292 } 293 #[cfg(not(feature = "std"))] 294 #[inline(always)] 295 $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { 296 unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body 297 unsafe { 298 if cfg!(target_feature = "avx2") { 299 fn_impl($crate::x86_64::AVX2::instance(), $($arg),*) 300 } else if cfg!(target_feature = "avx") { 301 fn_impl($crate::x86_64::AVX::instance(), $($arg),*) 302 } else if cfg!(target_feature = "sse4.1") { 303 fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) 304 } else if cfg!(target_feature = "ssse3") { 305 fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) 306 } else { 307 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) 308 } 309 } 310 } 311 }; 312 ($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { 313 dispatch!($mach, $MTy, { 314 $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body 315 }); 316 } 317 } 318 319 /// Generate only the basic implementations necessary to be able to operate efficiently on 128-bit 320 /// vectors on this platfrom. For x86-64, that would mean SSE2 and AVX. 321 /// 322 /// This dispatcher is suitable for vector operations that do not benefit from advanced hardware 323 /// features (e.g. because they are done infrequently), so minimizing their contribution to code 324 /// size is more important. 325 #[macro_export] 326 macro_rules! dispatch_light128 { 327 ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { 328 #[cfg(feature = "std")] 329 $($pub $(($krate))*)* fn $name($($arg: $argty),*) -> $ret { 330 #[inline(always)] 331 fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body 332 use std::arch::x86_64::*; 333 #[target_feature(enable = "avx")] 334 unsafe fn impl_avx($($arg: $argty),*) -> $ret { 335 fn_impl($crate::x86_64::AVX::instance(), $($arg),*) 336 } 337 #[target_feature(enable = "sse2")] 338 unsafe fn impl_sse2($($arg: $argty),*) -> $ret { 339 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) 340 } 341 unsafe { 342 if is_x86_feature_detected!("avx") { 343 impl_avx($($arg),*) 344 } else if is_x86_feature_detected!("sse2") { 345 impl_sse2($($arg),*) 346 } else { 347 unimplemented!() 348 } 349 } 350 } 351 #[cfg(not(feature = "std"))] 352 #[inline(always)] 353 $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { 354 unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body 355 unsafe { 356 if cfg!(target_feature = "avx2") { 357 fn_impl($crate::x86_64::AVX2::instance(), $($arg),*) 358 } else if cfg!(target_feature = "avx") { 359 fn_impl($crate::x86_64::AVX::instance(), $($arg),*) 360 } else if cfg!(target_feature = "sse4.1") { 361 fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) 362 } else if cfg!(target_feature = "ssse3") { 363 fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) 364 } else { 365 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) 366 } 367 } 368 } 369 }; 370 ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { 371 dispatch_light128!($mach, $MTy, { 372 $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body 373 }); 374 } 375 } 376 377 /// Generate only the basic implementations necessary to be able to operate efficiently on 256-bit 378 /// vectors on this platfrom. For x86-64, that would mean SSE2, AVX, and AVX2. 379 /// 380 /// This dispatcher is suitable for vector operations that do not benefit from advanced hardware 381 /// features (e.g. because they are done infrequently), so minimizing their contribution to code 382 /// size is more important. 383 #[macro_export] 384 macro_rules! dispatch_light256 { 385 ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { 386 #[cfg(feature = "std")] 387 $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> $ret { 388 #[inline(always)] 389 fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body 390 use std::arch::x86_64::*; 391 #[target_feature(enable = "avx")] 392 unsafe fn impl_avx($($arg: $argty),*) -> $ret { 393 fn_impl($crate::x86_64::AVX::instance(), $($arg),*) 394 } 395 #[target_feature(enable = "sse2")] 396 unsafe fn impl_sse2($($arg: $argty),*) -> $ret { 397 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) 398 } 399 unsafe { 400 if is_x86_feature_detected!("avx") { 401 impl_avx($($arg),*) 402 } else if is_x86_feature_detected!("sse2") { 403 impl_sse2($($arg),*) 404 } else { 405 unimplemented!() 406 } 407 } 408 } 409 #[cfg(not(feature = "std"))] 410 #[inline(always)] 411 $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { 412 unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body 413 unsafe { 414 if cfg!(target_feature = "avx2") { 415 fn_impl($crate::x86_64::AVX2::instance(), $($arg),*) 416 } else if cfg!(target_feature = "avx") { 417 fn_impl($crate::x86_64::AVX::instance(), $($arg),*) 418 } else if cfg!(target_feature = "sse4.1") { 419 fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) 420 } else if cfg!(target_feature = "ssse3") { 421 fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) 422 } else { 423 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) 424 } 425 } 426 } 427 }; 428 ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { 429 dispatch_light256!($mach, $MTy, { 430 $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body 431 }); 432 } 433 } 434