1 // crate minimums: sse2, x86_64 2 3 use core::arch::x86_64::{__m128i, __m256i}; 4 use crate::types::*; 5 6 mod sse2; 7 8 #[derive(Copy, Clone)] 9 pub struct YesS3; 10 #[derive(Copy, Clone)] 11 pub struct NoS3; 12 13 #[derive(Copy, Clone)] 14 pub struct YesS4; 15 #[derive(Copy, Clone)] 16 pub struct NoS4; 17 18 #[derive(Copy, Clone)] 19 pub struct YesA1; 20 #[derive(Copy, Clone)] 21 pub struct NoA1; 22 23 #[derive(Copy, Clone)] 24 pub struct YesA2; 25 #[derive(Copy, Clone)] 26 pub struct NoA2; 27 28 #[derive(Copy, Clone)] 29 pub struct YesNI; 30 #[derive(Copy, Clone)] 31 pub struct NoNI; 32 33 use core::marker::PhantomData; 34 35 #[derive(Copy, Clone)] 36 pub struct SseMachine<S3, S4, NI>(PhantomData<(S3, S4, NI)>); 37 impl<S3: Copy, S4: Copy, NI: Copy> Machine for SseMachine<S3, S4, NI> 38 where 39 sse2::u128x1_sse2<S3, S4, NI>: Swap64, 40 sse2::u64x2_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>, 41 sse2::u32x4_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>, 42 sse2::u64x4_sse2<S3, S4, NI>: BSwap + Words4, 43 sse2::u128x1_sse2<S3, S4, NI>: BSwap, 44 sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x2x2_sse2<S3, S4, NI>>, 45 sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x4_sse2<S3, S4, NI>>, 46 sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u32x4x2_sse2<S3, S4, NI>>, 47 sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u64x2x4_sse2<S3, S4, NI>>, 48 sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u32x4x4_sse2<S3, S4, NI>>, 49 { 50 type u32x4 = sse2::u32x4_sse2<S3, S4, NI>; 51 type u64x2 = sse2::u64x2_sse2<S3, S4, NI>; 52 type u128x1 = sse2::u128x1_sse2<S3, S4, NI>; 53 54 type u32x4x2 = sse2::u32x4x2_sse2<S3, S4, NI>; 55 type u64x2x2 = sse2::u64x2x2_sse2<S3, S4, NI>; 56 type u64x4 = sse2::u64x4_sse2<S3, S4, NI>; 57 type u128x2 = sse2::u128x2_sse2<S3, S4, NI>; 58 59 type u32x4x4 = sse2::u32x4x4_sse2<S3, S4, NI>; 60 type u64x2x4 = sse2::u64x2x4_sse2<S3, S4, NI>; 61 type u128x4 = sse2::u128x4_sse2<S3, S4, NI>; 62 63 #[inline(always)] instance() -> Self64 unsafe fn instance() -> Self { 65 SseMachine(PhantomData) 66 } 67 } 68 69 #[derive(Copy, Clone)] 70 pub struct Avx2Machine<NI>(PhantomData<NI>); 71 impl<NI: Copy> Machine for Avx2Machine<NI> 72 where 73 sse2::u128x1_sse2<YesS3, YesS4, NI>: BSwap + Swap64, 74 sse2::u64x2_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>, 75 sse2::u32x4_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>, 76 sse2::u64x4_sse2<YesS3, YesS4, NI>: BSwap + Words4, 77 { 78 type u32x4 = sse2::u32x4_sse2<YesS3, YesS4, NI>; 79 type u64x2 = sse2::u64x2_sse2<YesS3, YesS4, NI>; 80 type u128x1 = sse2::u128x1_sse2<YesS3, YesS4, NI>; 81 82 type u32x4x2 = sse2::u32x4x2_sse2<YesS3, YesS4, NI>; 83 type u64x2x2 = sse2::u64x2x2_sse2<YesS3, YesS4, NI>; 84 type u64x4 = sse2::u64x4_sse2<YesS3, YesS4, NI>; 85 type u128x2 = sse2::u128x2_sse2<YesS3, YesS4, NI>; 86 87 type u32x4x4 = sse2::avx2::u32x4x4_avx2<NI>; 88 type u64x2x4 = sse2::u64x2x4_sse2<YesS3, YesS4, NI>; 89 type u128x4 = sse2::u128x4_sse2<YesS3, YesS4, NI>; 90 91 #[inline(always)] instance() -> Self92 unsafe fn instance() -> Self { 93 Avx2Machine(PhantomData) 94 } 95 } 96 97 pub type SSE2 = SseMachine<NoS3, NoS4, NoNI>; 98 pub type SSSE3 = SseMachine<YesS3, NoS4, NoNI>; 99 pub type SSE41 = SseMachine<YesS3, YesS4, NoNI>; 100 /// AVX but not AVX2: only 128-bit integer operations, but use VEX versions of everything 101 /// to avoid expensive SSE/VEX conflicts. 102 pub type AVX = SseMachine<YesS3, YesS4, NoNI>; 103 pub type AVX2 = Avx2Machine<NoNI>; 104 105 /// Generic wrapper for unparameterized storage of any of the possible impls. 106 /// Converting into and out of this type should be essentially free, although it may be more 107 /// aligned than a particular impl requires. 108 #[allow(non_camel_case_types)] 109 #[derive(Copy, Clone)] 110 pub union vec128_storage { 111 u32x4: [u32; 4], 112 u64x2: [u64; 2], 113 u128x1: [u128; 1], 114 sse2: __m128i, 115 } 116 impl Store<vec128_storage> for vec128_storage { 117 #[inline(always)] unpack(p: vec128_storage) -> Self118 unsafe fn unpack(p: vec128_storage) -> Self { 119 p 120 } 121 } 122 impl<'a> Into<&'a [u32; 4]> for &'a vec128_storage { 123 #[inline(always)] into(self) -> &'a [u32; 4]124 fn into(self) -> &'a [u32; 4] { 125 unsafe { &self.u32x4 } 126 } 127 } 128 impl Into<vec128_storage> for [u32; 4] { 129 #[inline(always)] into(self) -> vec128_storage130 fn into(self) -> vec128_storage { 131 vec128_storage { u32x4: self } 132 } 133 } 134 impl Default for vec128_storage { 135 #[inline(always)] default() -> Self136 fn default() -> Self { 137 vec128_storage { u128x1: [0] } 138 } 139 } 140 141 #[allow(non_camel_case_types)] 142 #[derive(Copy, Clone)] 143 pub union vec256_storage { 144 u32x8: [u32; 8], 145 u64x4: [u64; 4], 146 u128x2: [u128; 2], 147 sse2: [vec128_storage; 2], 148 avx: __m256i, 149 } 150 impl Into<vec256_storage> for [u64; 4] { 151 #[inline(always)] into(self) -> vec256_storage152 fn into(self) -> vec256_storage { 153 vec256_storage { u64x4: self } 154 } 155 } 156 impl Default for vec256_storage { 157 #[inline(always)] default() -> Self158 fn default() -> Self { 159 vec256_storage { u128x2: [0, 0] } 160 } 161 } 162 impl vec256_storage { new128(xs: [vec128_storage; 2]) -> Self163 pub fn new128(xs: [vec128_storage; 2]) -> Self { 164 Self { sse2: xs } 165 } split128(self) -> [vec128_storage; 2]166 pub fn split128(self) -> [vec128_storage; 2] { 167 unsafe { self.sse2 } 168 } 169 } 170 171 #[allow(non_camel_case_types)] 172 #[derive(Copy, Clone)] 173 pub union vec512_storage { 174 u32x16: [u32; 16], 175 u64x8: [u64; 8], 176 u128x4: [u128; 4], 177 sse2: [vec128_storage; 4], 178 avx: [vec256_storage; 2], 179 } 180 impl Default for vec512_storage { 181 #[inline(always)] default() -> Self182 fn default() -> Self { 183 vec512_storage { 184 u128x4: [0, 0, 0, 0], 185 } 186 } 187 } 188 impl vec512_storage { new128(xs: [vec128_storage; 4]) -> Self189 pub fn new128(xs: [vec128_storage; 4]) -> Self { 190 Self { sse2: xs } 191 } split128(self) -> [vec128_storage; 4]192 pub fn split128(self) -> [vec128_storage; 4] { 193 unsafe { self.sse2 } 194 } 195 } 196 197 macro_rules! impl_into { 198 ($storage:ident, $array:ty, $name:ident) => { 199 impl Into<$array> for $storage { 200 #[inline(always)] 201 fn into(self) -> $array { 202 unsafe { self.$name } 203 } 204 } 205 }; 206 } 207 impl_into!(vec128_storage, [u32; 4], u32x4); 208 impl_into!(vec128_storage, [u64; 2], u64x2); 209 impl_into!(vec128_storage, [u128; 1], u128x1); 210 impl_into!(vec256_storage, [u32; 8], u32x8); 211 impl_into!(vec256_storage, [u64; 4], u64x4); 212 impl_into!(vec256_storage, [u128; 2], u128x2); 213 impl_into!(vec512_storage, [u32; 16], u32x16); 214 impl_into!(vec512_storage, [u64; 8], u64x8); 215 impl_into!(vec512_storage, [u128; 4], u128x4); 216 217 /// Generate the full set of optimized implementations to take advantage of the most important 218 /// hardware feature sets. 219 /// 220 /// This dispatcher is suitable for maximizing throughput. 221 #[macro_export] 222 macro_rules! dispatch { 223 ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { 224 #[cfg(feature = "std")] 225 $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { 226 #[inline(always)] 227 fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body 228 use std::arch::x86_64::*; 229 #[target_feature(enable = "avx2")] 230 unsafe fn impl_avx2($($arg: $argty),*) -> $ret { 231 let ret = fn_impl($crate::x86_64::AVX2::instance(), $($arg),*); 232 _mm256_zeroupper(); 233 ret 234 } 235 #[target_feature(enable = "avx")] 236 #[target_feature(enable = "sse4.1")] 237 #[target_feature(enable = "ssse3")] 238 unsafe fn impl_avx($($arg: $argty),*) -> $ret { 239 let ret = fn_impl($crate::x86_64::AVX::instance(), $($arg),*); 240 _mm256_zeroupper(); 241 ret 242 } 243 #[target_feature(enable = "sse4.1")] 244 #[target_feature(enable = "ssse3")] 245 unsafe fn impl_sse41($($arg: $argty),*) -> $ret { 246 fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) 247 } 248 #[target_feature(enable = "ssse3")] 249 unsafe fn impl_ssse3($($arg: $argty),*) -> $ret { 250 fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) 251 } 252 #[target_feature(enable = "sse2")] 253 unsafe fn impl_sse2($($arg: $argty),*) -> $ret { 254 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) 255 } 256 unsafe { 257 if is_x86_feature_detected!("avx2") { 258 impl_avx2($($arg),*) 259 } else if is_x86_feature_detected!("avx") { 260 impl_avx($($arg),*) 261 } else if is_x86_feature_detected!("sse4.1") { 262 impl_sse41($($arg),*) 263 } else if is_x86_feature_detected!("ssse3") { 264 impl_ssse3($($arg),*) 265 } else if is_x86_feature_detected!("sse2") { 266 impl_sse2($($arg),*) 267 } else { 268 unimplemented!() 269 } 270 } 271 } 272 #[cfg(not(feature = "std"))] 273 #[inline(always)] 274 $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { 275 unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body 276 unsafe { 277 if cfg!(target_feature = "avx2") { 278 fn_impl($crate::x86_64::AVX2::instance(), $($arg),*) 279 } else if cfg!(target_feature = "avx") { 280 fn_impl($crate::x86_64::AVX::instance(), $($arg),*) 281 } else if cfg!(target_feature = "sse4.1") { 282 fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) 283 } else if cfg!(target_feature = "ssse3") { 284 fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) 285 } else { 286 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) 287 } 288 } 289 } 290 }; 291 ($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { 292 dispatch!($mach, $MTy, { 293 $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body 294 }); 295 } 296 } 297 298 /// Generate only the basic implementations necessary to be able to operate efficiently on 128-bit 299 /// vectors on this platfrom. For x86-64, that would mean SSE2 and AVX. 300 /// 301 /// This dispatcher is suitable for vector operations that do not benefit from advanced hardware 302 /// features (e.g. because they are done infrequently), so minimizing their contribution to code 303 /// size is more important. 304 #[macro_export] 305 macro_rules! dispatch_light128 { 306 ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { 307 #[cfg(feature = "std")] 308 $($pub $(($krate))*)* fn $name($($arg: $argty),*) -> $ret { 309 #[inline(always)] 310 fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body 311 use std::arch::x86_64::*; 312 #[target_feature(enable = "avx")] 313 unsafe fn impl_avx($($arg: $argty),*) -> $ret { 314 fn_impl($crate::x86_64::AVX::instance(), $($arg),*) 315 } 316 #[target_feature(enable = "sse2")] 317 unsafe fn impl_sse2($($arg: $argty),*) -> $ret { 318 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) 319 } 320 unsafe { 321 if is_x86_feature_detected!("avx") { 322 impl_avx($($arg),*) 323 } else if is_x86_feature_detected!("sse2") { 324 impl_sse2($($arg),*) 325 } else { 326 unimplemented!() 327 } 328 } 329 } 330 #[cfg(not(feature = "std"))] 331 #[inline(always)] 332 $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { 333 unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body 334 unsafe { 335 if cfg!(target_feature = "avx2") { 336 fn_impl($crate::x86_64::AVX2::instance(), $($arg),*) 337 } else if cfg!(target_feature = "avx") { 338 fn_impl($crate::x86_64::AVX::instance(), $($arg),*) 339 } else if cfg!(target_feature = "sse4.1") { 340 fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) 341 } else if cfg!(target_feature = "ssse3") { 342 fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) 343 } else { 344 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) 345 } 346 } 347 } 348 }; 349 ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { 350 dispatch_light128!($mach, $MTy, { 351 $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body 352 }); 353 } 354 } 355 356 /// Generate only the basic implementations necessary to be able to operate efficiently on 256-bit 357 /// vectors on this platfrom. For x86-64, that would mean SSE2, AVX, and AVX2. 358 /// 359 /// This dispatcher is suitable for vector operations that do not benefit from advanced hardware 360 /// features (e.g. because they are done infrequently), so minimizing their contribution to code 361 /// size is more important. 362 #[macro_export] 363 macro_rules! dispatch_light256 { 364 ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { 365 #[cfg(feature = "std")] 366 $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> $ret { 367 #[inline(always)] 368 fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body 369 use std::arch::x86_64::*; 370 #[target_feature(enable = "avx")] 371 unsafe fn impl_avx($($arg: $argty),*) -> $ret { 372 fn_impl($crate::x86_64::AVX::instance(), $($arg),*) 373 } 374 #[target_feature(enable = "sse2")] 375 unsafe fn impl_sse2($($arg: $argty),*) -> $ret { 376 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) 377 } 378 unsafe { 379 if is_x86_feature_detected!("avx") { 380 impl_avx($($arg),*) 381 } else if is_x86_feature_detected!("sse2") { 382 impl_sse2($($arg),*) 383 } else { 384 unimplemented!() 385 } 386 } 387 } 388 #[cfg(not(feature = "std"))] 389 #[inline(always)] 390 $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { 391 unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body 392 unsafe { 393 if cfg!(target_feature = "avx2") { 394 fn_impl($crate::x86_64::AVX2::instance(), $($arg),*) 395 } else if cfg!(target_feature = "avx") { 396 fn_impl($crate::x86_64::AVX::instance(), $($arg),*) 397 } else if cfg!(target_feature = "sse4.1") { 398 fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) 399 } else if cfg!(target_feature = "ssse3") { 400 fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) 401 } else { 402 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) 403 } 404 } 405 } 406 }; 407 ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { 408 dispatch_light256!($mach, $MTy, { 409 $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body 410 }); 411 } 412 } 413