1 // crate minimums: sse2, x86_64
2 
3 use core::arch::x86_64::{__m128i, __m256i};
4 use crate::types::*;
5 
6 mod sse2;
7 
8 #[derive(Copy, Clone)]
9 pub struct YesS3;
10 #[derive(Copy, Clone)]
11 pub struct NoS3;
12 
13 #[derive(Copy, Clone)]
14 pub struct YesS4;
15 #[derive(Copy, Clone)]
16 pub struct NoS4;
17 
18 #[derive(Copy, Clone)]
19 pub struct YesA1;
20 #[derive(Copy, Clone)]
21 pub struct NoA1;
22 
23 #[derive(Copy, Clone)]
24 pub struct YesA2;
25 #[derive(Copy, Clone)]
26 pub struct NoA2;
27 
28 #[derive(Copy, Clone)]
29 pub struct YesNI;
30 #[derive(Copy, Clone)]
31 pub struct NoNI;
32 
33 use core::marker::PhantomData;
34 
35 #[derive(Copy, Clone)]
36 pub struct SseMachine<S3, S4, NI>(PhantomData<(S3, S4, NI)>);
37 impl<S3: Copy, S4: Copy, NI: Copy> Machine for SseMachine<S3, S4, NI>
38 where
39     sse2::u128x1_sse2<S3, S4, NI>: Swap64,
40     sse2::u64x2_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>,
41     sse2::u32x4_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>,
42     sse2::u64x4_sse2<S3, S4, NI>: BSwap + Words4,
43     sse2::u128x1_sse2<S3, S4, NI>: BSwap,
44     sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x2x2_sse2<S3, S4, NI>>,
45     sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x4_sse2<S3, S4, NI>>,
46     sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u32x4x2_sse2<S3, S4, NI>>,
47     sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u64x2x4_sse2<S3, S4, NI>>,
48     sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u32x4x4_sse2<S3, S4, NI>>,
49 {
50     type u32x4 = sse2::u32x4_sse2<S3, S4, NI>;
51     type u64x2 = sse2::u64x2_sse2<S3, S4, NI>;
52     type u128x1 = sse2::u128x1_sse2<S3, S4, NI>;
53 
54     type u32x4x2 = sse2::u32x4x2_sse2<S3, S4, NI>;
55     type u64x2x2 = sse2::u64x2x2_sse2<S3, S4, NI>;
56     type u64x4 = sse2::u64x4_sse2<S3, S4, NI>;
57     type u128x2 = sse2::u128x2_sse2<S3, S4, NI>;
58 
59     type u32x4x4 = sse2::u32x4x4_sse2<S3, S4, NI>;
60     type u64x2x4 = sse2::u64x2x4_sse2<S3, S4, NI>;
61     type u128x4 = sse2::u128x4_sse2<S3, S4, NI>;
62 
63     #[inline(always)]
instance() -> Self64     unsafe fn instance() -> Self {
65         SseMachine(PhantomData)
66     }
67 }
68 
69 #[derive(Copy, Clone)]
70 pub struct Avx2Machine<NI>(PhantomData<NI>);
71 impl<NI: Copy> Machine for Avx2Machine<NI>
72 where
73     sse2::u128x1_sse2<YesS3, YesS4, NI>: BSwap + Swap64,
74     sse2::u64x2_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>,
75     sse2::u32x4_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>,
76     sse2::u64x4_sse2<YesS3, YesS4, NI>: BSwap + Words4,
77 {
78     type u32x4 = sse2::u32x4_sse2<YesS3, YesS4, NI>;
79     type u64x2 = sse2::u64x2_sse2<YesS3, YesS4, NI>;
80     type u128x1 = sse2::u128x1_sse2<YesS3, YesS4, NI>;
81 
82     type u32x4x2 = sse2::u32x4x2_sse2<YesS3, YesS4, NI>;
83     type u64x2x2 = sse2::u64x2x2_sse2<YesS3, YesS4, NI>;
84     type u64x4 = sse2::u64x4_sse2<YesS3, YesS4, NI>;
85     type u128x2 = sse2::u128x2_sse2<YesS3, YesS4, NI>;
86 
87     type u32x4x4 = sse2::avx2::u32x4x4_avx2<NI>;
88     type u64x2x4 = sse2::u64x2x4_sse2<YesS3, YesS4, NI>;
89     type u128x4 = sse2::u128x4_sse2<YesS3, YesS4, NI>;
90 
91     #[inline(always)]
instance() -> Self92     unsafe fn instance() -> Self {
93         Avx2Machine(PhantomData)
94     }
95 }
96 
97 pub type SSE2 = SseMachine<NoS3, NoS4, NoNI>;
98 pub type SSSE3 = SseMachine<YesS3, NoS4, NoNI>;
99 pub type SSE41 = SseMachine<YesS3, YesS4, NoNI>;
100 /// AVX but not AVX2: only 128-bit integer operations, but use VEX versions of everything
101 /// to avoid expensive SSE/VEX conflicts.
102 pub type AVX = SseMachine<YesS3, YesS4, NoNI>;
103 pub type AVX2 = Avx2Machine<NoNI>;
104 
105 /// Generic wrapper for unparameterized storage of any of the possible impls.
106 /// Converting into and out of this type should be essentially free, although it may be more
107 /// aligned than a particular impl requires.
108 #[allow(non_camel_case_types)]
109 #[derive(Copy, Clone)]
110 pub union vec128_storage {
111     u32x4: [u32; 4],
112     u64x2: [u64; 2],
113     u128x1: [u128; 1],
114     sse2: __m128i,
115 }
116 impl Store<vec128_storage> for vec128_storage {
117     #[inline(always)]
unpack(p: vec128_storage) -> Self118     unsafe fn unpack(p: vec128_storage) -> Self {
119         p
120     }
121 }
122 impl<'a> Into<&'a [u32; 4]> for &'a vec128_storage {
123     #[inline(always)]
into(self) -> &'a [u32; 4]124     fn into(self) -> &'a [u32; 4] {
125         unsafe { &self.u32x4 }
126     }
127 }
128 impl Into<vec128_storage> for [u32; 4] {
129     #[inline(always)]
into(self) -> vec128_storage130     fn into(self) -> vec128_storage {
131         vec128_storage { u32x4: self }
132     }
133 }
134 impl Default for vec128_storage {
135     #[inline(always)]
default() -> Self136     fn default() -> Self {
137         vec128_storage { u128x1: [0] }
138     }
139 }
140 impl Eq for vec128_storage {}
141 impl PartialEq for vec128_storage {
142     #[inline(always)]
eq(&self, rhs: &Self) -> bool143     fn eq(&self, rhs: &Self) -> bool {
144         unsafe { self.u128x1 == rhs.u128x1 }
145     }
146 }
147 
148 #[allow(non_camel_case_types)]
149 #[derive(Copy, Clone)]
150 pub union vec256_storage {
151     u32x8: [u32; 8],
152     u64x4: [u64; 4],
153     u128x2: [u128; 2],
154     sse2: [vec128_storage; 2],
155     avx: __m256i,
156 }
157 impl Into<vec256_storage> for [u64; 4] {
158     #[inline(always)]
into(self) -> vec256_storage159     fn into(self) -> vec256_storage {
160         vec256_storage { u64x4: self }
161     }
162 }
163 impl Default for vec256_storage {
164     #[inline(always)]
default() -> Self165     fn default() -> Self {
166         vec256_storage { u128x2: [0, 0] }
167     }
168 }
169 impl vec256_storage {
new128(xs: [vec128_storage; 2]) -> Self170     pub fn new128(xs: [vec128_storage; 2]) -> Self {
171         Self { sse2: xs }
172     }
split128(self) -> [vec128_storage; 2]173     pub fn split128(self) -> [vec128_storage; 2] {
174         unsafe { self.sse2 }
175     }
176 }
177 impl Eq for vec256_storage {}
178 impl PartialEq for vec256_storage {
179     #[inline(always)]
eq(&self, rhs: &Self) -> bool180     fn eq(&self, rhs: &Self) -> bool {
181         unsafe { self.sse2 == rhs.sse2 }
182     }
183 }
184 
185 #[allow(non_camel_case_types)]
186 #[derive(Copy, Clone)]
187 pub union vec512_storage {
188     u32x16: [u32; 16],
189     u64x8: [u64; 8],
190     u128x4: [u128; 4],
191     sse2: [vec128_storage; 4],
192     avx: [vec256_storage; 2],
193 }
194 impl Default for vec512_storage {
195     #[inline(always)]
default() -> Self196     fn default() -> Self {
197         vec512_storage {
198             u128x4: [0, 0, 0, 0],
199         }
200     }
201 }
202 impl vec512_storage {
new128(xs: [vec128_storage; 4]) -> Self203     pub fn new128(xs: [vec128_storage; 4]) -> Self {
204         Self { sse2: xs }
205     }
split128(self) -> [vec128_storage; 4]206     pub fn split128(self) -> [vec128_storage; 4] {
207         unsafe { self.sse2 }
208     }
209 }
210 impl Eq for vec512_storage {}
211 impl PartialEq for vec512_storage {
212     #[inline(always)]
eq(&self, rhs: &Self) -> bool213     fn eq(&self, rhs: &Self) -> bool {
214         unsafe { self.avx == rhs.avx }
215     }
216 }
217 
218 macro_rules! impl_into {
219     ($storage:ident, $array:ty, $name:ident) => {
220         impl Into<$array> for $storage {
221             #[inline(always)]
222             fn into(self) -> $array {
223                 unsafe { self.$name }
224             }
225         }
226     };
227 }
228 impl_into!(vec128_storage, [u32; 4], u32x4);
229 impl_into!(vec128_storage, [u64; 2], u64x2);
230 impl_into!(vec128_storage, [u128; 1], u128x1);
231 impl_into!(vec256_storage, [u32; 8], u32x8);
232 impl_into!(vec256_storage, [u64; 4], u64x4);
233 impl_into!(vec256_storage, [u128; 2], u128x2);
234 impl_into!(vec512_storage, [u32; 16], u32x16);
235 impl_into!(vec512_storage, [u64; 8], u64x8);
236 impl_into!(vec512_storage, [u128; 4], u128x4);
237 
238 /// Generate the full set of optimized implementations to take advantage of the most important
239 /// hardware feature sets.
240 ///
241 /// This dispatcher is suitable for maximizing throughput.
242 #[macro_export]
243 macro_rules! dispatch {
244     ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
245         #[cfg(feature = "std")]
246         $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
247             #[inline(always)]
248             fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
249             use std::arch::x86_64::*;
250             #[target_feature(enable = "avx2")]
251             unsafe fn impl_avx2($($arg: $argty),*) -> $ret {
252                 let ret = fn_impl($crate::x86_64::AVX2::instance(), $($arg),*);
253                 _mm256_zeroupper();
254                 ret
255             }
256             #[target_feature(enable = "avx")]
257             #[target_feature(enable = "sse4.1")]
258             #[target_feature(enable = "ssse3")]
259             unsafe fn impl_avx($($arg: $argty),*) -> $ret {
260                 let ret = fn_impl($crate::x86_64::AVX::instance(), $($arg),*);
261                 _mm256_zeroupper();
262                 ret
263             }
264             #[target_feature(enable = "sse4.1")]
265             #[target_feature(enable = "ssse3")]
266             unsafe fn impl_sse41($($arg: $argty),*) -> $ret {
267                 fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
268             }
269             #[target_feature(enable = "ssse3")]
270             unsafe fn impl_ssse3($($arg: $argty),*) -> $ret {
271                 fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
272             }
273             #[target_feature(enable = "sse2")]
274             unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
275                 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
276             }
277             unsafe {
278                 if is_x86_feature_detected!("avx2") {
279                     impl_avx2($($arg),*)
280                 } else if is_x86_feature_detected!("avx") {
281                     impl_avx($($arg),*)
282                 } else if is_x86_feature_detected!("sse4.1") {
283                     impl_sse41($($arg),*)
284                 } else if is_x86_feature_detected!("ssse3") {
285                     impl_ssse3($($arg),*)
286                 } else if is_x86_feature_detected!("sse2") {
287                     impl_sse2($($arg),*)
288                 } else {
289                     unimplemented!()
290                 }
291             }
292         }
293         #[cfg(not(feature = "std"))]
294         #[inline(always)]
295         $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
296             unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
297             unsafe {
298                 if cfg!(target_feature = "avx2") {
299                     fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
300                 } else if cfg!(target_feature = "avx") {
301                     fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
302                 } else if cfg!(target_feature = "sse4.1") {
303                     fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
304                 } else if cfg!(target_feature = "ssse3") {
305                     fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
306                 } else {
307                     fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
308                 }
309             }
310         }
311     };
312     ($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
313         dispatch!($mach, $MTy, {
314             $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
315         });
316     }
317 }
318 
319 /// Generate only the basic implementations necessary to be able to operate efficiently on 128-bit
320 /// vectors on this platfrom. For x86-64, that would mean SSE2 and AVX.
321 ///
322 /// This dispatcher is suitable for vector operations that do not benefit from advanced hardware
323 /// features (e.g. because they are done infrequently), so minimizing their contribution to code
324 /// size is more important.
325 #[macro_export]
326 macro_rules! dispatch_light128 {
327     ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
328         #[cfg(feature = "std")]
329         $($pub $(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
330             #[inline(always)]
331             fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
332             use std::arch::x86_64::*;
333             #[target_feature(enable = "avx")]
334             unsafe fn impl_avx($($arg: $argty),*) -> $ret {
335                 fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
336             }
337             #[target_feature(enable = "sse2")]
338             unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
339                 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
340             }
341             unsafe {
342                 if is_x86_feature_detected!("avx") {
343                     impl_avx($($arg),*)
344                 } else if is_x86_feature_detected!("sse2") {
345                     impl_sse2($($arg),*)
346                 } else {
347                     unimplemented!()
348                 }
349             }
350         }
351         #[cfg(not(feature = "std"))]
352         #[inline(always)]
353         $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
354             unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
355             unsafe {
356                 if cfg!(target_feature = "avx2") {
357                     fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
358                 } else if cfg!(target_feature = "avx") {
359                     fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
360                 } else if cfg!(target_feature = "sse4.1") {
361                     fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
362                 } else if cfg!(target_feature = "ssse3") {
363                     fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
364                 } else {
365                     fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
366                 }
367             }
368         }
369     };
370     ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
371         dispatch_light128!($mach, $MTy, {
372             $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
373         });
374     }
375 }
376 
377 /// Generate only the basic implementations necessary to be able to operate efficiently on 256-bit
378 /// vectors on this platfrom. For x86-64, that would mean SSE2, AVX, and AVX2.
379 ///
380 /// This dispatcher is suitable for vector operations that do not benefit from advanced hardware
381 /// features (e.g. because they are done infrequently), so minimizing their contribution to code
382 /// size is more important.
383 #[macro_export]
384 macro_rules! dispatch_light256 {
385     ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
386         #[cfg(feature = "std")]
387         $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> $ret {
388             #[inline(always)]
389             fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
390             use std::arch::x86_64::*;
391             #[target_feature(enable = "avx")]
392             unsafe fn impl_avx($($arg: $argty),*) -> $ret {
393                 fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
394             }
395             #[target_feature(enable = "sse2")]
396             unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
397                 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
398             }
399             unsafe {
400                 if is_x86_feature_detected!("avx") {
401                     impl_avx($($arg),*)
402                 } else if is_x86_feature_detected!("sse2") {
403                     impl_sse2($($arg),*)
404                 } else {
405                     unimplemented!()
406                 }
407             }
408         }
409         #[cfg(not(feature = "std"))]
410         #[inline(always)]
411         $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
412             unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
413             unsafe {
414                 if cfg!(target_feature = "avx2") {
415                     fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
416                 } else if cfg!(target_feature = "avx") {
417                     fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
418                 } else if cfg!(target_feature = "sse4.1") {
419                     fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
420                 } else if cfg!(target_feature = "ssse3") {
421                     fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
422                 } else {
423                     fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
424                 }
425             }
426         }
427     };
428     ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
429         dispatch_light256!($mach, $MTy, {
430             $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
431         });
432     }
433 }
434