1 // crate minimums: sse2, x86_64
2 
3 use core::arch::x86_64::{__m128i, __m256i};
4 use crate::types::*;
5 
6 mod sse2;
7 
8 #[derive(Copy, Clone)]
9 pub struct YesS3;
10 #[derive(Copy, Clone)]
11 pub struct NoS3;
12 
13 #[derive(Copy, Clone)]
14 pub struct YesS4;
15 #[derive(Copy, Clone)]
16 pub struct NoS4;
17 
18 #[derive(Copy, Clone)]
19 pub struct YesA1;
20 #[derive(Copy, Clone)]
21 pub struct NoA1;
22 
23 #[derive(Copy, Clone)]
24 pub struct YesA2;
25 #[derive(Copy, Clone)]
26 pub struct NoA2;
27 
28 #[derive(Copy, Clone)]
29 pub struct YesNI;
30 #[derive(Copy, Clone)]
31 pub struct NoNI;
32 
33 use core::marker::PhantomData;
34 
35 #[derive(Copy, Clone)]
36 pub struct SseMachine<S3, S4, NI>(PhantomData<(S3, S4, NI)>);
37 impl<S3: Copy, S4: Copy, NI: Copy> Machine for SseMachine<S3, S4, NI>
38 where
39     sse2::u128x1_sse2<S3, S4, NI>: Swap64,
40     sse2::u64x2_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>,
41     sse2::u32x4_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>,
42     sse2::u64x4_sse2<S3, S4, NI>: BSwap + Words4,
43     sse2::u128x1_sse2<S3, S4, NI>: BSwap,
44     sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x2x2_sse2<S3, S4, NI>>,
45     sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x4_sse2<S3, S4, NI>>,
46     sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u32x4x2_sse2<S3, S4, NI>>,
47     sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u64x2x4_sse2<S3, S4, NI>>,
48     sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u32x4x4_sse2<S3, S4, NI>>,
49 {
50     type u32x4 = sse2::u32x4_sse2<S3, S4, NI>;
51     type u64x2 = sse2::u64x2_sse2<S3, S4, NI>;
52     type u128x1 = sse2::u128x1_sse2<S3, S4, NI>;
53 
54     type u32x4x2 = sse2::u32x4x2_sse2<S3, S4, NI>;
55     type u64x2x2 = sse2::u64x2x2_sse2<S3, S4, NI>;
56     type u64x4 = sse2::u64x4_sse2<S3, S4, NI>;
57     type u128x2 = sse2::u128x2_sse2<S3, S4, NI>;
58 
59     type u32x4x4 = sse2::u32x4x4_sse2<S3, S4, NI>;
60     type u64x2x4 = sse2::u64x2x4_sse2<S3, S4, NI>;
61     type u128x4 = sse2::u128x4_sse2<S3, S4, NI>;
62 
63     #[inline(always)]
instance() -> Self64     unsafe fn instance() -> Self {
65         SseMachine(PhantomData)
66     }
67 }
68 
69 #[derive(Copy, Clone)]
70 pub struct Avx2Machine<NI>(PhantomData<NI>);
71 impl<NI: Copy> Machine for Avx2Machine<NI>
72 where
73     sse2::u128x1_sse2<YesS3, YesS4, NI>: BSwap + Swap64,
74     sse2::u64x2_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>,
75     sse2::u32x4_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>,
76     sse2::u64x4_sse2<YesS3, YesS4, NI>: BSwap + Words4,
77 {
78     type u32x4 = sse2::u32x4_sse2<YesS3, YesS4, NI>;
79     type u64x2 = sse2::u64x2_sse2<YesS3, YesS4, NI>;
80     type u128x1 = sse2::u128x1_sse2<YesS3, YesS4, NI>;
81 
82     type u32x4x2 = sse2::u32x4x2_sse2<YesS3, YesS4, NI>;
83     type u64x2x2 = sse2::u64x2x2_sse2<YesS3, YesS4, NI>;
84     type u64x4 = sse2::u64x4_sse2<YesS3, YesS4, NI>;
85     type u128x2 = sse2::u128x2_sse2<YesS3, YesS4, NI>;
86 
87     type u32x4x4 = sse2::avx2::u32x4x4_avx2<NI>;
88     type u64x2x4 = sse2::u64x2x4_sse2<YesS3, YesS4, NI>;
89     type u128x4 = sse2::u128x4_sse2<YesS3, YesS4, NI>;
90 
91     #[inline(always)]
instance() -> Self92     unsafe fn instance() -> Self {
93         Avx2Machine(PhantomData)
94     }
95 }
96 
97 pub type SSE2 = SseMachine<NoS3, NoS4, NoNI>;
98 pub type SSSE3 = SseMachine<YesS3, NoS4, NoNI>;
99 pub type SSE41 = SseMachine<YesS3, YesS4, NoNI>;
100 /// AVX but not AVX2: only 128-bit integer operations, but use VEX versions of everything
101 /// to avoid expensive SSE/VEX conflicts.
102 pub type AVX = SseMachine<YesS3, YesS4, NoNI>;
103 pub type AVX2 = Avx2Machine<NoNI>;
104 
105 /// Generic wrapper for unparameterized storage of any of the possible impls.
106 /// Converting into and out of this type should be essentially free, although it may be more
107 /// aligned than a particular impl requires.
108 #[allow(non_camel_case_types)]
109 #[derive(Copy, Clone)]
110 pub union vec128_storage {
111     u32x4: [u32; 4],
112     u64x2: [u64; 2],
113     u128x1: [u128; 1],
114     sse2: __m128i,
115 }
116 impl Store<vec128_storage> for vec128_storage {
117     #[inline(always)]
unpack(p: vec128_storage) -> Self118     unsafe fn unpack(p: vec128_storage) -> Self {
119         p
120     }
121 }
122 impl<'a> Into<&'a [u32; 4]> for &'a vec128_storage {
123     #[inline(always)]
into(self) -> &'a [u32; 4]124     fn into(self) -> &'a [u32; 4] {
125         unsafe { &self.u32x4 }
126     }
127 }
128 impl Into<vec128_storage> for [u32; 4] {
129     #[inline(always)]
into(self) -> vec128_storage130     fn into(self) -> vec128_storage {
131         vec128_storage { u32x4: self }
132     }
133 }
134 impl Default for vec128_storage {
135     #[inline(always)]
default() -> Self136     fn default() -> Self {
137         vec128_storage { u128x1: [0] }
138     }
139 }
140 
141 #[allow(non_camel_case_types)]
142 #[derive(Copy, Clone)]
143 pub union vec256_storage {
144     u32x8: [u32; 8],
145     u64x4: [u64; 4],
146     u128x2: [u128; 2],
147     sse2: [vec128_storage; 2],
148     avx: __m256i,
149 }
150 impl Into<vec256_storage> for [u64; 4] {
151     #[inline(always)]
into(self) -> vec256_storage152     fn into(self) -> vec256_storage {
153         vec256_storage { u64x4: self }
154     }
155 }
156 impl Default for vec256_storage {
157     #[inline(always)]
default() -> Self158     fn default() -> Self {
159         vec256_storage { u128x2: [0, 0] }
160     }
161 }
162 impl vec256_storage {
new128(xs: [vec128_storage; 2]) -> Self163     pub fn new128(xs: [vec128_storage; 2]) -> Self {
164         Self { sse2: xs }
165     }
split128(self) -> [vec128_storage; 2]166     pub fn split128(self) -> [vec128_storage; 2] {
167         unsafe { self.sse2 }
168     }
169 }
170 
171 #[allow(non_camel_case_types)]
172 #[derive(Copy, Clone)]
173 pub union vec512_storage {
174     u32x16: [u32; 16],
175     u64x8: [u64; 8],
176     u128x4: [u128; 4],
177     sse2: [vec128_storage; 4],
178     avx: [vec256_storage; 2],
179 }
180 impl Default for vec512_storage {
181     #[inline(always)]
default() -> Self182     fn default() -> Self {
183         vec512_storage {
184             u128x4: [0, 0, 0, 0],
185         }
186     }
187 }
188 impl vec512_storage {
new128(xs: [vec128_storage; 4]) -> Self189     pub fn new128(xs: [vec128_storage; 4]) -> Self {
190         Self { sse2: xs }
191     }
split128(self) -> [vec128_storage; 4]192     pub fn split128(self) -> [vec128_storage; 4] {
193         unsafe { self.sse2 }
194     }
195 }
196 
197 macro_rules! impl_into {
198     ($storage:ident, $array:ty, $name:ident) => {
199         impl Into<$array> for $storage {
200             #[inline(always)]
201             fn into(self) -> $array {
202                 unsafe { self.$name }
203             }
204         }
205     };
206 }
207 impl_into!(vec128_storage, [u32; 4], u32x4);
208 impl_into!(vec128_storage, [u64; 2], u64x2);
209 impl_into!(vec128_storage, [u128; 1], u128x1);
210 impl_into!(vec256_storage, [u32; 8], u32x8);
211 impl_into!(vec256_storage, [u64; 4], u64x4);
212 impl_into!(vec256_storage, [u128; 2], u128x2);
213 impl_into!(vec512_storage, [u32; 16], u32x16);
214 impl_into!(vec512_storage, [u64; 8], u64x8);
215 impl_into!(vec512_storage, [u128; 4], u128x4);
216 
217 /// Generate the full set of optimized implementations to take advantage of the most important
218 /// hardware feature sets.
219 ///
220 /// This dispatcher is suitable for maximizing throughput.
221 #[macro_export]
222 macro_rules! dispatch {
223     ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
224         #[cfg(feature = "std")]
225         $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
226             #[inline(always)]
227             fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
228             use std::arch::x86_64::*;
229             #[target_feature(enable = "avx2")]
230             unsafe fn impl_avx2($($arg: $argty),*) -> $ret {
231                 let ret = fn_impl($crate::x86_64::AVX2::instance(), $($arg),*);
232                 _mm256_zeroupper();
233                 ret
234             }
235             #[target_feature(enable = "avx")]
236             #[target_feature(enable = "sse4.1")]
237             #[target_feature(enable = "ssse3")]
238             unsafe fn impl_avx($($arg: $argty),*) -> $ret {
239                 let ret = fn_impl($crate::x86_64::AVX::instance(), $($arg),*);
240                 _mm256_zeroupper();
241                 ret
242             }
243             #[target_feature(enable = "sse4.1")]
244             #[target_feature(enable = "ssse3")]
245             unsafe fn impl_sse41($($arg: $argty),*) -> $ret {
246                 fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
247             }
248             #[target_feature(enable = "ssse3")]
249             unsafe fn impl_ssse3($($arg: $argty),*) -> $ret {
250                 fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
251             }
252             #[target_feature(enable = "sse2")]
253             unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
254                 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
255             }
256             unsafe {
257                 if is_x86_feature_detected!("avx2") {
258                     impl_avx2($($arg),*)
259                 } else if is_x86_feature_detected!("avx") {
260                     impl_avx($($arg),*)
261                 } else if is_x86_feature_detected!("sse4.1") {
262                     impl_sse41($($arg),*)
263                 } else if is_x86_feature_detected!("ssse3") {
264                     impl_ssse3($($arg),*)
265                 } else if is_x86_feature_detected!("sse2") {
266                     impl_sse2($($arg),*)
267                 } else {
268                     unimplemented!()
269                 }
270             }
271         }
272         #[cfg(not(feature = "std"))]
273         #[inline(always)]
274         $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
275             unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
276             unsafe {
277                 if cfg!(target_feature = "avx2") {
278                     fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
279                 } else if cfg!(target_feature = "avx") {
280                     fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
281                 } else if cfg!(target_feature = "sse4.1") {
282                     fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
283                 } else if cfg!(target_feature = "ssse3") {
284                     fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
285                 } else {
286                     fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
287                 }
288             }
289         }
290     };
291     ($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
292         dispatch!($mach, $MTy, {
293             $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
294         });
295     }
296 }
297 
298 /// Generate only the basic implementations necessary to be able to operate efficiently on 128-bit
299 /// vectors on this platfrom. For x86-64, that would mean SSE2 and AVX.
300 ///
301 /// This dispatcher is suitable for vector operations that do not benefit from advanced hardware
302 /// features (e.g. because they are done infrequently), so minimizing their contribution to code
303 /// size is more important.
304 #[macro_export]
305 macro_rules! dispatch_light128 {
306     ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
307         #[cfg(feature = "std")]
308         $($pub $(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
309             #[inline(always)]
310             fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
311             use std::arch::x86_64::*;
312             #[target_feature(enable = "avx")]
313             unsafe fn impl_avx($($arg: $argty),*) -> $ret {
314                 fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
315             }
316             #[target_feature(enable = "sse2")]
317             unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
318                 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
319             }
320             unsafe {
321                 if is_x86_feature_detected!("avx") {
322                     impl_avx($($arg),*)
323                 } else if is_x86_feature_detected!("sse2") {
324                     impl_sse2($($arg),*)
325                 } else {
326                     unimplemented!()
327                 }
328             }
329         }
330         #[cfg(not(feature = "std"))]
331         #[inline(always)]
332         $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
333             unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
334             unsafe {
335                 if cfg!(target_feature = "avx2") {
336                     fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
337                 } else if cfg!(target_feature = "avx") {
338                     fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
339                 } else if cfg!(target_feature = "sse4.1") {
340                     fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
341                 } else if cfg!(target_feature = "ssse3") {
342                     fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
343                 } else {
344                     fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
345                 }
346             }
347         }
348     };
349     ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
350         dispatch_light128!($mach, $MTy, {
351             $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
352         });
353     }
354 }
355 
356 /// Generate only the basic implementations necessary to be able to operate efficiently on 256-bit
357 /// vectors on this platfrom. For x86-64, that would mean SSE2, AVX, and AVX2.
358 ///
359 /// This dispatcher is suitable for vector operations that do not benefit from advanced hardware
360 /// features (e.g. because they are done infrequently), so minimizing their contribution to code
361 /// size is more important.
362 #[macro_export]
363 macro_rules! dispatch_light256 {
364     ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
365         #[cfg(feature = "std")]
366         $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> $ret {
367             #[inline(always)]
368             fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
369             use std::arch::x86_64::*;
370             #[target_feature(enable = "avx")]
371             unsafe fn impl_avx($($arg: $argty),*) -> $ret {
372                 fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
373             }
374             #[target_feature(enable = "sse2")]
375             unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
376                 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
377             }
378             unsafe {
379                 if is_x86_feature_detected!("avx") {
380                     impl_avx($($arg),*)
381                 } else if is_x86_feature_detected!("sse2") {
382                     impl_sse2($($arg),*)
383                 } else {
384                     unimplemented!()
385                 }
386             }
387         }
388         #[cfg(not(feature = "std"))]
389         #[inline(always)]
390         $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
391             unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
392             unsafe {
393                 if cfg!(target_feature = "avx2") {
394                     fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
395                 } else if cfg!(target_feature = "avx") {
396                     fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
397                 } else if cfg!(target_feature = "sse4.1") {
398                     fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
399                 } else if cfg!(target_feature = "ssse3") {
400                     fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
401                 } else {
402                     fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
403                 }
404             }
405         }
406     };
407     ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
408         dispatch_light256!($mach, $MTy, {
409             $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
410         });
411     }
412 }
413