1 // crate minimums: sse2, x86_64
2 
3 use crate::types::*;
4 use core::arch::x86_64::{__m128i, __m256i};
5 
6 mod sse2;
7 
8 #[derive(Copy, Clone)]
9 pub struct YesS3;
10 #[derive(Copy, Clone)]
11 pub struct NoS3;
12 
13 #[derive(Copy, Clone)]
14 pub struct YesS4;
15 #[derive(Copy, Clone)]
16 pub struct NoS4;
17 
18 #[derive(Copy, Clone)]
19 pub struct YesA1;
20 #[derive(Copy, Clone)]
21 pub struct NoA1;
22 
23 #[derive(Copy, Clone)]
24 pub struct YesA2;
25 #[derive(Copy, Clone)]
26 pub struct NoA2;
27 
28 #[derive(Copy, Clone)]
29 pub struct YesNI;
30 #[derive(Copy, Clone)]
31 pub struct NoNI;
32 
33 use core::marker::PhantomData;
34 
35 #[derive(Copy, Clone)]
36 pub struct SseMachine<S3, S4, NI>(PhantomData<(S3, S4, NI)>);
37 impl<S3: Copy, S4: Copy, NI: Copy> Machine for SseMachine<S3, S4, NI>
38 where
39     sse2::u128x1_sse2<S3, S4, NI>: Swap64,
40     sse2::u64x2_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>,
41     sse2::u32x4_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>,
42     sse2::u64x4_sse2<S3, S4, NI>: BSwap + Words4,
43     sse2::u128x1_sse2<S3, S4, NI>: BSwap,
44     sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x2x2_sse2<S3, S4, NI>>,
45     sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x4_sse2<S3, S4, NI>>,
46     sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u32x4x2_sse2<S3, S4, NI>>,
47     sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u64x2x4_sse2<S3, S4, NI>>,
48     sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u32x4x4_sse2<S3, S4, NI>>,
49 {
50     type u32x4 = sse2::u32x4_sse2<S3, S4, NI>;
51     type u64x2 = sse2::u64x2_sse2<S3, S4, NI>;
52     type u128x1 = sse2::u128x1_sse2<S3, S4, NI>;
53 
54     type u32x4x2 = sse2::u32x4x2_sse2<S3, S4, NI>;
55     type u64x2x2 = sse2::u64x2x2_sse2<S3, S4, NI>;
56     type u64x4 = sse2::u64x4_sse2<S3, S4, NI>;
57     type u128x2 = sse2::u128x2_sse2<S3, S4, NI>;
58 
59     type u32x4x4 = sse2::u32x4x4_sse2<S3, S4, NI>;
60     type u64x2x4 = sse2::u64x2x4_sse2<S3, S4, NI>;
61     type u128x4 = sse2::u128x4_sse2<S3, S4, NI>;
62 
63     #[inline(always)]
instance() -> Self64     unsafe fn instance() -> Self {
65         SseMachine(PhantomData)
66     }
67 }
68 
69 #[derive(Copy, Clone)]
70 pub struct Avx2Machine<NI>(PhantomData<NI>);
71 impl<NI: Copy> Machine for Avx2Machine<NI>
72 where
73     sse2::u128x1_sse2<YesS3, YesS4, NI>: BSwap + Swap64,
74     sse2::u64x2_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>,
75     sse2::u32x4_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>,
76     sse2::u64x4_sse2<YesS3, YesS4, NI>: BSwap + Words4,
77 {
78     type u32x4 = sse2::u32x4_sse2<YesS3, YesS4, NI>;
79     type u64x2 = sse2::u64x2_sse2<YesS3, YesS4, NI>;
80     type u128x1 = sse2::u128x1_sse2<YesS3, YesS4, NI>;
81 
82     type u32x4x2 = sse2::u32x4x2_sse2<YesS3, YesS4, NI>;
83     type u64x2x2 = sse2::u64x2x2_sse2<YesS3, YesS4, NI>;
84     type u64x4 = sse2::u64x4_sse2<YesS3, YesS4, NI>;
85     type u128x2 = sse2::u128x2_sse2<YesS3, YesS4, NI>;
86 
87     type u32x4x4 = sse2::avx2::u32x4x4_avx2<NI>;
88     type u64x2x4 = sse2::u64x2x4_sse2<YesS3, YesS4, NI>;
89     type u128x4 = sse2::u128x4_sse2<YesS3, YesS4, NI>;
90 
91     #[inline(always)]
instance() -> Self92     unsafe fn instance() -> Self {
93         Avx2Machine(PhantomData)
94     }
95 }
96 
97 pub type SSE2 = SseMachine<NoS3, NoS4, NoNI>;
98 pub type SSSE3 = SseMachine<YesS3, NoS4, NoNI>;
99 pub type SSE41 = SseMachine<YesS3, YesS4, NoNI>;
100 /// AVX but not AVX2: only 128-bit integer operations, but use VEX versions of everything
101 /// to avoid expensive SSE/VEX conflicts.
102 pub type AVX = SseMachine<YesS3, YesS4, NoNI>;
103 pub type AVX2 = Avx2Machine<NoNI>;
104 
105 /// Generic wrapper for unparameterized storage of any of the possible impls.
106 /// Converting into and out of this type should be essentially free, although it may be more
107 /// aligned than a particular impl requires.
108 #[allow(non_camel_case_types)]
109 #[derive(Copy, Clone)]
110 pub union vec128_storage {
111     u32x4: [u32; 4],
112     u64x2: [u64; 2],
113     u128x1: [u128; 1],
114     sse2: __m128i,
115 }
116 impl Store<vec128_storage> for vec128_storage {
117     #[inline(always)]
unpack(p: vec128_storage) -> Self118     unsafe fn unpack(p: vec128_storage) -> Self {
119         p
120     }
121 }
122 impl<'a> Into<&'a [u32; 4]> for &'a vec128_storage {
123     #[inline(always)]
into(self) -> &'a [u32; 4]124     fn into(self) -> &'a [u32; 4] {
125         unsafe { &self.u32x4 }
126     }
127 }
128 impl Into<vec128_storage> for [u32; 4] {
129     #[inline(always)]
into(self) -> vec128_storage130     fn into(self) -> vec128_storage {
131         vec128_storage { u32x4: self }
132     }
133 }
134 impl Default for vec128_storage {
135     #[inline(always)]
default() -> Self136     fn default() -> Self {
137         vec128_storage { u128x1: [0] }
138     }
139 }
140 impl Eq for vec128_storage {}
141 impl PartialEq for vec128_storage {
142     #[inline(always)]
eq(&self, rhs: &Self) -> bool143     fn eq(&self, rhs: &Self) -> bool {
144         unsafe { self.u128x1 == rhs.u128x1 }
145     }
146 }
147 
148 #[allow(non_camel_case_types)]
149 #[derive(Copy, Clone)]
150 pub union vec256_storage {
151     u32x8: [u32; 8],
152     u64x4: [u64; 4],
153     u128x2: [u128; 2],
154     sse2: [vec128_storage; 2],
155     avx: __m256i,
156 }
157 impl Into<vec256_storage> for [u64; 4] {
158     #[inline(always)]
into(self) -> vec256_storage159     fn into(self) -> vec256_storage {
160         vec256_storage { u64x4: self }
161     }
162 }
163 impl Default for vec256_storage {
164     #[inline(always)]
default() -> Self165     fn default() -> Self {
166         vec256_storage { u128x2: [0, 0] }
167     }
168 }
169 impl vec256_storage {
170     #[inline(always)]
new128(xs: [vec128_storage; 2]) -> Self171     pub fn new128(xs: [vec128_storage; 2]) -> Self {
172         Self { sse2: xs }
173     }
174     #[inline(always)]
split128(self) -> [vec128_storage; 2]175     pub fn split128(self) -> [vec128_storage; 2] {
176         unsafe { self.sse2 }
177     }
178 }
179 impl Eq for vec256_storage {}
180 impl PartialEq for vec256_storage {
181     #[inline(always)]
eq(&self, rhs: &Self) -> bool182     fn eq(&self, rhs: &Self) -> bool {
183         unsafe { self.sse2 == rhs.sse2 }
184     }
185 }
186 
187 #[allow(non_camel_case_types)]
188 #[derive(Copy, Clone)]
189 pub union vec512_storage {
190     u32x16: [u32; 16],
191     u64x8: [u64; 8],
192     u128x4: [u128; 4],
193     sse2: [vec128_storage; 4],
194     avx: [vec256_storage; 2],
195 }
196 impl Default for vec512_storage {
197     #[inline(always)]
default() -> Self198     fn default() -> Self {
199         vec512_storage {
200             u128x4: [0, 0, 0, 0],
201         }
202     }
203 }
204 impl vec512_storage {
205     #[inline(always)]
new128(xs: [vec128_storage; 4]) -> Self206     pub fn new128(xs: [vec128_storage; 4]) -> Self {
207         Self { sse2: xs }
208     }
209     #[inline(always)]
split128(self) -> [vec128_storage; 4]210     pub fn split128(self) -> [vec128_storage; 4] {
211         unsafe { self.sse2 }
212     }
213 }
214 impl Eq for vec512_storage {}
215 impl PartialEq for vec512_storage {
216     #[inline(always)]
eq(&self, rhs: &Self) -> bool217     fn eq(&self, rhs: &Self) -> bool {
218         unsafe { self.avx == rhs.avx }
219     }
220 }
221 
222 macro_rules! impl_into {
223     ($storage:ident, $array:ty, $name:ident) => {
224         impl Into<$array> for $storage {
225             #[inline(always)]
226             fn into(self) -> $array {
227                 unsafe { self.$name }
228             }
229         }
230     };
231 }
232 impl_into!(vec128_storage, [u32; 4], u32x4);
233 impl_into!(vec128_storage, [u64; 2], u64x2);
234 impl_into!(vec128_storage, [u128; 1], u128x1);
235 impl_into!(vec256_storage, [u32; 8], u32x8);
236 impl_into!(vec256_storage, [u64; 4], u64x4);
237 impl_into!(vec256_storage, [u128; 2], u128x2);
238 impl_into!(vec512_storage, [u32; 16], u32x16);
239 impl_into!(vec512_storage, [u64; 8], u64x8);
240 impl_into!(vec512_storage, [u128; 4], u128x4);
241 
242 /// Generate the full set of optimized implementations to take advantage of the most important
243 /// hardware feature sets.
244 ///
245 /// This dispatcher is suitable for maximizing throughput.
246 #[macro_export]
247 macro_rules! dispatch {
248     ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
249         #[cfg(feature = "std")]
250         $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
251             #[inline(always)]
252             fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
253             use std::arch::x86_64::*;
254             #[target_feature(enable = "avx2")]
255             unsafe fn impl_avx2($($arg: $argty),*) -> $ret {
256                 let ret = fn_impl($crate::x86_64::AVX2::instance(), $($arg),*);
257                 _mm256_zeroupper();
258                 ret
259             }
260             #[target_feature(enable = "avx")]
261             #[target_feature(enable = "sse4.1")]
262             #[target_feature(enable = "ssse3")]
263             unsafe fn impl_avx($($arg: $argty),*) -> $ret {
264                 let ret = fn_impl($crate::x86_64::AVX::instance(), $($arg),*);
265                 _mm256_zeroupper();
266                 ret
267             }
268             #[target_feature(enable = "sse4.1")]
269             #[target_feature(enable = "ssse3")]
270             unsafe fn impl_sse41($($arg: $argty),*) -> $ret {
271                 fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
272             }
273             #[target_feature(enable = "ssse3")]
274             unsafe fn impl_ssse3($($arg: $argty),*) -> $ret {
275                 fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
276             }
277             #[target_feature(enable = "sse2")]
278             unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
279                 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
280             }
281             unsafe {
282                 if is_x86_feature_detected!("avx2") {
283                     impl_avx2($($arg),*)
284                 } else if is_x86_feature_detected!("avx") {
285                     impl_avx($($arg),*)
286                 } else if is_x86_feature_detected!("sse4.1") {
287                     impl_sse41($($arg),*)
288                 } else if is_x86_feature_detected!("ssse3") {
289                     impl_ssse3($($arg),*)
290                 } else if is_x86_feature_detected!("sse2") {
291                     impl_sse2($($arg),*)
292                 } else {
293                     unimplemented!()
294                 }
295             }
296         }
297         #[cfg(not(feature = "std"))]
298         #[inline(always)]
299         $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
300             unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
301             unsafe {
302                 if cfg!(target_feature = "avx2") {
303                     fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
304                 } else if cfg!(target_feature = "avx") {
305                     fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
306                 } else if cfg!(target_feature = "sse4.1") {
307                     fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
308                 } else if cfg!(target_feature = "ssse3") {
309                     fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
310                 } else {
311                     fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
312                 }
313             }
314         }
315     };
316     ($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
317         dispatch!($mach, $MTy, {
318             $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
319         });
320     }
321 }
322 
323 /// Generate only the basic implementations necessary to be able to operate efficiently on 128-bit
324 /// vectors on this platfrom. For x86-64, that would mean SSE2 and AVX.
325 ///
326 /// This dispatcher is suitable for vector operations that do not benefit from advanced hardware
327 /// features (e.g. because they are done infrequently), so minimizing their contribution to code
328 /// size is more important.
329 #[macro_export]
330 macro_rules! dispatch_light128 {
331     ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
332         #[cfg(feature = "std")]
333         $($pub $(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
334             #[inline(always)]
335             fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
336             use std::arch::x86_64::*;
337             #[target_feature(enable = "avx")]
338             unsafe fn impl_avx($($arg: $argty),*) -> $ret {
339                 fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
340             }
341             #[target_feature(enable = "sse2")]
342             unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
343                 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
344             }
345             unsafe {
346                 if is_x86_feature_detected!("avx") {
347                     impl_avx($($arg),*)
348                 } else if is_x86_feature_detected!("sse2") {
349                     impl_sse2($($arg),*)
350                 } else {
351                     unimplemented!()
352                 }
353             }
354         }
355         #[cfg(not(feature = "std"))]
356         #[inline(always)]
357         $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
358             unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
359             unsafe {
360                 if cfg!(target_feature = "avx2") {
361                     fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
362                 } else if cfg!(target_feature = "avx") {
363                     fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
364                 } else if cfg!(target_feature = "sse4.1") {
365                     fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
366                 } else if cfg!(target_feature = "ssse3") {
367                     fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
368                 } else {
369                     fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
370                 }
371             }
372         }
373     };
374     ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
375         dispatch_light128!($mach, $MTy, {
376             $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
377         });
378     }
379 }
380 
381 /// Generate only the basic implementations necessary to be able to operate efficiently on 256-bit
382 /// vectors on this platfrom. For x86-64, that would mean SSE2, AVX, and AVX2.
383 ///
384 /// This dispatcher is suitable for vector operations that do not benefit from advanced hardware
385 /// features (e.g. because they are done infrequently), so minimizing their contribution to code
386 /// size is more important.
387 #[macro_export]
388 macro_rules! dispatch_light256 {
389     ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
390         #[cfg(feature = "std")]
391         $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> $ret {
392             #[inline(always)]
393             fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
394             use std::arch::x86_64::*;
395             #[target_feature(enable = "avx")]
396             unsafe fn impl_avx($($arg: $argty),*) -> $ret {
397                 fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
398             }
399             #[target_feature(enable = "sse2")]
400             unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
401                 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
402             }
403             unsafe {
404                 if is_x86_feature_detected!("avx") {
405                     impl_avx($($arg),*)
406                 } else if is_x86_feature_detected!("sse2") {
407                     impl_sse2($($arg),*)
408                 } else {
409                     unimplemented!()
410                 }
411             }
412         }
413         #[cfg(not(feature = "std"))]
414         #[inline(always)]
415         $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
416             unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
417             unsafe {
418                 if cfg!(target_feature = "avx2") {
419                     fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
420                 } else if cfg!(target_feature = "avx") {
421                     fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
422                 } else if cfg!(target_feature = "sse4.1") {
423                     fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
424                 } else if cfg!(target_feature = "ssse3") {
425                     fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
426                 } else {
427                     fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
428                 }
429             }
430         }
431     };
432     ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
433         dispatch_light256!($mach, $MTy, {
434             $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
435         });
436     }
437 }
438