1 use crate::soft::{x2, x4};
2 use crate::types::*;
3 use crate::vec128_storage;
4 use crate::x86_64::Avx2Machine;
5 use crate::x86_64::SseMachine as Machine86;
6 use crate::x86_64::{NoS3, NoS4, YesS3, YesS4};
7 use core::arch::x86_64::*;
8 use core::marker::PhantomData;
9 use core::ops::{
10     Add, AddAssign, BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Not,
11 };
12 
13 macro_rules! impl_binop {
14     ($vec:ident, $trait:ident, $fn:ident, $impl_fn:ident) => {
15         impl<S3, S4, NI> $trait for $vec<S3, S4, NI> {
16             type Output = Self;
17             #[inline(always)]
18             fn $fn(self, rhs: Self) -> Self::Output {
19                 Self::new(unsafe { $impl_fn(self.x, rhs.x) })
20             }
21         }
22     };
23 }
24 
25 macro_rules! impl_binop_assign {
26     ($vec:ident, $trait:ident, $fn_assign:ident, $fn:ident) => {
27         impl<S3, S4, NI> $trait for $vec<S3, S4, NI>
28         where
29             $vec<S3, S4, NI>: Copy,
30         {
31             #[inline(always)]
32             fn $fn_assign(&mut self, rhs: Self) {
33                 *self = self.$fn(rhs);
34             }
35         }
36     };
37 }
38 
39 macro_rules! def_vec {
40     ($vec:ident, $word:ident) => {
41         #[allow(non_camel_case_types)]
42         #[derive(Copy, Clone)]
43         pub struct $vec<S3, S4, NI> {
44             x: __m128i,
45             s3: PhantomData<S3>,
46             s4: PhantomData<S4>,
47             ni: PhantomData<NI>,
48         }
49 
50         impl<S3, S4, NI> Store<vec128_storage> for $vec<S3, S4, NI> {
51             #[inline(always)]
52             unsafe fn unpack(x: vec128_storage) -> Self {
53                 Self::new(x.sse2)
54             }
55         }
56         impl<S3, S4, NI> From<$vec<S3, S4, NI>> for vec128_storage {
57             #[inline(always)]
58             fn from(x: $vec<S3, S4, NI>) -> Self {
59                 vec128_storage { sse2: x.x }
60             }
61         }
62         impl<S3, S4, NI> $vec<S3, S4, NI> {
63             #[inline(always)]
64             fn new(x: __m128i) -> Self {
65                 $vec {
66                     x,
67                     s3: PhantomData,
68                     s4: PhantomData,
69                     ni: PhantomData,
70                 }
71             }
72         }
73 
74         impl<S3, S4, NI> StoreBytes for $vec<S3, S4, NI>
75         where
76             Self: BSwap,
77         {
78             #[inline(always)]
79             unsafe fn unsafe_read_le(input: &[u8]) -> Self {
80                 assert_eq!(input.len(), 16);
81                 Self::new(_mm_loadu_si128(input.as_ptr() as *const _))
82             }
83             #[inline(always)]
84             unsafe fn unsafe_read_be(input: &[u8]) -> Self {
85                 assert_eq!(input.len(), 16);
86                 Self::new(_mm_loadu_si128(input.as_ptr() as *const _)).bswap()
87             }
88             #[inline(always)]
89             fn write_le(self, out: &mut [u8]) {
90                 assert_eq!(out.len(), 16);
91                 unsafe { _mm_storeu_si128(out.as_mut_ptr() as *mut _, self.x) }
92             }
93             #[inline(always)]
94             fn write_be(self, out: &mut [u8]) {
95                 assert_eq!(out.len(), 16);
96                 let x = self.bswap().x;
97                 unsafe {
98                     _mm_storeu_si128(out.as_mut_ptr() as *mut _, x);
99                 }
100             }
101         }
102 
103         impl<S3, S4, NI> Default for $vec<S3, S4, NI> {
104             #[inline(always)]
105             fn default() -> Self {
106                 Self::new(unsafe { _mm_setzero_si128() })
107             }
108         }
109 
110         impl<S3, S4, NI> Not for $vec<S3, S4, NI> {
111             type Output = Self;
112             #[inline(always)]
113             fn not(self) -> Self::Output {
114                 unsafe {
115                     let ff = _mm_set1_epi64x(-1i64);
116                     self ^ Self::new(ff)
117                 }
118             }
119         }
120 
121         impl<S3: Copy, S4: Copy, NI: Copy> BitOps0 for $vec<S3, S4, NI> {}
122         impl_binop!($vec, BitAnd, bitand, _mm_and_si128);
123         impl_binop!($vec, BitOr, bitor, _mm_or_si128);
124         impl_binop!($vec, BitXor, bitxor, _mm_xor_si128);
125         impl_binop_assign!($vec, BitAndAssign, bitand_assign, bitand);
126         impl_binop_assign!($vec, BitOrAssign, bitor_assign, bitor);
127         impl_binop_assign!($vec, BitXorAssign, bitxor_assign, bitxor);
128         impl<S3: Copy, S4: Copy, NI: Copy> AndNot for $vec<S3, S4, NI> {
129             type Output = Self;
130             #[inline(always)]
131             fn andnot(self, rhs: Self) -> Self {
132                 Self::new(unsafe { _mm_andnot_si128(self.x, rhs.x) })
133             }
134         }
135     };
136 }
137 
138 macro_rules! impl_bitops32 {
139     ($vec:ident) => {
140         impl<S3: Copy, S4: Copy, NI: Copy> BitOps32 for $vec<S3, S4, NI> where
141             $vec<S3, S4, NI>: RotateEachWord32
142         {
143         }
144     };
145 }
146 
147 macro_rules! impl_bitops64 {
148     ($vec:ident) => {
149         impl_bitops32!($vec);
150         impl<S3: Copy, S4: Copy, NI: Copy> BitOps64 for $vec<S3, S4, NI> where
151             $vec<S3, S4, NI>: RotateEachWord64 + RotateEachWord32
152         {
153         }
154     };
155 }
156 
157 macro_rules! impl_bitops128 {
158     ($vec:ident) => {
159         impl_bitops64!($vec);
160         impl<S3: Copy, S4: Copy, NI: Copy> BitOps128 for $vec<S3, S4, NI> where
161             $vec<S3, S4, NI>: RotateEachWord128
162         {
163         }
164     };
165 }
166 
167 macro_rules! rotr_32_s3 {
168     ($name:ident, $k0:expr, $k1:expr) => {
169         #[inline(always)]
170         fn $name(self) -> Self {
171             Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) })
172         }
173     };
174 }
175 macro_rules! rotr_32 {
176     ($name:ident, $i:expr) => {
177         #[inline(always)]
178         fn $name(self) -> Self {
179             Self::new(unsafe {
180                 _mm_or_si128(
181                     _mm_srli_epi32(self.x, $i as i32),
182                     _mm_slli_epi32(self.x, 32 - $i as i32),
183                 )
184             })
185         }
186     };
187 }
188 impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<YesS3, S4, NI> {
189     rotr_32!(rotate_each_word_right7, 7);
190     rotr_32_s3!(
191         rotate_each_word_right8,
192         0x0c0f_0e0d_080b_0a09,
193         0x0407_0605_0003_0201
194     );
195     rotr_32!(rotate_each_word_right11, 11);
196     rotr_32!(rotate_each_word_right12, 12);
197     rotr_32_s3!(
198         rotate_each_word_right16,
199         0x0d0c_0f0e_0908_0b0a,
200         0x0504_0706_0100_0302
201     );
202     rotr_32!(rotate_each_word_right20, 20);
203     rotr_32_s3!(
204         rotate_each_word_right24,
205         0x0e0d_0c0f_0a09_080b,
206         0x0605_0407_0201_0003
207     );
208     rotr_32!(rotate_each_word_right25, 25);
209 }
210 impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<NoS3, S4, NI> {
211     rotr_32!(rotate_each_word_right7, 7);
212     rotr_32!(rotate_each_word_right8, 8);
213     rotr_32!(rotate_each_word_right11, 11);
214     rotr_32!(rotate_each_word_right12, 12);
215     #[inline(always)]
rotate_each_word_right16(self) -> Self216     fn rotate_each_word_right16(self) -> Self {
217         Self::new(swap16_s2(self.x))
218     }
219     rotr_32!(rotate_each_word_right20, 20);
220     rotr_32!(rotate_each_word_right24, 24);
221     rotr_32!(rotate_each_word_right25, 25);
222 }
223 
224 macro_rules! rotr_64_s3 {
225     ($name:ident, $k0:expr, $k1:expr) => {
226         #[inline(always)]
227         fn $name(self) -> Self {
228             Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) })
229         }
230     };
231 }
232 macro_rules! rotr_64 {
233     ($name:ident, $i:expr) => {
234         #[inline(always)]
235         fn $name(self) -> Self {
236             Self::new(unsafe {
237                 _mm_or_si128(
238                     _mm_srli_epi64(self.x, $i as i32),
239                     _mm_slli_epi64(self.x, 64 - $i as i32),
240                 )
241             })
242         }
243     };
244 }
245 impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<YesS3, S4, NI> {
246     rotr_64!(rotate_each_word_right7, 7);
247     rotr_64_s3!(
248         rotate_each_word_right8,
249         0x080f_0e0d_0c0b_0a09,
250         0x0007_0605_0403_0201
251     );
252     rotr_64!(rotate_each_word_right11, 11);
253     rotr_64!(rotate_each_word_right12, 12);
254     rotr_64_s3!(
255         rotate_each_word_right16,
256         0x0908_0f0e_0d0c_0b0a,
257         0x0100_0706_0504_0302
258     );
259     rotr_64!(rotate_each_word_right20, 20);
260     rotr_64_s3!(
261         rotate_each_word_right24,
262         0x0a09_080f_0e0d_0c0b,
263         0x0201_0007_0605_0403
264     );
265     rotr_64!(rotate_each_word_right25, 25);
266 }
267 impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<NoS3, S4, NI> {
268     rotr_64!(rotate_each_word_right7, 7);
269     rotr_64!(rotate_each_word_right8, 8);
270     rotr_64!(rotate_each_word_right11, 11);
271     rotr_64!(rotate_each_word_right12, 12);
272     #[inline(always)]
rotate_each_word_right16(self) -> Self273     fn rotate_each_word_right16(self) -> Self {
274         Self::new(swap16_s2(self.x))
275     }
276     rotr_64!(rotate_each_word_right20, 20);
277     rotr_64!(rotate_each_word_right24, 24);
278     rotr_64!(rotate_each_word_right25, 25);
279 }
280 impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u64x2_sse2<S3, S4, NI> {
281     #[inline(always)]
rotate_each_word_right32(self) -> Self282     fn rotate_each_word_right32(self) -> Self {
283         Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b10110001) })
284     }
285 }
286 
287 macro_rules! rotr_128 {
288     ($name:ident, $i:expr) => {
289         #[inline(always)]
290         fn $name(self) -> Self {
291             Self::new(unsafe {
292                 _mm_or_si128(
293                     _mm_srli_si128(self.x, $i as i32),
294                     _mm_slli_si128(self.x, 128 - $i as i32),
295                 )
296             })
297         }
298     };
299 }
300 // TODO: completely unoptimized
301 impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord32 for u128x1_sse2<S3, S4, NI> {
302     rotr_128!(rotate_each_word_right7, 7);
303     rotr_128!(rotate_each_word_right8, 8);
304     rotr_128!(rotate_each_word_right11, 11);
305     rotr_128!(rotate_each_word_right12, 12);
306     rotr_128!(rotate_each_word_right16, 16);
307     rotr_128!(rotate_each_word_right20, 20);
308     rotr_128!(rotate_each_word_right24, 24);
309     rotr_128!(rotate_each_word_right25, 25);
310 }
311 // TODO: completely unoptimized
312 impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u128x1_sse2<S3, S4, NI> {
313     rotr_128!(rotate_each_word_right32, 32);
314 }
315 impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord128 for u128x1_sse2<S3, S4, NI> {}
316 
317 def_vec!(u32x4_sse2, u32);
318 def_vec!(u64x2_sse2, u64);
319 def_vec!(u128x1_sse2, u128);
320 
321 impl<S3, NI> MultiLane<[u32; 4]> for u32x4_sse2<S3, YesS4, NI> {
322     #[inline(always)]
to_lanes(self) -> [u32; 4]323     fn to_lanes(self) -> [u32; 4] {
324         unsafe {
325             let x = _mm_cvtsi128_si64(self.x) as u64;
326             let y = _mm_extract_epi64(self.x, 1) as u64;
327             [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32]
328         }
329     }
330     #[inline(always)]
from_lanes(xs: [u32; 4]) -> Self331     fn from_lanes(xs: [u32; 4]) -> Self {
332         unsafe {
333             let mut x = _mm_cvtsi64_si128((xs[0] as u64 | ((xs[1] as u64) << 32)) as i64);
334             x = _mm_insert_epi64(x, (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64, 1);
335             Self::new(x)
336         }
337     }
338 }
339 impl<S3, NI> MultiLane<[u32; 4]> for u32x4_sse2<S3, NoS4, NI> {
340     #[inline(always)]
to_lanes(self) -> [u32; 4]341     fn to_lanes(self) -> [u32; 4] {
342         unsafe {
343             let x = _mm_cvtsi128_si64(self.x) as u64;
344             let y = _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64;
345             [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32]
346         }
347     }
348     #[inline(always)]
from_lanes(xs: [u32; 4]) -> Self349     fn from_lanes(xs: [u32; 4]) -> Self {
350         unsafe {
351             let x = (xs[0] as u64 | ((xs[1] as u64) << 32)) as i64;
352             let y = (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64;
353             let x = _mm_cvtsi64_si128(x);
354             let y = _mm_slli_si128(_mm_cvtsi64_si128(y), 8);
355             Self::new(_mm_or_si128(x, y))
356         }
357     }
358 }
359 impl<S3, NI> MultiLane<[u64; 2]> for u64x2_sse2<S3, YesS4, NI> {
360     #[inline(always)]
to_lanes(self) -> [u64; 2]361     fn to_lanes(self) -> [u64; 2] {
362         unsafe {
363             [
364                 _mm_cvtsi128_si64(self.x) as u64,
365                 _mm_extract_epi64(self.x, 1) as u64,
366             ]
367         }
368     }
369     #[inline(always)]
from_lanes(xs: [u64; 2]) -> Self370     fn from_lanes(xs: [u64; 2]) -> Self {
371         unsafe {
372             let mut x = _mm_cvtsi64_si128(xs[0] as i64);
373             x = _mm_insert_epi64(x, xs[1] as i64, 1);
374             Self::new(x)
375         }
376     }
377 }
378 impl<S3, NI> MultiLane<[u64; 2]> for u64x2_sse2<S3, NoS4, NI> {
379     #[inline(always)]
to_lanes(self) -> [u64; 2]380     fn to_lanes(self) -> [u64; 2] {
381         unsafe {
382             [
383                 _mm_cvtsi128_si64(self.x) as u64,
384                 _mm_cvtsi128_si64(_mm_srli_si128(self.x, 8)) as u64,
385             ]
386         }
387     }
388     #[inline(always)]
from_lanes(xs: [u64; 2]) -> Self389     fn from_lanes(xs: [u64; 2]) -> Self {
390         unsafe {
391             let x = _mm_cvtsi64_si128(xs[0] as i64);
392             let y = _mm_slli_si128(_mm_cvtsi64_si128(xs[1] as i64), 8);
393             Self::new(_mm_or_si128(x, y))
394         }
395     }
396 }
397 impl<S3, S4, NI> MultiLane<[u128; 1]> for u128x1_sse2<S3, S4, NI> {
398     #[inline(always)]
to_lanes(self) -> [u128; 1]399     fn to_lanes(self) -> [u128; 1] {
400         unimplemented!()
401     }
402     #[inline(always)]
from_lanes(xs: [u128; 1]) -> Self403     fn from_lanes(xs: [u128; 1]) -> Self {
404         unimplemented!("{:?}", xs)
405     }
406 }
407 
408 impl<S3, S4, NI> MultiLane<[u64; 4]> for u64x4_sse2<S3, S4, NI>
409 where
410     u64x2_sse2<S3, S4, NI>: MultiLane<[u64; 2]> + Copy,
411 {
412     #[inline(always)]
to_lanes(self) -> [u64; 4]413     fn to_lanes(self) -> [u64; 4] {
414         let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes());
415         [a[0], a[1], b[0], b[1]]
416     }
417     #[inline(always)]
from_lanes(xs: [u64; 4]) -> Self418     fn from_lanes(xs: [u64; 4]) -> Self {
419         let (a, b) = (
420             u64x2_sse2::from_lanes([xs[0], xs[1]]),
421             u64x2_sse2::from_lanes([xs[2], xs[3]]),
422         );
423         x2::new([a, b])
424     }
425 }
426 
427 macro_rules! impl_into {
428     ($from:ident, $to:ident) => {
429         impl<S3, S4, NI> From<$from<S3, S4, NI>> for $to<S3, S4, NI> {
430             #[inline(always)]
431             fn from(x: $from<S3, S4, NI>) -> Self {
432                 $to::new(x.x)
433             }
434         }
435     };
436 }
437 
438 impl_into!(u128x1_sse2, u32x4_sse2);
439 impl_into!(u128x1_sse2, u64x2_sse2);
440 
441 impl_bitops32!(u32x4_sse2);
442 impl_bitops64!(u64x2_sse2);
443 impl_bitops128!(u128x1_sse2);
444 
445 impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u32x4_sse2<S3, S4, NI> where
446     u32x4_sse2<S3, S4, NI>: BSwap
447 {
448 }
449 impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u64x2_sse2<S3, S4, NI> where
450     u64x2_sse2<S3, S4, NI>: BSwap
451 {
452 }
453 impl_binop!(u32x4_sse2, Add, add, _mm_add_epi32);
454 impl_binop!(u64x2_sse2, Add, add, _mm_add_epi64);
455 impl_binop_assign!(u32x4_sse2, AddAssign, add_assign, add);
456 impl_binop_assign!(u64x2_sse2, AddAssign, add_assign, add);
457 
458 impl<S3: Copy, S4: Copy, NI: Copy> u32x4<Machine86<S3, S4, NI>> for u32x4_sse2<S3, S4, NI>
459 where
460     u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4<u32>,
461     Machine86<S3, S4, NI>: Machine,
462 {
463 }
464 impl<S3: Copy, S4: Copy, NI: Copy> u64x2<Machine86<S3, S4, NI>> for u64x2_sse2<S3, S4, NI>
465 where
466     u64x2_sse2<S3, S4, NI>:
467         RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2<u64>,
468     Machine86<S3, S4, NI>: Machine,
469 {
470 }
471 impl<S3: Copy, S4: Copy, NI: Copy> u128x1<Machine86<S3, S4, NI>> for u128x1_sse2<S3, S4, NI>
472 where
473     u128x1_sse2<S3, S4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap,
474     Machine86<S3, S4, NI>: Machine,
475     u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4>,
476     u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2>,
477 {
478 }
479 
480 impl<NI: Copy> u32x4<Avx2Machine<NI>> for u32x4_sse2<YesS3, YesS4, NI>
481 where
482     u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4<u32>,
483     Machine86<YesS3, YesS4, NI>: Machine,
484 {
485 }
486 impl<NI: Copy> u64x2<Avx2Machine<NI>> for u64x2_sse2<YesS3, YesS4, NI>
487 where
488     u64x2_sse2<YesS3, YesS4, NI>:
489         RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2<u64>,
490     Machine86<YesS3, YesS4, NI>: Machine,
491 {
492 }
493 impl<NI: Copy> u128x1<Avx2Machine<NI>> for u128x1_sse2<YesS3, YesS4, NI>
494 where
495     u128x1_sse2<YesS3, YesS4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap,
496     Machine86<YesS3, YesS4, NI>: Machine,
497     u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u32x4>,
498     u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u64x2>,
499 {
500 }
501 
502 impl<S3, S4, NI> UnsafeFrom<[u32; 4]> for u32x4_sse2<S3, S4, NI> {
503     #[inline(always)]
unsafe_from(xs: [u32; 4]) -> Self504     unsafe fn unsafe_from(xs: [u32; 4]) -> Self {
505         Self::new(_mm_set_epi32(
506             xs[3] as i32,
507             xs[2] as i32,
508             xs[1] as i32,
509             xs[0] as i32,
510         ))
511     }
512 }
513 
514 impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, YesS4, NI>
515 where
516     Self: MultiLane<[u32; 4]>,
517 {
518     #[inline(always)]
extract(self, i: u32) -> u32519     fn extract(self, i: u32) -> u32 {
520         self.to_lanes()[i as usize]
521     }
522     #[inline(always)]
insert(self, v: u32, i: u32) -> Self523     fn insert(self, v: u32, i: u32) -> Self {
524         Self::new(unsafe {
525             match i {
526                 0 => _mm_insert_epi32(self.x, v as i32, 0),
527                 1 => _mm_insert_epi32(self.x, v as i32, 1),
528                 2 => _mm_insert_epi32(self.x, v as i32, 2),
529                 3 => _mm_insert_epi32(self.x, v as i32, 3),
530                 _ => unreachable!(),
531             }
532         })
533     }
534 }
535 impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, NoS4, NI>
536 where
537     Self: MultiLane<[u32; 4]>,
538 {
539     #[inline(always)]
extract(self, i: u32) -> u32540     fn extract(self, i: u32) -> u32 {
541         self.to_lanes()[i as usize]
542     }
543     #[inline(always)]
insert(self, v: u32, i: u32) -> Self544     fn insert(self, v: u32, i: u32) -> Self {
545         Self::new(unsafe {
546             match i {
547                 0 => {
548                     let x = _mm_andnot_si128(_mm_cvtsi32_si128(-1), self.x);
549                     _mm_or_si128(x, _mm_cvtsi32_si128(v as i32))
550                 }
551                 1 => {
552                     let mut x = _mm_shuffle_epi32(self.x, 0b0111_1000);
553                     x = _mm_slli_si128(x, 4);
554                     x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
555                     _mm_shuffle_epi32(x, 0b1110_0001)
556                 }
557                 2 => {
558                     let mut x = _mm_shuffle_epi32(self.x, 0b1011_0100);
559                     x = _mm_slli_si128(x, 4);
560                     x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
561                     _mm_shuffle_epi32(x, 0b1100_1001)
562                 }
563                 3 => {
564                     let mut x = _mm_slli_si128(self.x, 4);
565                     x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
566                     _mm_shuffle_epi32(x, 0b0011_1001)
567                 }
568                 _ => unreachable!(),
569             }
570         })
571     }
572 }
573 
574 impl<S3, S4, NI> LaneWords4 for u32x4_sse2<S3, S4, NI> {
575     #[inline(always)]
shuffle_lane_words2301(self) -> Self576     fn shuffle_lane_words2301(self) -> Self {
577         self.shuffle2301()
578     }
579     #[inline(always)]
shuffle_lane_words1230(self) -> Self580     fn shuffle_lane_words1230(self) -> Self {
581         self.shuffle1230()
582     }
583     #[inline(always)]
shuffle_lane_words3012(self) -> Self584     fn shuffle_lane_words3012(self) -> Self {
585         self.shuffle3012()
586     }
587 }
588 
589 impl<S3, S4, NI> Words4 for u32x4_sse2<S3, S4, NI> {
590     #[inline(always)]
shuffle2301(self) -> Self591     fn shuffle2301(self) -> Self {
592         Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
593     }
594     #[inline(always)]
shuffle1230(self) -> Self595     fn shuffle1230(self) -> Self {
596         Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b1001_0011) })
597     }
598     #[inline(always)]
shuffle3012(self) -> Self599     fn shuffle3012(self) -> Self {
600         Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0011_1001) })
601     }
602 }
603 
604 impl<S4, NI> Words4 for u64x4_sse2<YesS3, S4, NI> {
605     #[inline(always)]
shuffle2301(self) -> Self606     fn shuffle2301(self) -> Self {
607         x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)])
608     }
609     #[inline(always)]
shuffle3012(self) -> Self610     fn shuffle3012(self) -> Self {
611         unsafe {
612             x2::new([
613                 u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)),
614                 u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)),
615             ])
616         }
617     }
618     #[inline(always)]
shuffle1230(self) -> Self619     fn shuffle1230(self) -> Self {
620         unsafe {
621             x2::new([
622                 u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)),
623                 u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)),
624             ])
625         }
626     }
627 }
628 impl<S4, NI> Words4 for u64x4_sse2<NoS3, S4, NI> {
629     #[inline(always)]
shuffle2301(self) -> Self630     fn shuffle2301(self) -> Self {
631         x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)])
632     }
633     #[inline(always)]
shuffle3012(self) -> Self634     fn shuffle3012(self) -> Self {
635         unsafe {
636             let a = _mm_srli_si128(self.0[0].x, 8);
637             let b = _mm_slli_si128(self.0[0].x, 8);
638             let c = _mm_srli_si128(self.0[1].x, 8);
639             let d = _mm_slli_si128(self.0[1].x, 8);
640             let da = _mm_or_si128(d, a);
641             let bc = _mm_or_si128(b, c);
642             x2::new([u64x2_sse2::new(da), u64x2_sse2::new(bc)])
643         }
644     }
645     #[inline(always)]
shuffle1230(self) -> Self646     fn shuffle1230(self) -> Self {
647         unsafe {
648             let a = _mm_srli_si128(self.0[0].x, 8);
649             let b = _mm_slli_si128(self.0[0].x, 8);
650             let c = _mm_srli_si128(self.0[1].x, 8);
651             let d = _mm_slli_si128(self.0[1].x, 8);
652             let da = _mm_or_si128(d, a);
653             let bc = _mm_or_si128(b, c);
654             x2::new([u64x2_sse2::new(bc), u64x2_sse2::new(da)])
655         }
656     }
657 }
658 
659 impl<S3, S4, NI> UnsafeFrom<[u64; 2]> for u64x2_sse2<S3, S4, NI> {
660     #[inline(always)]
unsafe_from(xs: [u64; 2]) -> Self661     unsafe fn unsafe_from(xs: [u64; 2]) -> Self {
662         Self::new(_mm_set_epi64x(xs[1] as i64, xs[0] as i64))
663     }
664 }
665 
666 impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, YesS4, NI> {
667     #[inline(always)]
extract(self, i: u32) -> u64668     fn extract(self, i: u32) -> u64 {
669         unsafe {
670             match i {
671                 0 => _mm_cvtsi128_si64(self.x) as u64,
672                 1 => _mm_extract_epi64(self.x, 1) as u64,
673                 _ => unreachable!(),
674             }
675         }
676     }
677     #[inline(always)]
insert(self, x: u64, i: u32) -> Self678     fn insert(self, x: u64, i: u32) -> Self {
679         Self::new(unsafe {
680             match i {
681                 0 => _mm_insert_epi64(self.x, x as i64, 0),
682                 1 => _mm_insert_epi64(self.x, x as i64, 1),
683                 _ => unreachable!(),
684             }
685         })
686     }
687 }
688 impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, NoS4, NI> {
689     #[inline(always)]
extract(self, i: u32) -> u64690     fn extract(self, i: u32) -> u64 {
691         unsafe {
692             match i {
693                 0 => _mm_cvtsi128_si64(self.x) as u64,
694                 1 => _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64,
695                 _ => unreachable!(),
696             }
697         }
698     }
699     #[inline(always)]
insert(self, x: u64, i: u32) -> Self700     fn insert(self, x: u64, i: u32) -> Self {
701         Self::new(unsafe {
702             match i {
703                 0 => _mm_or_si128(
704                     _mm_andnot_si128(_mm_cvtsi64_si128(-1), self.x),
705                     _mm_cvtsi64_si128(x as i64),
706                 ),
707                 1 => _mm_or_si128(
708                     _mm_move_epi64(self.x),
709                     _mm_slli_si128(_mm_cvtsi64_si128(x as i64), 8),
710                 ),
711                 _ => unreachable!(),
712             }
713         })
714     }
715 }
716 
717 impl<S4, NI> BSwap for u32x4_sse2<YesS3, S4, NI> {
718     #[inline(always)]
bswap(self) -> Self719     fn bswap(self) -> Self {
720         Self::new(unsafe {
721             let k = _mm_set_epi64x(0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203);
722             _mm_shuffle_epi8(self.x, k)
723         })
724     }
725 }
726 #[inline(always)]
bswap32_s2(x: __m128i) -> __m128i727 fn bswap32_s2(x: __m128i) -> __m128i {
728     unsafe {
729         let mut y = _mm_unpacklo_epi8(x, _mm_setzero_si128());
730         y = _mm_shufflehi_epi16(y, 0b0001_1011);
731         y = _mm_shufflelo_epi16(y, 0b0001_1011);
732         let mut z = _mm_unpackhi_epi8(x, _mm_setzero_si128());
733         z = _mm_shufflehi_epi16(z, 0b0001_1011);
734         z = _mm_shufflelo_epi16(z, 0b0001_1011);
735         _mm_packus_epi16(y, z)
736     }
737 }
738 impl<S4, NI> BSwap for u32x4_sse2<NoS3, S4, NI> {
739     #[inline(always)]
bswap(self) -> Self740     fn bswap(self) -> Self {
741         Self::new(bswap32_s2(self.x))
742     }
743 }
744 
745 impl<S4, NI> BSwap for u64x2_sse2<YesS3, S4, NI> {
746     #[inline(always)]
bswap(self) -> Self747     fn bswap(self) -> Self {
748         Self::new(unsafe {
749             let k = _mm_set_epi64x(0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607);
750             _mm_shuffle_epi8(self.x, k)
751         })
752     }
753 }
754 impl<S4, NI> BSwap for u64x2_sse2<NoS3, S4, NI> {
755     #[inline(always)]
bswap(self) -> Self756     fn bswap(self) -> Self {
757         Self::new(unsafe { bswap32_s2(_mm_shuffle_epi32(self.x, 0b1011_0001)) })
758     }
759 }
760 
761 impl<S4, NI> BSwap for u128x1_sse2<YesS3, S4, NI> {
762     #[inline(always)]
bswap(self) -> Self763     fn bswap(self) -> Self {
764         Self::new(unsafe {
765             let k = _mm_set_epi64x(0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100);
766             _mm_shuffle_epi8(self.x, k)
767         })
768     }
769 }
770 impl<S4, NI> BSwap for u128x1_sse2<NoS3, S4, NI> {
771     #[inline(always)]
bswap(self) -> Self772     fn bswap(self) -> Self {
773         unimplemented!()
774     }
775 }
776 
777 macro_rules! swapi {
778     ($x:expr, $i:expr, $k:expr) => {
779         unsafe {
780             const K: u8 = $k;
781             let k = _mm_set1_epi8(K as i8);
782             u128x1_sse2::new(_mm_or_si128(
783                 _mm_srli_epi16(_mm_and_si128($x.x, k), $i),
784                 _mm_and_si128(_mm_slli_epi16($x.x, $i), k),
785             ))
786         }
787     };
788 }
789 #[inline(always)]
swap16_s2(x: __m128i) -> __m128i790 fn swap16_s2(x: __m128i) -> __m128i {
791     unsafe { _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0b1011_0001), 0b1011_0001) }
792 }
793 impl<S4, NI> Swap64 for u128x1_sse2<YesS3, S4, NI> {
794     #[inline(always)]
swap1(self) -> Self795     fn swap1(self) -> Self {
796         swapi!(self, 1, 0xaa)
797     }
798     #[inline(always)]
swap2(self) -> Self799     fn swap2(self) -> Self {
800         swapi!(self, 2, 0xcc)
801     }
802     #[inline(always)]
swap4(self) -> Self803     fn swap4(self) -> Self {
804         swapi!(self, 4, 0xf0)
805     }
806     #[inline(always)]
swap8(self) -> Self807     fn swap8(self) -> Self {
808         u128x1_sse2::new(unsafe {
809             let k = _mm_set_epi64x(0x0e0f_0c0d_0a0b_0809, 0x0607_0405_0203_0001);
810             _mm_shuffle_epi8(self.x, k)
811         })
812     }
813     #[inline(always)]
swap16(self) -> Self814     fn swap16(self) -> Self {
815         u128x1_sse2::new(unsafe {
816             let k = _mm_set_epi64x(0x0d0c_0f0e_0908_0b0a, 0x0504_0706_0100_0302);
817             _mm_shuffle_epi8(self.x, k)
818         })
819     }
820     #[inline(always)]
swap32(self) -> Self821     fn swap32(self) -> Self {
822         u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) })
823     }
824     #[inline(always)]
swap64(self) -> Self825     fn swap64(self) -> Self {
826         u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
827     }
828 }
829 impl<S4, NI> Swap64 for u128x1_sse2<NoS3, S4, NI> {
830     #[inline(always)]
swap1(self) -> Self831     fn swap1(self) -> Self {
832         swapi!(self, 1, 0xaa)
833     }
834     #[inline(always)]
swap2(self) -> Self835     fn swap2(self) -> Self {
836         swapi!(self, 2, 0xcc)
837     }
838     #[inline(always)]
swap4(self) -> Self839     fn swap4(self) -> Self {
840         swapi!(self, 4, 0xf0)
841     }
842     #[inline(always)]
swap8(self) -> Self843     fn swap8(self) -> Self {
844         u128x1_sse2::new(unsafe {
845             _mm_or_si128(_mm_slli_epi16(self.x, 8), _mm_srli_epi16(self.x, 8))
846         })
847     }
848     #[inline(always)]
swap16(self) -> Self849     fn swap16(self) -> Self {
850         u128x1_sse2::new(swap16_s2(self.x))
851     }
852     #[inline(always)]
swap32(self) -> Self853     fn swap32(self) -> Self {
854         u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) })
855     }
856     #[inline(always)]
swap64(self) -> Self857     fn swap64(self) -> Self {
858         u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
859     }
860 }
861 
862 #[derive(Copy, Clone)]
863 pub struct G0;
864 #[derive(Copy, Clone)]
865 pub struct G1;
866 
867 #[allow(non_camel_case_types)]
868 pub type u32x4x2_sse2<S3, S4, NI> = x2<u32x4_sse2<S3, S4, NI>, G0>;
869 #[allow(non_camel_case_types)]
870 pub type u64x2x2_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G0>;
871 #[allow(non_camel_case_types)]
872 pub type u64x4_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G1>;
873 #[allow(non_camel_case_types)]
874 pub type u128x2_sse2<S3, S4, NI> = x2<u128x1_sse2<S3, S4, NI>, G0>;
875 
876 #[allow(non_camel_case_types)]
877 pub type u32x4x4_sse2<S3, S4, NI> = x4<u32x4_sse2<S3, S4, NI>>;
878 #[allow(non_camel_case_types)]
879 pub type u64x2x4_sse2<S3, S4, NI> = x4<u64x2_sse2<S3, S4, NI>>;
880 #[allow(non_camel_case_types)]
881 pub type u128x4_sse2<S3, S4, NI> = x4<u128x1_sse2<S3, S4, NI>>;
882 
883 impl<S3, S4, NI> Vector<[u32; 16]> for u32x4x4_sse2<S3, S4, NI> {
884     #[inline(always)]
to_scalars(self) -> [u32; 16]885     fn to_scalars(self) -> [u32; 16] {
886         unsafe { core::mem::transmute(self) }
887     }
888 }
889 
890 impl<S3: Copy, S4: Copy, NI: Copy> u32x4x2<Machine86<S3, S4, NI>> for u32x4x2_sse2<S3, S4, NI>
891 where
892     u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
893     Machine86<S3, S4, NI>: Machine,
894     u32x4x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 2]>,
895     u32x4x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u32x4>,
896 {
897 }
898 impl<S3: Copy, S4: Copy, NI: Copy> u64x2x2<Machine86<S3, S4, NI>> for u64x2x2_sse2<S3, S4, NI>
899 where
900     u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
901     Machine86<S3, S4, NI>: Machine,
902     u64x2x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; 2]>,
903     u64x2x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u64x2>,
904 {
905 }
906 impl<S3: Copy, S4: Copy, NI: Copy> u64x4<Machine86<S3, S4, NI>> for u64x4_sse2<S3, S4, NI>
907 where
908     u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
909     Machine86<S3, S4, NI>: Machine,
910     u64x4_sse2<S3, S4, NI>: MultiLane<[u64; 4]> + Vec4<u64> + Words4,
911 {
912 }
913 impl<S3: Copy, S4: Copy, NI: Copy> u128x2<Machine86<S3, S4, NI>> for u128x2_sse2<S3, S4, NI>
914 where
915     u128x1_sse2<S3, S4, NI>: Swap64 + BSwap,
916     Machine86<S3, S4, NI>: Machine,
917     u128x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; 2]>,
918     u128x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u128x1>,
919     u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x2>,
920     u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x2>,
921     u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x4>,
922 {
923 }
924 
925 impl<NI: Copy> u32x4x2<Avx2Machine<NI>> for u32x4x2_sse2<YesS3, YesS4, NI>
926 where
927     u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap,
928     Avx2Machine<NI>: Machine,
929     u32x4x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u32x4; 2]>,
930     u32x4x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u32x4>,
931 {
932 }
933 impl<NI: Copy> u64x2x2<Avx2Machine<NI>> for u64x2x2_sse2<YesS3, YesS4, NI>
934 where
935     u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
936     Avx2Machine<NI>: Machine,
937     u64x2x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; 2]>,
938     u64x2x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u64x2>,
939 {
940 }
941 impl<NI: Copy> u64x4<Avx2Machine<NI>> for u64x4_sse2<YesS3, YesS4, NI>
942 where
943     u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
944     Avx2Machine<NI>: Machine,
945     u64x4_sse2<YesS3, YesS4, NI>: MultiLane<[u64; 4]> + Vec4<u64> + Words4,
946 {
947 }
948 impl<NI: Copy> u128x2<Avx2Machine<NI>> for u128x2_sse2<YesS3, YesS4, NI>
949 where
950     u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap,
951     Avx2Machine<NI>: Machine,
952     u128x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; 2]>,
953     u128x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u128x1>,
954     u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x2>,
955     u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x2>,
956     u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x4>,
957 {
958 }
959 
960 impl<S3, S4, NI> Vec4<u64> for u64x4_sse2<S3, S4, NI>
961 where
962     u64x2_sse2<S3, S4, NI>: Copy + Vec2<u64>,
963 {
964     #[inline(always)]
extract(self, i: u32) -> u64965     fn extract(self, i: u32) -> u64 {
966         match i {
967             0 => self.0[0].extract(0),
968             1 => self.0[0].extract(1),
969             2 => self.0[1].extract(0),
970             3 => self.0[1].extract(1),
971             _ => panic!(),
972         }
973     }
974     #[inline(always)]
insert(mut self, w: u64, i: u32) -> Self975     fn insert(mut self, w: u64, i: u32) -> Self {
976         match i {
977             0 => self.0[0] = self.0[0].insert(w, 0),
978             1 => self.0[0] = self.0[0].insert(w, 1),
979             2 => self.0[1] = self.0[1].insert(w, 0),
980             3 => self.0[1] = self.0[1].insert(w, 1),
981             _ => panic!(),
982         };
983         self
984     }
985 }
986 
987 impl<S3: Copy, S4: Copy, NI: Copy> u32x4x4<Machine86<S3, S4, NI>> for u32x4x4_sse2<S3, S4, NI>
988 where
989     u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
990     Machine86<S3, S4, NI>: Machine,
991     u32x4x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 4]>,
992     u32x4x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u32x4>,
993     u32x4x4_sse2<S3, S4, NI>: Vec4Ext<<Machine86<S3, S4, NI> as Machine>::u32x4>,
994     u32x4x4_sse2<S3, S4, NI>: Vector<[u32; 16]>,
995 {
996 }
997 impl<S3: Copy, S4: Copy, NI: Copy> u64x2x4<Machine86<S3, S4, NI>> for u64x2x4_sse2<S3, S4, NI>
998 where
999     u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
1000     Machine86<S3, S4, NI>: Machine,
1001     u64x2x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; 4]>,
1002     u64x2x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u64x2>,
1003 {
1004 }
1005 impl<S3: Copy, S4: Copy, NI: Copy> u128x4<Machine86<S3, S4, NI>> for u128x4_sse2<S3, S4, NI>
1006 where
1007     u128x1_sse2<S3, S4, NI>: Swap64 + BSwap,
1008     Machine86<S3, S4, NI>: Machine,
1009     u128x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; 4]>,
1010     u128x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u128x1>,
1011     u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x4>,
1012     u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x4>,
1013 {
1014 }
1015 
1016 impl<NI: Copy> u64x2x4<Avx2Machine<NI>> for u64x2x4_sse2<YesS3, YesS4, NI>
1017 where
1018     u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
1019     Avx2Machine<NI>: Machine,
1020     u64x2x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; 4]>,
1021     u64x2x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u64x2>,
1022 {
1023 }
1024 impl<NI: Copy> u128x4<Avx2Machine<NI>> for u128x4_sse2<YesS3, YesS4, NI>
1025 where
1026     u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap,
1027     Avx2Machine<NI>: Machine,
1028     u128x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; 4]>,
1029     u128x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u128x1>,
1030     u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x4>,
1031     u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x4>,
1032 {
1033 }
1034 
1035 macro_rules! impl_into_x {
1036     ($from:ident, $to:ident) => {
1037         impl<S3: Copy, S4: Copy, NI: Copy, Gf, Gt> From<x2<$from<S3, S4, NI>, Gf>>
1038             for x2<$to<S3, S4, NI>, Gt>
1039         {
1040             #[inline(always)]
1041             fn from(x: x2<$from<S3, S4, NI>, Gf>) -> Self {
1042                 x2::new([$to::from(x.0[0]), $to::from(x.0[1])])
1043             }
1044         }
1045         impl<S3: Copy, S4: Copy, NI: Copy> From<x4<$from<S3, S4, NI>>> for x4<$to<S3, S4, NI>> {
1046             #[inline(always)]
1047             fn from(x: x4<$from<S3, S4, NI>>) -> Self {
1048                 x4::new([
1049                     $to::from(x.0[0]),
1050                     $to::from(x.0[1]),
1051                     $to::from(x.0[2]),
1052                     $to::from(x.0[3]),
1053                 ])
1054             }
1055         }
1056     };
1057 }
1058 impl_into_x!(u128x1_sse2, u64x2_sse2);
1059 impl_into_x!(u128x1_sse2, u32x4_sse2);
1060 
1061 ///// Debugging
1062 
1063 use core::fmt::{Debug, Formatter, Result};
1064 
1065 impl<W: PartialEq, G> PartialEq for x2<W, G> {
1066     #[inline(always)]
eq(&self, rhs: &Self) -> bool1067     fn eq(&self, rhs: &Self) -> bool {
1068         self.0[0] == rhs.0[0] && self.0[1] == rhs.0[1]
1069     }
1070 }
1071 
1072 #[allow(unused)]
1073 #[inline(always)]
eq128_s4(x: __m128i, y: __m128i) -> bool1074 unsafe fn eq128_s4(x: __m128i, y: __m128i) -> bool {
1075     let q = _mm_shuffle_epi32(_mm_cmpeq_epi64(x, y), 0b1100_0110);
1076     _mm_cvtsi128_si64(q) == -1
1077 }
1078 
1079 #[inline(always)]
eq128_s2(x: __m128i, y: __m128i) -> bool1080 unsafe fn eq128_s2(x: __m128i, y: __m128i) -> bool {
1081     let q = _mm_cmpeq_epi32(x, y);
1082     let p = _mm_cvtsi128_si64(_mm_srli_si128(q, 8));
1083     let q = _mm_cvtsi128_si64(q);
1084     (p & q) == -1
1085 }
1086 
1087 impl<S3, S4, NI> PartialEq for u32x4_sse2<S3, S4, NI> {
1088     #[inline(always)]
eq(&self, rhs: &Self) -> bool1089     fn eq(&self, rhs: &Self) -> bool {
1090         unsafe { eq128_s2(self.x, rhs.x) }
1091     }
1092 }
1093 impl<S3, S4, NI> Debug for u32x4_sse2<S3, S4, NI>
1094 where
1095     Self: Copy + MultiLane<[u32; 4]>,
1096 {
1097     #[cold]
fmt(&self, fmt: &mut Formatter) -> Result1098     fn fmt(&self, fmt: &mut Formatter) -> Result {
1099         fmt.write_fmt(format_args!("{:08x?}", &self.to_lanes()))
1100     }
1101 }
1102 
1103 impl<S3, S4, NI> PartialEq for u64x2_sse2<S3, S4, NI> {
1104     #[inline(always)]
eq(&self, rhs: &Self) -> bool1105     fn eq(&self, rhs: &Self) -> bool {
1106         unsafe { eq128_s2(self.x, rhs.x) }
1107     }
1108 }
1109 impl<S3, S4, NI> Debug for u64x2_sse2<S3, S4, NI>
1110 where
1111     Self: Copy + MultiLane<[u64; 2]>,
1112 {
1113     #[cold]
fmt(&self, fmt: &mut Formatter) -> Result1114     fn fmt(&self, fmt: &mut Formatter) -> Result {
1115         fmt.write_fmt(format_args!("{:016x?}", &self.to_lanes()))
1116     }
1117 }
1118 
1119 impl<S3, S4, NI> Debug for u64x4_sse2<S3, S4, NI>
1120 where
1121     u64x2_sse2<S3, S4, NI>: Copy + MultiLane<[u64; 2]>,
1122 {
1123     #[cold]
fmt(&self, fmt: &mut Formatter) -> Result1124     fn fmt(&self, fmt: &mut Formatter) -> Result {
1125         let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes());
1126         fmt.write_fmt(format_args!("{:016x?}", &[a[0], a[1], b[0], b[1]]))
1127     }
1128 }
1129 
1130 #[cfg(test)]
1131 #[cfg(target_arch = "x86_64")]
1132 mod test {
1133     use super::*;
1134     use crate::x86_64::{SSE2, SSE41, SSSE3};
1135     use crate::Machine;
1136 
1137     #[test]
1138     #[cfg_attr(not(target_feature = "ssse3"), ignore)]
test_bswap32_s2_vs_s3()1139     fn test_bswap32_s2_vs_s3() {
1140         let xs = [0x0f0e_0d0c, 0x0b0a_0908, 0x0706_0504, 0x0302_0100];
1141         let ys = [0x0c0d_0e0f, 0x0809_0a0b, 0x0405_0607, 0x0001_0203];
1142 
1143         let s2 = unsafe { SSE2::instance() };
1144         let s3 = unsafe { SSSE3::instance() };
1145 
1146         let x_s2 = {
1147             let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1148             x_s2.bswap()
1149         };
1150 
1151         let x_s3 = {
1152             let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1153             x_s3.bswap()
1154         };
1155 
1156         assert_eq!(x_s2, unsafe { core::mem::transmute(x_s3) });
1157         assert_eq!(x_s2, s2.vec(ys));
1158     }
1159 
1160     #[test]
1161     #[cfg_attr(not(target_feature = "ssse3"), ignore)]
test_bswap64_s2_vs_s3()1162     fn test_bswap64_s2_vs_s3() {
1163         let xs = [0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100];
1164         let ys = [0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607];
1165 
1166         let s2 = unsafe { SSE2::instance() };
1167         let s3 = unsafe { SSSE3::instance() };
1168 
1169         let x_s2 = {
1170             let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1171             x_s2.bswap()
1172         };
1173 
1174         let x_s3 = {
1175             let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs);
1176             x_s3.bswap()
1177         };
1178 
1179         assert_eq!(x_s2, s2.vec(ys));
1180         assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1181     }
1182 
1183     #[test]
1184     #[cfg_attr(not(target_feature = "ssse3"), ignore)]
test_shuffle32_s2_vs_s3()1185     fn test_shuffle32_s2_vs_s3() {
1186         let xs = [0x0, 0x1, 0x2, 0x3];
1187         let ys = [0x2, 0x3, 0x0, 0x1];
1188         let zs = [0x1, 0x2, 0x3, 0x0];
1189 
1190         let s2 = unsafe { SSE2::instance() };
1191         let s3 = unsafe { SSSE3::instance() };
1192 
1193         let x_s2 = {
1194             let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1195             x_s2.shuffle2301()
1196         };
1197         let x_s3 = {
1198             let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1199             x_s3.shuffle2301()
1200         };
1201         assert_eq!(x_s2, s2.vec(ys));
1202         assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1203 
1204         let x_s2 = {
1205             let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1206             x_s2.shuffle3012()
1207         };
1208         let x_s3 = {
1209             let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1210             x_s3.shuffle3012()
1211         };
1212         assert_eq!(x_s2, s2.vec(zs));
1213         assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1214 
1215         let x_s2 = x_s2.shuffle1230();
1216         let x_s3 = x_s3.shuffle1230();
1217         assert_eq!(x_s2, s2.vec(xs));
1218         assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1219     }
1220 
1221     #[test]
1222     #[cfg_attr(not(target_feature = "ssse3"), ignore)]
test_shuffle64_s2_vs_s3()1223     fn test_shuffle64_s2_vs_s3() {
1224         let xs = [0x0, 0x1, 0x2, 0x3];
1225         let ys = [0x2, 0x3, 0x0, 0x1];
1226         let zs = [0x1, 0x2, 0x3, 0x0];
1227 
1228         let s2 = unsafe { SSE2::instance() };
1229         let s3 = unsafe { SSSE3::instance() };
1230 
1231         let x_s2 = {
1232             let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs);
1233             x_s2.shuffle2301()
1234         };
1235         let x_s3 = {
1236             let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs);
1237             x_s3.shuffle2301()
1238         };
1239         assert_eq!(x_s2, s2.vec(ys));
1240         assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1241 
1242         let x_s2 = {
1243             let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs);
1244             x_s2.shuffle3012()
1245         };
1246         let x_s3 = {
1247             let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs);
1248             x_s3.shuffle3012()
1249         };
1250         assert_eq!(x_s2, s2.vec(zs));
1251         assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1252 
1253         let x_s2 = x_s2.shuffle1230();
1254         let x_s3 = x_s3.shuffle1230();
1255         assert_eq!(x_s2, s2.vec(xs));
1256         assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1257     }
1258 
1259     #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1260     #[test]
test_lanes_u32x4()1261     fn test_lanes_u32x4() {
1262         let xs = [0x1, 0x2, 0x3, 0x4];
1263 
1264         let s2 = unsafe { SSE2::instance() };
1265         let s3 = unsafe { SSSE3::instance() };
1266         let s4 = unsafe { SSE41::instance() };
1267 
1268         {
1269             let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1270             let y_s2 = <SSE2 as Machine>::u32x4::from_lanes(xs);
1271             assert_eq!(x_s2, y_s2);
1272             assert_eq!(xs, y_s2.to_lanes());
1273         }
1274 
1275         {
1276             let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1277             let y_s3 = <SSSE3 as Machine>::u32x4::from_lanes(xs);
1278             assert_eq!(x_s3, y_s3);
1279             assert_eq!(xs, y_s3.to_lanes());
1280         }
1281 
1282         {
1283             let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs);
1284             let y_s4 = <SSE41 as Machine>::u32x4::from_lanes(xs);
1285             assert_eq!(x_s4, y_s4);
1286             assert_eq!(xs, y_s4.to_lanes());
1287         }
1288     }
1289 
1290     #[test]
1291     #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
test_lanes_u64x2()1292     fn test_lanes_u64x2() {
1293         let xs = [0x1, 0x2];
1294 
1295         let s2 = unsafe { SSE2::instance() };
1296         let s3 = unsafe { SSSE3::instance() };
1297         let s4 = unsafe { SSE41::instance() };
1298 
1299         {
1300             let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1301             let y_s2 = <SSE2 as Machine>::u64x2::from_lanes(xs);
1302             assert_eq!(x_s2, y_s2);
1303             assert_eq!(xs, y_s2.to_lanes());
1304         }
1305 
1306         {
1307             let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs);
1308             let y_s3 = <SSSE3 as Machine>::u64x2::from_lanes(xs);
1309             assert_eq!(x_s3, y_s3);
1310             assert_eq!(xs, y_s3.to_lanes());
1311         }
1312 
1313         {
1314             let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs);
1315             let y_s4 = <SSE41 as Machine>::u64x2::from_lanes(xs);
1316             assert_eq!(x_s4, y_s4);
1317             assert_eq!(xs, y_s4.to_lanes());
1318         }
1319     }
1320 
1321     #[test]
test_vec4_u32x4_s2()1322     fn test_vec4_u32x4_s2() {
1323         let xs = [1, 2, 3, 4];
1324         let s2 = unsafe { SSE2::instance() };
1325         let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1326         assert_eq!(x_s2.extract(0), 1);
1327         assert_eq!(x_s2.extract(1), 2);
1328         assert_eq!(x_s2.extract(2), 3);
1329         assert_eq!(x_s2.extract(3), 4);
1330         assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2, 3, 4]));
1331         assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf, 3, 4]));
1332         assert_eq!(x_s2.insert(0xf, 2), s2.vec([1, 2, 0xf, 4]));
1333         assert_eq!(x_s2.insert(0xf, 3), s2.vec([1, 2, 3, 0xf]));
1334     }
1335 
1336     #[test]
1337     #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
test_vec4_u32x4_s4()1338     fn test_vec4_u32x4_s4() {
1339         let xs = [1, 2, 3, 4];
1340         let s4 = unsafe { SSE41::instance() };
1341         let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs);
1342         assert_eq!(x_s4.extract(0), 1);
1343         assert_eq!(x_s4.extract(1), 2);
1344         assert_eq!(x_s4.extract(2), 3);
1345         assert_eq!(x_s4.extract(3), 4);
1346         assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2, 3, 4]));
1347         assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf, 3, 4]));
1348         assert_eq!(x_s4.insert(0xf, 2), s4.vec([1, 2, 0xf, 4]));
1349         assert_eq!(x_s4.insert(0xf, 3), s4.vec([1, 2, 3, 0xf]));
1350     }
1351 
1352     #[test]
test_vec2_u64x2_s2()1353     fn test_vec2_u64x2_s2() {
1354         let xs = [0x1, 0x2];
1355         let s2 = unsafe { SSE2::instance() };
1356         let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1357         assert_eq!(x_s2.extract(0), 1);
1358         assert_eq!(x_s2.extract(1), 2);
1359         assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2]));
1360         assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf]));
1361     }
1362 
1363     #[test]
1364     #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
test_vec4_u64x2_s4()1365     fn test_vec4_u64x2_s4() {
1366         let xs = [0x1, 0x2];
1367         let s4 = unsafe { SSE41::instance() };
1368         let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs);
1369         assert_eq!(x_s4.extract(0), 1);
1370         assert_eq!(x_s4.extract(1), 2);
1371         assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2]));
1372         assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf]));
1373     }
1374 }
1375 
1376 pub mod avx2 {
1377     #![allow(non_camel_case_types)]
1378     use crate::soft::{x2, x4};
1379     use crate::types::*;
1380     use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2, G0};
1381     use crate::x86_64::{vec256_storage, vec512_storage, Avx2Machine, YesS3, YesS4};
1382     use core::arch::x86_64::*;
1383     use core::marker::PhantomData;
1384     use core::ops::*;
1385 
1386     #[derive(Copy, Clone)]
1387     pub struct u32x4x2_avx2<NI> {
1388         x: __m256i,
1389         ni: PhantomData<NI>,
1390     }
1391 
1392     impl<NI> u32x4x2_avx2<NI> {
1393         #[inline(always)]
new(x: __m256i) -> Self1394         fn new(x: __m256i) -> Self {
1395             Self { x, ni: PhantomData }
1396         }
1397     }
1398 
1399     impl<NI> u32x4x2<Avx2Machine<NI>> for u32x4x2_avx2<NI> where NI: Copy {}
1400     impl<NI> Store<vec256_storage> for u32x4x2_avx2<NI> {
1401         #[inline(always)]
unpack(p: vec256_storage) -> Self1402         unsafe fn unpack(p: vec256_storage) -> Self {
1403             Self::new(p.avx)
1404         }
1405     }
1406     impl<NI> StoreBytes for u32x4x2_avx2<NI> {
1407         #[inline(always)]
unsafe_read_le(input: &[u8]) -> Self1408         unsafe fn unsafe_read_le(input: &[u8]) -> Self {
1409             assert_eq!(input.len(), 32);
1410             Self::new(_mm256_loadu_si256(input.as_ptr() as *const _))
1411         }
1412         #[inline(always)]
unsafe_read_be(input: &[u8]) -> Self1413         unsafe fn unsafe_read_be(input: &[u8]) -> Self {
1414             Self::unsafe_read_le(input).bswap()
1415         }
1416         #[inline(always)]
write_le(self, out: &mut [u8])1417         fn write_le(self, out: &mut [u8]) {
1418             unsafe {
1419                 assert_eq!(out.len(), 32);
1420                 _mm256_storeu_si256(out.as_mut_ptr() as *mut _, self.x)
1421             }
1422         }
1423         #[inline(always)]
write_be(self, out: &mut [u8])1424         fn write_be(self, out: &mut [u8]) {
1425             self.bswap().write_le(out)
1426         }
1427     }
1428     impl<NI> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 2]> for u32x4x2_avx2<NI> {
1429         #[inline(always)]
to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 2]1430         fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 2] {
1431             unsafe {
1432                 [
1433                     u32x4_sse2::new(_mm256_extracti128_si256(self.x, 0)),
1434                     u32x4_sse2::new(_mm256_extracti128_si256(self.x, 1)),
1435                 ]
1436             }
1437         }
1438         #[inline(always)]
from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 2]) -> Self1439         fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 2]) -> Self {
1440             Self::new(unsafe { _mm256_setr_m128i(x[0].x, x[1].x) })
1441         }
1442     }
1443     impl<NI> Vec2<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x2_avx2<NI> {
1444         #[inline(always)]
extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI>1445         fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> {
1446             unsafe {
1447                 match i {
1448                     0 => u32x4_sse2::new(_mm256_extracti128_si256(self.x, 0)),
1449                     1 => u32x4_sse2::new(_mm256_extracti128_si256(self.x, 1)),
1450                     _ => panic!(),
1451                 }
1452             }
1453         }
1454         #[inline(always)]
insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self1455         fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self {
1456             Self::new(unsafe {
1457                 match i {
1458                     0 => _mm256_inserti128_si256(self.x, w.x, 0),
1459                     1 => _mm256_inserti128_si256(self.x, w.x, 1),
1460                     _ => panic!(),
1461                 }
1462             })
1463         }
1464     }
1465     impl<NI> BitOps32 for u32x4x2_avx2<NI> where NI: Copy {}
1466     impl<NI> ArithOps for u32x4x2_avx2<NI> where NI: Copy {}
1467     macro_rules! shuf_lane_bytes {
1468         ($name:ident, $k0:expr, $k1:expr) => {
1469             #[inline(always)]
1470             fn $name(self) -> Self {
1471                 Self::new(unsafe {
1472                     _mm256_shuffle_epi8(self.x, _mm256_set_epi64x($k0, $k1, $k0, $k1))
1473                 })
1474             }
1475         };
1476     }
1477     macro_rules! rotr_32 {
1478         ($name:ident, $i:expr) => {
1479             #[inline(always)]
1480             fn $name(self) -> Self {
1481                 Self::new(unsafe {
1482                     _mm256_or_si256(
1483                         _mm256_srli_epi32(self.x, $i as i32),
1484                         _mm256_slli_epi32(self.x, 32 - $i as i32),
1485                     )
1486                 })
1487             }
1488         };
1489     }
1490     impl<NI: Copy> RotateEachWord32 for u32x4x2_avx2<NI> {
1491         rotr_32!(rotate_each_word_right7, 7);
1492         shuf_lane_bytes!(
1493             rotate_each_word_right8,
1494             0x0c0f_0e0d_080b_0a09,
1495             0x0407_0605_0003_0201
1496         );
1497         rotr_32!(rotate_each_word_right11, 11);
1498         rotr_32!(rotate_each_word_right12, 12);
1499         shuf_lane_bytes!(
1500             rotate_each_word_right16,
1501             0x0d0c_0f0e_0908_0b0a,
1502             0x0504_0706_0100_0302
1503         );
1504         rotr_32!(rotate_each_word_right20, 20);
1505         shuf_lane_bytes!(
1506             rotate_each_word_right24,
1507             0x0e0d_0c0f_0a09_080b,
1508             0x0605_0407_0201_0003
1509         );
1510         rotr_32!(rotate_each_word_right25, 25);
1511     }
1512     impl<NI> BitOps0 for u32x4x2_avx2<NI> where NI: Copy {}
1513     impl<NI> From<u32x4x2_avx2<NI>> for vec256_storage {
1514         #[inline(always)]
from(x: u32x4x2_avx2<NI>) -> Self1515         fn from(x: u32x4x2_avx2<NI>) -> Self {
1516             Self { avx: x.x }
1517         }
1518     }
1519 
1520     macro_rules! impl_assign {
1521         ($vec:ident, $Assign:ident, $assign_fn:ident, $bin_fn:ident) => {
1522             impl<NI> $Assign for $vec<NI>
1523             where
1524                 NI: Copy,
1525             {
1526                 #[inline(always)]
1527                 fn $assign_fn(&mut self, rhs: Self) {
1528                     *self = self.$bin_fn(rhs);
1529                 }
1530             }
1531         };
1532     }
1533     impl_assign!(u32x4x2_avx2, BitXorAssign, bitxor_assign, bitxor);
1534     impl_assign!(u32x4x2_avx2, BitOrAssign, bitor_assign, bitor);
1535     impl_assign!(u32x4x2_avx2, BitAndAssign, bitand_assign, bitand);
1536     impl_assign!(u32x4x2_avx2, AddAssign, add_assign, add);
1537 
1538     macro_rules! impl_bitop {
1539         ($vec:ident, $Op:ident, $op_fn:ident, $impl_fn:ident) => {
1540             impl<NI> $Op for $vec<NI> {
1541                 type Output = Self;
1542                 #[inline(always)]
1543                 fn $op_fn(self, rhs: Self) -> Self::Output {
1544                     Self::new(unsafe { $impl_fn(self.x, rhs.x) })
1545                 }
1546             }
1547         };
1548     }
1549     impl_bitop!(u32x4x2_avx2, BitXor, bitxor, _mm256_xor_si256);
1550     impl_bitop!(u32x4x2_avx2, BitOr, bitor, _mm256_or_si256);
1551     impl_bitop!(u32x4x2_avx2, BitAnd, bitand, _mm256_and_si256);
1552     impl_bitop!(u32x4x2_avx2, AndNot, andnot, _mm256_andnot_si256);
1553     impl_bitop!(u32x4x2_avx2, Add, add, _mm256_add_epi32);
1554 
1555     impl<NI> Not for u32x4x2_avx2<NI> {
1556         type Output = Self;
1557         #[inline(always)]
not(self) -> Self::Output1558         fn not(self) -> Self::Output {
1559             unsafe {
1560                 let f = _mm256_set1_epi8(-0x7f);
1561                 Self::new(f) ^ self
1562             }
1563         }
1564     }
1565 
1566     impl<NI> BSwap for u32x4x2_avx2<NI> {
1567         shuf_lane_bytes!(bswap, 0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203);
1568     }
1569 
1570     impl<NI> From<x2<u128x1_sse2<YesS3, YesS4, NI>, G0>> for u32x4x2_avx2<NI>
1571     where
1572         NI: Copy,
1573     {
1574         #[inline(always)]
from(x: x2<u128x1_sse2<YesS3, YesS4, NI>, G0>) -> Self1575         fn from(x: x2<u128x1_sse2<YesS3, YesS4, NI>, G0>) -> Self {
1576             Self::new(unsafe { _mm256_setr_m128i(x.0[0].x, x.0[1].x) })
1577         }
1578     }
1579 
1580     impl<NI> LaneWords4 for u32x4x2_avx2<NI> {
1581         #[inline(always)]
shuffle_lane_words1230(self) -> Self1582         fn shuffle_lane_words1230(self) -> Self {
1583             Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b1001_0011) })
1584         }
1585         #[inline(always)]
shuffle_lane_words2301(self) -> Self1586         fn shuffle_lane_words2301(self) -> Self {
1587             Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b0100_1110) })
1588         }
1589         #[inline(always)]
shuffle_lane_words3012(self) -> Self1590         fn shuffle_lane_words3012(self) -> Self {
1591             Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b0011_1001) })
1592         }
1593     }
1594 
1595     ///////////////////////////////////////////////////////////////////////////////////////////
1596 
1597     pub type u32x4x4_avx2<NI> = x2<u32x4x2_avx2<NI>, G0>;
1598     impl<NI: Copy> u32x4x4<Avx2Machine<NI>> for u32x4x4_avx2<NI> {}
1599 
1600     impl<NI: Copy> Store<vec512_storage> for u32x4x4_avx2<NI> {
1601         #[inline(always)]
unpack(p: vec512_storage) -> Self1602         unsafe fn unpack(p: vec512_storage) -> Self {
1603             Self::new([
1604                 u32x4x2_avx2::unpack(p.avx[0]),
1605                 u32x4x2_avx2::unpack(p.avx[1]),
1606             ])
1607         }
1608     }
1609     impl<NI: Copy> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 4]> for u32x4x4_avx2<NI> {
1610         #[inline(always)]
to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 4]1611         fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 4] {
1612             let [a, b] = self.0[0].to_lanes();
1613             let [c, d] = self.0[1].to_lanes();
1614             [a, b, c, d]
1615         }
1616         #[inline(always)]
from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 4]) -> Self1617         fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 4]) -> Self {
1618             let ab = u32x4x2_avx2::from_lanes([x[0], x[1]]);
1619             let cd = u32x4x2_avx2::from_lanes([x[2], x[3]]);
1620             Self::new([ab, cd])
1621         }
1622     }
1623     impl<NI: Copy> Vec4<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
1624         #[inline(always)]
extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI>1625         fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> {
1626             match i {
1627                 0 => self.0[0].extract(0),
1628                 1 => self.0[0].extract(1),
1629                 2 => self.0[1].extract(0),
1630                 3 => self.0[1].extract(1),
1631                 _ => panic!(),
1632             }
1633         }
1634         #[inline(always)]
insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self1635         fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self {
1636             Self::new(match i {
1637                 0 | 1 => [self.0[0].insert(w, i), self.0[1]],
1638                 2 | 3 => [self.0[0], self.0[1].insert(w, i - 2)],
1639                 _ => panic!(),
1640             })
1641         }
1642     }
1643     impl<NI: Copy> Vec4Ext<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
1644         #[inline(always)]
transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self)1645         fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) {
1646             /*
1647              * a00:a01 a10:a11
1648              * b00:b01 b10:b11
1649              * c00:c01 c10:c11
1650              * d00:d01 d10:d11
1651              *       =>
1652              * a00:b00 c00:d00
1653              * a01:b01 c01:d01
1654              * a10:b10 c10:d10
1655              * a11:b11 c11:d11
1656              */
1657             unsafe {
1658                 let ab00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[0].x, b.0[0].x, 0x20));
1659                 let ab01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[0].x, b.0[0].x, 0x31));
1660                 let ab10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[1].x, b.0[1].x, 0x20));
1661                 let ab11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[1].x, b.0[1].x, 0x31));
1662                 let cd00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[0].x, d.0[0].x, 0x20));
1663                 let cd01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[0].x, d.0[0].x, 0x31));
1664                 let cd10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[1].x, d.0[1].x, 0x20));
1665                 let cd11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[1].x, d.0[1].x, 0x31));
1666                 (
1667                     Self::new([ab00, cd00]),
1668                     Self::new([ab01, cd01]),
1669                     Self::new([ab10, cd10]),
1670                     Self::new([ab11, cd11]),
1671                 )
1672             }
1673         }
1674     }
1675     impl<NI: Copy> Vector<[u32; 16]> for u32x4x4_avx2<NI> {
1676         #[inline(always)]
to_scalars(self) -> [u32; 16]1677         fn to_scalars(self) -> [u32; 16] {
1678             unsafe { core::mem::transmute(self) }
1679         }
1680     }
1681     impl<NI: Copy> From<u32x4x4_avx2<NI>> for vec512_storage {
1682         #[inline(always)]
from(x: u32x4x4_avx2<NI>) -> Self1683         fn from(x: u32x4x4_avx2<NI>) -> Self {
1684             Self {
1685                 avx: [
1686                     vec256_storage { avx: x.0[0].x },
1687                     vec256_storage { avx: x.0[1].x },
1688                 ],
1689             }
1690         }
1691     }
1692     impl<NI: Copy> From<x4<u128x1_sse2<YesS3, YesS4, NI>>> for u32x4x4_avx2<NI> {
1693         #[inline(always)]
from(x: x4<u128x1_sse2<YesS3, YesS4, NI>>) -> Self1694         fn from(x: x4<u128x1_sse2<YesS3, YesS4, NI>>) -> Self {
1695             Self::new(unsafe {
1696                 [
1697                     u32x4x2_avx2::new(_mm256_setr_m128i(x.0[0].x, x.0[1].x)),
1698                     u32x4x2_avx2::new(_mm256_setr_m128i(x.0[2].x, x.0[3].x)),
1699                 ]
1700             })
1701         }
1702     }
1703 }
1704