1 use crate::soft::{x2, x4};
2 use crate::types::*;
3 use crate::vec128_storage;
4 use crate::x86_64::Avx2Machine;
5 use crate::x86_64::SseMachine as Machine86;
6 use crate::x86_64::{NoS3, NoS4, YesS3, YesS4};
7 use core::arch::x86_64::*;
8 use core::marker::PhantomData;
9 use core::ops::{
10     Add, AddAssign, BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Not,
11 };
12 
13 macro_rules! impl_binop {
14     ($vec:ident, $trait:ident, $fn:ident, $impl_fn:ident) => {
15         impl<S3, S4, NI> $trait for $vec<S3, S4, NI> {
16             type Output = Self;
17             #[inline(always)]
18             fn $fn(self, rhs: Self) -> Self::Output {
19                 Self::new(unsafe { $impl_fn(self.x, rhs.x) })
20             }
21         }
22     };
23 }
24 
25 macro_rules! impl_binop_assign {
26     ($vec:ident, $trait:ident, $fn_assign:ident, $fn:ident) => {
27         impl<S3, S4, NI> $trait for $vec<S3, S4, NI>
28         where
29             $vec<S3, S4, NI>: Copy,
30         {
31             #[inline(always)]
32             fn $fn_assign(&mut self, rhs: Self) {
33                 *self = self.$fn(rhs);
34             }
35         }
36     };
37 }
38 
39 macro_rules! def_vec {
40     ($vec:ident, $word:ident) => {
41         #[allow(non_camel_case_types)]
42         #[derive(Copy, Clone)]
43         pub struct $vec<S3, S4, NI> {
44             x: __m128i,
45             s3: PhantomData<S3>,
46             s4: PhantomData<S4>,
47             ni: PhantomData<NI>,
48         }
49 
50         impl<S3, S4, NI> Store<vec128_storage> for $vec<S3, S4, NI> {
51             #[inline(always)]
52             unsafe fn unpack(x: vec128_storage) -> Self {
53                 Self::new(x.sse2)
54             }
55         }
56         impl<S3, S4, NI> From<$vec<S3, S4, NI>> for vec128_storage {
57             #[inline(always)]
58             fn from(x: $vec<S3, S4, NI>) -> Self {
59                 vec128_storage { sse2: x.x }
60             }
61         }
62         impl<S3, S4, NI> $vec<S3, S4, NI> {
63             #[inline(always)]
64             fn new(x: __m128i) -> Self {
65                 $vec {
66                     x,
67                     s3: PhantomData,
68                     s4: PhantomData,
69                     ni: PhantomData,
70                 }
71             }
72         }
73 
74         impl<S3, S4, NI> StoreBytes for $vec<S3, S4, NI>
75         where
76             Self: BSwap,
77         {
78             #[inline(always)]
79             unsafe fn unsafe_read_le(input: &[u8]) -> Self {
80                 assert_eq!(input.len(), 16);
81                 Self::new(_mm_loadu_si128(input.as_ptr() as *const _))
82             }
83             #[inline(always)]
84             unsafe fn unsafe_read_be(input: &[u8]) -> Self {
85                 assert_eq!(input.len(), 16);
86                 Self::new(_mm_loadu_si128(input.as_ptr() as *const _)).bswap()
87             }
88             #[inline(always)]
89             fn write_le(self, out: &mut [u8]) {
90                 assert_eq!(out.len(), 16);
91                 unsafe { _mm_storeu_si128(out.as_mut_ptr() as *mut _, self.x) }
92             }
93             #[inline(always)]
94             fn write_be(self, out: &mut [u8]) {
95                 assert_eq!(out.len(), 16);
96                 let x = self.bswap().x;
97                 unsafe {
98                     _mm_storeu_si128(out.as_mut_ptr() as *mut _, x);
99                 }
100             }
101         }
102 
103         impl<S3, S4, NI> Default for $vec<S3, S4, NI> {
104             #[inline(always)]
105             fn default() -> Self {
106                 Self::new(unsafe { _mm_setzero_si128() })
107             }
108         }
109 
110         impl<S3, S4, NI> Not for $vec<S3, S4, NI> {
111             type Output = Self;
112             #[inline(always)]
113             fn not(self) -> Self::Output {
114                 unsafe {
115                     let ff = _mm_set1_epi64x(-1i64);
116                     self ^ Self::new(ff)
117                 }
118             }
119         }
120 
121         impl<S3: Copy, S4: Copy, NI: Copy> BitOps0 for $vec<S3, S4, NI> {}
122         impl_binop!($vec, BitAnd, bitand, _mm_and_si128);
123         impl_binop!($vec, BitOr, bitor, _mm_or_si128);
124         impl_binop!($vec, BitXor, bitxor, _mm_xor_si128);
125         impl_binop_assign!($vec, BitAndAssign, bitand_assign, bitand);
126         impl_binop_assign!($vec, BitOrAssign, bitor_assign, bitor);
127         impl_binop_assign!($vec, BitXorAssign, bitxor_assign, bitxor);
128         impl<S3: Copy, S4: Copy, NI: Copy> AndNot for $vec<S3, S4, NI> {
129             type Output = Self;
130             #[inline(always)]
131             fn andnot(self, rhs: Self) -> Self {
132                 Self::new(unsafe { _mm_andnot_si128(self.x, rhs.x) })
133             }
134         }
135     };
136 }
137 
138 macro_rules! impl_bitops32 {
139     ($vec:ident) => {
140         impl<S3: Copy, S4: Copy, NI: Copy> BitOps32 for $vec<S3, S4, NI> where
141             $vec<S3, S4, NI>: RotateEachWord32
142         {
143         }
144     };
145 }
146 
147 macro_rules! impl_bitops64 {
148     ($vec:ident) => {
149         impl_bitops32!($vec);
150         impl<S3: Copy, S4: Copy, NI: Copy> BitOps64 for $vec<S3, S4, NI> where
151             $vec<S3, S4, NI>: RotateEachWord64 + RotateEachWord32
152         {
153         }
154     };
155 }
156 
157 macro_rules! impl_bitops128 {
158     ($vec:ident) => {
159         impl_bitops64!($vec);
160         impl<S3: Copy, S4: Copy, NI: Copy> BitOps128 for $vec<S3, S4, NI> where
161             $vec<S3, S4, NI>: RotateEachWord128
162         {
163         }
164     };
165 }
166 
167 macro_rules! rotr_32_s3 {
168     ($name:ident, $k0:expr, $k1:expr) => {
169         #[inline(always)]
170         fn $name(self) -> Self {
171             Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) })
172         }
173     };
174 }
175 macro_rules! rotr_32 {
176     ($name:ident, $i:expr) => {
177         #[inline(always)]
178         fn $name(self) -> Self {
179             Self::new(unsafe {
180                 _mm_or_si128(
181                     _mm_srli_epi32(self.x, $i as i32),
182                     _mm_slli_epi32(self.x, 32 - $i as i32),
183                 )
184             })
185         }
186     };
187 }
188 impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<YesS3, S4, NI> {
189     rotr_32!(rotate_each_word_right7, 7);
190     rotr_32_s3!(
191         rotate_each_word_right8,
192         0x0c0f0e0d_080b0a09,
193         0x04070605_00030201
194     );
195     rotr_32!(rotate_each_word_right11, 11);
196     rotr_32!(rotate_each_word_right12, 12);
197     rotr_32_s3!(
198         rotate_each_word_right16,
199         0x0d0c0f0e_09080b0a,
200         0x05040706_01000302
201     );
202     rotr_32!(rotate_each_word_right20, 20);
203     rotr_32_s3!(
204         rotate_each_word_right24,
205         0x0e0d0c0f_0a09080b,
206         0x06050407_02010003
207     );
208     rotr_32!(rotate_each_word_right25, 25);
209 }
210 impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<NoS3, S4, NI> {
211     rotr_32!(rotate_each_word_right7, 7);
212     rotr_32!(rotate_each_word_right8, 8);
213     rotr_32!(rotate_each_word_right11, 11);
214     rotr_32!(rotate_each_word_right12, 12);
215     #[inline(always)]
rotate_each_word_right16(self) -> Self216     fn rotate_each_word_right16(self) -> Self {
217         Self::new(swap16_s2(self.x))
218     }
219     rotr_32!(rotate_each_word_right20, 20);
220     rotr_32!(rotate_each_word_right24, 24);
221     rotr_32!(rotate_each_word_right25, 25);
222 }
223 
224 macro_rules! rotr_64_s3 {
225     ($name:ident, $k0:expr, $k1:expr) => {
226         #[inline(always)]
227         fn $name(self) -> Self {
228             Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) })
229         }
230     };
231 }
232 macro_rules! rotr_64 {
233     ($name:ident, $i:expr) => {
234         #[inline(always)]
235         fn $name(self) -> Self {
236             Self::new(unsafe {
237                 _mm_or_si128(
238                     _mm_srli_epi64(self.x, $i as i32),
239                     _mm_slli_epi64(self.x, 64 - $i as i32),
240                 )
241             })
242         }
243     };
244 }
245 impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<YesS3, S4, NI> {
246     rotr_64!(rotate_each_word_right7, 7);
247     rotr_64_s3!(
248         rotate_each_word_right8,
249         0x080f_0e0d_0c0b_0a09,
250         0x0007_0605_0403_0201
251     );
252     rotr_64!(rotate_each_word_right11, 11);
253     rotr_64!(rotate_each_word_right12, 12);
254     rotr_64_s3!(
255         rotate_each_word_right16,
256         0x0908_0f0e_0d0c_0b0a,
257         0x0100_0706_0504_0302
258     );
259     rotr_64!(rotate_each_word_right20, 20);
260     rotr_64_s3!(
261         rotate_each_word_right24,
262         0x0a09_080f_0e0d_0c0b,
263         0x0201_0007_0605_0403
264     );
265     rotr_64!(rotate_each_word_right25, 25);
266 }
267 impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<NoS3, S4, NI> {
268     rotr_64!(rotate_each_word_right7, 7);
269     rotr_64!(rotate_each_word_right8, 8);
270     rotr_64!(rotate_each_word_right11, 11);
271     rotr_64!(rotate_each_word_right12, 12);
272     #[inline(always)]
rotate_each_word_right16(self) -> Self273     fn rotate_each_word_right16(self) -> Self {
274         Self::new(swap16_s2(self.x))
275     }
276     rotr_64!(rotate_each_word_right20, 20);
277     rotr_64!(rotate_each_word_right24, 24);
278     rotr_64!(rotate_each_word_right25, 25);
279 }
280 impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u64x2_sse2<S3, S4, NI> {
281     #[inline(always)]
rotate_each_word_right32(self) -> Self282     fn rotate_each_word_right32(self) -> Self {
283         Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b10110001) })
284     }
285 }
286 
287 macro_rules! rotr_128 {
288     ($name:ident, $i:expr) => {
289         #[inline(always)]
290         fn $name(self) -> Self {
291             Self::new(unsafe {
292                 _mm_or_si128(
293                     _mm_srli_si128(self.x, $i as i32),
294                     _mm_slli_si128(self.x, 128 - $i as i32),
295                 )
296             })
297         }
298     };
299 }
300 // TODO: completely unoptimized
301 impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord32 for u128x1_sse2<S3, S4, NI> {
302     rotr_128!(rotate_each_word_right7, 7);
303     rotr_128!(rotate_each_word_right8, 8);
304     rotr_128!(rotate_each_word_right11, 11);
305     rotr_128!(rotate_each_word_right12, 12);
306     rotr_128!(rotate_each_word_right16, 16);
307     rotr_128!(rotate_each_word_right20, 20);
308     rotr_128!(rotate_each_word_right24, 24);
309     rotr_128!(rotate_each_word_right25, 25);
310 }
311 // TODO: completely unoptimized
312 impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u128x1_sse2<S3, S4, NI> {
313     rotr_128!(rotate_each_word_right32, 32);
314 }
315 impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord128 for u128x1_sse2<S3, S4, NI> {}
316 
317 def_vec!(u32x4_sse2, u32);
318 def_vec!(u64x2_sse2, u64);
319 def_vec!(u128x1_sse2, u128);
320 
321 impl<S3, NI> MultiLane<[u32; 4]> for u32x4_sse2<S3, YesS4, NI> {
322     #[inline(always)]
to_lanes(self) -> [u32; 4]323     fn to_lanes(self) -> [u32; 4] {
324         unsafe {
325             let x = _mm_cvtsi128_si64(self.x) as u64;
326             let y = _mm_extract_epi64(self.x, 1) as u64;
327             [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32]
328         }
329     }
330     #[inline(always)]
from_lanes(xs: [u32; 4]) -> Self331     fn from_lanes(xs: [u32; 4]) -> Self {
332         unsafe {
333             let mut x = _mm_cvtsi64_si128((xs[0] as u64 | ((xs[1] as u64) << 32)) as i64);
334             x = _mm_insert_epi64(x, (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64, 1);
335             Self::new(x)
336         }
337     }
338 }
339 impl<S3, NI> MultiLane<[u32; 4]> for u32x4_sse2<S3, NoS4, NI> {
340     #[inline(always)]
to_lanes(self) -> [u32; 4]341     fn to_lanes(self) -> [u32; 4] {
342         unsafe {
343             let x = _mm_cvtsi128_si64(self.x) as u64;
344             let y = _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64;
345             [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32]
346         }
347     }
348     #[inline(always)]
from_lanes(xs: [u32; 4]) -> Self349     fn from_lanes(xs: [u32; 4]) -> Self {
350         unsafe {
351             let x = (xs[0] as u64 | ((xs[1] as u64) << 32)) as i64;
352             let y = (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64;
353             let x = _mm_cvtsi64_si128(x);
354             let y = _mm_slli_si128(_mm_cvtsi64_si128(y), 8);
355             Self::new(_mm_or_si128(x, y))
356         }
357     }
358 }
359 impl<S3, NI> MultiLane<[u64; 2]> for u64x2_sse2<S3, YesS4, NI> {
360     #[inline(always)]
to_lanes(self) -> [u64; 2]361     fn to_lanes(self) -> [u64; 2] {
362         unsafe {
363             [
364                 _mm_cvtsi128_si64(self.x) as u64,
365                 _mm_extract_epi64(self.x, 1) as u64,
366             ]
367         }
368     }
369     #[inline(always)]
from_lanes(xs: [u64; 2]) -> Self370     fn from_lanes(xs: [u64; 2]) -> Self {
371         unsafe {
372             let mut x = _mm_cvtsi64_si128(xs[0] as i64);
373             x = _mm_insert_epi64(x, xs[1] as i64, 1);
374             Self::new(x)
375         }
376     }
377 }
378 impl<S3, NI> MultiLane<[u64; 2]> for u64x2_sse2<S3, NoS4, NI> {
379     #[inline(always)]
to_lanes(self) -> [u64; 2]380     fn to_lanes(self) -> [u64; 2] {
381         unsafe {
382             [
383                 _mm_cvtsi128_si64(self.x) as u64,
384                 _mm_cvtsi128_si64(_mm_srli_si128(self.x, 8)) as u64,
385             ]
386         }
387     }
388     #[inline(always)]
from_lanes(xs: [u64; 2]) -> Self389     fn from_lanes(xs: [u64; 2]) -> Self {
390         unsafe {
391             let x = _mm_cvtsi64_si128(xs[0] as i64);
392             let y = _mm_slli_si128(_mm_cvtsi64_si128(xs[1] as i64), 8);
393             Self::new(_mm_or_si128(x, y))
394         }
395     }
396 }
397 impl<S3, S4, NI> MultiLane<[u128; 1]> for u128x1_sse2<S3, S4, NI> {
398     #[inline(always)]
to_lanes(self) -> [u128; 1]399     fn to_lanes(self) -> [u128; 1] {
400         unimplemented!()
401     }
402     #[inline(always)]
from_lanes(xs: [u128; 1]) -> Self403     fn from_lanes(xs: [u128; 1]) -> Self {
404         unimplemented!("{:?}", xs)
405     }
406 }
407 
408 impl<S3, S4, NI> MultiLane<[u64; 4]> for u64x4_sse2<S3, S4, NI>
409 where
410     u64x2_sse2<S3, S4, NI>: MultiLane<[u64; 2]> + Copy,
411 {
412     #[inline(always)]
to_lanes(self) -> [u64; 4]413     fn to_lanes(self) -> [u64; 4] {
414         let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes());
415         [a[0], a[1], b[0], b[1]]
416     }
417     #[inline(always)]
from_lanes(xs: [u64; 4]) -> Self418     fn from_lanes(xs: [u64; 4]) -> Self {
419         let (a, b) = (
420             u64x2_sse2::from_lanes([xs[0], xs[1]]),
421             u64x2_sse2::from_lanes([xs[2], xs[3]]),
422         );
423         x2::new([a, b])
424     }
425 }
426 
427 macro_rules! impl_into {
428     ($from:ident, $to:ident) => {
429         impl<S3, S4, NI> From<$from<S3, S4, NI>> for $to<S3, S4, NI> {
430             #[inline(always)]
431             fn from(x: $from<S3, S4, NI>) -> Self {
432                 $to::new(x.x)
433             }
434         }
435     };
436 }
437 
438 impl_into!(u128x1_sse2, u32x4_sse2);
439 impl_into!(u128x1_sse2, u64x2_sse2);
440 
441 impl_bitops32!(u32x4_sse2);
442 impl_bitops64!(u64x2_sse2);
443 impl_bitops128!(u128x1_sse2);
444 
445 impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u32x4_sse2<S3, S4, NI> where
446     u32x4_sse2<S3, S4, NI>: BSwap
447 {
448 }
449 impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u64x2_sse2<S3, S4, NI> where
450     u64x2_sse2<S3, S4, NI>: BSwap
451 {
452 }
453 impl_binop!(u32x4_sse2, Add, add, _mm_add_epi32);
454 impl_binop!(u64x2_sse2, Add, add, _mm_add_epi64);
455 impl_binop_assign!(u32x4_sse2, AddAssign, add_assign, add);
456 impl_binop_assign!(u64x2_sse2, AddAssign, add_assign, add);
457 
458 impl<S3: Copy, S4: Copy, NI: Copy> u32x4<Machine86<S3, S4, NI>> for u32x4_sse2<S3, S4, NI>
459 where
460     u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4<u32>,
461     Machine86<S3, S4, NI>: Machine,
462 {
463 }
464 impl<S3: Copy, S4: Copy, NI: Copy> u64x2<Machine86<S3, S4, NI>> for u64x2_sse2<S3, S4, NI>
465 where
466     u64x2_sse2<S3, S4, NI>:
467         RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2<u64>,
468     Machine86<S3, S4, NI>: Machine,
469 {
470 }
471 impl<S3: Copy, S4: Copy, NI: Copy> u128x1<Machine86<S3, S4, NI>> for u128x1_sse2<S3, S4, NI>
472 where
473     u128x1_sse2<S3, S4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap,
474     Machine86<S3, S4, NI>: Machine,
475     u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4>,
476     u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2>,
477 {
478 }
479 
480 impl<NI: Copy> u32x4<Avx2Machine<NI>> for u32x4_sse2<YesS3, YesS4, NI>
481 where
482     u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4<u32>,
483     Machine86<YesS3, YesS4, NI>: Machine,
484 {
485 }
486 impl<NI: Copy> u64x2<Avx2Machine<NI>> for u64x2_sse2<YesS3, YesS4, NI>
487 where
488     u64x2_sse2<YesS3, YesS4, NI>:
489         RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2<u64>,
490     Machine86<YesS3, YesS4, NI>: Machine,
491 {
492 }
493 impl<NI: Copy> u128x1<Avx2Machine<NI>> for u128x1_sse2<YesS3, YesS4, NI>
494 where
495     u128x1_sse2<YesS3, YesS4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap,
496     Machine86<YesS3, YesS4, NI>: Machine,
497     u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u32x4>,
498     u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u64x2>,
499 {
500 }
501 
502 impl<S3, S4, NI> UnsafeFrom<[u32; 4]> for u32x4_sse2<S3, S4, NI> {
503     #[inline(always)]
unsafe_from(xs: [u32; 4]) -> Self504     unsafe fn unsafe_from(xs: [u32; 4]) -> Self {
505         Self::new(_mm_set_epi32(
506             xs[3] as i32,
507             xs[2] as i32,
508             xs[1] as i32,
509             xs[0] as i32,
510         ))
511     }
512 }
513 
514 impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, YesS4, NI>
515 where
516     Self: MultiLane<[u32; 4]>,
517 {
518     #[inline(always)]
extract(self, i: u32) -> u32519     fn extract(self, i: u32) -> u32 {
520         self.to_lanes()[i as usize]
521     }
522     #[inline(always)]
insert(self, v: u32, i: u32) -> Self523     fn insert(self, v: u32, i: u32) -> Self {
524         Self::new(unsafe {
525             match i {
526                 0 => _mm_insert_epi32(self.x, v as i32, 0),
527                 1 => _mm_insert_epi32(self.x, v as i32, 1),
528                 2 => _mm_insert_epi32(self.x, v as i32, 2),
529                 3 => _mm_insert_epi32(self.x, v as i32, 3),
530                 _ => unreachable!(),
531             }
532         })
533     }
534 }
535 impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, NoS4, NI>
536 where
537     Self: MultiLane<[u32; 4]>,
538 {
539     #[inline(always)]
extract(self, i: u32) -> u32540     fn extract(self, i: u32) -> u32 {
541         self.to_lanes()[i as usize]
542     }
543     #[inline(always)]
insert(self, v: u32, i: u32) -> Self544     fn insert(self, v: u32, i: u32) -> Self {
545         Self::new(unsafe {
546             match i {
547                 0 => {
548                     let x = _mm_andnot_si128(_mm_cvtsi32_si128(-1), self.x);
549                     _mm_or_si128(x, _mm_cvtsi32_si128(v as i32))
550                 }
551                 1 => {
552                     let mut x = _mm_shuffle_epi32(self.x, 0b0111_1000);
553                     x = _mm_slli_si128(x, 4);
554                     x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
555                     _mm_shuffle_epi32(x, 0b1110_0001)
556                 }
557                 2 => {
558                     let mut x = _mm_shuffle_epi32(self.x, 0b1011_0100);
559                     x = _mm_slli_si128(x, 4);
560                     x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
561                     _mm_shuffle_epi32(x, 0b1100_1001)
562                 }
563                 3 => {
564                     let mut x = _mm_slli_si128(self.x, 4);
565                     x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
566                     _mm_shuffle_epi32(x, 0b0011_1001)
567                 }
568                 _ => unreachable!(),
569             }
570         })
571     }
572 }
573 
574 impl<S3, S4, NI> LaneWords4 for u32x4_sse2<S3, S4, NI> {
575     #[inline(always)]
shuffle_lane_words2301(self) -> Self576     fn shuffle_lane_words2301(self) -> Self {
577         self.shuffle2301()
578     }
579     #[inline(always)]
shuffle_lane_words1230(self) -> Self580     fn shuffle_lane_words1230(self) -> Self {
581         self.shuffle1230()
582     }
583     #[inline(always)]
shuffle_lane_words3012(self) -> Self584     fn shuffle_lane_words3012(self) -> Self {
585         self.shuffle3012()
586     }
587 }
588 
589 impl<S3, S4, NI> Words4 for u32x4_sse2<S3, S4, NI> {
590     #[inline(always)]
shuffle2301(self) -> Self591     fn shuffle2301(self) -> Self {
592         Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
593     }
594     #[inline(always)]
shuffle1230(self) -> Self595     fn shuffle1230(self) -> Self {
596         Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b1001_0011) })
597     }
598     #[inline(always)]
shuffle3012(self) -> Self599     fn shuffle3012(self) -> Self {
600         Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0011_1001) })
601     }
602 }
603 
604 impl<S4, NI> Words4 for u64x4_sse2<YesS3, S4, NI> {
605     #[inline(always)]
shuffle2301(self) -> Self606     fn shuffle2301(self) -> Self {
607         x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)])
608     }
609     #[inline(always)]
shuffle3012(self) -> Self610     fn shuffle3012(self) -> Self {
611         unsafe {
612             x2::new([
613                 u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)),
614                 u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)),
615             ])
616         }
617     }
618     #[inline(always)]
shuffle1230(self) -> Self619     fn shuffle1230(self) -> Self {
620         unsafe {
621             x2::new([
622                 u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)),
623                 u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)),
624             ])
625         }
626     }
627 }
628 impl<S4, NI> Words4 for u64x4_sse2<NoS3, S4, NI> {
629     #[inline(always)]
shuffle2301(self) -> Self630     fn shuffle2301(self) -> Self {
631         x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)])
632     }
633     #[inline(always)]
shuffle3012(self) -> Self634     fn shuffle3012(self) -> Self {
635         unsafe {
636             let a = _mm_srli_si128(self.0[0].x, 8);
637             let b = _mm_slli_si128(self.0[0].x, 8);
638             let c = _mm_srli_si128(self.0[1].x, 8);
639             let d = _mm_slli_si128(self.0[1].x, 8);
640             let da = _mm_or_si128(d, a);
641             let bc = _mm_or_si128(b, c);
642             x2::new([u64x2_sse2::new(da), u64x2_sse2::new(bc)])
643         }
644     }
645     #[inline(always)]
shuffle1230(self) -> Self646     fn shuffle1230(self) -> Self {
647         unsafe {
648             let a = _mm_srli_si128(self.0[0].x, 8);
649             let b = _mm_slli_si128(self.0[0].x, 8);
650             let c = _mm_srli_si128(self.0[1].x, 8);
651             let d = _mm_slli_si128(self.0[1].x, 8);
652             let da = _mm_or_si128(d, a);
653             let bc = _mm_or_si128(b, c);
654             x2::new([u64x2_sse2::new(bc), u64x2_sse2::new(da)])
655         }
656     }
657 }
658 
659 impl<S3, S4, NI> UnsafeFrom<[u64; 2]> for u64x2_sse2<S3, S4, NI> {
660     #[inline(always)]
unsafe_from(xs: [u64; 2]) -> Self661     unsafe fn unsafe_from(xs: [u64; 2]) -> Self {
662         Self::new(_mm_set_epi64x(xs[1] as i64, xs[0] as i64))
663     }
664 }
665 
666 impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, YesS4, NI> {
667     #[inline(always)]
extract(self, i: u32) -> u64668     fn extract(self, i: u32) -> u64 {
669         unsafe {
670             match i {
671                 0 => _mm_cvtsi128_si64(self.x) as u64,
672                 1 => _mm_extract_epi64(self.x, 1) as u64,
673                 _ => unreachable!(),
674             }
675         }
676     }
677     #[inline(always)]
insert(self, x: u64, i: u32) -> Self678     fn insert(self, x: u64, i: u32) -> Self {
679         Self::new(unsafe {
680             match i {
681                 0 => _mm_insert_epi64(self.x, x as i64, 0),
682                 1 => _mm_insert_epi64(self.x, x as i64, 1),
683                 _ => unreachable!(),
684             }
685         })
686     }
687 }
688 impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, NoS4, NI> {
689     #[inline(always)]
extract(self, i: u32) -> u64690     fn extract(self, i: u32) -> u64 {
691         unsafe {
692             match i {
693                 0 => _mm_cvtsi128_si64(self.x) as u64,
694                 1 => _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64,
695                 _ => unreachable!(),
696             }
697         }
698     }
699     #[inline(always)]
insert(self, x: u64, i: u32) -> Self700     fn insert(self, x: u64, i: u32) -> Self {
701         Self::new(unsafe {
702             match i {
703                 0 => _mm_or_si128(
704                     _mm_andnot_si128(_mm_cvtsi64_si128(-1), self.x),
705                     _mm_cvtsi64_si128(x as i64),
706                 ),
707                 1 => _mm_or_si128(
708                     _mm_move_epi64(self.x),
709                     _mm_slli_si128(_mm_cvtsi64_si128(x as i64), 8),
710                 ),
711                 _ => unreachable!(),
712             }
713         })
714     }
715 }
716 
717 impl<S4, NI> BSwap for u32x4_sse2<YesS3, S4, NI> {
718     #[inline(always)]
bswap(self) -> Self719     fn bswap(self) -> Self {
720         Self::new(unsafe {
721             let k = _mm_set_epi64x(0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203);
722             _mm_shuffle_epi8(self.x, k)
723         })
724     }
725 }
726 #[inline(always)]
bswap32_s2(x: __m128i) -> __m128i727 fn bswap32_s2(x: __m128i) -> __m128i {
728     unsafe {
729         let mut y = _mm_unpacklo_epi8(x, _mm_setzero_si128());
730         y = _mm_shufflehi_epi16(y, 0b0001_1011);
731         y = _mm_shufflelo_epi16(y, 0b0001_1011);
732         let mut z = _mm_unpackhi_epi8(x, _mm_setzero_si128());
733         z = _mm_shufflehi_epi16(z, 0b0001_1011);
734         z = _mm_shufflelo_epi16(z, 0b0001_1011);
735         _mm_packus_epi16(y, z)
736     }
737 }
738 impl<S4, NI> BSwap for u32x4_sse2<NoS3, S4, NI> {
739     #[inline(always)]
bswap(self) -> Self740     fn bswap(self) -> Self {
741         Self::new(bswap32_s2(self.x))
742     }
743 }
744 
745 impl<S4, NI> BSwap for u64x2_sse2<YesS3, S4, NI> {
746     #[inline(always)]
bswap(self) -> Self747     fn bswap(self) -> Self {
748         Self::new(unsafe {
749             let k = _mm_set_epi64x(0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607);
750             _mm_shuffle_epi8(self.x, k)
751         })
752     }
753 }
754 impl<S4, NI> BSwap for u64x2_sse2<NoS3, S4, NI> {
755     #[inline(always)]
bswap(self) -> Self756     fn bswap(self) -> Self {
757         Self::new(unsafe { bswap32_s2(_mm_shuffle_epi32(self.x, 0b1011_0001)) })
758     }
759 }
760 
761 impl<S4, NI> BSwap for u128x1_sse2<YesS3, S4, NI> {
762     #[inline(always)]
bswap(self) -> Self763     fn bswap(self) -> Self {
764         Self::new(unsafe {
765             let k = _mm_set_epi64x(0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100);
766             _mm_shuffle_epi8(self.x, k)
767         })
768     }
769 }
770 impl<S4, NI> BSwap for u128x1_sse2<NoS3, S4, NI> {
771     #[inline(always)]
bswap(self) -> Self772     fn bswap(self) -> Self {
773         unimplemented!()
774     }
775 }
776 
777 macro_rules! swapi {
778     ($x:expr, $i:expr, $k:expr) => {
779         unsafe {
780             const K: u8 = $k;
781             let k = _mm_set1_epi8(K as i8);
782             u128x1_sse2::new(_mm_or_si128(
783                 _mm_srli_epi16(_mm_and_si128($x.x, k), $i),
784                 _mm_and_si128(_mm_slli_epi16($x.x, $i), k),
785             ))
786         }
787     };
788 }
789 #[inline(always)]
swap16_s2(x: __m128i) -> __m128i790 fn swap16_s2(x: __m128i) -> __m128i {
791     unsafe { _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0b1011_0001), 0b1011_0001) }
792 }
793 impl<S4, NI> Swap64 for u128x1_sse2<YesS3, S4, NI> {
794     #[inline(always)]
swap1(self) -> Self795     fn swap1(self) -> Self {
796         swapi!(self, 1, 0xaa)
797     }
798     #[inline(always)]
swap2(self) -> Self799     fn swap2(self) -> Self {
800         swapi!(self, 2, 0xcc)
801     }
802     #[inline(always)]
swap4(self) -> Self803     fn swap4(self) -> Self {
804         swapi!(self, 4, 0xf0)
805     }
806     #[inline(always)]
swap8(self) -> Self807     fn swap8(self) -> Self {
808         u128x1_sse2::new(unsafe {
809             let k = _mm_set_epi64x(0x0e0f_0c0d_0a0b_0809, 0x0607_0405_0203_0001);
810             _mm_shuffle_epi8(self.x, k)
811         })
812     }
813     #[inline(always)]
swap16(self) -> Self814     fn swap16(self) -> Self {
815         u128x1_sse2::new(unsafe {
816             let k = _mm_set_epi64x(0x0d0c_0f0e_0908_0b0a, 0x0504_0706_0100_0302);
817             _mm_shuffle_epi8(self.x, k)
818         })
819     }
820     #[inline(always)]
swap32(self) -> Self821     fn swap32(self) -> Self {
822         u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) })
823     }
824     #[inline(always)]
swap64(self) -> Self825     fn swap64(self) -> Self {
826         u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
827     }
828 }
829 impl<S4, NI> Swap64 for u128x1_sse2<NoS3, S4, NI> {
830     #[inline(always)]
swap1(self) -> Self831     fn swap1(self) -> Self {
832         swapi!(self, 1, 0xaa)
833     }
834     #[inline(always)]
swap2(self) -> Self835     fn swap2(self) -> Self {
836         swapi!(self, 2, 0xcc)
837     }
838     #[inline(always)]
swap4(self) -> Self839     fn swap4(self) -> Self {
840         swapi!(self, 4, 0xf0)
841     }
842     #[inline(always)]
swap8(self) -> Self843     fn swap8(self) -> Self {
844         u128x1_sse2::new(unsafe {
845             _mm_or_si128(_mm_slli_epi16(self.x, 8), _mm_srli_epi16(self.x, 8))
846         })
847     }
848     #[inline(always)]
swap16(self) -> Self849     fn swap16(self) -> Self {
850         u128x1_sse2::new(swap16_s2(self.x))
851     }
852     #[inline(always)]
swap32(self) -> Self853     fn swap32(self) -> Self {
854         u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) })
855     }
856     #[inline(always)]
swap64(self) -> Self857     fn swap64(self) -> Self {
858         u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
859     }
860 }
861 
862 #[derive(Copy, Clone)]
863 pub struct G0;
864 #[derive(Copy, Clone)]
865 pub struct G1;
866 
867 #[allow(non_camel_case_types)]
868 pub type u32x4x2_sse2<S3, S4, NI> = x2<u32x4_sse2<S3, S4, NI>, G0>;
869 #[allow(non_camel_case_types)]
870 pub type u64x2x2_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G0>;
871 #[allow(non_camel_case_types)]
872 pub type u64x4_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G1>;
873 #[allow(non_camel_case_types)]
874 pub type u128x2_sse2<S3, S4, NI> = x2<u128x1_sse2<S3, S4, NI>, G0>;
875 
876 #[allow(non_camel_case_types)]
877 pub type u32x4x4_sse2<S3, S4, NI> = x4<u32x4_sse2<S3, S4, NI>>;
878 #[allow(non_camel_case_types)]
879 pub type u64x2x4_sse2<S3, S4, NI> = x4<u64x2_sse2<S3, S4, NI>>;
880 #[allow(non_camel_case_types)]
881 pub type u128x4_sse2<S3, S4, NI> = x4<u128x1_sse2<S3, S4, NI>>;
882 
883 impl<S3: Copy, S4: Copy, NI: Copy> u32x4x2<Machine86<S3, S4, NI>> for u32x4x2_sse2<S3, S4, NI>
884 where
885     u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
886     Machine86<S3, S4, NI>: Machine,
887     u32x4x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 2]>,
888     u32x4x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u32x4>,
889 {
890 }
891 impl<S3: Copy, S4: Copy, NI: Copy> u64x2x2<Machine86<S3, S4, NI>> for u64x2x2_sse2<S3, S4, NI>
892 where
893     u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
894     Machine86<S3, S4, NI>: Machine,
895     u64x2x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; 2]>,
896     u64x2x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u64x2>,
897 {
898 }
899 impl<S3: Copy, S4: Copy, NI: Copy> u64x4<Machine86<S3, S4, NI>> for u64x4_sse2<S3, S4, NI>
900 where
901     u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
902     Machine86<S3, S4, NI>: Machine,
903     u64x4_sse2<S3, S4, NI>: MultiLane<[u64; 4]> + Vec4<u64> + Words4,
904 {
905 }
906 impl<S3: Copy, S4: Copy, NI: Copy> u128x2<Machine86<S3, S4, NI>> for u128x2_sse2<S3, S4, NI>
907 where
908     u128x1_sse2<S3, S4, NI>: Swap64 + BSwap,
909     Machine86<S3, S4, NI>: Machine,
910     u128x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; 2]>,
911     u128x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u128x1>,
912     u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x2>,
913     u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x2>,
914     u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x4>,
915 {
916 }
917 
918 impl<NI: Copy> u32x4x2<Avx2Machine<NI>> for u32x4x2_sse2<YesS3, YesS4, NI>
919 where
920     u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap,
921     Avx2Machine<NI>: Machine,
922     u32x4x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u32x4; 2]>,
923     u32x4x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u32x4>,
924 {
925 }
926 impl<NI: Copy> u64x2x2<Avx2Machine<NI>> for u64x2x2_sse2<YesS3, YesS4, NI>
927 where
928     u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
929     Avx2Machine<NI>: Machine,
930     u64x2x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; 2]>,
931     u64x2x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u64x2>,
932 {
933 }
934 impl<NI: Copy> u64x4<Avx2Machine<NI>> for u64x4_sse2<YesS3, YesS4, NI>
935 where
936     u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
937     Avx2Machine<NI>: Machine,
938     u64x4_sse2<YesS3, YesS4, NI>: MultiLane<[u64; 4]> + Vec4<u64> + Words4,
939 {
940 }
941 impl<NI: Copy> u128x2<Avx2Machine<NI>> for u128x2_sse2<YesS3, YesS4, NI>
942 where
943     u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap,
944     Avx2Machine<NI>: Machine,
945     u128x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; 2]>,
946     u128x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u128x1>,
947     u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x2>,
948     u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x2>,
949     u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x4>,
950 {
951 }
952 
953 impl<S3, S4, NI> Vec4<u64> for u64x4_sse2<S3, S4, NI>
954 where
955     u64x2_sse2<S3, S4, NI>: Copy + Vec2<u64>,
956 {
957     #[inline(always)]
extract(self, i: u32) -> u64958     fn extract(self, i: u32) -> u64 {
959         match i {
960             0 => self.0[0].extract(0),
961             1 => self.0[0].extract(1),
962             2 => self.0[1].extract(0),
963             3 => self.0[1].extract(1),
964             _ => panic!(),
965         }
966     }
967     #[inline(always)]
insert(mut self, w: u64, i: u32) -> Self968     fn insert(mut self, w: u64, i: u32) -> Self {
969         match i {
970             0 => self.0[0] = self.0[0].insert(w, 0),
971             1 => self.0[0] = self.0[0].insert(w, 1),
972             2 => self.0[1] = self.0[1].insert(w, 0),
973             3 => self.0[1] = self.0[1].insert(w, 1),
974             _ => panic!(),
975         };
976         self
977     }
978 }
979 
980 impl<S3: Copy, S4: Copy, NI: Copy> u32x4x4<Machine86<S3, S4, NI>> for u32x4x4_sse2<S3, S4, NI>
981 where
982     u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
983     Machine86<S3, S4, NI>: Machine,
984     u32x4x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 4]>,
985     u32x4x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u32x4>,
986 {
987 }
988 impl<S3: Copy, S4: Copy, NI: Copy> u64x2x4<Machine86<S3, S4, NI>> for u64x2x4_sse2<S3, S4, NI>
989 where
990     u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
991     Machine86<S3, S4, NI>: Machine,
992     u64x2x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; 4]>,
993     u64x2x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u64x2>,
994 {
995 }
996 impl<S3: Copy, S4: Copy, NI: Copy> u128x4<Machine86<S3, S4, NI>> for u128x4_sse2<S3, S4, NI>
997 where
998     u128x1_sse2<S3, S4, NI>: Swap64 + BSwap,
999     Machine86<S3, S4, NI>: Machine,
1000     u128x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; 4]>,
1001     u128x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u128x1>,
1002     u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x4>,
1003     u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x4>,
1004 {
1005 }
1006 
1007 impl<NI: Copy> u32x4x4<Avx2Machine<NI>> for u32x4x4_sse2<YesS3, YesS4, NI>
1008 where
1009     u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap,
1010     Avx2Machine<NI>: Machine,
1011     u32x4x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u32x4; 4]>,
1012     u32x4x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u32x4>,
1013 {
1014 }
1015 impl<NI: Copy> u64x2x4<Avx2Machine<NI>> for u64x2x4_sse2<YesS3, YesS4, NI>
1016 where
1017     u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
1018     Avx2Machine<NI>: Machine,
1019     u64x2x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; 4]>,
1020     u64x2x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u64x2>,
1021 {
1022 }
1023 impl<NI: Copy> u128x4<Avx2Machine<NI>> for u128x4_sse2<YesS3, YesS4, NI>
1024 where
1025     u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap,
1026     Avx2Machine<NI>: Machine,
1027     u128x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; 4]>,
1028     u128x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u128x1>,
1029     u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x4>,
1030     u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x4>,
1031 {
1032 }
1033 
1034 macro_rules! impl_into_x {
1035     ($from:ident, $to:ident) => {
1036         impl<S3: Copy, S4: Copy, NI: Copy, Gf, Gt> From<x2<$from<S3, S4, NI>, Gf>>
1037             for x2<$to<S3, S4, NI>, Gt>
1038         {
1039             #[inline(always)]
1040             fn from(x: x2<$from<S3, S4, NI>, Gf>) -> Self {
1041                 x2::new([$to::from(x.0[0]), $to::from(x.0[1])])
1042             }
1043         }
1044         impl<S3: Copy, S4: Copy, NI: Copy> From<x4<$from<S3, S4, NI>>> for x4<$to<S3, S4, NI>> {
1045             #[inline(always)]
1046             fn from(x: x4<$from<S3, S4, NI>>) -> Self {
1047                 x4::new([
1048                     $to::from(x.0[0]),
1049                     $to::from(x.0[1]),
1050                     $to::from(x.0[2]),
1051                     $to::from(x.0[3]),
1052                 ])
1053             }
1054         }
1055     };
1056 }
1057 impl_into_x!(u128x1_sse2, u64x2_sse2);
1058 impl_into_x!(u128x1_sse2, u32x4_sse2);
1059 
1060 ///// Debugging
1061 
1062 use core::fmt::{Debug, Formatter, Result};
1063 
1064 impl<W: PartialEq, G> PartialEq for x2<W, G> {
1065     #[inline(always)]
eq(&self, rhs: &Self) -> bool1066     fn eq(&self, rhs: &Self) -> bool {
1067         self.0[0] == rhs.0[0] && self.0[1] == rhs.0[1]
1068     }
1069 }
1070 
1071 #[allow(unused)]
1072 #[inline(always)]
eq128_s4(x: __m128i, y: __m128i) -> bool1073 unsafe fn eq128_s4(x: __m128i, y: __m128i) -> bool {
1074     let q = _mm_shuffle_epi32(_mm_cmpeq_epi64(x, y), 0b1100_0110);
1075     _mm_cvtsi128_si64(q) == -1
1076 }
1077 
1078 #[inline(always)]
eq128_s2(x: __m128i, y: __m128i) -> bool1079 unsafe fn eq128_s2(x: __m128i, y: __m128i) -> bool {
1080     let q = _mm_cmpeq_epi32(x, y);
1081     let p = _mm_cvtsi128_si64(_mm_srli_si128(q, 8));
1082     let q = _mm_cvtsi128_si64(q);
1083     (p & q) == -1
1084 }
1085 
1086 impl<S3, S4, NI> PartialEq for u32x4_sse2<S3, S4, NI> {
1087     #[inline(always)]
eq(&self, rhs: &Self) -> bool1088     fn eq(&self, rhs: &Self) -> bool {
1089         unsafe { eq128_s2(self.x, rhs.x) }
1090     }
1091 }
1092 impl<S3, S4, NI> Debug for u32x4_sse2<S3, S4, NI>
1093 where
1094     Self: Copy + MultiLane<[u32; 4]>,
1095 {
1096     #[cold]
fmt(&self, fmt: &mut Formatter) -> Result1097     fn fmt(&self, fmt: &mut Formatter) -> Result {
1098         fmt.write_fmt(format_args!("{:08x?}", &self.to_lanes()))
1099     }
1100 }
1101 
1102 impl<S3, S4, NI> PartialEq for u64x2_sse2<S3, S4, NI> {
1103     #[inline(always)]
eq(&self, rhs: &Self) -> bool1104     fn eq(&self, rhs: &Self) -> bool {
1105         unsafe { eq128_s2(self.x, rhs.x) }
1106     }
1107 }
1108 impl<S3, S4, NI> Debug for u64x2_sse2<S3, S4, NI>
1109 where
1110     Self: Copy + MultiLane<[u64; 2]>,
1111 {
1112     #[cold]
fmt(&self, fmt: &mut Formatter) -> Result1113     fn fmt(&self, fmt: &mut Formatter) -> Result {
1114         fmt.write_fmt(format_args!("{:016x?}", &self.to_lanes()))
1115     }
1116 }
1117 
1118 impl<S3, S4, NI> Debug for u64x4_sse2<S3, S4, NI>
1119 where
1120     u64x2_sse2<S3, S4, NI>: Copy + MultiLane<[u64; 2]>,
1121 {
1122     #[cold]
fmt(&self, fmt: &mut Formatter) -> Result1123     fn fmt(&self, fmt: &mut Formatter) -> Result {
1124         let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes());
1125         fmt.write_fmt(format_args!("{:016x?}", &[a[0], a[1], b[0], b[1]]))
1126     }
1127 }
1128 
1129 #[cfg(test)]
1130 #[cfg(target_arch = "x86_64")]
1131 mod test {
1132     use super::*;
1133     use crate::x86_64::{SSE2, SSE41, SSSE3};
1134     use crate::Machine;
1135 
1136     #[test]
1137     #[cfg_attr(not(target_feature = "ssse3"), ignore)]
test_bswap32_s2_vs_s3()1138     fn test_bswap32_s2_vs_s3() {
1139         let xs = [0x0f0e_0d0c, 0x0b0a_0908, 0x0706_0504, 0x0302_0100];
1140         let ys = [0x0c0d_0e0f, 0x0809_0a0b, 0x0405_0607, 0x0001_0203];
1141 
1142         let s2 = unsafe { SSE2::instance() };
1143         let s3 = unsafe { SSSE3::instance() };
1144 
1145         let x_s2 = {
1146             let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1147             x_s2.bswap()
1148         };
1149 
1150         let x_s3 = {
1151             let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1152             x_s3.bswap()
1153         };
1154 
1155         assert_eq!(x_s2, unsafe { core::mem::transmute(x_s3) });
1156         assert_eq!(x_s2, s2.vec(ys));
1157     }
1158 
1159     #[test]
1160     #[cfg_attr(not(target_feature = "ssse3"), ignore)]
test_bswap64_s2_vs_s3()1161     fn test_bswap64_s2_vs_s3() {
1162         let xs = [0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100];
1163         let ys = [0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607];
1164 
1165         let s2 = unsafe { SSE2::instance() };
1166         let s3 = unsafe { SSSE3::instance() };
1167 
1168         let x_s2 = {
1169             let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1170             x_s2.bswap()
1171         };
1172 
1173         let x_s3 = {
1174             let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs);
1175             x_s3.bswap()
1176         };
1177 
1178         assert_eq!(x_s2, s2.vec(ys));
1179         assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1180     }
1181 
1182     #[test]
1183     #[cfg_attr(not(target_feature = "ssse3"), ignore)]
test_shuffle32_s2_vs_s3()1184     fn test_shuffle32_s2_vs_s3() {
1185         let xs = [0x0, 0x1, 0x2, 0x3];
1186         let ys = [0x2, 0x3, 0x0, 0x1];
1187         let zs = [0x1, 0x2, 0x3, 0x0];
1188 
1189         let s2 = unsafe { SSE2::instance() };
1190         let s3 = unsafe { SSSE3::instance() };
1191 
1192         let x_s2 = {
1193             let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1194             x_s2.shuffle2301()
1195         };
1196         let x_s3 = {
1197             let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1198             x_s3.shuffle2301()
1199         };
1200         assert_eq!(x_s2, s2.vec(ys));
1201         assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1202 
1203         let x_s2 = {
1204             let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1205             x_s2.shuffle3012()
1206         };
1207         let x_s3 = {
1208             let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1209             x_s3.shuffle3012()
1210         };
1211         assert_eq!(x_s2, s2.vec(zs));
1212         assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1213 
1214         let x_s2 = x_s2.shuffle1230();
1215         let x_s3 = x_s3.shuffle1230();
1216         assert_eq!(x_s2, s2.vec(xs));
1217         assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1218     }
1219 
1220     #[test]
1221     #[cfg_attr(not(target_feature = "ssse3"), ignore)]
test_shuffle64_s2_vs_s3()1222     fn test_shuffle64_s2_vs_s3() {
1223         let xs = [0x0, 0x1, 0x2, 0x3];
1224         let ys = [0x2, 0x3, 0x0, 0x1];
1225         let zs = [0x1, 0x2, 0x3, 0x0];
1226 
1227         let s2 = unsafe { SSE2::instance() };
1228         let s3 = unsafe { SSSE3::instance() };
1229 
1230         let x_s2 = {
1231             let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs);
1232             x_s2.shuffle2301()
1233         };
1234         let x_s3 = {
1235             let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs);
1236             x_s3.shuffle2301()
1237         };
1238         assert_eq!(x_s2, s2.vec(ys));
1239         assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1240 
1241         let x_s2 = {
1242             let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs);
1243             x_s2.shuffle3012()
1244         };
1245         let x_s3 = {
1246             let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs);
1247             x_s3.shuffle3012()
1248         };
1249         assert_eq!(x_s2, s2.vec(zs));
1250         assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1251 
1252         let x_s2 = x_s2.shuffle1230();
1253         let x_s3 = x_s3.shuffle1230();
1254         assert_eq!(x_s2, s2.vec(xs));
1255         assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1256     }
1257 
1258     #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1259     #[test]
test_lanes_u32x4()1260     fn test_lanes_u32x4() {
1261         let xs = [0x1, 0x2, 0x3, 0x4];
1262 
1263         let s2 = unsafe { SSE2::instance() };
1264         let s3 = unsafe { SSSE3::instance() };
1265         let s4 = unsafe { SSE41::instance() };
1266 
1267         {
1268             let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1269             let y_s2 = <SSE2 as Machine>::u32x4::from_lanes(xs);
1270             assert_eq!(x_s2, y_s2);
1271             assert_eq!(xs, y_s2.to_lanes());
1272         }
1273 
1274         {
1275             let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1276             let y_s3 = <SSSE3 as Machine>::u32x4::from_lanes(xs);
1277             assert_eq!(x_s3, y_s3);
1278             assert_eq!(xs, y_s3.to_lanes());
1279         }
1280 
1281         {
1282             let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs);
1283             let y_s4 = <SSE41 as Machine>::u32x4::from_lanes(xs);
1284             assert_eq!(x_s4, y_s4);
1285             assert_eq!(xs, y_s4.to_lanes());
1286         }
1287     }
1288 
1289     #[test]
1290     #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
test_lanes_u64x2()1291     fn test_lanes_u64x2() {
1292         let xs = [0x1, 0x2];
1293 
1294         let s2 = unsafe { SSE2::instance() };
1295         let s3 = unsafe { SSSE3::instance() };
1296         let s4 = unsafe { SSE41::instance() };
1297 
1298         {
1299             let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1300             let y_s2 = <SSE2 as Machine>::u64x2::from_lanes(xs);
1301             assert_eq!(x_s2, y_s2);
1302             assert_eq!(xs, y_s2.to_lanes());
1303         }
1304 
1305         {
1306             let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs);
1307             let y_s3 = <SSSE3 as Machine>::u64x2::from_lanes(xs);
1308             assert_eq!(x_s3, y_s3);
1309             assert_eq!(xs, y_s3.to_lanes());
1310         }
1311 
1312         {
1313             let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs);
1314             let y_s4 = <SSE41 as Machine>::u64x2::from_lanes(xs);
1315             assert_eq!(x_s4, y_s4);
1316             assert_eq!(xs, y_s4.to_lanes());
1317         }
1318     }
1319 
1320     #[test]
test_vec4_u32x4_s2()1321     fn test_vec4_u32x4_s2() {
1322         let xs = [1, 2, 3, 4];
1323         let s2 = unsafe { SSE2::instance() };
1324         let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1325         assert_eq!(x_s2.extract(0), 1);
1326         assert_eq!(x_s2.extract(1), 2);
1327         assert_eq!(x_s2.extract(2), 3);
1328         assert_eq!(x_s2.extract(3), 4);
1329         assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2, 3, 4]));
1330         assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf, 3, 4]));
1331         assert_eq!(x_s2.insert(0xf, 2), s2.vec([1, 2, 0xf, 4]));
1332         assert_eq!(x_s2.insert(0xf, 3), s2.vec([1, 2, 3, 0xf]));
1333     }
1334 
1335     #[test]
1336     #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
test_vec4_u32x4_s4()1337     fn test_vec4_u32x4_s4() {
1338         let xs = [1, 2, 3, 4];
1339         let s4 = unsafe { SSE41::instance() };
1340         let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs);
1341         assert_eq!(x_s4.extract(0), 1);
1342         assert_eq!(x_s4.extract(1), 2);
1343         assert_eq!(x_s4.extract(2), 3);
1344         assert_eq!(x_s4.extract(3), 4);
1345         assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2, 3, 4]));
1346         assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf, 3, 4]));
1347         assert_eq!(x_s4.insert(0xf, 2), s4.vec([1, 2, 0xf, 4]));
1348         assert_eq!(x_s4.insert(0xf, 3), s4.vec([1, 2, 3, 0xf]));
1349     }
1350 
1351     #[test]
test_vec2_u64x2_s2()1352     fn test_vec2_u64x2_s2() {
1353         let xs = [0x1, 0x2];
1354         let s2 = unsafe { SSE2::instance() };
1355         let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1356         assert_eq!(x_s2.extract(0), 1);
1357         assert_eq!(x_s2.extract(1), 2);
1358         assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2]));
1359         assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf]));
1360     }
1361 
1362     #[test]
1363     #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
test_vec4_u64x2_s4()1364     fn test_vec4_u64x2_s4() {
1365         let xs = [0x1, 0x2];
1366         let s4 = unsafe { SSE41::instance() };
1367         let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs);
1368         assert_eq!(x_s4.extract(0), 1);
1369         assert_eq!(x_s4.extract(1), 2);
1370         assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2]));
1371         assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf]));
1372     }
1373 }
1374 
1375 pub mod avx2 {
1376     #![allow(non_camel_case_types)]
1377     use crate::soft::x4;
1378     use crate::types::*;
1379     use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2};
1380     use crate::x86_64::{vec256_storage, vec512_storage, Avx2Machine, YesS3, YesS4};
1381     use core::arch::x86_64::*;
1382     use core::marker::PhantomData;
1383     use core::ops::*;
1384 
1385     #[derive(Copy, Clone)]
1386     pub struct u32x4x4_avx2<NI> {
1387         x: [__m256i; 2],
1388         ni: PhantomData<NI>,
1389     }
1390 
1391     impl<NI> u32x4x4_avx2<NI> {
1392         #[inline(always)]
new(x: [__m256i; 2]) -> Self1393         fn new(x: [__m256i; 2]) -> Self {
1394             Self { x, ni: PhantomData }
1395         }
1396     }
1397 
1398     impl<NI> u32x4x4<Avx2Machine<NI>> for u32x4x4_avx2<NI> where NI: Copy {}
1399     impl<NI> Store<vec512_storage> for u32x4x4_avx2<NI> {
1400         #[inline(always)]
unpack(p: vec512_storage) -> Self1401         unsafe fn unpack(p: vec512_storage) -> Self {
1402             Self::new([p.avx[0].avx, p.avx[1].avx])
1403         }
1404     }
1405     impl<NI> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 4]> for u32x4x4_avx2<NI> {
1406         #[inline(always)]
to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 4]1407         fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 4] {
1408             unsafe {
1409                 [
1410                     u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 0)),
1411                     u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 1)),
1412                     u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 0)),
1413                     u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 1)),
1414                 ]
1415             }
1416         }
1417         #[inline(always)]
from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 4]) -> Self1418         fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 4]) -> Self {
1419             Self::new(unsafe {
1420                 [
1421                     _mm256_setr_m128i(x[0].x, x[1].x),
1422                     _mm256_setr_m128i(x[2].x, x[3].x),
1423                 ]
1424             })
1425         }
1426     }
1427     impl<NI> Vec4<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
1428         #[inline(always)]
extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI>1429         fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> {
1430             unsafe {
1431                 match i {
1432                     0 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 0)),
1433                     1 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 1)),
1434                     2 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 0)),
1435                     3 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 1)),
1436                     _ => panic!(),
1437                 }
1438             }
1439         }
1440         #[inline(always)]
insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self1441         fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self {
1442             Self::new(unsafe {
1443                 match i {
1444                     0 => [_mm256_inserti128_si256(self.x[0], w.x, 0), self.x[1]],
1445                     1 => [_mm256_inserti128_si256(self.x[0], w.x, 1), self.x[1]],
1446                     2 => [self.x[0], _mm256_inserti128_si256(self.x[1], w.x, 0)],
1447                     3 => [self.x[0], _mm256_inserti128_si256(self.x[1], w.x, 1)],
1448                     _ => panic!(),
1449                 }
1450             })
1451         }
1452     }
1453     impl<NI> LaneWords4 for u32x4x4_avx2<NI> {
1454         #[inline(always)]
shuffle_lane_words1230(self) -> Self1455         fn shuffle_lane_words1230(self) -> Self {
1456             Self::new(unsafe {
1457                 [
1458                     _mm256_shuffle_epi32(self.x[0], 0b1001_0011),
1459                     _mm256_shuffle_epi32(self.x[1], 0b1001_0011),
1460                 ]
1461             })
1462         }
1463         #[inline(always)]
shuffle_lane_words2301(self) -> Self1464         fn shuffle_lane_words2301(self) -> Self {
1465             Self::new(unsafe {
1466                 [
1467                     _mm256_shuffle_epi32(self.x[0], 0b0100_1110),
1468                     _mm256_shuffle_epi32(self.x[1], 0b0100_1110),
1469                 ]
1470             })
1471         }
1472         #[inline(always)]
shuffle_lane_words3012(self) -> Self1473         fn shuffle_lane_words3012(self) -> Self {
1474             Self::new(unsafe {
1475                 [
1476                     _mm256_shuffle_epi32(self.x[0], 0b0011_1001),
1477                     _mm256_shuffle_epi32(self.x[1], 0b0011_1001),
1478                 ]
1479             })
1480         }
1481     }
1482     impl<NI> BitOps32 for u32x4x4_avx2<NI> where NI: Copy {}
1483     impl<NI> ArithOps for u32x4x4_avx2<NI> where NI: Copy {}
1484     macro_rules! shuf_lane_bytes {
1485         ($name:ident, $k0:expr, $k1:expr) => {
1486             #[inline(always)]
1487             fn $name(self) -> Self {
1488                 Self::new(unsafe {
1489                     [
1490                         _mm256_shuffle_epi8(self.x[0], _mm256_set_epi64x($k0, $k1, $k0, $k1)),
1491                         _mm256_shuffle_epi8(self.x[1], _mm256_set_epi64x($k0, $k1, $k0, $k1)),
1492                     ]
1493                 })
1494             }
1495         };
1496     }
1497     macro_rules! rotr_32 {
1498         ($name:ident, $i:expr) => {
1499             #[inline(always)]
1500             fn $name(self) -> Self {
1501                 Self::new(unsafe {
1502                     [
1503                         _mm256_or_si256(
1504                             _mm256_srli_epi32(self.x[0], $i as i32),
1505                             _mm256_slli_epi32(self.x[0], 32 - $i as i32),
1506                         ),
1507                         _mm256_or_si256(
1508                             _mm256_srli_epi32(self.x[1], $i as i32),
1509                             _mm256_slli_epi32(self.x[1], 32 - $i as i32),
1510                         ),
1511                     ]
1512                 })
1513             }
1514         };
1515     }
1516     impl<NI: Copy> RotateEachWord32 for u32x4x4_avx2<NI> {
1517         rotr_32!(rotate_each_word_right7, 7);
1518         shuf_lane_bytes!(
1519             rotate_each_word_right8,
1520             0x0c0f0e0d_080b0a09,
1521             0x04070605_00030201
1522         );
1523         rotr_32!(rotate_each_word_right11, 11);
1524         rotr_32!(rotate_each_word_right12, 12);
1525         shuf_lane_bytes!(
1526             rotate_each_word_right16,
1527             0x0d0c0f0e_09080b0a,
1528             0x05040706_01000302
1529         );
1530         rotr_32!(rotate_each_word_right20, 20);
1531         shuf_lane_bytes!(
1532             rotate_each_word_right24,
1533             0x0e0d0c0f_0a09080b,
1534             0x06050407_02010003
1535         );
1536         rotr_32!(rotate_each_word_right25, 25);
1537     }
1538     impl<NI> BitOps0 for u32x4x4_avx2<NI> where NI: Copy {}
1539     impl<NI> From<u32x4x4_avx2<NI>> for vec512_storage {
1540         #[inline(always)]
from(x: u32x4x4_avx2<NI>) -> Self1541         fn from(x: u32x4x4_avx2<NI>) -> Self {
1542             Self {
1543                 avx: [
1544                     vec256_storage { avx: x.x[0] },
1545                     vec256_storage { avx: x.x[1] },
1546                 ],
1547             }
1548         }
1549     }
1550 
1551     macro_rules! impl_assign {
1552         ($vec:ident, $Assign:ident, $assign_fn:ident, $bin_fn:ident) => {
1553             impl<NI> $Assign for $vec<NI>
1554             where
1555                 NI: Copy,
1556             {
1557                 #[inline(always)]
1558                 fn $assign_fn(&mut self, rhs: Self) {
1559                     *self = self.$bin_fn(rhs);
1560                 }
1561             }
1562         };
1563     }
1564     impl_assign!(u32x4x4_avx2, BitXorAssign, bitxor_assign, bitxor);
1565     impl_assign!(u32x4x4_avx2, BitOrAssign, bitor_assign, bitor);
1566     impl_assign!(u32x4x4_avx2, BitAndAssign, bitand_assign, bitand);
1567     impl_assign!(u32x4x4_avx2, AddAssign, add_assign, add);
1568 
1569     macro_rules! impl_bitop_x2 {
1570         ($vec:ident, $Op:ident, $op_fn:ident, $impl_fn:ident) => {
1571             impl<NI> $Op for $vec<NI> {
1572                 type Output = Self;
1573                 #[inline(always)]
1574                 fn $op_fn(self, rhs: Self) -> Self::Output {
1575                     Self::new(unsafe {
1576                         [$impl_fn(self.x[0], rhs.x[0]), $impl_fn(self.x[1], rhs.x[1])]
1577                     })
1578                 }
1579             }
1580         };
1581     }
1582     impl_bitop_x2!(u32x4x4_avx2, BitXor, bitxor, _mm256_xor_si256);
1583     impl_bitop_x2!(u32x4x4_avx2, BitOr, bitor, _mm256_or_si256);
1584     impl_bitop_x2!(u32x4x4_avx2, BitAnd, bitand, _mm256_and_si256);
1585     impl_bitop_x2!(u32x4x4_avx2, AndNot, andnot, _mm256_andnot_si256);
1586     impl_bitop_x2!(u32x4x4_avx2, Add, add, _mm256_add_epi32);
1587 
1588     impl<NI> Not for u32x4x4_avx2<NI> {
1589         type Output = Self;
1590         #[inline(always)]
not(self) -> Self::Output1591         fn not(self) -> Self::Output {
1592             unsafe {
1593                 let f = _mm256_set1_epi8(-0x7f);
1594                 Self::new([f, f]) ^ self
1595             }
1596         }
1597     }
1598 
1599     impl<NI> BSwap for u32x4x4_avx2<NI> {
1600         shuf_lane_bytes!(bswap, 0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203);
1601     }
1602 
1603     impl<NI> From<x4<u128x1_sse2<YesS3, YesS4, NI>>> for u32x4x4_avx2<NI>
1604     where
1605         NI: Copy,
1606     {
1607         #[inline(always)]
from(x: x4<u128x1_sse2<YesS3, YesS4, NI>>) -> Self1608         fn from(x: x4<u128x1_sse2<YesS3, YesS4, NI>>) -> Self {
1609             Self::new(unsafe {
1610                 [
1611                     _mm256_setr_m128i(x.0[0].x, x.0[1].x),
1612                     _mm256_setr_m128i(x.0[2].x, x.0[3].x),
1613                 ]
1614             })
1615         }
1616     }
1617 }
1618