1 use super::*;
2 
3 pick! {
4   if #[cfg(target_feature="avx")] {
5     #[derive(Default, Clone, Copy, PartialEq)]
6     #[repr(C, align(32))]
7     pub struct f32x8 { avx: m256 }
8   } else if #[cfg(target_feature="sse2")] {
9     #[derive(Default, Clone, Copy, PartialEq)]
10     #[repr(C, align(32))]
11     pub struct f32x8 { sse0: m128, sse1: m128 }
12   } else if #[cfg(target_feature="simd128")] {
13     use core::arch::wasm32::*;
14 
15     #[derive(Clone, Copy)]
16     #[repr(C, align(32))]
17     pub struct f32x8 { simd0: v128, simd1: v128 }
18 
19     impl Default for f32x8 {
20       fn default() -> Self {
21         Self::splat(0.0)
22       }
23     }
24 
25     impl PartialEq for f32x8 {
26       fn eq(&self, other: &Self) -> bool {
27         u32x4_all_true(f32x4_eq(self.simd0, other.simd0)) &
28           u32x4_all_true(f32x4_eq(self.simd1, other.simd1))
29       }
30     }
31   } else {
32     #[derive(Default, Clone, Copy, PartialEq)]
33     #[repr(C, align(32))]
34     pub struct f32x8 { arr: [f32;8] }
35   }
36 }
37 
38 macro_rules! const_f32_as_f32x8 {
39   ($i:ident, $f:expr) => {
40     pub const $i: f32x8 =
41       unsafe { ConstUnionHack256bit { f32a8: [$f; 8] }.f32x8 };
42   };
43 }
44 
45 impl f32x8 {
46   const_f32_as_f32x8!(ONE, 1.0);
47   const_f32_as_f32x8!(HALF, 0.5);
48   const_f32_as_f32x8!(ZERO, 0.0);
49   const_f32_as_f32x8!(E, core::f32::consts::E);
50   const_f32_as_f32x8!(FRAC_1_PI, core::f32::consts::FRAC_1_PI);
51   const_f32_as_f32x8!(FRAC_2_PI, core::f32::consts::FRAC_2_PI);
52   const_f32_as_f32x8!(FRAC_2_SQRT_PI, core::f32::consts::FRAC_2_SQRT_PI);
53   const_f32_as_f32x8!(FRAC_1_SQRT_2, core::f32::consts::FRAC_1_SQRT_2);
54   const_f32_as_f32x8!(FRAC_PI_2, core::f32::consts::FRAC_PI_2);
55   const_f32_as_f32x8!(FRAC_PI_3, core::f32::consts::FRAC_PI_3);
56   const_f32_as_f32x8!(FRAC_PI_4, core::f32::consts::FRAC_PI_4);
57   const_f32_as_f32x8!(FRAC_PI_6, core::f32::consts::FRAC_PI_6);
58   const_f32_as_f32x8!(FRAC_PI_8, core::f32::consts::FRAC_PI_8);
59   const_f32_as_f32x8!(LN_2, core::f32::consts::LN_2);
60   const_f32_as_f32x8!(LN_10, core::f32::consts::LN_10);
61   const_f32_as_f32x8!(LOG2_E, core::f32::consts::LOG2_E);
62   const_f32_as_f32x8!(LOG10_E, core::f32::consts::LOG10_E);
63   const_f32_as_f32x8!(LOG10_2, core::f32::consts::LOG10_2);
64   const_f32_as_f32x8!(LOG2_10, core::f32::consts::LOG2_10);
65   const_f32_as_f32x8!(PI, core::f32::consts::PI);
66   const_f32_as_f32x8!(SQRT_2, core::f32::consts::SQRT_2);
67   const_f32_as_f32x8!(TAU, core::f32::consts::TAU);
68 }
69 
70 unsafe impl Zeroable for f32x8 {}
71 unsafe impl Pod for f32x8 {}
72 
73 impl Add for f32x8 {
74   type Output = Self;
75   #[inline]
76   #[must_use]
add(self, rhs: Self) -> Self::Output77   fn add(self, rhs: Self) -> Self::Output {
78     pick! {
79       if #[cfg(target_feature="avx")] {
80         Self { avx: add_m256(self.avx, rhs.avx) }
81       } else if #[cfg(target_feature="sse2")] {
82         Self { sse0: add_m128(self.sse0, rhs.sse0), sse1: add_m128(self.sse1, rhs.sse1) }
83       } else if #[cfg(target_feature="simd128")] {
84         Self { simd0: f32x4_add(self.simd0, rhs.simd0), simd1: f32x4_add(self.simd1, rhs.simd1) }
85       } else {
86         Self { arr: [
87           self.arr[0] + rhs.arr[0],
88           self.arr[1] + rhs.arr[1],
89           self.arr[2] + rhs.arr[2],
90           self.arr[3] + rhs.arr[3],
91           self.arr[4] + rhs.arr[4],
92           self.arr[5] + rhs.arr[5],
93           self.arr[6] + rhs.arr[6],
94           self.arr[7] + rhs.arr[7],
95         ]}
96       }
97     }
98   }
99 }
100 
101 impl Sub for f32x8 {
102   type Output = Self;
103   #[inline]
104   #[must_use]
sub(self, rhs: Self) -> Self::Output105   fn sub(self, rhs: Self) -> Self::Output {
106     pick! {
107       if #[cfg(target_feature="avx")] {
108         Self { avx: sub_m256(self.avx, rhs.avx) }
109       } else if #[cfg(target_feature="sse2")] {
110         Self { sse0: sub_m128(self.sse0, rhs.sse0), sse1: sub_m128(self.sse1, rhs.sse1) }
111       } else if #[cfg(target_feature="simd128")] {
112         Self { simd0: f32x4_sub(self.simd0, rhs.simd0), simd1: f32x4_sub(self.simd1, rhs.simd1) }
113       } else {
114         Self { arr: [
115           self.arr[0] - rhs.arr[0],
116           self.arr[1] - rhs.arr[1],
117           self.arr[2] - rhs.arr[2],
118           self.arr[3] - rhs.arr[3],
119           self.arr[4] - rhs.arr[4],
120           self.arr[5] - rhs.arr[5],
121           self.arr[6] - rhs.arr[6],
122           self.arr[7] - rhs.arr[7],
123         ]}
124       }
125     }
126   }
127 }
128 
129 impl Mul for f32x8 {
130   type Output = Self;
131   #[inline]
132   #[must_use]
mul(self, rhs: Self) -> Self::Output133   fn mul(self, rhs: Self) -> Self::Output {
134     pick! {
135       if #[cfg(target_feature="avx")] {
136         Self { avx: mul_m256(self.avx, rhs.avx) }
137       } else if #[cfg(target_feature="sse2")] {
138         Self { sse0: mul_m128(self.sse0, rhs.sse0), sse1: mul_m128(self.sse1, rhs.sse1) }
139       } else if #[cfg(target_feature="simd128")] {
140         Self { simd0: f32x4_mul(self.simd0, rhs.simd0), simd1: f32x4_mul(self.simd1, rhs.simd1) }
141       } else {
142         Self { arr: [
143           self.arr[0] * rhs.arr[0],
144           self.arr[1] * rhs.arr[1],
145           self.arr[2] * rhs.arr[2],
146           self.arr[3] * rhs.arr[3],
147           self.arr[4] * rhs.arr[4],
148           self.arr[5] * rhs.arr[5],
149           self.arr[6] * rhs.arr[6],
150           self.arr[7] * rhs.arr[7],
151         ]}
152       }
153     }
154   }
155 }
156 
157 impl Div for f32x8 {
158   type Output = Self;
159   #[inline]
160   #[must_use]
div(self, rhs: Self) -> Self::Output161   fn div(self, rhs: Self) -> Self::Output {
162     pick! {
163       if #[cfg(target_feature="avx")] {
164         Self { avx: div_m256(self.avx, rhs.avx) }
165       } else if #[cfg(target_feature="sse2")] {
166         Self { sse0: div_m128(self.sse0, rhs.sse0), sse1: div_m128(self.sse1, rhs.sse1) }
167       } else if #[cfg(target_feature="simd128")] {
168         Self { simd0: f32x4_div(self.simd0, rhs.simd0), simd1: f32x4_div(self.simd1, rhs.simd1) }
169       } else {
170         Self { arr: [
171           self.arr[0] / rhs.arr[0],
172           self.arr[1] / rhs.arr[1],
173           self.arr[2] / rhs.arr[2],
174           self.arr[3] / rhs.arr[3],
175           self.arr[4] / rhs.arr[4],
176           self.arr[5] / rhs.arr[5],
177           self.arr[6] / rhs.arr[6],
178           self.arr[7] / rhs.arr[7],
179         ]}
180       }
181     }
182   }
183 }
184 
185 impl Add<f32> for f32x8 {
186   type Output = Self;
187   #[inline]
188   #[must_use]
add(self, rhs: f32) -> Self::Output189   fn add(self, rhs: f32) -> Self::Output {
190     self.add(Self::splat(rhs))
191   }
192 }
193 
194 impl Sub<f32> for f32x8 {
195   type Output = Self;
196   #[inline]
197   #[must_use]
sub(self, rhs: f32) -> Self::Output198   fn sub(self, rhs: f32) -> Self::Output {
199     self.sub(Self::splat(rhs))
200   }
201 }
202 
203 impl Mul<f32> for f32x8 {
204   type Output = Self;
205   #[inline]
206   #[must_use]
mul(self, rhs: f32) -> Self::Output207   fn mul(self, rhs: f32) -> Self::Output {
208     self.mul(Self::splat(rhs))
209   }
210 }
211 
212 impl Div<f32> for f32x8 {
213   type Output = Self;
214   #[inline]
215   #[must_use]
div(self, rhs: f32) -> Self::Output216   fn div(self, rhs: f32) -> Self::Output {
217     self.div(Self::splat(rhs))
218   }
219 }
220 
221 impl Add<f32x8> for f32 {
222   type Output = f32x8;
223   #[inline]
224   #[must_use]
add(self, rhs: f32x8) -> Self::Output225   fn add(self, rhs: f32x8) -> Self::Output {
226     f32x8::splat(self).add(rhs)
227   }
228 }
229 
230 impl Sub<f32x8> for f32 {
231   type Output = f32x8;
232   #[inline]
233   #[must_use]
sub(self, rhs: f32x8) -> Self::Output234   fn sub(self, rhs: f32x8) -> Self::Output {
235     f32x8::splat(self).sub(rhs)
236   }
237 }
238 
239 impl Mul<f32x8> for f32 {
240   type Output = f32x8;
241   #[inline]
242   #[must_use]
mul(self, rhs: f32x8) -> Self::Output243   fn mul(self, rhs: f32x8) -> Self::Output {
244     f32x8::splat(self).mul(rhs)
245   }
246 }
247 
248 impl Div<f32x8> for f32 {
249   type Output = f32x8;
250   #[inline]
251   #[must_use]
div(self, rhs: f32x8) -> Self::Output252   fn div(self, rhs: f32x8) -> Self::Output {
253     f32x8::splat(self).div(rhs)
254   }
255 }
256 
257 impl BitAnd for f32x8 {
258   type Output = Self;
259   #[inline]
260   #[must_use]
bitand(self, rhs: Self) -> Self::Output261   fn bitand(self, rhs: Self) -> Self::Output {
262     pick! {
263       if #[cfg(target_feature="avx")] {
264         Self { avx: bitand_m256(self.avx, rhs.avx) }
265       } else if #[cfg(target_feature="sse2")] {
266         Self { sse0: bitand_m128(self.sse0, rhs.sse0), sse1: bitand_m128(self.sse1, rhs.sse1) }
267       } else if #[cfg(target_feature="simd128")] {
268         Self { simd0: v128_and(self.simd0, rhs.simd0), simd1: v128_and(self.simd1, rhs.simd1) }
269       } else {
270         Self { arr: [
271           f32::from_bits(self.arr[0].to_bits() & rhs.arr[0].to_bits()),
272           f32::from_bits(self.arr[1].to_bits() & rhs.arr[1].to_bits()),
273           f32::from_bits(self.arr[2].to_bits() & rhs.arr[2].to_bits()),
274           f32::from_bits(self.arr[3].to_bits() & rhs.arr[3].to_bits()),
275           f32::from_bits(self.arr[4].to_bits() & rhs.arr[4].to_bits()),
276           f32::from_bits(self.arr[5].to_bits() & rhs.arr[5].to_bits()),
277           f32::from_bits(self.arr[6].to_bits() & rhs.arr[6].to_bits()),
278           f32::from_bits(self.arr[7].to_bits() & rhs.arr[7].to_bits()),
279         ]}
280       }
281     }
282   }
283 }
284 
285 impl BitOr for f32x8 {
286   type Output = Self;
287   #[inline]
288   #[must_use]
bitor(self, rhs: Self) -> Self::Output289   fn bitor(self, rhs: Self) -> Self::Output {
290     pick! {
291       if #[cfg(target_feature="avx")] {
292         Self { avx: bitor_m256(self.avx, rhs.avx) }
293       } else if #[cfg(target_feature="sse2")] {
294         Self { sse0: bitor_m128(self.sse0, rhs.sse0), sse1: bitor_m128(self.sse1, rhs.sse1) }
295       } else if #[cfg(target_feature="simd128")] {
296         Self { simd0: v128_or(self.simd0, rhs.simd0), simd1: v128_or(self.simd1, rhs.simd1) }
297       } else {
298         Self { arr: [
299           f32::from_bits(self.arr[0].to_bits() | rhs.arr[0].to_bits()),
300           f32::from_bits(self.arr[1].to_bits() | rhs.arr[1].to_bits()),
301           f32::from_bits(self.arr[2].to_bits() | rhs.arr[2].to_bits()),
302           f32::from_bits(self.arr[3].to_bits() | rhs.arr[3].to_bits()),
303           f32::from_bits(self.arr[4].to_bits() | rhs.arr[4].to_bits()),
304           f32::from_bits(self.arr[5].to_bits() | rhs.arr[5].to_bits()),
305           f32::from_bits(self.arr[6].to_bits() | rhs.arr[6].to_bits()),
306           f32::from_bits(self.arr[7].to_bits() | rhs.arr[7].to_bits()),
307         ]}
308       }
309     }
310   }
311 }
312 
313 impl BitXor for f32x8 {
314   type Output = Self;
315   #[inline]
316   #[must_use]
bitxor(self, rhs: Self) -> Self::Output317   fn bitxor(self, rhs: Self) -> Self::Output {
318     pick! {
319       if #[cfg(target_feature="avx")] {
320         Self { avx: bitxor_m256(self.avx, rhs.avx) }
321       } else if #[cfg(target_feature="sse2")] {
322         Self { sse0: bitxor_m128(self.sse0, rhs.sse0), sse1: bitxor_m128(self.sse1, rhs.sse1) }
323       } else if #[cfg(target_feature="simd128")] {
324         Self { simd0: v128_xor(self.simd0, rhs.simd0), simd1: v128_xor(self.simd1, rhs.simd1) }
325       } else {
326         Self { arr: [
327           f32::from_bits(self.arr[0].to_bits() ^ rhs.arr[0].to_bits()),
328           f32::from_bits(self.arr[1].to_bits() ^ rhs.arr[1].to_bits()),
329           f32::from_bits(self.arr[2].to_bits() ^ rhs.arr[2].to_bits()),
330           f32::from_bits(self.arr[3].to_bits() ^ rhs.arr[3].to_bits()),
331           f32::from_bits(self.arr[4].to_bits() ^ rhs.arr[4].to_bits()),
332           f32::from_bits(self.arr[5].to_bits() ^ rhs.arr[5].to_bits()),
333           f32::from_bits(self.arr[6].to_bits() ^ rhs.arr[6].to_bits()),
334           f32::from_bits(self.arr[7].to_bits() ^ rhs.arr[7].to_bits()),
335         ]}
336       }
337     }
338   }
339 }
340 
341 impl CmpEq for f32x8 {
342   type Output = Self;
343   #[inline]
344   #[must_use]
cmp_eq(self, rhs: Self) -> Self::Output345   fn cmp_eq(self, rhs: Self) -> Self::Output {
346     pick! {
347       if #[cfg(target_feature="avx")] {
348         Self { avx: cmp_op_mask_m256::<{cmp_op!(EqualOrdered)}>(self.avx, rhs.avx) }
349       } else if #[cfg(target_feature="sse2")] {
350         Self { sse0: cmp_eq_mask_m128(self.sse0, rhs.sse0), sse1: cmp_eq_mask_m128(self.sse1, rhs.sse1) }
351       } else if #[cfg(target_feature="simd128")] {
352         Self { simd0: f32x4_eq(self.simd0, rhs.simd0), simd1: f32x4_eq(self.simd1, rhs.simd1) }
353       } else {
354         Self { arr: [
355           if self.arr[0] == rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
356           if self.arr[1] == rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
357           if self.arr[2] == rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
358           if self.arr[3] == rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
359           if self.arr[4] == rhs.arr[4] { f32::from_bits(u32::MAX) } else { 0.0 },
360           if self.arr[5] == rhs.arr[5] { f32::from_bits(u32::MAX) } else { 0.0 },
361           if self.arr[6] == rhs.arr[6] { f32::from_bits(u32::MAX) } else { 0.0 },
362           if self.arr[7] == rhs.arr[7] { f32::from_bits(u32::MAX) } else { 0.0 },
363         ]}
364       }
365     }
366   }
367 }
368 
369 impl CmpGe for f32x8 {
370   type Output = Self;
371   #[inline]
372   #[must_use]
cmp_ge(self, rhs: Self) -> Self::Output373   fn cmp_ge(self, rhs: Self) -> Self::Output {
374     pick! {
375       if #[cfg(target_feature="avx")] {
376         Self { avx: cmp_op_mask_m256::<{cmp_op!(GreaterEqualOrdered)}>(self.avx, rhs.avx) }
377       } else if #[cfg(target_feature="sse2")] {
378         Self { sse0: cmp_ge_mask_m128(self.sse0, rhs.sse0), sse1: cmp_ge_mask_m128(self.sse1, rhs.sse1) }
379       } else if #[cfg(target_feature="simd128")] {
380         Self { simd0: f32x4_ge(self.simd0, rhs.simd0), simd1: f32x4_ge(self.simd1, rhs.simd1) }
381       } else {
382         Self { arr: [
383           if self.arr[0] >= rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
384           if self.arr[1] >= rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
385           if self.arr[2] >= rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
386           if self.arr[3] >= rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
387           if self.arr[4] >= rhs.arr[4] { f32::from_bits(u32::MAX) } else { 0.0 },
388           if self.arr[5] >= rhs.arr[5] { f32::from_bits(u32::MAX) } else { 0.0 },
389           if self.arr[6] >= rhs.arr[6] { f32::from_bits(u32::MAX) } else { 0.0 },
390           if self.arr[7] >= rhs.arr[7] { f32::from_bits(u32::MAX) } else { 0.0 },
391         ]}
392       }
393     }
394   }
395 }
396 
397 impl CmpGt for f32x8 {
398   type Output = Self;
399   #[inline]
400   #[must_use]
cmp_gt(self, rhs: Self) -> Self::Output401   fn cmp_gt(self, rhs: Self) -> Self::Output {
402     pick! {
403       if #[cfg(target_feature="avx")] {
404         Self { avx: cmp_op_mask_m256::<{cmp_op!(GreaterThanOrdered)}>(self.avx, rhs.avx) }
405       } else if #[cfg(target_feature="sse2")] {
406         Self { sse0: cmp_gt_mask_m128(self.sse0, rhs.sse0), sse1: cmp_gt_mask_m128(self.sse1, rhs.sse1) }
407       } else if #[cfg(target_feature="simd128")] {
408         Self { simd0: f32x4_gt(self.simd0, rhs.simd0), simd1: f32x4_gt(self.simd1, rhs.simd1) }
409       } else {
410         Self { arr: [
411           if self.arr[0] > rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
412           if self.arr[1] > rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
413           if self.arr[2] > rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
414           if self.arr[3] > rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
415           if self.arr[4] > rhs.arr[4] { f32::from_bits(u32::MAX) } else { 0.0 },
416           if self.arr[5] > rhs.arr[5] { f32::from_bits(u32::MAX) } else { 0.0 },
417           if self.arr[6] > rhs.arr[6] { f32::from_bits(u32::MAX) } else { 0.0 },
418           if self.arr[7] > rhs.arr[7] { f32::from_bits(u32::MAX) } else { 0.0 },
419         ]}
420       }
421     }
422   }
423 }
424 
425 impl CmpNe for f32x8 {
426   type Output = Self;
427   #[inline]
428   #[must_use]
cmp_ne(self, rhs: Self) -> Self::Output429   fn cmp_ne(self, rhs: Self) -> Self::Output {
430     pick! {
431       if #[cfg(target_feature="avx")] {
432         Self { avx: cmp_op_mask_m256::<{cmp_op!(NotEqualOrdered)}>(self.avx, rhs.avx) }
433       } else if #[cfg(target_feature="sse2")] {
434         Self { sse0: cmp_neq_mask_m128(self.sse0, rhs.sse0), sse1: cmp_neq_mask_m128(self.sse1, rhs.sse1) }
435       } else if #[cfg(target_feature="simd128")] {
436         Self { simd0: f32x4_ne(self.simd0, rhs.simd0), simd1: f32x4_ne(self.simd1, rhs.simd1) }
437       } else {
438         Self { arr: [
439           if self.arr[0] != rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
440           if self.arr[1] != rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
441           if self.arr[2] != rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
442           if self.arr[3] != rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
443           if self.arr[4] != rhs.arr[4] { f32::from_bits(u32::MAX) } else { 0.0 },
444           if self.arr[5] != rhs.arr[5] { f32::from_bits(u32::MAX) } else { 0.0 },
445           if self.arr[6] != rhs.arr[6] { f32::from_bits(u32::MAX) } else { 0.0 },
446           if self.arr[7] != rhs.arr[7] { f32::from_bits(u32::MAX) } else { 0.0 },
447         ]}
448       }
449     }
450   }
451 }
452 
453 impl CmpLe for f32x8 {
454   type Output = Self;
455   #[inline]
456   #[must_use]
cmp_le(self, rhs: Self) -> Self::Output457   fn cmp_le(self, rhs: Self) -> Self::Output {
458     pick! {
459       if #[cfg(target_feature="avx")] {
460         Self { avx: cmp_op_mask_m256::<{cmp_op!(LessEqualOrdered)}>(self.avx, rhs.avx) }
461       } else if #[cfg(target_feature="sse2")] {
462         Self { sse0: cmp_le_mask_m128(self.sse0, rhs.sse0), sse1: cmp_le_mask_m128(self.sse1, rhs.sse1) }
463       } else if #[cfg(target_feature="simd128")] {
464         Self { simd0: f32x4_le(self.simd0, rhs.simd0), simd1: f32x4_le(self.simd1, rhs.simd1) }
465       } else {
466         Self { arr: [
467           if self.arr[0] <= rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
468           if self.arr[1] <= rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
469           if self.arr[2] <= rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
470           if self.arr[3] <= rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
471           if self.arr[4] <= rhs.arr[4] { f32::from_bits(u32::MAX) } else { 0.0 },
472           if self.arr[5] <= rhs.arr[5] { f32::from_bits(u32::MAX) } else { 0.0 },
473           if self.arr[6] <= rhs.arr[6] { f32::from_bits(u32::MAX) } else { 0.0 },
474           if self.arr[7] <= rhs.arr[7] { f32::from_bits(u32::MAX) } else { 0.0 },
475         ]}
476       }
477     }
478   }
479 }
480 
481 impl CmpLt for f32x8 {
482   type Output = Self;
483   #[inline]
484   #[must_use]
cmp_lt(self, rhs: Self) -> Self::Output485   fn cmp_lt(self, rhs: Self) -> Self::Output {
486     pick! {
487         if #[cfg(target_feature="avx")] {
488           Self { avx: cmp_op_mask_m256::<{cmp_op!(LessThanOrdered)}>(self.avx, rhs.avx) }
489         } else if #[cfg(target_feature="sse2")] {
490           Self { sse0: cmp_lt_mask_m128(self.sse0, rhs.sse0), sse1: cmp_lt_mask_m128(self.sse1, rhs.sse1) }
491         } else if #[cfg(target_feature="simd128")] {
492           Self { simd0: f32x4_lt(self.simd0, rhs.simd0), simd1: f32x4_lt(self.simd1, rhs.simd1) }
493         } else {
494           Self { arr: [
495             if self.arr[0] < rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
496             if self.arr[1] < rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
497             if self.arr[2] < rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
498             if self.arr[3] < rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
499             if self.arr[4] < rhs.arr[4] { f32::from_bits(u32::MAX) } else { 0.0 },
500             if self.arr[5] < rhs.arr[5] { f32::from_bits(u32::MAX) } else { 0.0 },
501             if self.arr[6] < rhs.arr[6] { f32::from_bits(u32::MAX) } else { 0.0 },
502             if self.arr[7] < rhs.arr[7] { f32::from_bits(u32::MAX) } else { 0.0 },
503           ]}
504         }
505     }
506   }
507 }
508 
509 impl f32x8 {
510   #[inline]
511   #[must_use]
new(array: [f32; 8]) -> Self512   pub fn new(array: [f32; 8]) -> Self {
513     Self::from(array)
514   }
515   #[inline]
516   #[must_use]
blend(self, t: Self, f: Self) -> Self517   pub fn blend(self, t: Self, f: Self) -> Self {
518     pick! {
519       if #[cfg(target_feature="avx")] {
520         Self { avx: blend_varying_m256(f.avx, t.avx, self.avx) }
521       } else if #[cfg(target_feature="sse4.1")] {
522         Self { sse0: blend_varying_m128(f.sse0, t.sse0, self.sse0), sse1: blend_varying_m128(f.sse1, t.sse1, self.sse1) }
523       } else if #[cfg(target_feature="simd128")] {
524         Self { simd0: v128_bitselect(t.simd0, f.simd0, self.simd0), simd1: v128_bitselect(t.simd1, f.simd1, self.simd1) }
525       } else {
526         generic_bit_blend(self, t, f)
527       }
528     }
529   }
530   #[inline]
531   #[must_use]
abs(self) -> Self532   pub fn abs(self) -> Self {
533     pick! {
534       if #[cfg(target_feature="simd128")] {
535         Self { simd0: f32x4_abs(self.simd0), simd1: f32x4_abs(self.simd1) }
536       } else {
537         let non_sign_bits = f32x8::from(f32::from_bits(i32::MAX as u32));
538         self & non_sign_bits
539       }
540     }
541   }
542 
543   /// Calculates the lanewise maximum of both vectors. This is a faster
544   /// implementation than `max`, but it doesn't specify any behavior if NaNs are
545   /// involved.
546   #[inline]
547   #[must_use]
fast_max(self, rhs: Self) -> Self548   pub fn fast_max(self, rhs: Self) -> Self {
549     pick! {
550       if #[cfg(target_feature="avx")] {
551         Self { avx: max_m256(self.avx, rhs.avx) }
552       } else if #[cfg(target_feature="sse2")] {
553         Self { sse0: max_m128(self.sse0, rhs.sse0), sse1: max_m128(self.sse1, rhs.sse1) }
554       } else if #[cfg(target_feature="simd128")] {
555         Self { simd0: f32x4_pmax(self.simd0, rhs.simd0), simd1: f32x4_pmax(self.simd1, rhs.simd1) }
556       } else {
557         Self { arr: [
558           if self.arr[0] < rhs.arr[0] { rhs.arr[0] } else { self.arr[0] },
559           if self.arr[1] < rhs.arr[1] { rhs.arr[1] } else { self.arr[1] },
560           if self.arr[2] < rhs.arr[2] { rhs.arr[2] } else { self.arr[2] },
561           if self.arr[3] < rhs.arr[3] { rhs.arr[3] } else { self.arr[3] },
562           if self.arr[4] < rhs.arr[4] { rhs.arr[4] } else { self.arr[4] },
563           if self.arr[5] < rhs.arr[5] { rhs.arr[5] } else { self.arr[5] },
564           if self.arr[6] < rhs.arr[6] { rhs.arr[6] } else { self.arr[6] },
565           if self.arr[7] < rhs.arr[7] { rhs.arr[7] } else { self.arr[7] },
566         ]}
567       }
568     }
569   }
570 
571   /// Calculates the lanewise maximum of both vectors. This doesn't match
572   /// IEEE-754 and instead is defined as `self < rhs ? rhs : self`.
573   #[inline]
574   #[must_use]
max(self, rhs: Self) -> Self575   pub fn max(self, rhs: Self) -> Self {
576     pick! {
577       if #[cfg(target_feature="avx")] {
578         // max_m256 seems to do rhs < self ? self : rhs. So if there's any NaN
579         // involved, it chooses rhs, so we need to specifically check rhs for
580         // NaN.
581         rhs.is_nan().blend(self, Self { avx: max_m256(self.avx, rhs.avx) })
582       } else if #[cfg(target_feature="sse2")] {
583         // max_m128 seems to do rhs < self ? self : rhs. So if there's any NaN
584         // involved, it chooses rhs, so we need to specifically check rhs for
585         // NaN.
586         rhs.is_nan().blend(self, Self { sse0: max_m128(self.sse0, rhs.sse0), sse1: max_m128(self.sse1, rhs.sse1) })
587       } else if #[cfg(target_feature="simd128")] {
588         // WASM has two max intrinsics:
589         // - max: This propagates NaN, that's the opposite of what we need.
590         // - pmax: This is defined as self < rhs ? rhs : self, which basically
591         //   chooses self if either is NaN.
592         //
593         // pmax is what we want, but we need to specifically check self for NaN.
594         Self {
595           simd0: v128_bitselect(
596             rhs.simd0,
597             f32x4_pmax(self.simd0, rhs.simd0),
598             f32x4_ne(self.simd0, self.simd0), // NaN check
599           ),
600           simd1: v128_bitselect(
601             rhs.simd1,
602             f32x4_pmax(self.simd1, rhs.simd1),
603             f32x4_ne(self.simd1, self.simd1), // NaN check
604           ),
605         }
606       } else {
607         Self { arr: [
608           self.arr[0].max(rhs.arr[0]),
609           self.arr[1].max(rhs.arr[1]),
610           self.arr[2].max(rhs.arr[2]),
611           self.arr[3].max(rhs.arr[3]),
612           self.arr[4].max(rhs.arr[4]),
613           self.arr[5].max(rhs.arr[5]),
614           self.arr[6].max(rhs.arr[6]),
615           self.arr[7].max(rhs.arr[7]),
616         ]}
617       }
618     }
619   }
620 
621   /// Calculates the lanewise minimum of both vectors. This is a faster
622   /// implementation than `min`, but it doesn't specify any behavior if NaNs are
623   /// involved.
624   #[inline]
625   #[must_use]
fast_min(self, rhs: Self) -> Self626   pub fn fast_min(self, rhs: Self) -> Self {
627     pick! {
628       if #[cfg(target_feature="avx")] {
629         Self { avx: min_m256(self.avx, rhs.avx) }
630       } else if #[cfg(target_feature="sse2")] {
631         Self { sse0: min_m128(self.sse0, rhs.sse0), sse1: min_m128(self.sse1, rhs.sse1) }
632       } else if #[cfg(target_feature="simd128")] {
633         Self { simd0: f32x4_pmin(self.simd0, rhs.simd0), simd1: f32x4_pmin(self.simd1, rhs.simd1) }
634       } else {
635         Self { arr: [
636           if self.arr[0] > rhs.arr[0] { rhs.arr[0] } else { self.arr[0] },
637           if self.arr[1] > rhs.arr[1] { rhs.arr[1] } else { self.arr[1] },
638           if self.arr[2] > rhs.arr[2] { rhs.arr[2] } else { self.arr[2] },
639           if self.arr[3] > rhs.arr[3] { rhs.arr[3] } else { self.arr[3] },
640           if self.arr[4] > rhs.arr[4] { rhs.arr[4] } else { self.arr[4] },
641           if self.arr[5] > rhs.arr[5] { rhs.arr[5] } else { self.arr[5] },
642           if self.arr[6] > rhs.arr[6] { rhs.arr[6] } else { self.arr[6] },
643           if self.arr[7] > rhs.arr[7] { rhs.arr[7] } else { self.arr[7] },
644         ]}
645       }
646     }
647   }
648 
649   /// Calculates the lanewise minimum of both vectors. If either lane is NaN,
650   /// the other lane gets chosen. Use `fast_min` for a faster implementation
651   /// that doesn't handle NaNs.
652   #[inline]
653   #[must_use]
min(self, rhs: Self) -> Self654   pub fn min(self, rhs: Self) -> Self {
655     pick! {
656       if #[cfg(target_feature="avx")] {
657         // min_m256 seems to do rhs > self ? self : rhs. So if there's any NaN
658         // involved, it chooses rhs, so we need to specifically check rhs for
659         // NaN.
660         rhs.is_nan().blend(self, Self { avx: min_m256(self.avx, rhs.avx) })
661       } else if #[cfg(target_feature="sse2")] {
662         // min_m128 seems to do rhs > self ? self : rhs. So if there's any NaN
663         // involved, it chooses rhs, so we need to specifically check rhs for
664         // NaN.
665         rhs.is_nan().blend(self, Self { sse0: min_m128(self.sse0, rhs.sse0), sse1: min_m128(self.sse1, rhs.sse1) })
666       } else if #[cfg(target_feature="simd128")] {
667         // WASM has two min intrinsics:
668         // - min: This propagates NaN, that's the opposite of what we need.
669         // - pmin: This is defined as rhs < self ? rhs : self, which basically
670         //   chooses self if either is NaN.
671         //
672         // pmin is what we want, but we need to specifically check self for NaN.
673         Self {
674           simd0: v128_bitselect(
675             rhs.simd0,
676             f32x4_pmin(self.simd0, rhs.simd0),
677             f32x4_ne(self.simd0, self.simd0), // NaN check
678           ),
679           simd1: v128_bitselect(
680             rhs.simd1,
681             f32x4_pmin(self.simd1, rhs.simd1),
682             f32x4_ne(self.simd1, self.simd1), // NaN check
683           ),
684         }
685       } else {
686         Self { arr: [
687           self.arr[0].min(rhs.arr[0]),
688           self.arr[1].min(rhs.arr[1]),
689           self.arr[2].min(rhs.arr[2]),
690           self.arr[3].min(rhs.arr[3]),
691           self.arr[4].min(rhs.arr[4]),
692           self.arr[5].min(rhs.arr[5]),
693           self.arr[6].min(rhs.arr[6]),
694           self.arr[7].min(rhs.arr[7]),
695         ]}
696       }
697     }
698   }
699   #[inline]
700   #[must_use]
is_nan(self) -> Self701   pub fn is_nan(self) -> Self {
702     pick! {
703       if #[cfg(target_feature="avx")] {
704         Self { avx: cmp_op_mask_m256::<{cmp_op!(Unordered)}>(self.avx, self.avx) }
705       } else if #[cfg(target_feature="sse2")] {
706         Self { sse0: cmp_unord_mask_m128(self.sse0, self.sse0) , sse1: cmp_unord_mask_m128(self.sse1, self.sse1) }
707       } else if #[cfg(target_feature="simd128")] {
708         Self { simd0: f32x4_ne(self.simd0, self.simd0), simd1: f32x4_ne(self.simd1, self.simd1) }
709       } else {
710         Self { arr: [
711           if self.arr[0].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
712           if self.arr[1].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
713           if self.arr[2].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
714           if self.arr[3].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
715           if self.arr[4].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
716           if self.arr[5].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
717           if self.arr[6].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
718           if self.arr[7].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
719         ]}
720       }
721     }
722   }
723   #[inline]
724   #[must_use]
is_finite(self) -> Self725   pub fn is_finite(self) -> Self {
726     let shifted_exp_mask = u32x8::from(0xFF000000);
727     let u: u32x8 = cast(self);
728     let shift_u = u << 1_u64;
729     let out = !(shift_u & shifted_exp_mask).cmp_eq(shifted_exp_mask);
730     cast(out)
731   }
732   #[inline]
733   #[must_use]
is_inf(self) -> Self734   pub fn is_inf(self) -> Self {
735     let shifted_inf = u32x8::from(0xFF000000);
736     let u: u32x8 = cast(self);
737     let shift_u = u << 1_u64;
738     let out = (shift_u).cmp_eq(shifted_inf);
739     cast(out)
740   }
741 
742   #[inline]
743   #[must_use]
round(self) -> Self744   pub fn round(self) -> Self {
745     pick! {
746       // NOTE: Is there an SSE2 version of this? f32x4 version probably translates but I've not had time to figure it out
747       if #[cfg(target_feature="avx")] {
748         Self { avx: round_m256::<{round_op!(Nearest)}>(self.avx) }
749       }  else if #[cfg(target_feature="sse4.1")] {
750         Self { sse0: round_m128::<{round_op!(Nearest)}>(self.sse0), sse1: round_m128::<{round_op!(Nearest)}>(self.sse1) }
751       } else if #[cfg(target_feature="simd128")] {
752         Self { simd0: f32x4_nearest(self.simd0), simd1: f32x4_nearest(self.simd1) }
753       } else {
754         // Note(Lokathor): This software fallback is probably very slow compared
755         // to having a hardware option available, even just the sse2 version is
756         // better than this. Oh well.
757         let to_int = f32x8::from(1.0 / f32::EPSILON);
758         let u: u32x8 = cast(self);
759         let e: i32x8 = cast((u >> 23) & u32x8::from(0xff));
760         let mut y: f32x8;
761 
762         let no_op_magic = i32x8::from(0x7f + 23);
763         let no_op_mask: f32x8 = cast(e.cmp_gt(no_op_magic) | e.cmp_eq(no_op_magic));
764         let no_op_val: f32x8 = self;
765 
766         let zero_magic = i32x8::from(0x7f - 1);
767         let zero_mask: f32x8 = cast(e.cmp_lt(zero_magic));
768         let zero_val: f32x8 = self * f32x8::from(0.0);
769 
770         let neg_bit: f32x8 = cast(cast::<u32x8, i32x8>(u).cmp_lt(i32x8::default()));
771         let x: f32x8 = neg_bit.blend(-self, self);
772         y = x + to_int - to_int - x;
773         y = y.cmp_gt(f32x8::from(0.5)).blend(
774           y + x - f32x8::from(-1.0),
775           y.cmp_lt(f32x8::from(-0.5)).blend(y + x + f32x8::from(1.0), y + x),
776         );
777         y = neg_bit.blend(-y, y);
778 
779         no_op_mask.blend(no_op_val, zero_mask.blend(zero_val, y))
780       }
781     }
782   }
783 
784   /// Rounds each lane into an integer. This is a faster implementation than
785   /// `round_int`, but it doesn't handle out of range values or NaNs. For those
786   /// values you get implementation defined behavior.
787   #[inline]
788   #[must_use]
fast_round_int(self) -> i32x8789   pub fn fast_round_int(self) -> i32x8 {
790     pick! {
791       if #[cfg(target_feature="avx")] {
792         cast(convert_to_i32_m256i_from_m256(self.avx))
793       } else if #[cfg(target_feature="sse2")] {
794         i32x8 { sse0: convert_to_i32_m128i_from_m128(self.sse0), sse1: convert_to_i32_m128i_from_m128(self.sse1) }
795       } else {
796         self.round_int()
797       }
798     }
799   }
800 
801   /// Rounds each lane into an integer. This saturates out of range values and
802   /// turns NaNs into 0. Use `fast_round_int` for a faster implementation that
803   /// doesn't handle out of range values or NaNs.
804   #[inline]
805   #[must_use]
round_int(self) -> i32x8806   pub fn round_int(self) -> i32x8 {
807     pick! {
808       if #[cfg(target_feature="avx")] {
809         // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
810         let non_nan_mask = self.cmp_eq(self);
811         let non_nan = self & non_nan_mask;
812         let flip_to_max: i32x8 = cast(self.cmp_ge(Self::splat(2147483648.0)));
813         let cast: i32x8 = cast(convert_to_i32_m256i_from_m256(non_nan.avx));
814         flip_to_max ^ cast
815       } else if #[cfg(target_feature="sse2")] {
816         // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
817         let non_nan_mask = self.cmp_eq(self);
818         let non_nan = self & non_nan_mask;
819         let flip_to_max: i32x8 = cast(self.cmp_ge(Self::splat(2147483648.0)));
820         let cast: i32x8 = i32x8 { sse0: convert_to_i32_m128i_from_m128(non_nan.sse0), sse1: convert_to_i32_m128i_from_m128(non_nan.sse1) };
821         flip_to_max ^ cast
822       } else if #[cfg(target_feature="simd128")] {
823         cast(Self {
824           simd0: i32x4_trunc_sat_f32x4(f32x4_nearest(self.simd0)),
825           simd1: i32x4_trunc_sat_f32x4(f32x4_nearest(self.simd1)),
826         })
827       } else {
828         let rounded: [f32; 8] = cast(self.round());
829         cast([
830           rounded[0] as i32,
831           rounded[1] as i32,
832           rounded[2] as i32,
833           rounded[3] as i32,
834           rounded[4] as i32,
835           rounded[5] as i32,
836           rounded[6] as i32,
837           rounded[7] as i32,
838         ])
839       }
840     }
841   }
842 
843   /// Truncates each lane into an integer. This is a faster implementation than
844   /// `trunc_int`, but it doesn't handle out of range values or NaNs. For those
845   /// values you get implementation defined behavior.
846   #[inline]
847   #[must_use]
fast_trunc_int(self) -> i32x8848   pub fn fast_trunc_int(self) -> i32x8 {
849     pick! {
850       if #[cfg(all(target_feature="avx"))] {
851         cast(convert_truncate_to_i32_m256i_from_m256(self.avx))
852       } else if #[cfg(target_feature="sse2")] {
853         i32x8 { sse0: truncate_m128_to_m128i(self.sse0), sse1: truncate_m128_to_m128i(self.sse1) }
854       } else {
855         self.trunc_int()
856       }
857     }
858   }
859 
860   /// Truncates each lane into an integer. This saturates out of range values
861   /// and turns NaNs into 0. Use `fast_trunc_int` for a faster implementation
862   /// that doesn't handle out of range values or NaNs.
863   #[inline]
864   #[must_use]
trunc_int(self) -> i32x8865   pub fn trunc_int(self) -> i32x8 {
866     pick! {
867         if #[cfg(target_feature="avx")] {
868         // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
869         let non_nan_mask = self.cmp_eq(self);
870         let non_nan = self & non_nan_mask;
871         let flip_to_max: i32x8 = cast(self.cmp_ge(Self::splat(2147483648.0)));
872         let cast: i32x8 = cast(convert_truncate_to_i32_m256i_from_m256(non_nan.avx));
873         flip_to_max ^ cast
874       } else if #[cfg(target_feature="sse2")] {
875         // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
876         let non_nan_mask = self.cmp_eq(self);
877         let non_nan = self & non_nan_mask;
878         let flip_to_max: i32x8 = cast(self.cmp_ge(Self::splat(2147483648.0)));
879         let cast: i32x8 = i32x8 { sse0: truncate_m128_to_m128i(non_nan.sse0), sse1: truncate_m128_to_m128i(non_nan.sse1) };
880         flip_to_max ^ cast
881       } else if #[cfg(target_feature="simd128")] {
882         cast(Self {
883           simd0: i32x4_trunc_sat_f32x4(self.simd0),
884           simd1: i32x4_trunc_sat_f32x4(self.simd1),
885         })
886       } else {
887         let n: [f32; 8] = cast(self);
888         cast([
889           n[0] as i32,
890           n[1] as i32,
891           n[2] as i32,
892           n[3] as i32,
893           n[4] as i32,
894           n[5] as i32,
895           n[6] as i32,
896           n[7] as i32,
897         ])
898       }
899     }
900   }
901   #[inline]
902   #[must_use]
mul_add(self, m: Self, a: Self) -> Self903   pub fn mul_add(self, m: Self, a: Self) -> Self {
904     pick! {
905       if #[cfg(all(target_feature="avx",target_feature="fma"))] {
906         Self { avx: fused_mul_add_m256(self.avx, m.avx, a.avx) }
907       } else if #[cfg(all(target_feature="avx",target_feature="fma"))] {
908         Self { sse0: fused_mul_add_m128(self.sse0, m.sse0, a.sse0), sse1: fused_mul_add_m128(self.sse1, m.sse1, a.sse1) }
909       } else {
910         (self * m) + a
911       }
912     }
913   }
914 
915   #[inline]
916   #[must_use]
mul_sub(self, m: Self, a: Self) -> Self917   pub fn mul_sub(self, m: Self, a: Self) -> Self {
918     pick! {
919       if #[cfg(all(target_feature="avx",target_feature="fma"))] {
920         Self { avx: fused_mul_sub_m256(self.avx, m.avx, a.avx) }
921       } else if #[cfg(all(target_feature="avx",target_feature="fma"))] {
922         Self { sse0: fused_mul_sub_m128(self.sse0, m.sse0, a.sse0), sse1: fused_mul_sub_m128(self.sse1, m.sse1, a.sse1) }
923       } else {
924         (self * m) - a
925       }
926     }
927   }
928 
929   #[inline]
930   #[must_use]
mul_neg_add(self, m: Self, a: Self) -> Self931   pub fn mul_neg_add(self, m: Self, a: Self) -> Self {
932     pick! {
933       if #[cfg(all(target_feature="avx",target_feature="fma"))] {
934         Self { avx: fused_mul_neg_add_m256(self.avx, m.avx, a.avx) }
935       } else if #[cfg(all(target_feature="avx",target_feature="fma"))] {
936         Self { sse0: fused_mul_neg_add_m128(self.sse0, m.sse0, a.sse0), sse1: fused_mul_neg_add_m128(self.sse1, m.sse1, a.sse1) }
937       } else {
938         a - (self * m)
939       }
940     }
941   }
942 
943   #[inline]
944   #[must_use]
mul_neg_sub(self, m: Self, a: Self) -> Self945   pub fn mul_neg_sub(self, m: Self, a: Self) -> Self {
946     pick! {
947       if #[cfg(all(target_feature="avx",target_feature="fma"))] {
948         Self { avx: fused_mul_neg_sub_m256(self.avx, m.avx, a.avx) }
949       } else if #[cfg(all(target_feature="avx",target_feature="fma"))] {
950         Self { sse0: fused_mul_neg_sub_m128(self.sse0, m.sse0, a.sse0), sse1: fused_mul_neg_sub_m128(self.sse1, m.sse1, a.sse1) }
951       } else {
952         -(self * m) - a
953       }
954     }
955   }
956 
957   #[inline]
958   #[must_use]
flip_signs(self, signs: Self) -> Self959   pub fn flip_signs(self, signs: Self) -> Self {
960     self ^ (signs & Self::from(-0.0))
961   }
962 
963   #[inline]
964   #[must_use]
copysign(self, sign: Self) -> Self965   pub fn copysign(self, sign: Self) -> Self {
966     let magnitude_mask = Self::from(f32::from_bits(u32::MAX >> 1));
967     (self & magnitude_mask) | (sign & Self::from(-0.0))
968   }
969 
970   #[allow(non_upper_case_globals)]
asin_acos(self) -> (Self, Self)971   pub fn asin_acos(self) -> (Self, Self) {
972     // Based on the Agner Fog "vector class library":
973     // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
974     const_f32_as_f32x8!(P4asinf, 4.2163199048E-2);
975     const_f32_as_f32x8!(P3asinf, 2.4181311049E-2);
976     const_f32_as_f32x8!(P2asinf, 4.5470025998E-2);
977     const_f32_as_f32x8!(P1asinf, 7.4953002686E-2);
978     const_f32_as_f32x8!(P0asinf, 1.6666752422E-1);
979 
980     let xa = self.abs();
981     let big = xa.cmp_ge(f32x8::splat(0.5));
982 
983     let x1 = f32x8::splat(0.5) * (f32x8::ONE - xa);
984     let x2 = xa * xa;
985     let x3 = big.blend(x1, x2);
986 
987     let xb = x1.sqrt();
988 
989     let x4 = big.blend(xb, xa);
990 
991     let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
992     let z = z.mul_add(x3 * x4, x4);
993 
994     let z1 = z + z;
995 
996     // acos
997     let z3 = self.cmp_lt(f32x8::ZERO).blend(f32x8::PI - z1, z1);
998     let z4 = f32x8::FRAC_PI_2 - z.flip_signs(self);
999     let acos = big.blend(z3, z4);
1000 
1001     // asin
1002     let z3 = f32x8::FRAC_PI_2 - z1;
1003     let asin = big.blend(z3, z);
1004     let asin = asin.flip_signs(self);
1005 
1006     (asin, acos)
1007   }
1008 
1009   #[inline]
1010   #[must_use]
1011   #[allow(non_upper_case_globals)]
asin(self) -> Self1012   pub fn asin(self) -> Self {
1013     // Based on the Agner Fog "vector class library":
1014     // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
1015     const_f32_as_f32x8!(P4asinf, 4.2163199048E-2);
1016     const_f32_as_f32x8!(P3asinf, 2.4181311049E-2);
1017     const_f32_as_f32x8!(P2asinf, 4.5470025998E-2);
1018     const_f32_as_f32x8!(P1asinf, 7.4953002686E-2);
1019     const_f32_as_f32x8!(P0asinf, 1.6666752422E-1);
1020 
1021     let xa = self.abs();
1022     let big = xa.cmp_ge(f32x8::splat(0.5));
1023 
1024     let x1 = f32x8::splat(0.5) * (f32x8::ONE - xa);
1025     let x2 = xa * xa;
1026     let x3 = big.blend(x1, x2);
1027 
1028     let xb = x1.sqrt();
1029 
1030     let x4 = big.blend(xb, xa);
1031 
1032     let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
1033     let z = z.mul_add(x3 * x4, x4);
1034 
1035     let z1 = z + z;
1036 
1037     // asin
1038     let z3 = f32x8::FRAC_PI_2 - z1;
1039     let asin = big.blend(z3, z);
1040     let asin = asin.flip_signs(self);
1041 
1042     asin
1043   }
1044 
1045   #[inline]
1046   #[must_use]
1047   #[allow(non_upper_case_globals)]
acos(self) -> Self1048   pub fn acos(self) -> Self {
1049     // Based on the Agner Fog "vector class library":
1050     // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
1051     const_f32_as_f32x8!(P4asinf, 4.2163199048E-2);
1052     const_f32_as_f32x8!(P3asinf, 2.4181311049E-2);
1053     const_f32_as_f32x8!(P2asinf, 4.5470025998E-2);
1054     const_f32_as_f32x8!(P1asinf, 7.4953002686E-2);
1055     const_f32_as_f32x8!(P0asinf, 1.6666752422E-1);
1056 
1057     let xa = self.abs();
1058     let big = xa.cmp_ge(f32x8::splat(0.5));
1059 
1060     let x1 = f32x8::splat(0.5) * (f32x8::ONE - xa);
1061     let x2 = xa * xa;
1062     let x3 = big.blend(x1, x2);
1063 
1064     let xb = x1.sqrt();
1065 
1066     let x4 = big.blend(xb, xa);
1067 
1068     let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
1069     let z = z.mul_add(x3 * x4, x4);
1070 
1071     let z1 = z + z;
1072 
1073     // acos
1074     let z3 = self.cmp_lt(f32x8::ZERO).blend(f32x8::PI - z1, z1);
1075     let z4 = f32x8::FRAC_PI_2 - z.flip_signs(self);
1076     let acos = big.blend(z3, z4);
1077 
1078     acos
1079   }
1080 
1081   #[allow(non_upper_case_globals)]
atan(self) -> Self1082   pub fn atan(self) -> Self {
1083     // Based on the Agner Fog "vector class library":
1084     // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
1085     const_f32_as_f32x8!(P3atanf, 8.05374449538E-2);
1086     const_f32_as_f32x8!(P2atanf, -1.38776856032E-1);
1087     const_f32_as_f32x8!(P1atanf, 1.99777106478E-1);
1088     const_f32_as_f32x8!(P0atanf, -3.33329491539E-1);
1089 
1090     let t = self.abs();
1091 
1092     // small:  z = t / 1.0;
1093     // medium: z = (t-1.0) / (t+1.0);
1094     // big:    z = -1.0 / t;
1095     let notsmal = t.cmp_ge(Self::SQRT_2 - Self::ONE);
1096     let notbig = t.cmp_le(Self::SQRT_2 + Self::ONE);
1097 
1098     let mut s = notbig.blend(Self::FRAC_PI_4, Self::FRAC_PI_2);
1099     s = notsmal & s;
1100 
1101     let mut a = notbig & t;
1102     a = notsmal.blend(a - Self::ONE, a);
1103     let mut b = notbig & Self::ONE;
1104     b = notsmal.blend(b + t, b);
1105     let z = a / b;
1106 
1107     let zz = z * z;
1108 
1109     // Taylor expansion
1110     let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf);
1111     re = re.mul_add(zz * z, z) + s;
1112 
1113     // get sign bit
1114     re = (self.sign_bit()).blend(-re, re);
1115 
1116     re
1117   }
1118 
1119   #[allow(non_upper_case_globals)]
atan2(self, x: Self) -> Self1120   pub fn atan2(self, x: Self) -> Self {
1121     // Based on the Agner Fog "vector class library":
1122     // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
1123     const_f32_as_f32x8!(P3atanf, 8.05374449538E-2);
1124     const_f32_as_f32x8!(P2atanf, -1.38776856032E-1);
1125     const_f32_as_f32x8!(P1atanf, 1.99777106478E-1);
1126     const_f32_as_f32x8!(P0atanf, -3.33329491539E-1);
1127 
1128     let y = self;
1129 
1130     // move in first octant
1131     let x1 = x.abs();
1132     let y1 = y.abs();
1133     let swapxy = y1.cmp_gt(x1);
1134     // swap x and y if y1 > x1
1135     let mut x2 = swapxy.blend(y1, x1);
1136     let mut y2 = swapxy.blend(x1, y1);
1137 
1138     // check for special case: x and y are both +/- INF
1139     let both_infinite = x.is_inf() & y.is_inf();
1140     if both_infinite.any() {
1141       let mone = -Self::ONE;
1142       x2 = both_infinite.blend(x2 & mone, x2);
1143       y2 = both_infinite.blend(y2 & mone, y2);
1144     }
1145 
1146     // x = y = 0 will produce NAN. No problem, fixed below
1147     let t = y2 / x2;
1148 
1149     // small:  z = t / 1.0;
1150     // medium: z = (t-1.0) / (t+1.0);
1151     let notsmal = t.cmp_ge(Self::SQRT_2 - Self::ONE);
1152 
1153     let a = notsmal.blend(t - Self::ONE, t);
1154     let b = notsmal.blend(t + Self::ONE, Self::ONE);
1155     let s = notsmal & Self::FRAC_PI_4;
1156     let z = a / b;
1157 
1158     let zz = z * z;
1159 
1160     // Taylor expansion
1161     let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf);
1162     re = re.mul_add(zz * z, z) + s;
1163 
1164     // move back in place
1165     re = swapxy.blend(Self::FRAC_PI_2 - re, re);
1166     re = ((x | y).cmp_eq(Self::ZERO)).blend(Self::ZERO, re);
1167     re = (x.sign_bit()).blend(Self::PI - re, re);
1168 
1169     // get sign bit
1170     re = (y.sign_bit()).blend(-re, re);
1171 
1172     re
1173   }
1174 
1175   #[inline]
1176   #[must_use]
1177   #[allow(non_upper_case_globals)]
sin_cos(self) -> (Self, Self)1178   pub fn sin_cos(self) -> (Self, Self) {
1179     // Based on the Agner Fog "vector class library":
1180     // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
1181 
1182     const_f32_as_f32x8!(DP1F, 0.78515625_f32 * 2.0);
1183     const_f32_as_f32x8!(DP2F, 2.4187564849853515625E-4_f32 * 2.0);
1184     const_f32_as_f32x8!(DP3F, 3.77489497744594108E-8_f32 * 2.0);
1185 
1186     const_f32_as_f32x8!(P0sinf, -1.6666654611E-1);
1187     const_f32_as_f32x8!(P1sinf, 8.3321608736E-3);
1188     const_f32_as_f32x8!(P2sinf, -1.9515295891E-4);
1189 
1190     const_f32_as_f32x8!(P0cosf, 4.166664568298827E-2);
1191     const_f32_as_f32x8!(P1cosf, -1.388731625493765E-3);
1192     const_f32_as_f32x8!(P2cosf, 2.443315711809948E-5);
1193 
1194     const_f32_as_f32x8!(TWO_OVER_PI, 2.0 / core::f32::consts::PI);
1195 
1196     let xa = self.abs();
1197 
1198     // Find quadrant
1199     let y = (xa * TWO_OVER_PI).round();
1200     let q: i32x8 = y.round_int();
1201 
1202     let x = y.mul_neg_add(DP3F, y.mul_neg_add(DP2F, y.mul_neg_add(DP1F, xa)));
1203 
1204     let x2 = x * x;
1205     let mut s = polynomial_2!(x2, P0sinf, P1sinf, P2sinf) * (x * x2) + x;
1206     let mut c = polynomial_2!(x2, P0cosf, P1cosf, P2cosf) * (x2 * x2)
1207       + f32x8::from(0.5).mul_neg_add(x2, f32x8::from(1.0));
1208 
1209     let swap = !(q & i32x8::from(1)).cmp_eq(i32x8::from(0));
1210 
1211     let mut overflow: f32x8 = cast(q.cmp_gt(i32x8::from(0x2000000)));
1212     overflow &= xa.is_finite();
1213     s = overflow.blend(f32x8::from(0.0), s);
1214     c = overflow.blend(f32x8::from(1.0), c);
1215 
1216     // calc sin
1217     let mut sin1 = cast::<_, f32x8>(swap).blend(c, s);
1218     let sign_sin: i32x8 = (q << 30) ^ cast::<_, i32x8>(self);
1219     sin1 = sin1.flip_signs(cast(sign_sin));
1220 
1221     // calc cos
1222     let mut cos1 = cast::<_, f32x8>(swap).blend(s, c);
1223     let sign_cos: i32x8 = ((q + i32x8::from(1)) & i32x8::from(2)) << 30;
1224     cos1 ^= cast::<_, f32x8>(sign_cos);
1225 
1226     (sin1, cos1)
1227   }
1228   #[inline]
1229   #[must_use]
sin(self) -> Self1230   pub fn sin(self) -> Self {
1231     let (s, _) = self.sin_cos();
1232     s
1233   }
1234   #[inline]
1235   #[must_use]
cos(self) -> Self1236   pub fn cos(self) -> Self {
1237     let (_, c) = self.sin_cos();
1238     c
1239   }
1240   #[inline]
1241   #[must_use]
tan(self) -> Self1242   pub fn tan(self) -> Self {
1243     let (s, c) = self.sin_cos();
1244     s / c
1245   }
1246   #[inline]
1247   #[must_use]
to_degrees(self) -> Self1248   pub fn to_degrees(self) -> Self {
1249     const_f32_as_f32x8!(RAD_TO_DEG_RATIO, 180.0_f32 / core::f32::consts::PI);
1250     self * RAD_TO_DEG_RATIO
1251   }
1252   #[inline]
1253   #[must_use]
to_radians(self) -> Self1254   pub fn to_radians(self) -> Self {
1255     const_f32_as_f32x8!(DEG_TO_RAD_RATIO, core::f32::consts::PI / 180.0_f32);
1256     self * DEG_TO_RAD_RATIO
1257   }
1258   #[inline]
1259   #[must_use]
recip(self) -> Self1260   pub fn recip(self) -> Self {
1261     pick! {
1262       if #[cfg(target_feature="avx")] {
1263         Self { avx: reciprocal_m256(self.avx) }
1264       } else if #[cfg(target_feature="sse2")] {
1265         Self { sse0: reciprocal_m128(self.sse0), sse1: reciprocal_m128(self.sse1) }
1266       } else if #[cfg(target_feature="simd128")] {
1267         let one = f32x4_splat(1.0);
1268         Self { simd0: f32x4_div(one, self.simd0), simd1: f32x4_div(one, self.simd1) }
1269       } else {
1270         Self { arr: [
1271           1.0 / self.arr[0],
1272           1.0 / self.arr[1],
1273           1.0 / self.arr[2],
1274           1.0 / self.arr[3],
1275           1.0 / self.arr[4],
1276           1.0 / self.arr[5],
1277           1.0 / self.arr[6],
1278           1.0 / self.arr[7],
1279         ]}
1280       }
1281     }
1282   }
1283   #[inline]
1284   #[must_use]
recip_sqrt(self) -> Self1285   pub fn recip_sqrt(self) -> Self {
1286     pick! {
1287       if #[cfg(target_feature="avx")] {
1288         Self { avx: reciprocal_sqrt_m256(self.avx) }
1289       } else if #[cfg(target_feature="sse2")] {
1290         Self { sse0: reciprocal_sqrt_m128(self.sse0), sse1: reciprocal_sqrt_m128(self.sse1) }
1291       } else if #[cfg(target_feature="simd128")] {
1292         let one = f32x4_splat(1.0);
1293         Self { simd0: f32x4_div(one, f32x4_sqrt(self.simd0)), simd1: f32x4_div(one, f32x4_sqrt(self.simd1)) }
1294       } else if #[cfg(feature="std")] {
1295         Self { arr: [
1296           1.0 / self.arr[0].sqrt(),
1297           1.0 / self.arr[1].sqrt(),
1298           1.0 / self.arr[2].sqrt(),
1299           1.0 / self.arr[3].sqrt(),
1300           1.0 / self.arr[4].sqrt(),
1301           1.0 / self.arr[5].sqrt(),
1302           1.0 / self.arr[6].sqrt(),
1303           1.0 / self.arr[7].sqrt(),
1304         ]}
1305       } else {
1306         Self { arr: [
1307           1.0 / software_sqrt(self.arr[0] as f64) as f32,
1308           1.0 / software_sqrt(self.arr[1] as f64) as f32,
1309           1.0 / software_sqrt(self.arr[2] as f64) as f32,
1310           1.0 / software_sqrt(self.arr[3] as f64) as f32,
1311           1.0 / software_sqrt(self.arr[4] as f64) as f32,
1312           1.0 / software_sqrt(self.arr[5] as f64) as f32,
1313           1.0 / software_sqrt(self.arr[6] as f64) as f32,
1314           1.0 / software_sqrt(self.arr[7] as f64) as f32,
1315         ]}
1316       }
1317     }
1318   }
1319   #[inline]
1320   #[must_use]
sqrt(self) -> Self1321   pub fn sqrt(self) -> Self {
1322     pick! {
1323       if #[cfg(target_feature="avx")] {
1324         Self { avx: sqrt_m256(self.avx) }
1325       } else if #[cfg(target_feature="sse2")] {
1326         Self { sse0: sqrt_m128(self.sse0), sse1: sqrt_m128(self.sse1) }
1327       } else if #[cfg(target_feature="simd128")] {
1328         Self { simd0: f32x4_sqrt(self.simd0), simd1: f32x4_sqrt(self.simd1) }
1329       } else if #[cfg(feature="std")] {
1330         Self { arr: [
1331           self.arr[0].sqrt(),
1332           self.arr[1].sqrt(),
1333           self.arr[2].sqrt(),
1334           self.arr[3].sqrt(),
1335           self.arr[4].sqrt(),
1336           self.arr[5].sqrt(),
1337           self.arr[6].sqrt(),
1338           self.arr[7].sqrt(),
1339         ]}
1340       } else {
1341         Self { arr: [
1342           software_sqrt(self.arr[0] as f64) as f32,
1343           software_sqrt(self.arr[1] as f64) as f32,
1344           software_sqrt(self.arr[2] as f64) as f32,
1345           software_sqrt(self.arr[3] as f64) as f32,
1346           software_sqrt(self.arr[4] as f64) as f32,
1347           software_sqrt(self.arr[5] as f64) as f32,
1348           software_sqrt(self.arr[6] as f64) as f32,
1349           software_sqrt(self.arr[7] as f64) as f32,
1350         ]}
1351       }
1352     }
1353   }
1354   #[inline]
1355   #[must_use]
move_mask(self) -> i321356   pub fn move_mask(self) -> i32 {
1357     pick! {
1358       if #[cfg(target_feature="avx")] {
1359         move_mask_m256(self.avx)
1360       } else if #[cfg(target_feature="sse2")] {
1361         (move_mask_m128(self.sse1) << 4) ^ move_mask_m128(self.sse0)
1362       } else if #[cfg(target_feature="simd128")] {
1363         ((u32x4_bitmask(self.simd1) as i32) << 4) ^ u32x4_bitmask(self.simd0) as i32
1364       } else {
1365         (((self.arr[0].to_bits() as i32) < 0) as i32) << 0 |
1366         (((self.arr[1].to_bits() as i32) < 0) as i32) << 1 |
1367         (((self.arr[2].to_bits() as i32) < 0) as i32) << 2 |
1368         (((self.arr[3].to_bits() as i32) < 0) as i32) << 3 |
1369         (((self.arr[4].to_bits() as i32) < 0) as i32) << 4 |
1370         (((self.arr[5].to_bits() as i32) < 0) as i32) << 5 |
1371         (((self.arr[6].to_bits() as i32) < 0) as i32) << 6 |
1372         (((self.arr[7].to_bits() as i32) < 0) as i32) << 7
1373       }
1374     }
1375   }
1376   #[inline]
1377   #[must_use]
any(self) -> bool1378   pub fn any(self) -> bool {
1379     pick! {
1380       if #[cfg(target_feature="simd128")] {
1381         v128_any_true(self.simd0) | v128_any_true(self.simd1)
1382       } else {
1383         self.move_mask() != 0
1384       }
1385     }
1386   }
1387   #[inline]
1388   #[must_use]
all(self) -> bool1389   pub fn all(self) -> bool {
1390     pick! {
1391       if #[cfg(target_feature="simd128")] {
1392         u32x4_all_true(self.simd0) & u32x4_all_true(self.simd1)
1393       } else {
1394         // eight lanes
1395         self.move_mask() == 0b11111111
1396       }
1397     }
1398   }
1399   #[inline]
1400   #[must_use]
none(self) -> bool1401   pub fn none(self) -> bool {
1402     !self.any()
1403   }
1404 
1405   #[inline]
1406   #[allow(non_upper_case_globals)]
vm_pow2n(self) -> Self1407   fn vm_pow2n(self) -> Self {
1408     const_f32_as_f32x8!(pow2_23, 8388608.0);
1409     const_f32_as_f32x8!(bias, 127.0);
1410     let a = self + (bias + pow2_23);
1411     let c = cast::<_, i32x8>(a) << 23;
1412     cast::<_, f32x8>(c)
1413   }
1414 
1415   /// Calculate the exponent of a packed f32x8
1416   #[inline]
1417   #[must_use]
1418   #[allow(non_upper_case_globals)]
exp(self) -> Self1419   pub fn exp(self) -> Self {
1420     const_f32_as_f32x8!(P0, 1.0 / 2.0);
1421     const_f32_as_f32x8!(P1, 1.0 / 6.0);
1422     const_f32_as_f32x8!(P2, 1. / 24.);
1423     const_f32_as_f32x8!(P3, 1. / 120.);
1424     const_f32_as_f32x8!(P4, 1. / 720.);
1425     const_f32_as_f32x8!(P5, 1. / 5040.);
1426     const_f32_as_f32x8!(LN2D_HI, 0.693359375);
1427     const_f32_as_f32x8!(LN2D_LO, -2.12194440e-4);
1428     let max_x = f32x8::from(87.3);
1429     let r = (self * Self::LOG2_E).round();
1430     let x = r.mul_neg_add(LN2D_HI, self);
1431     let x = r.mul_neg_add(LN2D_LO, x);
1432     let z = polynomial_5!(x, P0, P1, P2, P3, P4, P5);
1433     let x2 = x * x;
1434     let z = z.mul_add(x2, x);
1435     let n2 = Self::vm_pow2n(r);
1436     let z = (z + Self::ONE) * n2;
1437     // check for overflow
1438     let in_range = self.abs().cmp_lt(max_x);
1439     let in_range = in_range & self.is_finite();
1440     in_range.blend(z, Self::ZERO)
1441   }
1442 
1443   #[inline]
1444   #[allow(non_upper_case_globals)]
exponent(self) -> f32x81445   fn exponent(self) -> f32x8 {
1446     const_f32_as_f32x8!(pow2_23, 8388608.0);
1447     const_f32_as_f32x8!(bias, 127.0);
1448     let a = cast::<_, u32x8>(self);
1449     let b = a >> 23;
1450     let c = b | cast::<_, u32x8>(pow2_23);
1451     let d = cast::<_, f32x8>(c);
1452     let e = d - (pow2_23 + bias);
1453     e
1454   }
1455 
1456   #[inline]
1457   #[allow(non_upper_case_globals)]
fraction_2(self) -> Self1458   fn fraction_2(self) -> Self {
1459     let t1 = cast::<_, u32x8>(self);
1460     let t2 = cast::<_, u32x8>(
1461       (t1 & u32x8::from(0x007FFFFF)) | u32x8::from(0x3F000000),
1462     );
1463     cast::<_, f32x8>(t2)
1464   }
1465 
is_zero_or_subnormal(self) -> Self1466   fn is_zero_or_subnormal(self) -> Self {
1467     let t = cast::<_, i32x8>(self);
1468     let t = t & i32x8::splat(0x7F800000);
1469     i32x8::round_float(t.cmp_eq(i32x8::splat(0)))
1470   }
1471 
infinity() -> Self1472   fn infinity() -> Self {
1473     cast::<_, f32x8>(i32x8::splat(0x7F800000))
1474   }
1475 
nan_log() -> Self1476   fn nan_log() -> Self {
1477     cast::<_, f32x8>(i32x8::splat(0x7FC00000 | 0x101 & 0x003FFFFF))
1478   }
1479 
nan_pow() -> Self1480   fn nan_pow() -> Self {
1481     cast::<_, f32x8>(i32x8::splat(0x7FC00000 | 0x101 & 0x003FFFFF))
1482   }
1483 
sign_bit(self) -> Self1484   pub fn sign_bit(self) -> Self {
1485     let t1 = cast::<_, i32x8>(self);
1486     let t2 = t1 >> 31;
1487     !cast::<_, f32x8>(t2).cmp_eq(f32x8::ZERO)
1488   }
1489 
reduce_add(self) -> f321490   pub fn reduce_add(self) -> f32 {
1491     pick! {
1492       // From https://stackoverflow.com/questions/13219146/how-to-sum-m256-horizontally
1493       if #[cfg(target_feature="avx")]{
1494         let hi_quad = extract_m128_from_m256::<1>(self.avx);
1495         let lo_quad = cast_to_m128_from_m256(self.avx);
1496         let sum_quad = add_m128(lo_quad,hi_quad);
1497         let lo_dual = sum_quad;
1498         let hi_dual = move_high_low_m128(sum_quad,sum_quad);
1499         let sum_dual = add_m128(lo_dual,hi_dual);
1500         let lo = sum_dual;
1501         let hi = shuffle_abi_f32_all_m128::<0b_01>(sum_dual, sum_dual);
1502         let sum = add_m128_s(lo, hi);
1503         get_f32_from_m128_s(sum)
1504       }
1505       else if #[cfg(target_feature="sse3")] {
1506         let a = add_horizontal_m128(self.sse0, self.sse0);
1507         let b = add_horizontal_m128(a, a);
1508         let c = add_horizontal_m128(self.sse1, self.sse1);
1509         let d = add_horizontal_m128(c, c);
1510         let sum = add_m128_s(b, d);
1511         get_f32_from_m128_s(sum)
1512       } else {
1513         let arr: [f32; 8] = cast(self);
1514         arr.iter().sum()
1515       }
1516     }
1517   }
1518 
1519   /// Natural log (ln(x))
1520   #[inline]
1521   #[must_use]
1522   #[allow(non_upper_case_globals)]
ln(self) -> Self1523   pub fn ln(self) -> Self {
1524     const_f32_as_f32x8!(HALF, 0.5);
1525     const_f32_as_f32x8!(P0, 3.3333331174E-1);
1526     const_f32_as_f32x8!(P1, -2.4999993993E-1);
1527     const_f32_as_f32x8!(P2, 2.0000714765E-1);
1528     const_f32_as_f32x8!(P3, -1.6668057665E-1);
1529     const_f32_as_f32x8!(P4, 1.4249322787E-1);
1530     const_f32_as_f32x8!(P5, -1.2420140846E-1);
1531     const_f32_as_f32x8!(P6, 1.1676998740E-1);
1532     const_f32_as_f32x8!(P7, -1.1514610310E-1);
1533     const_f32_as_f32x8!(P8, 7.0376836292E-2);
1534     const_f32_as_f32x8!(LN2F_HI, 0.693359375);
1535     const_f32_as_f32x8!(LN2F_LO, -2.12194440e-4);
1536     const_f32_as_f32x8!(VM_SMALLEST_NORMAL, 1.17549435E-38);
1537 
1538     let x1 = self;
1539     let x = Self::fraction_2(x1);
1540     let e = Self::exponent(x1);
1541     let mask = x.cmp_gt(Self::SQRT_2 * HALF);
1542     let x = (!mask).blend(x + x, x);
1543     let fe = mask.blend(e + Self::ONE, e);
1544     let x = x - Self::ONE;
1545     let res = polynomial_8!(x, P0, P1, P2, P3, P4, P5, P6, P7, P8);
1546     let x2 = x * x;
1547     let res = x2 * x * res;
1548     let res = fe.mul_add(LN2F_LO, res);
1549     let res = res + x2.mul_neg_add(HALF, x);
1550     let res = fe.mul_add(LN2F_HI, res);
1551     let overflow = !self.is_finite();
1552     let underflow = x1.cmp_lt(VM_SMALLEST_NORMAL);
1553     let mask = overflow | underflow;
1554     if !mask.any() {
1555       res
1556     } else {
1557       let is_zero = self.is_zero_or_subnormal();
1558       let res = underflow.blend(Self::nan_log(), res);
1559       let res = is_zero.blend(Self::infinity(), res);
1560       let res = overflow.blend(self, res);
1561       res
1562     }
1563   }
1564 
1565   #[inline]
1566   #[must_use]
log2(self) -> Self1567   pub fn log2(self) -> Self {
1568     Self::ln(self) * Self::LOG2_E
1569   }
1570   #[inline]
1571   #[must_use]
log10(self) -> Self1572   pub fn log10(self) -> Self {
1573     Self::ln(self) * Self::LOG10_E
1574   }
1575 
1576   #[inline]
1577   #[must_use]
1578   #[allow(non_upper_case_globals)]
pow_f32x8(self, y: Self) -> Self1579   pub fn pow_f32x8(self, y: Self) -> Self {
1580     const_f32_as_f32x8!(ln2f_hi, 0.693359375);
1581     const_f32_as_f32x8!(ln2f_lo, -2.12194440e-4);
1582     const_f32_as_f32x8!(P0logf, 3.3333331174E-1);
1583     const_f32_as_f32x8!(P1logf, -2.4999993993E-1);
1584     const_f32_as_f32x8!(P2logf, 2.0000714765E-1);
1585     const_f32_as_f32x8!(P3logf, -1.6668057665E-1);
1586     const_f32_as_f32x8!(P4logf, 1.4249322787E-1);
1587     const_f32_as_f32x8!(P5logf, -1.2420140846E-1);
1588     const_f32_as_f32x8!(P6logf, 1.1676998740E-1);
1589     const_f32_as_f32x8!(P7logf, -1.1514610310E-1);
1590     const_f32_as_f32x8!(P8logf, 7.0376836292E-2);
1591 
1592     const_f32_as_f32x8!(p2expf, 1.0 / 2.0); // coefficients for Taylor expansion of exp
1593     const_f32_as_f32x8!(p3expf, 1.0 / 6.0);
1594     const_f32_as_f32x8!(p4expf, 1.0 / 24.0);
1595     const_f32_as_f32x8!(p5expf, 1.0 / 120.0);
1596     const_f32_as_f32x8!(p6expf, 1.0 / 720.0);
1597     const_f32_as_f32x8!(p7expf, 1.0 / 5040.0);
1598 
1599     let x1 = self.abs();
1600     let x = x1.fraction_2();
1601     let mask = x.cmp_gt(f32x8::SQRT_2 * f32x8::HALF);
1602     let x = (!mask).blend(x + x, x);
1603 
1604     let x = x - f32x8::ONE;
1605     let x2 = x * x;
1606     let lg1 = polynomial_8!(
1607       x, P0logf, P1logf, P2logf, P3logf, P4logf, P5logf, P6logf, P7logf, P8logf
1608     );
1609     let lg1 = lg1 * x2 * x;
1610 
1611     let ef = x1.exponent();
1612     let ef = mask.blend(ef + f32x8::ONE, ef);
1613     let e1 = (ef * y).round();
1614     let yr = ef.mul_sub(y, e1);
1615 
1616     let lg = f32x8::HALF.mul_neg_add(x2, x) + lg1;
1617     let x2_err = (f32x8::HALF * x).mul_sub(x, f32x8::HALF * x2);
1618     let lg_err = f32x8::HALF.mul_add(x2, lg - x) - lg1;
1619 
1620     let e2 = (lg * y * f32x8::LOG2_E).round();
1621     let v = lg.mul_sub(y, e2 * ln2f_hi);
1622     let v = e2.mul_neg_add(ln2f_lo, v);
1623     let v = v - (lg_err + x2_err).mul_sub(y, yr * f32x8::LN_2);
1624 
1625     let x = v;
1626     let e3 = (x * f32x8::LOG2_E).round();
1627     let x = e3.mul_neg_add(f32x8::LN_2, x);
1628     let x2 = x * x;
1629     let z = x2.mul_add(
1630       polynomial_5!(x, p2expf, p3expf, p4expf, p5expf, p6expf, p7expf),
1631       x + f32x8::ONE,
1632     );
1633 
1634     let ee = e1 + e2 + e3;
1635     let ei = cast::<_, i32x8>(ee.round_int());
1636     let ej = cast::<_, i32x8>(ei + (cast::<_, i32x8>(z) >> 23));
1637 
1638     let overflow = cast::<_, f32x8>(ej.cmp_gt(i32x8::splat(0x0FF)))
1639       | (ee.cmp_gt(f32x8::splat(300.0)));
1640     let underflow = cast::<_, f32x8>(ej.cmp_lt(i32x8::splat(0x000)))
1641       | (ee.cmp_lt(f32x8::splat(-300.0)));
1642 
1643     // Add exponent by integer addition
1644     let z = cast::<_, f32x8>(cast::<_, i32x8>(z) + (ei << 23));
1645     // Check for overflow/underflow
1646     let z = underflow.blend(f32x8::ZERO, z);
1647     let z = overflow.blend(Self::infinity(), z);
1648 
1649     // Check for self == 0
1650     let x_zero = self.is_zero_or_subnormal();
1651     let z = x_zero.blend(
1652       y.cmp_lt(f32x8::ZERO).blend(
1653         Self::infinity(),
1654         y.cmp_eq(f32x8::ZERO).blend(f32x8::ONE, f32x8::ZERO),
1655       ),
1656       z,
1657     );
1658 
1659     let x_sign = self.sign_bit();
1660     let z = if x_sign.any() {
1661       // Y into an integer
1662       let yi = y.cmp_eq(y.round());
1663 
1664       // Is y odd?
1665       let y_odd = cast::<_, i32x8>(y.round_int() << 31).round_float();
1666 
1667       let z1 =
1668         yi.blend(z | y_odd, self.cmp_eq(Self::ZERO).blend(z, Self::nan_pow()));
1669 
1670       x_sign.blend(z1, z)
1671     } else {
1672       z
1673     };
1674 
1675     let x_finite = self.is_finite();
1676     let y_finite = y.is_finite();
1677     let e_finite = ee.is_finite();
1678     if (x_finite & y_finite & (e_finite | x_zero)).all() {
1679       return z;
1680     }
1681 
1682     (self.is_nan() | y.is_nan()).blend(self + y, z)
1683   }
1684 
powf(self, y: f32) -> Self1685   pub fn powf(self, y: f32) -> Self {
1686     Self::pow_f32x8(self, f32x8::splat(y))
1687   }
1688 
to_array(self) -> [f32; 8]1689   pub fn to_array(self) -> [f32; 8] {
1690     cast(self)
1691   }
1692 
as_array_ref(&self) -> &[f32; 8]1693   pub fn as_array_ref(&self) -> &[f32; 8] {
1694     cast_ref(self)
1695   }
1696 }
1697 
1698 impl Not for f32x8 {
1699   type Output = Self;
not(self) -> Self1700   fn not(self) -> Self {
1701     pick! {
1702       if #[cfg(target_feature="avx")] {
1703         Self { avx: self.avx.not()  }
1704       } else if #[cfg(target_feature="sse2")] {
1705         Self { sse0: self.sse0.not() , sse1: self.sse1.not() }
1706       } else if #[cfg(target_feature="simd128")] {
1707         Self { simd0: v128_not(self.simd0), simd1: v128_not(self.simd1) }
1708       } else {
1709         Self { arr: [
1710           f32::from_bits(!self.arr[0].to_bits()),
1711           f32::from_bits(!self.arr[1].to_bits()),
1712           f32::from_bits(!self.arr[2].to_bits()),
1713           f32::from_bits(!self.arr[3].to_bits()),
1714           f32::from_bits(!self.arr[4].to_bits()),
1715           f32::from_bits(!self.arr[5].to_bits()),
1716           f32::from_bits(!self.arr[6].to_bits()),
1717           f32::from_bits(!self.arr[7].to_bits()),
1718         ]}
1719       }
1720     }
1721   }
1722 }
1723