1 use super::*; 2 3 pick! { 4 if #[cfg(target_feature="avx")] { 5 #[derive(Default, Clone, Copy, PartialEq)] 6 #[repr(C, align(32))] 7 pub struct f32x8 { avx: m256 } 8 } else if #[cfg(target_feature="sse2")] { 9 #[derive(Default, Clone, Copy, PartialEq)] 10 #[repr(C, align(32))] 11 pub struct f32x8 { sse0: m128, sse1: m128 } 12 } else if #[cfg(target_feature="simd128")] { 13 use core::arch::wasm32::*; 14 15 #[derive(Clone, Copy)] 16 #[repr(C, align(32))] 17 pub struct f32x8 { simd0: v128, simd1: v128 } 18 19 impl Default for f32x8 { 20 fn default() -> Self { 21 Self::splat(0.0) 22 } 23 } 24 25 impl PartialEq for f32x8 { 26 fn eq(&self, other: &Self) -> bool { 27 u32x4_all_true(f32x4_eq(self.simd0, other.simd0)) & 28 u32x4_all_true(f32x4_eq(self.simd1, other.simd1)) 29 } 30 } 31 } else { 32 #[derive(Default, Clone, Copy, PartialEq)] 33 #[repr(C, align(32))] 34 pub struct f32x8 { arr: [f32;8] } 35 } 36 } 37 38 macro_rules! const_f32_as_f32x8 { 39 ($i:ident, $f:expr) => { 40 pub const $i: f32x8 = 41 unsafe { ConstUnionHack256bit { f32a8: [$f; 8] }.f32x8 }; 42 }; 43 } 44 45 impl f32x8 { 46 const_f32_as_f32x8!(ONE, 1.0); 47 const_f32_as_f32x8!(HALF, 0.5); 48 const_f32_as_f32x8!(ZERO, 0.0); 49 const_f32_as_f32x8!(E, core::f32::consts::E); 50 const_f32_as_f32x8!(FRAC_1_PI, core::f32::consts::FRAC_1_PI); 51 const_f32_as_f32x8!(FRAC_2_PI, core::f32::consts::FRAC_2_PI); 52 const_f32_as_f32x8!(FRAC_2_SQRT_PI, core::f32::consts::FRAC_2_SQRT_PI); 53 const_f32_as_f32x8!(FRAC_1_SQRT_2, core::f32::consts::FRAC_1_SQRT_2); 54 const_f32_as_f32x8!(FRAC_PI_2, core::f32::consts::FRAC_PI_2); 55 const_f32_as_f32x8!(FRAC_PI_3, core::f32::consts::FRAC_PI_3); 56 const_f32_as_f32x8!(FRAC_PI_4, core::f32::consts::FRAC_PI_4); 57 const_f32_as_f32x8!(FRAC_PI_6, core::f32::consts::FRAC_PI_6); 58 const_f32_as_f32x8!(FRAC_PI_8, core::f32::consts::FRAC_PI_8); 59 const_f32_as_f32x8!(LN_2, core::f32::consts::LN_2); 60 const_f32_as_f32x8!(LN_10, core::f32::consts::LN_10); 61 const_f32_as_f32x8!(LOG2_E, core::f32::consts::LOG2_E); 62 const_f32_as_f32x8!(LOG10_E, core::f32::consts::LOG10_E); 63 const_f32_as_f32x8!(LOG10_2, core::f32::consts::LOG10_2); 64 const_f32_as_f32x8!(LOG2_10, core::f32::consts::LOG2_10); 65 const_f32_as_f32x8!(PI, core::f32::consts::PI); 66 const_f32_as_f32x8!(SQRT_2, core::f32::consts::SQRT_2); 67 const_f32_as_f32x8!(TAU, core::f32::consts::TAU); 68 } 69 70 unsafe impl Zeroable for f32x8 {} 71 unsafe impl Pod for f32x8 {} 72 73 impl Add for f32x8 { 74 type Output = Self; 75 #[inline] 76 #[must_use] add(self, rhs: Self) -> Self::Output77 fn add(self, rhs: Self) -> Self::Output { 78 pick! { 79 if #[cfg(target_feature="avx")] { 80 Self { avx: add_m256(self.avx, rhs.avx) } 81 } else if #[cfg(target_feature="sse2")] { 82 Self { sse0: add_m128(self.sse0, rhs.sse0), sse1: add_m128(self.sse1, rhs.sse1) } 83 } else if #[cfg(target_feature="simd128")] { 84 Self { simd0: f32x4_add(self.simd0, rhs.simd0), simd1: f32x4_add(self.simd1, rhs.simd1) } 85 } else { 86 Self { arr: [ 87 self.arr[0] + rhs.arr[0], 88 self.arr[1] + rhs.arr[1], 89 self.arr[2] + rhs.arr[2], 90 self.arr[3] + rhs.arr[3], 91 self.arr[4] + rhs.arr[4], 92 self.arr[5] + rhs.arr[5], 93 self.arr[6] + rhs.arr[6], 94 self.arr[7] + rhs.arr[7], 95 ]} 96 } 97 } 98 } 99 } 100 101 impl Sub for f32x8 { 102 type Output = Self; 103 #[inline] 104 #[must_use] sub(self, rhs: Self) -> Self::Output105 fn sub(self, rhs: Self) -> Self::Output { 106 pick! { 107 if #[cfg(target_feature="avx")] { 108 Self { avx: sub_m256(self.avx, rhs.avx) } 109 } else if #[cfg(target_feature="sse2")] { 110 Self { sse0: sub_m128(self.sse0, rhs.sse0), sse1: sub_m128(self.sse1, rhs.sse1) } 111 } else if #[cfg(target_feature="simd128")] { 112 Self { simd0: f32x4_sub(self.simd0, rhs.simd0), simd1: f32x4_sub(self.simd1, rhs.simd1) } 113 } else { 114 Self { arr: [ 115 self.arr[0] - rhs.arr[0], 116 self.arr[1] - rhs.arr[1], 117 self.arr[2] - rhs.arr[2], 118 self.arr[3] - rhs.arr[3], 119 self.arr[4] - rhs.arr[4], 120 self.arr[5] - rhs.arr[5], 121 self.arr[6] - rhs.arr[6], 122 self.arr[7] - rhs.arr[7], 123 ]} 124 } 125 } 126 } 127 } 128 129 impl Mul for f32x8 { 130 type Output = Self; 131 #[inline] 132 #[must_use] mul(self, rhs: Self) -> Self::Output133 fn mul(self, rhs: Self) -> Self::Output { 134 pick! { 135 if #[cfg(target_feature="avx")] { 136 Self { avx: mul_m256(self.avx, rhs.avx) } 137 } else if #[cfg(target_feature="sse2")] { 138 Self { sse0: mul_m128(self.sse0, rhs.sse0), sse1: mul_m128(self.sse1, rhs.sse1) } 139 } else if #[cfg(target_feature="simd128")] { 140 Self { simd0: f32x4_mul(self.simd0, rhs.simd0), simd1: f32x4_mul(self.simd1, rhs.simd1) } 141 } else { 142 Self { arr: [ 143 self.arr[0] * rhs.arr[0], 144 self.arr[1] * rhs.arr[1], 145 self.arr[2] * rhs.arr[2], 146 self.arr[3] * rhs.arr[3], 147 self.arr[4] * rhs.arr[4], 148 self.arr[5] * rhs.arr[5], 149 self.arr[6] * rhs.arr[6], 150 self.arr[7] * rhs.arr[7], 151 ]} 152 } 153 } 154 } 155 } 156 157 impl Div for f32x8 { 158 type Output = Self; 159 #[inline] 160 #[must_use] div(self, rhs: Self) -> Self::Output161 fn div(self, rhs: Self) -> Self::Output { 162 pick! { 163 if #[cfg(target_feature="avx")] { 164 Self { avx: div_m256(self.avx, rhs.avx) } 165 } else if #[cfg(target_feature="sse2")] { 166 Self { sse0: div_m128(self.sse0, rhs.sse0), sse1: div_m128(self.sse1, rhs.sse1) } 167 } else if #[cfg(target_feature="simd128")] { 168 Self { simd0: f32x4_div(self.simd0, rhs.simd0), simd1: f32x4_div(self.simd1, rhs.simd1) } 169 } else { 170 Self { arr: [ 171 self.arr[0] / rhs.arr[0], 172 self.arr[1] / rhs.arr[1], 173 self.arr[2] / rhs.arr[2], 174 self.arr[3] / rhs.arr[3], 175 self.arr[4] / rhs.arr[4], 176 self.arr[5] / rhs.arr[5], 177 self.arr[6] / rhs.arr[6], 178 self.arr[7] / rhs.arr[7], 179 ]} 180 } 181 } 182 } 183 } 184 185 impl Add<f32> for f32x8 { 186 type Output = Self; 187 #[inline] 188 #[must_use] add(self, rhs: f32) -> Self::Output189 fn add(self, rhs: f32) -> Self::Output { 190 self.add(Self::splat(rhs)) 191 } 192 } 193 194 impl Sub<f32> for f32x8 { 195 type Output = Self; 196 #[inline] 197 #[must_use] sub(self, rhs: f32) -> Self::Output198 fn sub(self, rhs: f32) -> Self::Output { 199 self.sub(Self::splat(rhs)) 200 } 201 } 202 203 impl Mul<f32> for f32x8 { 204 type Output = Self; 205 #[inline] 206 #[must_use] mul(self, rhs: f32) -> Self::Output207 fn mul(self, rhs: f32) -> Self::Output { 208 self.mul(Self::splat(rhs)) 209 } 210 } 211 212 impl Div<f32> for f32x8 { 213 type Output = Self; 214 #[inline] 215 #[must_use] div(self, rhs: f32) -> Self::Output216 fn div(self, rhs: f32) -> Self::Output { 217 self.div(Self::splat(rhs)) 218 } 219 } 220 221 impl Add<f32x8> for f32 { 222 type Output = f32x8; 223 #[inline] 224 #[must_use] add(self, rhs: f32x8) -> Self::Output225 fn add(self, rhs: f32x8) -> Self::Output { 226 f32x8::splat(self).add(rhs) 227 } 228 } 229 230 impl Sub<f32x8> for f32 { 231 type Output = f32x8; 232 #[inline] 233 #[must_use] sub(self, rhs: f32x8) -> Self::Output234 fn sub(self, rhs: f32x8) -> Self::Output { 235 f32x8::splat(self).sub(rhs) 236 } 237 } 238 239 impl Mul<f32x8> for f32 { 240 type Output = f32x8; 241 #[inline] 242 #[must_use] mul(self, rhs: f32x8) -> Self::Output243 fn mul(self, rhs: f32x8) -> Self::Output { 244 f32x8::splat(self).mul(rhs) 245 } 246 } 247 248 impl Div<f32x8> for f32 { 249 type Output = f32x8; 250 #[inline] 251 #[must_use] div(self, rhs: f32x8) -> Self::Output252 fn div(self, rhs: f32x8) -> Self::Output { 253 f32x8::splat(self).div(rhs) 254 } 255 } 256 257 impl BitAnd for f32x8 { 258 type Output = Self; 259 #[inline] 260 #[must_use] bitand(self, rhs: Self) -> Self::Output261 fn bitand(self, rhs: Self) -> Self::Output { 262 pick! { 263 if #[cfg(target_feature="avx")] { 264 Self { avx: bitand_m256(self.avx, rhs.avx) } 265 } else if #[cfg(target_feature="sse2")] { 266 Self { sse0: bitand_m128(self.sse0, rhs.sse0), sse1: bitand_m128(self.sse1, rhs.sse1) } 267 } else if #[cfg(target_feature="simd128")] { 268 Self { simd0: v128_and(self.simd0, rhs.simd0), simd1: v128_and(self.simd1, rhs.simd1) } 269 } else { 270 Self { arr: [ 271 f32::from_bits(self.arr[0].to_bits() & rhs.arr[0].to_bits()), 272 f32::from_bits(self.arr[1].to_bits() & rhs.arr[1].to_bits()), 273 f32::from_bits(self.arr[2].to_bits() & rhs.arr[2].to_bits()), 274 f32::from_bits(self.arr[3].to_bits() & rhs.arr[3].to_bits()), 275 f32::from_bits(self.arr[4].to_bits() & rhs.arr[4].to_bits()), 276 f32::from_bits(self.arr[5].to_bits() & rhs.arr[5].to_bits()), 277 f32::from_bits(self.arr[6].to_bits() & rhs.arr[6].to_bits()), 278 f32::from_bits(self.arr[7].to_bits() & rhs.arr[7].to_bits()), 279 ]} 280 } 281 } 282 } 283 } 284 285 impl BitOr for f32x8 { 286 type Output = Self; 287 #[inline] 288 #[must_use] bitor(self, rhs: Self) -> Self::Output289 fn bitor(self, rhs: Self) -> Self::Output { 290 pick! { 291 if #[cfg(target_feature="avx")] { 292 Self { avx: bitor_m256(self.avx, rhs.avx) } 293 } else if #[cfg(target_feature="sse2")] { 294 Self { sse0: bitor_m128(self.sse0, rhs.sse0), sse1: bitor_m128(self.sse1, rhs.sse1) } 295 } else if #[cfg(target_feature="simd128")] { 296 Self { simd0: v128_or(self.simd0, rhs.simd0), simd1: v128_or(self.simd1, rhs.simd1) } 297 } else { 298 Self { arr: [ 299 f32::from_bits(self.arr[0].to_bits() | rhs.arr[0].to_bits()), 300 f32::from_bits(self.arr[1].to_bits() | rhs.arr[1].to_bits()), 301 f32::from_bits(self.arr[2].to_bits() | rhs.arr[2].to_bits()), 302 f32::from_bits(self.arr[3].to_bits() | rhs.arr[3].to_bits()), 303 f32::from_bits(self.arr[4].to_bits() | rhs.arr[4].to_bits()), 304 f32::from_bits(self.arr[5].to_bits() | rhs.arr[5].to_bits()), 305 f32::from_bits(self.arr[6].to_bits() | rhs.arr[6].to_bits()), 306 f32::from_bits(self.arr[7].to_bits() | rhs.arr[7].to_bits()), 307 ]} 308 } 309 } 310 } 311 } 312 313 impl BitXor for f32x8 { 314 type Output = Self; 315 #[inline] 316 #[must_use] bitxor(self, rhs: Self) -> Self::Output317 fn bitxor(self, rhs: Self) -> Self::Output { 318 pick! { 319 if #[cfg(target_feature="avx")] { 320 Self { avx: bitxor_m256(self.avx, rhs.avx) } 321 } else if #[cfg(target_feature="sse2")] { 322 Self { sse0: bitxor_m128(self.sse0, rhs.sse0), sse1: bitxor_m128(self.sse1, rhs.sse1) } 323 } else if #[cfg(target_feature="simd128")] { 324 Self { simd0: v128_xor(self.simd0, rhs.simd0), simd1: v128_xor(self.simd1, rhs.simd1) } 325 } else { 326 Self { arr: [ 327 f32::from_bits(self.arr[0].to_bits() ^ rhs.arr[0].to_bits()), 328 f32::from_bits(self.arr[1].to_bits() ^ rhs.arr[1].to_bits()), 329 f32::from_bits(self.arr[2].to_bits() ^ rhs.arr[2].to_bits()), 330 f32::from_bits(self.arr[3].to_bits() ^ rhs.arr[3].to_bits()), 331 f32::from_bits(self.arr[4].to_bits() ^ rhs.arr[4].to_bits()), 332 f32::from_bits(self.arr[5].to_bits() ^ rhs.arr[5].to_bits()), 333 f32::from_bits(self.arr[6].to_bits() ^ rhs.arr[6].to_bits()), 334 f32::from_bits(self.arr[7].to_bits() ^ rhs.arr[7].to_bits()), 335 ]} 336 } 337 } 338 } 339 } 340 341 impl CmpEq for f32x8 { 342 type Output = Self; 343 #[inline] 344 #[must_use] cmp_eq(self, rhs: Self) -> Self::Output345 fn cmp_eq(self, rhs: Self) -> Self::Output { 346 pick! { 347 if #[cfg(target_feature="avx")] { 348 Self { avx: cmp_op_mask_m256::<{cmp_op!(EqualOrdered)}>(self.avx, rhs.avx) } 349 } else if #[cfg(target_feature="sse2")] { 350 Self { sse0: cmp_eq_mask_m128(self.sse0, rhs.sse0), sse1: cmp_eq_mask_m128(self.sse1, rhs.sse1) } 351 } else if #[cfg(target_feature="simd128")] { 352 Self { simd0: f32x4_eq(self.simd0, rhs.simd0), simd1: f32x4_eq(self.simd1, rhs.simd1) } 353 } else { 354 Self { arr: [ 355 if self.arr[0] == rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 }, 356 if self.arr[1] == rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 }, 357 if self.arr[2] == rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 }, 358 if self.arr[3] == rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 }, 359 if self.arr[4] == rhs.arr[4] { f32::from_bits(u32::MAX) } else { 0.0 }, 360 if self.arr[5] == rhs.arr[5] { f32::from_bits(u32::MAX) } else { 0.0 }, 361 if self.arr[6] == rhs.arr[6] { f32::from_bits(u32::MAX) } else { 0.0 }, 362 if self.arr[7] == rhs.arr[7] { f32::from_bits(u32::MAX) } else { 0.0 }, 363 ]} 364 } 365 } 366 } 367 } 368 369 impl CmpGe for f32x8 { 370 type Output = Self; 371 #[inline] 372 #[must_use] cmp_ge(self, rhs: Self) -> Self::Output373 fn cmp_ge(self, rhs: Self) -> Self::Output { 374 pick! { 375 if #[cfg(target_feature="avx")] { 376 Self { avx: cmp_op_mask_m256::<{cmp_op!(GreaterEqualOrdered)}>(self.avx, rhs.avx) } 377 } else if #[cfg(target_feature="sse2")] { 378 Self { sse0: cmp_ge_mask_m128(self.sse0, rhs.sse0), sse1: cmp_ge_mask_m128(self.sse1, rhs.sse1) } 379 } else if #[cfg(target_feature="simd128")] { 380 Self { simd0: f32x4_ge(self.simd0, rhs.simd0), simd1: f32x4_ge(self.simd1, rhs.simd1) } 381 } else { 382 Self { arr: [ 383 if self.arr[0] >= rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 }, 384 if self.arr[1] >= rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 }, 385 if self.arr[2] >= rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 }, 386 if self.arr[3] >= rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 }, 387 if self.arr[4] >= rhs.arr[4] { f32::from_bits(u32::MAX) } else { 0.0 }, 388 if self.arr[5] >= rhs.arr[5] { f32::from_bits(u32::MAX) } else { 0.0 }, 389 if self.arr[6] >= rhs.arr[6] { f32::from_bits(u32::MAX) } else { 0.0 }, 390 if self.arr[7] >= rhs.arr[7] { f32::from_bits(u32::MAX) } else { 0.0 }, 391 ]} 392 } 393 } 394 } 395 } 396 397 impl CmpGt for f32x8 { 398 type Output = Self; 399 #[inline] 400 #[must_use] cmp_gt(self, rhs: Self) -> Self::Output401 fn cmp_gt(self, rhs: Self) -> Self::Output { 402 pick! { 403 if #[cfg(target_feature="avx")] { 404 Self { avx: cmp_op_mask_m256::<{cmp_op!(GreaterThanOrdered)}>(self.avx, rhs.avx) } 405 } else if #[cfg(target_feature="sse2")] { 406 Self { sse0: cmp_gt_mask_m128(self.sse0, rhs.sse0), sse1: cmp_gt_mask_m128(self.sse1, rhs.sse1) } 407 } else if #[cfg(target_feature="simd128")] { 408 Self { simd0: f32x4_gt(self.simd0, rhs.simd0), simd1: f32x4_gt(self.simd1, rhs.simd1) } 409 } else { 410 Self { arr: [ 411 if self.arr[0] > rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 }, 412 if self.arr[1] > rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 }, 413 if self.arr[2] > rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 }, 414 if self.arr[3] > rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 }, 415 if self.arr[4] > rhs.arr[4] { f32::from_bits(u32::MAX) } else { 0.0 }, 416 if self.arr[5] > rhs.arr[5] { f32::from_bits(u32::MAX) } else { 0.0 }, 417 if self.arr[6] > rhs.arr[6] { f32::from_bits(u32::MAX) } else { 0.0 }, 418 if self.arr[7] > rhs.arr[7] { f32::from_bits(u32::MAX) } else { 0.0 }, 419 ]} 420 } 421 } 422 } 423 } 424 425 impl CmpNe for f32x8 { 426 type Output = Self; 427 #[inline] 428 #[must_use] cmp_ne(self, rhs: Self) -> Self::Output429 fn cmp_ne(self, rhs: Self) -> Self::Output { 430 pick! { 431 if #[cfg(target_feature="avx")] { 432 Self { avx: cmp_op_mask_m256::<{cmp_op!(NotEqualOrdered)}>(self.avx, rhs.avx) } 433 } else if #[cfg(target_feature="sse2")] { 434 Self { sse0: cmp_neq_mask_m128(self.sse0, rhs.sse0), sse1: cmp_neq_mask_m128(self.sse1, rhs.sse1) } 435 } else if #[cfg(target_feature="simd128")] { 436 Self { simd0: f32x4_ne(self.simd0, rhs.simd0), simd1: f32x4_ne(self.simd1, rhs.simd1) } 437 } else { 438 Self { arr: [ 439 if self.arr[0] != rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 }, 440 if self.arr[1] != rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 }, 441 if self.arr[2] != rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 }, 442 if self.arr[3] != rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 }, 443 if self.arr[4] != rhs.arr[4] { f32::from_bits(u32::MAX) } else { 0.0 }, 444 if self.arr[5] != rhs.arr[5] { f32::from_bits(u32::MAX) } else { 0.0 }, 445 if self.arr[6] != rhs.arr[6] { f32::from_bits(u32::MAX) } else { 0.0 }, 446 if self.arr[7] != rhs.arr[7] { f32::from_bits(u32::MAX) } else { 0.0 }, 447 ]} 448 } 449 } 450 } 451 } 452 453 impl CmpLe for f32x8 { 454 type Output = Self; 455 #[inline] 456 #[must_use] cmp_le(self, rhs: Self) -> Self::Output457 fn cmp_le(self, rhs: Self) -> Self::Output { 458 pick! { 459 if #[cfg(target_feature="avx")] { 460 Self { avx: cmp_op_mask_m256::<{cmp_op!(LessEqualOrdered)}>(self.avx, rhs.avx) } 461 } else if #[cfg(target_feature="sse2")] { 462 Self { sse0: cmp_le_mask_m128(self.sse0, rhs.sse0), sse1: cmp_le_mask_m128(self.sse1, rhs.sse1) } 463 } else if #[cfg(target_feature="simd128")] { 464 Self { simd0: f32x4_le(self.simd0, rhs.simd0), simd1: f32x4_le(self.simd1, rhs.simd1) } 465 } else { 466 Self { arr: [ 467 if self.arr[0] <= rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 }, 468 if self.arr[1] <= rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 }, 469 if self.arr[2] <= rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 }, 470 if self.arr[3] <= rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 }, 471 if self.arr[4] <= rhs.arr[4] { f32::from_bits(u32::MAX) } else { 0.0 }, 472 if self.arr[5] <= rhs.arr[5] { f32::from_bits(u32::MAX) } else { 0.0 }, 473 if self.arr[6] <= rhs.arr[6] { f32::from_bits(u32::MAX) } else { 0.0 }, 474 if self.arr[7] <= rhs.arr[7] { f32::from_bits(u32::MAX) } else { 0.0 }, 475 ]} 476 } 477 } 478 } 479 } 480 481 impl CmpLt for f32x8 { 482 type Output = Self; 483 #[inline] 484 #[must_use] cmp_lt(self, rhs: Self) -> Self::Output485 fn cmp_lt(self, rhs: Self) -> Self::Output { 486 pick! { 487 if #[cfg(target_feature="avx")] { 488 Self { avx: cmp_op_mask_m256::<{cmp_op!(LessThanOrdered)}>(self.avx, rhs.avx) } 489 } else if #[cfg(target_feature="sse2")] { 490 Self { sse0: cmp_lt_mask_m128(self.sse0, rhs.sse0), sse1: cmp_lt_mask_m128(self.sse1, rhs.sse1) } 491 } else if #[cfg(target_feature="simd128")] { 492 Self { simd0: f32x4_lt(self.simd0, rhs.simd0), simd1: f32x4_lt(self.simd1, rhs.simd1) } 493 } else { 494 Self { arr: [ 495 if self.arr[0] < rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 }, 496 if self.arr[1] < rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 }, 497 if self.arr[2] < rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 }, 498 if self.arr[3] < rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 }, 499 if self.arr[4] < rhs.arr[4] { f32::from_bits(u32::MAX) } else { 0.0 }, 500 if self.arr[5] < rhs.arr[5] { f32::from_bits(u32::MAX) } else { 0.0 }, 501 if self.arr[6] < rhs.arr[6] { f32::from_bits(u32::MAX) } else { 0.0 }, 502 if self.arr[7] < rhs.arr[7] { f32::from_bits(u32::MAX) } else { 0.0 }, 503 ]} 504 } 505 } 506 } 507 } 508 509 impl f32x8 { 510 #[inline] 511 #[must_use] new(array: [f32; 8]) -> Self512 pub fn new(array: [f32; 8]) -> Self { 513 Self::from(array) 514 } 515 #[inline] 516 #[must_use] blend(self, t: Self, f: Self) -> Self517 pub fn blend(self, t: Self, f: Self) -> Self { 518 pick! { 519 if #[cfg(target_feature="avx")] { 520 Self { avx: blend_varying_m256(f.avx, t.avx, self.avx) } 521 } else if #[cfg(target_feature="sse4.1")] { 522 Self { sse0: blend_varying_m128(f.sse0, t.sse0, self.sse0), sse1: blend_varying_m128(f.sse1, t.sse1, self.sse1) } 523 } else if #[cfg(target_feature="simd128")] { 524 Self { simd0: v128_bitselect(t.simd0, f.simd0, self.simd0), simd1: v128_bitselect(t.simd1, f.simd1, self.simd1) } 525 } else { 526 generic_bit_blend(self, t, f) 527 } 528 } 529 } 530 #[inline] 531 #[must_use] abs(self) -> Self532 pub fn abs(self) -> Self { 533 pick! { 534 if #[cfg(target_feature="simd128")] { 535 Self { simd0: f32x4_abs(self.simd0), simd1: f32x4_abs(self.simd1) } 536 } else { 537 let non_sign_bits = f32x8::from(f32::from_bits(i32::MAX as u32)); 538 self & non_sign_bits 539 } 540 } 541 } 542 543 /// Calculates the lanewise maximum of both vectors. This is a faster 544 /// implementation than `max`, but it doesn't specify any behavior if NaNs are 545 /// involved. 546 #[inline] 547 #[must_use] fast_max(self, rhs: Self) -> Self548 pub fn fast_max(self, rhs: Self) -> Self { 549 pick! { 550 if #[cfg(target_feature="avx")] { 551 Self { avx: max_m256(self.avx, rhs.avx) } 552 } else if #[cfg(target_feature="sse2")] { 553 Self { sse0: max_m128(self.sse0, rhs.sse0), sse1: max_m128(self.sse1, rhs.sse1) } 554 } else if #[cfg(target_feature="simd128")] { 555 Self { simd0: f32x4_pmax(self.simd0, rhs.simd0), simd1: f32x4_pmax(self.simd1, rhs.simd1) } 556 } else { 557 Self { arr: [ 558 if self.arr[0] < rhs.arr[0] { rhs.arr[0] } else { self.arr[0] }, 559 if self.arr[1] < rhs.arr[1] { rhs.arr[1] } else { self.arr[1] }, 560 if self.arr[2] < rhs.arr[2] { rhs.arr[2] } else { self.arr[2] }, 561 if self.arr[3] < rhs.arr[3] { rhs.arr[3] } else { self.arr[3] }, 562 if self.arr[4] < rhs.arr[4] { rhs.arr[4] } else { self.arr[4] }, 563 if self.arr[5] < rhs.arr[5] { rhs.arr[5] } else { self.arr[5] }, 564 if self.arr[6] < rhs.arr[6] { rhs.arr[6] } else { self.arr[6] }, 565 if self.arr[7] < rhs.arr[7] { rhs.arr[7] } else { self.arr[7] }, 566 ]} 567 } 568 } 569 } 570 571 /// Calculates the lanewise maximum of both vectors. This doesn't match 572 /// IEEE-754 and instead is defined as `self < rhs ? rhs : self`. 573 #[inline] 574 #[must_use] max(self, rhs: Self) -> Self575 pub fn max(self, rhs: Self) -> Self { 576 pick! { 577 if #[cfg(target_feature="avx")] { 578 // max_m256 seems to do rhs < self ? self : rhs. So if there's any NaN 579 // involved, it chooses rhs, so we need to specifically check rhs for 580 // NaN. 581 rhs.is_nan().blend(self, Self { avx: max_m256(self.avx, rhs.avx) }) 582 } else if #[cfg(target_feature="sse2")] { 583 // max_m128 seems to do rhs < self ? self : rhs. So if there's any NaN 584 // involved, it chooses rhs, so we need to specifically check rhs for 585 // NaN. 586 rhs.is_nan().blend(self, Self { sse0: max_m128(self.sse0, rhs.sse0), sse1: max_m128(self.sse1, rhs.sse1) }) 587 } else if #[cfg(target_feature="simd128")] { 588 // WASM has two max intrinsics: 589 // - max: This propagates NaN, that's the opposite of what we need. 590 // - pmax: This is defined as self < rhs ? rhs : self, which basically 591 // chooses self if either is NaN. 592 // 593 // pmax is what we want, but we need to specifically check self for NaN. 594 Self { 595 simd0: v128_bitselect( 596 rhs.simd0, 597 f32x4_pmax(self.simd0, rhs.simd0), 598 f32x4_ne(self.simd0, self.simd0), // NaN check 599 ), 600 simd1: v128_bitselect( 601 rhs.simd1, 602 f32x4_pmax(self.simd1, rhs.simd1), 603 f32x4_ne(self.simd1, self.simd1), // NaN check 604 ), 605 } 606 } else { 607 Self { arr: [ 608 self.arr[0].max(rhs.arr[0]), 609 self.arr[1].max(rhs.arr[1]), 610 self.arr[2].max(rhs.arr[2]), 611 self.arr[3].max(rhs.arr[3]), 612 self.arr[4].max(rhs.arr[4]), 613 self.arr[5].max(rhs.arr[5]), 614 self.arr[6].max(rhs.arr[6]), 615 self.arr[7].max(rhs.arr[7]), 616 ]} 617 } 618 } 619 } 620 621 /// Calculates the lanewise minimum of both vectors. This is a faster 622 /// implementation than `min`, but it doesn't specify any behavior if NaNs are 623 /// involved. 624 #[inline] 625 #[must_use] fast_min(self, rhs: Self) -> Self626 pub fn fast_min(self, rhs: Self) -> Self { 627 pick! { 628 if #[cfg(target_feature="avx")] { 629 Self { avx: min_m256(self.avx, rhs.avx) } 630 } else if #[cfg(target_feature="sse2")] { 631 Self { sse0: min_m128(self.sse0, rhs.sse0), sse1: min_m128(self.sse1, rhs.sse1) } 632 } else if #[cfg(target_feature="simd128")] { 633 Self { simd0: f32x4_pmin(self.simd0, rhs.simd0), simd1: f32x4_pmin(self.simd1, rhs.simd1) } 634 } else { 635 Self { arr: [ 636 if self.arr[0] > rhs.arr[0] { rhs.arr[0] } else { self.arr[0] }, 637 if self.arr[1] > rhs.arr[1] { rhs.arr[1] } else { self.arr[1] }, 638 if self.arr[2] > rhs.arr[2] { rhs.arr[2] } else { self.arr[2] }, 639 if self.arr[3] > rhs.arr[3] { rhs.arr[3] } else { self.arr[3] }, 640 if self.arr[4] > rhs.arr[4] { rhs.arr[4] } else { self.arr[4] }, 641 if self.arr[5] > rhs.arr[5] { rhs.arr[5] } else { self.arr[5] }, 642 if self.arr[6] > rhs.arr[6] { rhs.arr[6] } else { self.arr[6] }, 643 if self.arr[7] > rhs.arr[7] { rhs.arr[7] } else { self.arr[7] }, 644 ]} 645 } 646 } 647 } 648 649 /// Calculates the lanewise minimum of both vectors. If either lane is NaN, 650 /// the other lane gets chosen. Use `fast_min` for a faster implementation 651 /// that doesn't handle NaNs. 652 #[inline] 653 #[must_use] min(self, rhs: Self) -> Self654 pub fn min(self, rhs: Self) -> Self { 655 pick! { 656 if #[cfg(target_feature="avx")] { 657 // min_m256 seems to do rhs > self ? self : rhs. So if there's any NaN 658 // involved, it chooses rhs, so we need to specifically check rhs for 659 // NaN. 660 rhs.is_nan().blend(self, Self { avx: min_m256(self.avx, rhs.avx) }) 661 } else if #[cfg(target_feature="sse2")] { 662 // min_m128 seems to do rhs > self ? self : rhs. So if there's any NaN 663 // involved, it chooses rhs, so we need to specifically check rhs for 664 // NaN. 665 rhs.is_nan().blend(self, Self { sse0: min_m128(self.sse0, rhs.sse0), sse1: min_m128(self.sse1, rhs.sse1) }) 666 } else if #[cfg(target_feature="simd128")] { 667 // WASM has two min intrinsics: 668 // - min: This propagates NaN, that's the opposite of what we need. 669 // - pmin: This is defined as rhs < self ? rhs : self, which basically 670 // chooses self if either is NaN. 671 // 672 // pmin is what we want, but we need to specifically check self for NaN. 673 Self { 674 simd0: v128_bitselect( 675 rhs.simd0, 676 f32x4_pmin(self.simd0, rhs.simd0), 677 f32x4_ne(self.simd0, self.simd0), // NaN check 678 ), 679 simd1: v128_bitselect( 680 rhs.simd1, 681 f32x4_pmin(self.simd1, rhs.simd1), 682 f32x4_ne(self.simd1, self.simd1), // NaN check 683 ), 684 } 685 } else { 686 Self { arr: [ 687 self.arr[0].min(rhs.arr[0]), 688 self.arr[1].min(rhs.arr[1]), 689 self.arr[2].min(rhs.arr[2]), 690 self.arr[3].min(rhs.arr[3]), 691 self.arr[4].min(rhs.arr[4]), 692 self.arr[5].min(rhs.arr[5]), 693 self.arr[6].min(rhs.arr[6]), 694 self.arr[7].min(rhs.arr[7]), 695 ]} 696 } 697 } 698 } 699 #[inline] 700 #[must_use] is_nan(self) -> Self701 pub fn is_nan(self) -> Self { 702 pick! { 703 if #[cfg(target_feature="avx")] { 704 Self { avx: cmp_op_mask_m256::<{cmp_op!(Unordered)}>(self.avx, self.avx) } 705 } else if #[cfg(target_feature="sse2")] { 706 Self { sse0: cmp_unord_mask_m128(self.sse0, self.sse0) , sse1: cmp_unord_mask_m128(self.sse1, self.sse1) } 707 } else if #[cfg(target_feature="simd128")] { 708 Self { simd0: f32x4_ne(self.simd0, self.simd0), simd1: f32x4_ne(self.simd1, self.simd1) } 709 } else { 710 Self { arr: [ 711 if self.arr[0].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 }, 712 if self.arr[1].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 }, 713 if self.arr[2].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 }, 714 if self.arr[3].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 }, 715 if self.arr[4].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 }, 716 if self.arr[5].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 }, 717 if self.arr[6].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 }, 718 if self.arr[7].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 }, 719 ]} 720 } 721 } 722 } 723 #[inline] 724 #[must_use] is_finite(self) -> Self725 pub fn is_finite(self) -> Self { 726 let shifted_exp_mask = u32x8::from(0xFF000000); 727 let u: u32x8 = cast(self); 728 let shift_u = u << 1_u64; 729 let out = !(shift_u & shifted_exp_mask).cmp_eq(shifted_exp_mask); 730 cast(out) 731 } 732 #[inline] 733 #[must_use] is_inf(self) -> Self734 pub fn is_inf(self) -> Self { 735 let shifted_inf = u32x8::from(0xFF000000); 736 let u: u32x8 = cast(self); 737 let shift_u = u << 1_u64; 738 let out = (shift_u).cmp_eq(shifted_inf); 739 cast(out) 740 } 741 742 #[inline] 743 #[must_use] round(self) -> Self744 pub fn round(self) -> Self { 745 pick! { 746 // NOTE: Is there an SSE2 version of this? f32x4 version probably translates but I've not had time to figure it out 747 if #[cfg(target_feature="avx")] { 748 Self { avx: round_m256::<{round_op!(Nearest)}>(self.avx) } 749 } else if #[cfg(target_feature="sse4.1")] { 750 Self { sse0: round_m128::<{round_op!(Nearest)}>(self.sse0), sse1: round_m128::<{round_op!(Nearest)}>(self.sse1) } 751 } else if #[cfg(target_feature="simd128")] { 752 Self { simd0: f32x4_nearest(self.simd0), simd1: f32x4_nearest(self.simd1) } 753 } else { 754 // Note(Lokathor): This software fallback is probably very slow compared 755 // to having a hardware option available, even just the sse2 version is 756 // better than this. Oh well. 757 let to_int = f32x8::from(1.0 / f32::EPSILON); 758 let u: u32x8 = cast(self); 759 let e: i32x8 = cast((u >> 23) & u32x8::from(0xff)); 760 let mut y: f32x8; 761 762 let no_op_magic = i32x8::from(0x7f + 23); 763 let no_op_mask: f32x8 = cast(e.cmp_gt(no_op_magic) | e.cmp_eq(no_op_magic)); 764 let no_op_val: f32x8 = self; 765 766 let zero_magic = i32x8::from(0x7f - 1); 767 let zero_mask: f32x8 = cast(e.cmp_lt(zero_magic)); 768 let zero_val: f32x8 = self * f32x8::from(0.0); 769 770 let neg_bit: f32x8 = cast(cast::<u32x8, i32x8>(u).cmp_lt(i32x8::default())); 771 let x: f32x8 = neg_bit.blend(-self, self); 772 y = x + to_int - to_int - x; 773 y = y.cmp_gt(f32x8::from(0.5)).blend( 774 y + x - f32x8::from(-1.0), 775 y.cmp_lt(f32x8::from(-0.5)).blend(y + x + f32x8::from(1.0), y + x), 776 ); 777 y = neg_bit.blend(-y, y); 778 779 no_op_mask.blend(no_op_val, zero_mask.blend(zero_val, y)) 780 } 781 } 782 } 783 784 /// Rounds each lane into an integer. This is a faster implementation than 785 /// `round_int`, but it doesn't handle out of range values or NaNs. For those 786 /// values you get implementation defined behavior. 787 #[inline] 788 #[must_use] fast_round_int(self) -> i32x8789 pub fn fast_round_int(self) -> i32x8 { 790 pick! { 791 if #[cfg(target_feature="avx")] { 792 cast(convert_to_i32_m256i_from_m256(self.avx)) 793 } else if #[cfg(target_feature="sse2")] { 794 i32x8 { sse0: convert_to_i32_m128i_from_m128(self.sse0), sse1: convert_to_i32_m128i_from_m128(self.sse1) } 795 } else { 796 self.round_int() 797 } 798 } 799 } 800 801 /// Rounds each lane into an integer. This saturates out of range values and 802 /// turns NaNs into 0. Use `fast_round_int` for a faster implementation that 803 /// doesn't handle out of range values or NaNs. 804 #[inline] 805 #[must_use] round_int(self) -> i32x8806 pub fn round_int(self) -> i32x8 { 807 pick! { 808 if #[cfg(target_feature="avx")] { 809 // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504 810 let non_nan_mask = self.cmp_eq(self); 811 let non_nan = self & non_nan_mask; 812 let flip_to_max: i32x8 = cast(self.cmp_ge(Self::splat(2147483648.0))); 813 let cast: i32x8 = cast(convert_to_i32_m256i_from_m256(non_nan.avx)); 814 flip_to_max ^ cast 815 } else if #[cfg(target_feature="sse2")] { 816 // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504 817 let non_nan_mask = self.cmp_eq(self); 818 let non_nan = self & non_nan_mask; 819 let flip_to_max: i32x8 = cast(self.cmp_ge(Self::splat(2147483648.0))); 820 let cast: i32x8 = i32x8 { sse0: convert_to_i32_m128i_from_m128(non_nan.sse0), sse1: convert_to_i32_m128i_from_m128(non_nan.sse1) }; 821 flip_to_max ^ cast 822 } else if #[cfg(target_feature="simd128")] { 823 cast(Self { 824 simd0: i32x4_trunc_sat_f32x4(f32x4_nearest(self.simd0)), 825 simd1: i32x4_trunc_sat_f32x4(f32x4_nearest(self.simd1)), 826 }) 827 } else { 828 let rounded: [f32; 8] = cast(self.round()); 829 cast([ 830 rounded[0] as i32, 831 rounded[1] as i32, 832 rounded[2] as i32, 833 rounded[3] as i32, 834 rounded[4] as i32, 835 rounded[5] as i32, 836 rounded[6] as i32, 837 rounded[7] as i32, 838 ]) 839 } 840 } 841 } 842 843 /// Truncates each lane into an integer. This is a faster implementation than 844 /// `trunc_int`, but it doesn't handle out of range values or NaNs. For those 845 /// values you get implementation defined behavior. 846 #[inline] 847 #[must_use] fast_trunc_int(self) -> i32x8848 pub fn fast_trunc_int(self) -> i32x8 { 849 pick! { 850 if #[cfg(all(target_feature="avx"))] { 851 cast(convert_truncate_to_i32_m256i_from_m256(self.avx)) 852 } else if #[cfg(target_feature="sse2")] { 853 i32x8 { sse0: truncate_m128_to_m128i(self.sse0), sse1: truncate_m128_to_m128i(self.sse1) } 854 } else { 855 self.trunc_int() 856 } 857 } 858 } 859 860 /// Truncates each lane into an integer. This saturates out of range values 861 /// and turns NaNs into 0. Use `fast_trunc_int` for a faster implementation 862 /// that doesn't handle out of range values or NaNs. 863 #[inline] 864 #[must_use] trunc_int(self) -> i32x8865 pub fn trunc_int(self) -> i32x8 { 866 pick! { 867 if #[cfg(target_feature="avx")] { 868 // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504 869 let non_nan_mask = self.cmp_eq(self); 870 let non_nan = self & non_nan_mask; 871 let flip_to_max: i32x8 = cast(self.cmp_ge(Self::splat(2147483648.0))); 872 let cast: i32x8 = cast(convert_truncate_to_i32_m256i_from_m256(non_nan.avx)); 873 flip_to_max ^ cast 874 } else if #[cfg(target_feature="sse2")] { 875 // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504 876 let non_nan_mask = self.cmp_eq(self); 877 let non_nan = self & non_nan_mask; 878 let flip_to_max: i32x8 = cast(self.cmp_ge(Self::splat(2147483648.0))); 879 let cast: i32x8 = i32x8 { sse0: truncate_m128_to_m128i(non_nan.sse0), sse1: truncate_m128_to_m128i(non_nan.sse1) }; 880 flip_to_max ^ cast 881 } else if #[cfg(target_feature="simd128")] { 882 cast(Self { 883 simd0: i32x4_trunc_sat_f32x4(self.simd0), 884 simd1: i32x4_trunc_sat_f32x4(self.simd1), 885 }) 886 } else { 887 let n: [f32; 8] = cast(self); 888 cast([ 889 n[0] as i32, 890 n[1] as i32, 891 n[2] as i32, 892 n[3] as i32, 893 n[4] as i32, 894 n[5] as i32, 895 n[6] as i32, 896 n[7] as i32, 897 ]) 898 } 899 } 900 } 901 #[inline] 902 #[must_use] mul_add(self, m: Self, a: Self) -> Self903 pub fn mul_add(self, m: Self, a: Self) -> Self { 904 pick! { 905 if #[cfg(all(target_feature="avx",target_feature="fma"))] { 906 Self { avx: fused_mul_add_m256(self.avx, m.avx, a.avx) } 907 } else if #[cfg(all(target_feature="avx",target_feature="fma"))] { 908 Self { sse0: fused_mul_add_m128(self.sse0, m.sse0, a.sse0), sse1: fused_mul_add_m128(self.sse1, m.sse1, a.sse1) } 909 } else { 910 (self * m) + a 911 } 912 } 913 } 914 915 #[inline] 916 #[must_use] mul_sub(self, m: Self, a: Self) -> Self917 pub fn mul_sub(self, m: Self, a: Self) -> Self { 918 pick! { 919 if #[cfg(all(target_feature="avx",target_feature="fma"))] { 920 Self { avx: fused_mul_sub_m256(self.avx, m.avx, a.avx) } 921 } else if #[cfg(all(target_feature="avx",target_feature="fma"))] { 922 Self { sse0: fused_mul_sub_m128(self.sse0, m.sse0, a.sse0), sse1: fused_mul_sub_m128(self.sse1, m.sse1, a.sse1) } 923 } else { 924 (self * m) - a 925 } 926 } 927 } 928 929 #[inline] 930 #[must_use] mul_neg_add(self, m: Self, a: Self) -> Self931 pub fn mul_neg_add(self, m: Self, a: Self) -> Self { 932 pick! { 933 if #[cfg(all(target_feature="avx",target_feature="fma"))] { 934 Self { avx: fused_mul_neg_add_m256(self.avx, m.avx, a.avx) } 935 } else if #[cfg(all(target_feature="avx",target_feature="fma"))] { 936 Self { sse0: fused_mul_neg_add_m128(self.sse0, m.sse0, a.sse0), sse1: fused_mul_neg_add_m128(self.sse1, m.sse1, a.sse1) } 937 } else { 938 a - (self * m) 939 } 940 } 941 } 942 943 #[inline] 944 #[must_use] mul_neg_sub(self, m: Self, a: Self) -> Self945 pub fn mul_neg_sub(self, m: Self, a: Self) -> Self { 946 pick! { 947 if #[cfg(all(target_feature="avx",target_feature="fma"))] { 948 Self { avx: fused_mul_neg_sub_m256(self.avx, m.avx, a.avx) } 949 } else if #[cfg(all(target_feature="avx",target_feature="fma"))] { 950 Self { sse0: fused_mul_neg_sub_m128(self.sse0, m.sse0, a.sse0), sse1: fused_mul_neg_sub_m128(self.sse1, m.sse1, a.sse1) } 951 } else { 952 -(self * m) - a 953 } 954 } 955 } 956 957 #[inline] 958 #[must_use] flip_signs(self, signs: Self) -> Self959 pub fn flip_signs(self, signs: Self) -> Self { 960 self ^ (signs & Self::from(-0.0)) 961 } 962 963 #[inline] 964 #[must_use] copysign(self, sign: Self) -> Self965 pub fn copysign(self, sign: Self) -> Self { 966 let magnitude_mask = Self::from(f32::from_bits(u32::MAX >> 1)); 967 (self & magnitude_mask) | (sign & Self::from(-0.0)) 968 } 969 970 #[allow(non_upper_case_globals)] asin_acos(self) -> (Self, Self)971 pub fn asin_acos(self) -> (Self, Self) { 972 // Based on the Agner Fog "vector class library": 973 // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h 974 const_f32_as_f32x8!(P4asinf, 4.2163199048E-2); 975 const_f32_as_f32x8!(P3asinf, 2.4181311049E-2); 976 const_f32_as_f32x8!(P2asinf, 4.5470025998E-2); 977 const_f32_as_f32x8!(P1asinf, 7.4953002686E-2); 978 const_f32_as_f32x8!(P0asinf, 1.6666752422E-1); 979 980 let xa = self.abs(); 981 let big = xa.cmp_ge(f32x8::splat(0.5)); 982 983 let x1 = f32x8::splat(0.5) * (f32x8::ONE - xa); 984 let x2 = xa * xa; 985 let x3 = big.blend(x1, x2); 986 987 let xb = x1.sqrt(); 988 989 let x4 = big.blend(xb, xa); 990 991 let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf); 992 let z = z.mul_add(x3 * x4, x4); 993 994 let z1 = z + z; 995 996 // acos 997 let z3 = self.cmp_lt(f32x8::ZERO).blend(f32x8::PI - z1, z1); 998 let z4 = f32x8::FRAC_PI_2 - z.flip_signs(self); 999 let acos = big.blend(z3, z4); 1000 1001 // asin 1002 let z3 = f32x8::FRAC_PI_2 - z1; 1003 let asin = big.blend(z3, z); 1004 let asin = asin.flip_signs(self); 1005 1006 (asin, acos) 1007 } 1008 1009 #[inline] 1010 #[must_use] 1011 #[allow(non_upper_case_globals)] asin(self) -> Self1012 pub fn asin(self) -> Self { 1013 // Based on the Agner Fog "vector class library": 1014 // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h 1015 const_f32_as_f32x8!(P4asinf, 4.2163199048E-2); 1016 const_f32_as_f32x8!(P3asinf, 2.4181311049E-2); 1017 const_f32_as_f32x8!(P2asinf, 4.5470025998E-2); 1018 const_f32_as_f32x8!(P1asinf, 7.4953002686E-2); 1019 const_f32_as_f32x8!(P0asinf, 1.6666752422E-1); 1020 1021 let xa = self.abs(); 1022 let big = xa.cmp_ge(f32x8::splat(0.5)); 1023 1024 let x1 = f32x8::splat(0.5) * (f32x8::ONE - xa); 1025 let x2 = xa * xa; 1026 let x3 = big.blend(x1, x2); 1027 1028 let xb = x1.sqrt(); 1029 1030 let x4 = big.blend(xb, xa); 1031 1032 let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf); 1033 let z = z.mul_add(x3 * x4, x4); 1034 1035 let z1 = z + z; 1036 1037 // asin 1038 let z3 = f32x8::FRAC_PI_2 - z1; 1039 let asin = big.blend(z3, z); 1040 let asin = asin.flip_signs(self); 1041 1042 asin 1043 } 1044 1045 #[inline] 1046 #[must_use] 1047 #[allow(non_upper_case_globals)] acos(self) -> Self1048 pub fn acos(self) -> Self { 1049 // Based on the Agner Fog "vector class library": 1050 // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h 1051 const_f32_as_f32x8!(P4asinf, 4.2163199048E-2); 1052 const_f32_as_f32x8!(P3asinf, 2.4181311049E-2); 1053 const_f32_as_f32x8!(P2asinf, 4.5470025998E-2); 1054 const_f32_as_f32x8!(P1asinf, 7.4953002686E-2); 1055 const_f32_as_f32x8!(P0asinf, 1.6666752422E-1); 1056 1057 let xa = self.abs(); 1058 let big = xa.cmp_ge(f32x8::splat(0.5)); 1059 1060 let x1 = f32x8::splat(0.5) * (f32x8::ONE - xa); 1061 let x2 = xa * xa; 1062 let x3 = big.blend(x1, x2); 1063 1064 let xb = x1.sqrt(); 1065 1066 let x4 = big.blend(xb, xa); 1067 1068 let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf); 1069 let z = z.mul_add(x3 * x4, x4); 1070 1071 let z1 = z + z; 1072 1073 // acos 1074 let z3 = self.cmp_lt(f32x8::ZERO).blend(f32x8::PI - z1, z1); 1075 let z4 = f32x8::FRAC_PI_2 - z.flip_signs(self); 1076 let acos = big.blend(z3, z4); 1077 1078 acos 1079 } 1080 1081 #[allow(non_upper_case_globals)] atan(self) -> Self1082 pub fn atan(self) -> Self { 1083 // Based on the Agner Fog "vector class library": 1084 // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h 1085 const_f32_as_f32x8!(P3atanf, 8.05374449538E-2); 1086 const_f32_as_f32x8!(P2atanf, -1.38776856032E-1); 1087 const_f32_as_f32x8!(P1atanf, 1.99777106478E-1); 1088 const_f32_as_f32x8!(P0atanf, -3.33329491539E-1); 1089 1090 let t = self.abs(); 1091 1092 // small: z = t / 1.0; 1093 // medium: z = (t-1.0) / (t+1.0); 1094 // big: z = -1.0 / t; 1095 let notsmal = t.cmp_ge(Self::SQRT_2 - Self::ONE); 1096 let notbig = t.cmp_le(Self::SQRT_2 + Self::ONE); 1097 1098 let mut s = notbig.blend(Self::FRAC_PI_4, Self::FRAC_PI_2); 1099 s = notsmal & s; 1100 1101 let mut a = notbig & t; 1102 a = notsmal.blend(a - Self::ONE, a); 1103 let mut b = notbig & Self::ONE; 1104 b = notsmal.blend(b + t, b); 1105 let z = a / b; 1106 1107 let zz = z * z; 1108 1109 // Taylor expansion 1110 let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf); 1111 re = re.mul_add(zz * z, z) + s; 1112 1113 // get sign bit 1114 re = (self.sign_bit()).blend(-re, re); 1115 1116 re 1117 } 1118 1119 #[allow(non_upper_case_globals)] atan2(self, x: Self) -> Self1120 pub fn atan2(self, x: Self) -> Self { 1121 // Based on the Agner Fog "vector class library": 1122 // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h 1123 const_f32_as_f32x8!(P3atanf, 8.05374449538E-2); 1124 const_f32_as_f32x8!(P2atanf, -1.38776856032E-1); 1125 const_f32_as_f32x8!(P1atanf, 1.99777106478E-1); 1126 const_f32_as_f32x8!(P0atanf, -3.33329491539E-1); 1127 1128 let y = self; 1129 1130 // move in first octant 1131 let x1 = x.abs(); 1132 let y1 = y.abs(); 1133 let swapxy = y1.cmp_gt(x1); 1134 // swap x and y if y1 > x1 1135 let mut x2 = swapxy.blend(y1, x1); 1136 let mut y2 = swapxy.blend(x1, y1); 1137 1138 // check for special case: x and y are both +/- INF 1139 let both_infinite = x.is_inf() & y.is_inf(); 1140 if both_infinite.any() { 1141 let mone = -Self::ONE; 1142 x2 = both_infinite.blend(x2 & mone, x2); 1143 y2 = both_infinite.blend(y2 & mone, y2); 1144 } 1145 1146 // x = y = 0 will produce NAN. No problem, fixed below 1147 let t = y2 / x2; 1148 1149 // small: z = t / 1.0; 1150 // medium: z = (t-1.0) / (t+1.0); 1151 let notsmal = t.cmp_ge(Self::SQRT_2 - Self::ONE); 1152 1153 let a = notsmal.blend(t - Self::ONE, t); 1154 let b = notsmal.blend(t + Self::ONE, Self::ONE); 1155 let s = notsmal & Self::FRAC_PI_4; 1156 let z = a / b; 1157 1158 let zz = z * z; 1159 1160 // Taylor expansion 1161 let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf); 1162 re = re.mul_add(zz * z, z) + s; 1163 1164 // move back in place 1165 re = swapxy.blend(Self::FRAC_PI_2 - re, re); 1166 re = ((x | y).cmp_eq(Self::ZERO)).blend(Self::ZERO, re); 1167 re = (x.sign_bit()).blend(Self::PI - re, re); 1168 1169 // get sign bit 1170 re = (y.sign_bit()).blend(-re, re); 1171 1172 re 1173 } 1174 1175 #[inline] 1176 #[must_use] 1177 #[allow(non_upper_case_globals)] sin_cos(self) -> (Self, Self)1178 pub fn sin_cos(self) -> (Self, Self) { 1179 // Based on the Agner Fog "vector class library": 1180 // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h 1181 1182 const_f32_as_f32x8!(DP1F, 0.78515625_f32 * 2.0); 1183 const_f32_as_f32x8!(DP2F, 2.4187564849853515625E-4_f32 * 2.0); 1184 const_f32_as_f32x8!(DP3F, 3.77489497744594108E-8_f32 * 2.0); 1185 1186 const_f32_as_f32x8!(P0sinf, -1.6666654611E-1); 1187 const_f32_as_f32x8!(P1sinf, 8.3321608736E-3); 1188 const_f32_as_f32x8!(P2sinf, -1.9515295891E-4); 1189 1190 const_f32_as_f32x8!(P0cosf, 4.166664568298827E-2); 1191 const_f32_as_f32x8!(P1cosf, -1.388731625493765E-3); 1192 const_f32_as_f32x8!(P2cosf, 2.443315711809948E-5); 1193 1194 const_f32_as_f32x8!(TWO_OVER_PI, 2.0 / core::f32::consts::PI); 1195 1196 let xa = self.abs(); 1197 1198 // Find quadrant 1199 let y = (xa * TWO_OVER_PI).round(); 1200 let q: i32x8 = y.round_int(); 1201 1202 let x = y.mul_neg_add(DP3F, y.mul_neg_add(DP2F, y.mul_neg_add(DP1F, xa))); 1203 1204 let x2 = x * x; 1205 let mut s = polynomial_2!(x2, P0sinf, P1sinf, P2sinf) * (x * x2) + x; 1206 let mut c = polynomial_2!(x2, P0cosf, P1cosf, P2cosf) * (x2 * x2) 1207 + f32x8::from(0.5).mul_neg_add(x2, f32x8::from(1.0)); 1208 1209 let swap = !(q & i32x8::from(1)).cmp_eq(i32x8::from(0)); 1210 1211 let mut overflow: f32x8 = cast(q.cmp_gt(i32x8::from(0x2000000))); 1212 overflow &= xa.is_finite(); 1213 s = overflow.blend(f32x8::from(0.0), s); 1214 c = overflow.blend(f32x8::from(1.0), c); 1215 1216 // calc sin 1217 let mut sin1 = cast::<_, f32x8>(swap).blend(c, s); 1218 let sign_sin: i32x8 = (q << 30) ^ cast::<_, i32x8>(self); 1219 sin1 = sin1.flip_signs(cast(sign_sin)); 1220 1221 // calc cos 1222 let mut cos1 = cast::<_, f32x8>(swap).blend(s, c); 1223 let sign_cos: i32x8 = ((q + i32x8::from(1)) & i32x8::from(2)) << 30; 1224 cos1 ^= cast::<_, f32x8>(sign_cos); 1225 1226 (sin1, cos1) 1227 } 1228 #[inline] 1229 #[must_use] sin(self) -> Self1230 pub fn sin(self) -> Self { 1231 let (s, _) = self.sin_cos(); 1232 s 1233 } 1234 #[inline] 1235 #[must_use] cos(self) -> Self1236 pub fn cos(self) -> Self { 1237 let (_, c) = self.sin_cos(); 1238 c 1239 } 1240 #[inline] 1241 #[must_use] tan(self) -> Self1242 pub fn tan(self) -> Self { 1243 let (s, c) = self.sin_cos(); 1244 s / c 1245 } 1246 #[inline] 1247 #[must_use] to_degrees(self) -> Self1248 pub fn to_degrees(self) -> Self { 1249 const_f32_as_f32x8!(RAD_TO_DEG_RATIO, 180.0_f32 / core::f32::consts::PI); 1250 self * RAD_TO_DEG_RATIO 1251 } 1252 #[inline] 1253 #[must_use] to_radians(self) -> Self1254 pub fn to_radians(self) -> Self { 1255 const_f32_as_f32x8!(DEG_TO_RAD_RATIO, core::f32::consts::PI / 180.0_f32); 1256 self * DEG_TO_RAD_RATIO 1257 } 1258 #[inline] 1259 #[must_use] recip(self) -> Self1260 pub fn recip(self) -> Self { 1261 pick! { 1262 if #[cfg(target_feature="avx")] { 1263 Self { avx: reciprocal_m256(self.avx) } 1264 } else if #[cfg(target_feature="sse2")] { 1265 Self { sse0: reciprocal_m128(self.sse0), sse1: reciprocal_m128(self.sse1) } 1266 } else if #[cfg(target_feature="simd128")] { 1267 let one = f32x4_splat(1.0); 1268 Self { simd0: f32x4_div(one, self.simd0), simd1: f32x4_div(one, self.simd1) } 1269 } else { 1270 Self { arr: [ 1271 1.0 / self.arr[0], 1272 1.0 / self.arr[1], 1273 1.0 / self.arr[2], 1274 1.0 / self.arr[3], 1275 1.0 / self.arr[4], 1276 1.0 / self.arr[5], 1277 1.0 / self.arr[6], 1278 1.0 / self.arr[7], 1279 ]} 1280 } 1281 } 1282 } 1283 #[inline] 1284 #[must_use] recip_sqrt(self) -> Self1285 pub fn recip_sqrt(self) -> Self { 1286 pick! { 1287 if #[cfg(target_feature="avx")] { 1288 Self { avx: reciprocal_sqrt_m256(self.avx) } 1289 } else if #[cfg(target_feature="sse2")] { 1290 Self { sse0: reciprocal_sqrt_m128(self.sse0), sse1: reciprocal_sqrt_m128(self.sse1) } 1291 } else if #[cfg(target_feature="simd128")] { 1292 let one = f32x4_splat(1.0); 1293 Self { simd0: f32x4_div(one, f32x4_sqrt(self.simd0)), simd1: f32x4_div(one, f32x4_sqrt(self.simd1)) } 1294 } else if #[cfg(feature="std")] { 1295 Self { arr: [ 1296 1.0 / self.arr[0].sqrt(), 1297 1.0 / self.arr[1].sqrt(), 1298 1.0 / self.arr[2].sqrt(), 1299 1.0 / self.arr[3].sqrt(), 1300 1.0 / self.arr[4].sqrt(), 1301 1.0 / self.arr[5].sqrt(), 1302 1.0 / self.arr[6].sqrt(), 1303 1.0 / self.arr[7].sqrt(), 1304 ]} 1305 } else { 1306 Self { arr: [ 1307 1.0 / software_sqrt(self.arr[0] as f64) as f32, 1308 1.0 / software_sqrt(self.arr[1] as f64) as f32, 1309 1.0 / software_sqrt(self.arr[2] as f64) as f32, 1310 1.0 / software_sqrt(self.arr[3] as f64) as f32, 1311 1.0 / software_sqrt(self.arr[4] as f64) as f32, 1312 1.0 / software_sqrt(self.arr[5] as f64) as f32, 1313 1.0 / software_sqrt(self.arr[6] as f64) as f32, 1314 1.0 / software_sqrt(self.arr[7] as f64) as f32, 1315 ]} 1316 } 1317 } 1318 } 1319 #[inline] 1320 #[must_use] sqrt(self) -> Self1321 pub fn sqrt(self) -> Self { 1322 pick! { 1323 if #[cfg(target_feature="avx")] { 1324 Self { avx: sqrt_m256(self.avx) } 1325 } else if #[cfg(target_feature="sse2")] { 1326 Self { sse0: sqrt_m128(self.sse0), sse1: sqrt_m128(self.sse1) } 1327 } else if #[cfg(target_feature="simd128")] { 1328 Self { simd0: f32x4_sqrt(self.simd0), simd1: f32x4_sqrt(self.simd1) } 1329 } else if #[cfg(feature="std")] { 1330 Self { arr: [ 1331 self.arr[0].sqrt(), 1332 self.arr[1].sqrt(), 1333 self.arr[2].sqrt(), 1334 self.arr[3].sqrt(), 1335 self.arr[4].sqrt(), 1336 self.arr[5].sqrt(), 1337 self.arr[6].sqrt(), 1338 self.arr[7].sqrt(), 1339 ]} 1340 } else { 1341 Self { arr: [ 1342 software_sqrt(self.arr[0] as f64) as f32, 1343 software_sqrt(self.arr[1] as f64) as f32, 1344 software_sqrt(self.arr[2] as f64) as f32, 1345 software_sqrt(self.arr[3] as f64) as f32, 1346 software_sqrt(self.arr[4] as f64) as f32, 1347 software_sqrt(self.arr[5] as f64) as f32, 1348 software_sqrt(self.arr[6] as f64) as f32, 1349 software_sqrt(self.arr[7] as f64) as f32, 1350 ]} 1351 } 1352 } 1353 } 1354 #[inline] 1355 #[must_use] move_mask(self) -> i321356 pub fn move_mask(self) -> i32 { 1357 pick! { 1358 if #[cfg(target_feature="avx")] { 1359 move_mask_m256(self.avx) 1360 } else if #[cfg(target_feature="sse2")] { 1361 (move_mask_m128(self.sse1) << 4) ^ move_mask_m128(self.sse0) 1362 } else if #[cfg(target_feature="simd128")] { 1363 ((u32x4_bitmask(self.simd1) as i32) << 4) ^ u32x4_bitmask(self.simd0) as i32 1364 } else { 1365 (((self.arr[0].to_bits() as i32) < 0) as i32) << 0 | 1366 (((self.arr[1].to_bits() as i32) < 0) as i32) << 1 | 1367 (((self.arr[2].to_bits() as i32) < 0) as i32) << 2 | 1368 (((self.arr[3].to_bits() as i32) < 0) as i32) << 3 | 1369 (((self.arr[4].to_bits() as i32) < 0) as i32) << 4 | 1370 (((self.arr[5].to_bits() as i32) < 0) as i32) << 5 | 1371 (((self.arr[6].to_bits() as i32) < 0) as i32) << 6 | 1372 (((self.arr[7].to_bits() as i32) < 0) as i32) << 7 1373 } 1374 } 1375 } 1376 #[inline] 1377 #[must_use] any(self) -> bool1378 pub fn any(self) -> bool { 1379 pick! { 1380 if #[cfg(target_feature="simd128")] { 1381 v128_any_true(self.simd0) | v128_any_true(self.simd1) 1382 } else { 1383 self.move_mask() != 0 1384 } 1385 } 1386 } 1387 #[inline] 1388 #[must_use] all(self) -> bool1389 pub fn all(self) -> bool { 1390 pick! { 1391 if #[cfg(target_feature="simd128")] { 1392 u32x4_all_true(self.simd0) & u32x4_all_true(self.simd1) 1393 } else { 1394 // eight lanes 1395 self.move_mask() == 0b11111111 1396 } 1397 } 1398 } 1399 #[inline] 1400 #[must_use] none(self) -> bool1401 pub fn none(self) -> bool { 1402 !self.any() 1403 } 1404 1405 #[inline] 1406 #[allow(non_upper_case_globals)] vm_pow2n(self) -> Self1407 fn vm_pow2n(self) -> Self { 1408 const_f32_as_f32x8!(pow2_23, 8388608.0); 1409 const_f32_as_f32x8!(bias, 127.0); 1410 let a = self + (bias + pow2_23); 1411 let c = cast::<_, i32x8>(a) << 23; 1412 cast::<_, f32x8>(c) 1413 } 1414 1415 /// Calculate the exponent of a packed f32x8 1416 #[inline] 1417 #[must_use] 1418 #[allow(non_upper_case_globals)] exp(self) -> Self1419 pub fn exp(self) -> Self { 1420 const_f32_as_f32x8!(P0, 1.0 / 2.0); 1421 const_f32_as_f32x8!(P1, 1.0 / 6.0); 1422 const_f32_as_f32x8!(P2, 1. / 24.); 1423 const_f32_as_f32x8!(P3, 1. / 120.); 1424 const_f32_as_f32x8!(P4, 1. / 720.); 1425 const_f32_as_f32x8!(P5, 1. / 5040.); 1426 const_f32_as_f32x8!(LN2D_HI, 0.693359375); 1427 const_f32_as_f32x8!(LN2D_LO, -2.12194440e-4); 1428 let max_x = f32x8::from(87.3); 1429 let r = (self * Self::LOG2_E).round(); 1430 let x = r.mul_neg_add(LN2D_HI, self); 1431 let x = r.mul_neg_add(LN2D_LO, x); 1432 let z = polynomial_5!(x, P0, P1, P2, P3, P4, P5); 1433 let x2 = x * x; 1434 let z = z.mul_add(x2, x); 1435 let n2 = Self::vm_pow2n(r); 1436 let z = (z + Self::ONE) * n2; 1437 // check for overflow 1438 let in_range = self.abs().cmp_lt(max_x); 1439 let in_range = in_range & self.is_finite(); 1440 in_range.blend(z, Self::ZERO) 1441 } 1442 1443 #[inline] 1444 #[allow(non_upper_case_globals)] exponent(self) -> f32x81445 fn exponent(self) -> f32x8 { 1446 const_f32_as_f32x8!(pow2_23, 8388608.0); 1447 const_f32_as_f32x8!(bias, 127.0); 1448 let a = cast::<_, u32x8>(self); 1449 let b = a >> 23; 1450 let c = b | cast::<_, u32x8>(pow2_23); 1451 let d = cast::<_, f32x8>(c); 1452 let e = d - (pow2_23 + bias); 1453 e 1454 } 1455 1456 #[inline] 1457 #[allow(non_upper_case_globals)] fraction_2(self) -> Self1458 fn fraction_2(self) -> Self { 1459 let t1 = cast::<_, u32x8>(self); 1460 let t2 = cast::<_, u32x8>( 1461 (t1 & u32x8::from(0x007FFFFF)) | u32x8::from(0x3F000000), 1462 ); 1463 cast::<_, f32x8>(t2) 1464 } 1465 is_zero_or_subnormal(self) -> Self1466 fn is_zero_or_subnormal(self) -> Self { 1467 let t = cast::<_, i32x8>(self); 1468 let t = t & i32x8::splat(0x7F800000); 1469 i32x8::round_float(t.cmp_eq(i32x8::splat(0))) 1470 } 1471 infinity() -> Self1472 fn infinity() -> Self { 1473 cast::<_, f32x8>(i32x8::splat(0x7F800000)) 1474 } 1475 nan_log() -> Self1476 fn nan_log() -> Self { 1477 cast::<_, f32x8>(i32x8::splat(0x7FC00000 | 0x101 & 0x003FFFFF)) 1478 } 1479 nan_pow() -> Self1480 fn nan_pow() -> Self { 1481 cast::<_, f32x8>(i32x8::splat(0x7FC00000 | 0x101 & 0x003FFFFF)) 1482 } 1483 sign_bit(self) -> Self1484 pub fn sign_bit(self) -> Self { 1485 let t1 = cast::<_, i32x8>(self); 1486 let t2 = t1 >> 31; 1487 !cast::<_, f32x8>(t2).cmp_eq(f32x8::ZERO) 1488 } 1489 reduce_add(self) -> f321490 pub fn reduce_add(self) -> f32 { 1491 pick! { 1492 // From https://stackoverflow.com/questions/13219146/how-to-sum-m256-horizontally 1493 if #[cfg(target_feature="avx")]{ 1494 let hi_quad = extract_m128_from_m256::<1>(self.avx); 1495 let lo_quad = cast_to_m128_from_m256(self.avx); 1496 let sum_quad = add_m128(lo_quad,hi_quad); 1497 let lo_dual = sum_quad; 1498 let hi_dual = move_high_low_m128(sum_quad,sum_quad); 1499 let sum_dual = add_m128(lo_dual,hi_dual); 1500 let lo = sum_dual; 1501 let hi = shuffle_abi_f32_all_m128::<0b_01>(sum_dual, sum_dual); 1502 let sum = add_m128_s(lo, hi); 1503 get_f32_from_m128_s(sum) 1504 } 1505 else if #[cfg(target_feature="sse3")] { 1506 let a = add_horizontal_m128(self.sse0, self.sse0); 1507 let b = add_horizontal_m128(a, a); 1508 let c = add_horizontal_m128(self.sse1, self.sse1); 1509 let d = add_horizontal_m128(c, c); 1510 let sum = add_m128_s(b, d); 1511 get_f32_from_m128_s(sum) 1512 } else { 1513 let arr: [f32; 8] = cast(self); 1514 arr.iter().sum() 1515 } 1516 } 1517 } 1518 1519 /// Natural log (ln(x)) 1520 #[inline] 1521 #[must_use] 1522 #[allow(non_upper_case_globals)] ln(self) -> Self1523 pub fn ln(self) -> Self { 1524 const_f32_as_f32x8!(HALF, 0.5); 1525 const_f32_as_f32x8!(P0, 3.3333331174E-1); 1526 const_f32_as_f32x8!(P1, -2.4999993993E-1); 1527 const_f32_as_f32x8!(P2, 2.0000714765E-1); 1528 const_f32_as_f32x8!(P3, -1.6668057665E-1); 1529 const_f32_as_f32x8!(P4, 1.4249322787E-1); 1530 const_f32_as_f32x8!(P5, -1.2420140846E-1); 1531 const_f32_as_f32x8!(P6, 1.1676998740E-1); 1532 const_f32_as_f32x8!(P7, -1.1514610310E-1); 1533 const_f32_as_f32x8!(P8, 7.0376836292E-2); 1534 const_f32_as_f32x8!(LN2F_HI, 0.693359375); 1535 const_f32_as_f32x8!(LN2F_LO, -2.12194440e-4); 1536 const_f32_as_f32x8!(VM_SMALLEST_NORMAL, 1.17549435E-38); 1537 1538 let x1 = self; 1539 let x = Self::fraction_2(x1); 1540 let e = Self::exponent(x1); 1541 let mask = x.cmp_gt(Self::SQRT_2 * HALF); 1542 let x = (!mask).blend(x + x, x); 1543 let fe = mask.blend(e + Self::ONE, e); 1544 let x = x - Self::ONE; 1545 let res = polynomial_8!(x, P0, P1, P2, P3, P4, P5, P6, P7, P8); 1546 let x2 = x * x; 1547 let res = x2 * x * res; 1548 let res = fe.mul_add(LN2F_LO, res); 1549 let res = res + x2.mul_neg_add(HALF, x); 1550 let res = fe.mul_add(LN2F_HI, res); 1551 let overflow = !self.is_finite(); 1552 let underflow = x1.cmp_lt(VM_SMALLEST_NORMAL); 1553 let mask = overflow | underflow; 1554 if !mask.any() { 1555 res 1556 } else { 1557 let is_zero = self.is_zero_or_subnormal(); 1558 let res = underflow.blend(Self::nan_log(), res); 1559 let res = is_zero.blend(Self::infinity(), res); 1560 let res = overflow.blend(self, res); 1561 res 1562 } 1563 } 1564 1565 #[inline] 1566 #[must_use] log2(self) -> Self1567 pub fn log2(self) -> Self { 1568 Self::ln(self) * Self::LOG2_E 1569 } 1570 #[inline] 1571 #[must_use] log10(self) -> Self1572 pub fn log10(self) -> Self { 1573 Self::ln(self) * Self::LOG10_E 1574 } 1575 1576 #[inline] 1577 #[must_use] 1578 #[allow(non_upper_case_globals)] pow_f32x8(self, y: Self) -> Self1579 pub fn pow_f32x8(self, y: Self) -> Self { 1580 const_f32_as_f32x8!(ln2f_hi, 0.693359375); 1581 const_f32_as_f32x8!(ln2f_lo, -2.12194440e-4); 1582 const_f32_as_f32x8!(P0logf, 3.3333331174E-1); 1583 const_f32_as_f32x8!(P1logf, -2.4999993993E-1); 1584 const_f32_as_f32x8!(P2logf, 2.0000714765E-1); 1585 const_f32_as_f32x8!(P3logf, -1.6668057665E-1); 1586 const_f32_as_f32x8!(P4logf, 1.4249322787E-1); 1587 const_f32_as_f32x8!(P5logf, -1.2420140846E-1); 1588 const_f32_as_f32x8!(P6logf, 1.1676998740E-1); 1589 const_f32_as_f32x8!(P7logf, -1.1514610310E-1); 1590 const_f32_as_f32x8!(P8logf, 7.0376836292E-2); 1591 1592 const_f32_as_f32x8!(p2expf, 1.0 / 2.0); // coefficients for Taylor expansion of exp 1593 const_f32_as_f32x8!(p3expf, 1.0 / 6.0); 1594 const_f32_as_f32x8!(p4expf, 1.0 / 24.0); 1595 const_f32_as_f32x8!(p5expf, 1.0 / 120.0); 1596 const_f32_as_f32x8!(p6expf, 1.0 / 720.0); 1597 const_f32_as_f32x8!(p7expf, 1.0 / 5040.0); 1598 1599 let x1 = self.abs(); 1600 let x = x1.fraction_2(); 1601 let mask = x.cmp_gt(f32x8::SQRT_2 * f32x8::HALF); 1602 let x = (!mask).blend(x + x, x); 1603 1604 let x = x - f32x8::ONE; 1605 let x2 = x * x; 1606 let lg1 = polynomial_8!( 1607 x, P0logf, P1logf, P2logf, P3logf, P4logf, P5logf, P6logf, P7logf, P8logf 1608 ); 1609 let lg1 = lg1 * x2 * x; 1610 1611 let ef = x1.exponent(); 1612 let ef = mask.blend(ef + f32x8::ONE, ef); 1613 let e1 = (ef * y).round(); 1614 let yr = ef.mul_sub(y, e1); 1615 1616 let lg = f32x8::HALF.mul_neg_add(x2, x) + lg1; 1617 let x2_err = (f32x8::HALF * x).mul_sub(x, f32x8::HALF * x2); 1618 let lg_err = f32x8::HALF.mul_add(x2, lg - x) - lg1; 1619 1620 let e2 = (lg * y * f32x8::LOG2_E).round(); 1621 let v = lg.mul_sub(y, e2 * ln2f_hi); 1622 let v = e2.mul_neg_add(ln2f_lo, v); 1623 let v = v - (lg_err + x2_err).mul_sub(y, yr * f32x8::LN_2); 1624 1625 let x = v; 1626 let e3 = (x * f32x8::LOG2_E).round(); 1627 let x = e3.mul_neg_add(f32x8::LN_2, x); 1628 let x2 = x * x; 1629 let z = x2.mul_add( 1630 polynomial_5!(x, p2expf, p3expf, p4expf, p5expf, p6expf, p7expf), 1631 x + f32x8::ONE, 1632 ); 1633 1634 let ee = e1 + e2 + e3; 1635 let ei = cast::<_, i32x8>(ee.round_int()); 1636 let ej = cast::<_, i32x8>(ei + (cast::<_, i32x8>(z) >> 23)); 1637 1638 let overflow = cast::<_, f32x8>(ej.cmp_gt(i32x8::splat(0x0FF))) 1639 | (ee.cmp_gt(f32x8::splat(300.0))); 1640 let underflow = cast::<_, f32x8>(ej.cmp_lt(i32x8::splat(0x000))) 1641 | (ee.cmp_lt(f32x8::splat(-300.0))); 1642 1643 // Add exponent by integer addition 1644 let z = cast::<_, f32x8>(cast::<_, i32x8>(z) + (ei << 23)); 1645 // Check for overflow/underflow 1646 let z = underflow.blend(f32x8::ZERO, z); 1647 let z = overflow.blend(Self::infinity(), z); 1648 1649 // Check for self == 0 1650 let x_zero = self.is_zero_or_subnormal(); 1651 let z = x_zero.blend( 1652 y.cmp_lt(f32x8::ZERO).blend( 1653 Self::infinity(), 1654 y.cmp_eq(f32x8::ZERO).blend(f32x8::ONE, f32x8::ZERO), 1655 ), 1656 z, 1657 ); 1658 1659 let x_sign = self.sign_bit(); 1660 let z = if x_sign.any() { 1661 // Y into an integer 1662 let yi = y.cmp_eq(y.round()); 1663 1664 // Is y odd? 1665 let y_odd = cast::<_, i32x8>(y.round_int() << 31).round_float(); 1666 1667 let z1 = 1668 yi.blend(z | y_odd, self.cmp_eq(Self::ZERO).blend(z, Self::nan_pow())); 1669 1670 x_sign.blend(z1, z) 1671 } else { 1672 z 1673 }; 1674 1675 let x_finite = self.is_finite(); 1676 let y_finite = y.is_finite(); 1677 let e_finite = ee.is_finite(); 1678 if (x_finite & y_finite & (e_finite | x_zero)).all() { 1679 return z; 1680 } 1681 1682 (self.is_nan() | y.is_nan()).blend(self + y, z) 1683 } 1684 powf(self, y: f32) -> Self1685 pub fn powf(self, y: f32) -> Self { 1686 Self::pow_f32x8(self, f32x8::splat(y)) 1687 } 1688 to_array(self) -> [f32; 8]1689 pub fn to_array(self) -> [f32; 8] { 1690 cast(self) 1691 } 1692 as_array_ref(&self) -> &[f32; 8]1693 pub fn as_array_ref(&self) -> &[f32; 8] { 1694 cast_ref(self) 1695 } 1696 } 1697 1698 impl Not for f32x8 { 1699 type Output = Self; not(self) -> Self1700 fn not(self) -> Self { 1701 pick! { 1702 if #[cfg(target_feature="avx")] { 1703 Self { avx: self.avx.not() } 1704 } else if #[cfg(target_feature="sse2")] { 1705 Self { sse0: self.sse0.not() , sse1: self.sse1.not() } 1706 } else if #[cfg(target_feature="simd128")] { 1707 Self { simd0: v128_not(self.simd0), simd1: v128_not(self.simd1) } 1708 } else { 1709 Self { arr: [ 1710 f32::from_bits(!self.arr[0].to_bits()), 1711 f32::from_bits(!self.arr[1].to_bits()), 1712 f32::from_bits(!self.arr[2].to_bits()), 1713 f32::from_bits(!self.arr[3].to_bits()), 1714 f32::from_bits(!self.arr[4].to_bits()), 1715 f32::from_bits(!self.arr[5].to_bits()), 1716 f32::from_bits(!self.arr[6].to_bits()), 1717 f32::from_bits(!self.arr[7].to_bits()), 1718 ]} 1719 } 1720 } 1721 } 1722 } 1723