1 // pathfinder/simd/src/x86.rs 2 // 3 // Copyright © 2019 The Pathfinder Project Developers. 4 // 5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license 7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your 8 // option. This file may not be copied, modified, or distributed 9 // except according to those terms. 10 11 use std::cmp::PartialEq; 12 use std::fmt::{self, Debug, Formatter}; 13 use std::mem; 14 use std::ops::{Add, BitAnd, BitOr, BitXor, Div, Index, IndexMut, Mul, Not, Shr, Sub}; 15 16 #[cfg(target_pointer_width = "32")] 17 use std::arch::x86::{__m128, __m128i}; 18 #[cfg(target_pointer_width = "32")] 19 use std::arch::x86; 20 #[cfg(target_pointer_width = "64")] 21 use std::arch::x86_64::{__m128, __m128i}; 22 #[cfg(target_pointer_width = "64")] 23 use std::arch::x86_64 as x86; 24 25 mod swizzle_f32x4; 26 mod swizzle_i32x4; 27 28 // Two 32-bit floats 29 30 #[derive(Clone, Copy)] 31 pub struct F32x2(pub u64); 32 33 impl F32x2 { 34 // Constructors 35 36 #[inline] new(a: f32, b: f32) -> F32x237 pub fn new(a: f32, b: f32) -> F32x2 { 38 unsafe { 39 let a = mem::transmute::<*const f32, *const u32>(&a); 40 let b = mem::transmute::<*const f32, *const u32>(&b); 41 F32x2((*a as u64) | ((*b as u64) << 32)) 42 } 43 } 44 45 #[inline] splat(x: f32) -> F32x246 pub fn splat(x: f32) -> F32x2 { 47 F32x2::new(x, x) 48 } 49 50 // Basic operations 51 52 #[inline] approx_recip(self) -> F32x253 pub fn approx_recip(self) -> F32x2 { 54 self.to_f32x4().approx_recip().xy() 55 } 56 57 #[inline] min(self, other: F32x2) -> F32x258 pub fn min(self, other: F32x2) -> F32x2 { 59 self.to_f32x4().min(other.to_f32x4()).xy() 60 } 61 62 #[inline] max(self, other: F32x2) -> F32x263 pub fn max(self, other: F32x2) -> F32x2 { 64 self.to_f32x4().max(other.to_f32x4()).xy() 65 } 66 67 #[inline] clamp(self, min: F32x2, max: F32x2) -> F32x268 pub fn clamp(self, min: F32x2, max: F32x2) -> F32x2 { 69 self.to_f32x4().clamp(min.to_f32x4(), max.to_f32x4()).xy() 70 } 71 72 #[inline] abs(self) -> F32x273 pub fn abs(self) -> F32x2 { 74 self.to_f32x4().abs().xy() 75 } 76 77 #[inline] floor(self) -> F32x278 pub fn floor(self) -> F32x2 { 79 self.to_f32x4().floor().xy() 80 } 81 82 #[inline] ceil(self) -> F32x283 pub fn ceil(self) -> F32x2 { 84 self.to_f32x4().ceil().xy() 85 } 86 87 #[inline] sqrt(self) -> F32x288 pub fn sqrt(self) -> F32x2 { 89 self.to_f32x4().sqrt().xy() 90 } 91 92 // Packed comparisons 93 94 #[inline] packed_eq(self, other: F32x2) -> U32x295 pub fn packed_eq(self, other: F32x2) -> U32x2 { 96 self.to_f32x4().packed_eq(other.to_f32x4()).xy() 97 } 98 99 #[inline] packed_gt(self, other: F32x2) -> U32x2100 pub fn packed_gt(self, other: F32x2) -> U32x2 { 101 self.to_f32x4().packed_gt(other.to_f32x4()).xy() 102 } 103 104 #[inline] packed_lt(self, other: F32x2) -> U32x2105 pub fn packed_lt(self, other: F32x2) -> U32x2 { 106 self.to_f32x4().packed_lt(other.to_f32x4()).xy() 107 } 108 109 #[inline] packed_le(self, other: F32x2) -> U32x2110 pub fn packed_le(self, other: F32x2) -> U32x2 { 111 self.to_f32x4().packed_le(other.to_f32x4()).xy() 112 } 113 114 // Conversions 115 116 #[inline] to_f32x4(self) -> F32x4117 pub fn to_f32x4(self) -> F32x4 { 118 unsafe { 119 let mut result = F32x4::default(); 120 *mem::transmute::<&mut __m128, &mut u64>(&mut result.0) = self.0; 121 result 122 } 123 } 124 125 #[inline] to_i32x2(self) -> I32x2126 pub fn to_i32x2(self) -> I32x2 { 127 self.to_i32x4().xy() 128 } 129 130 #[inline] to_i32x4(self) -> I32x4131 pub fn to_i32x4(self) -> I32x4 { 132 self.to_f32x4().to_i32x4() 133 } 134 135 // Swizzle 136 137 #[inline] yx(self) -> F32x2138 pub fn yx(self) -> F32x2 { 139 self.to_f32x4().yx() 140 } 141 142 // Concatenations 143 144 #[inline] concat_xy_xy(self, other: F32x2) -> F32x4145 pub fn concat_xy_xy(self, other: F32x2) -> F32x4 { 146 self.to_f32x4().concat_xy_xy(other.to_f32x4()) 147 } 148 } 149 150 impl Default for F32x2 { 151 #[inline] default() -> F32x2152 fn default() -> F32x2 { 153 F32x2(0) 154 } 155 } 156 157 impl Index<usize> for F32x2 { 158 type Output = f32; 159 #[inline] index(&self, index: usize) -> &f32160 fn index(&self, index: usize) -> &f32 { 161 unsafe { &mem::transmute::<&u64, &[f32; 2]>(&self.0)[index] } 162 } 163 } 164 165 impl IndexMut<usize> for F32x2 { 166 #[inline] index_mut(&mut self, index: usize) -> &mut f32167 fn index_mut(&mut self, index: usize) -> &mut f32 { 168 unsafe { &mut mem::transmute::<&mut u64, &mut [f32; 2]>(&mut self.0)[index] } 169 } 170 } 171 172 impl Debug for F32x2 { 173 #[inline] fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error>174 fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> { 175 write!(f, "<{}, {}>", self[0], self[1]) 176 } 177 } 178 179 impl PartialEq for F32x2 { 180 #[inline] eq(&self, other: &F32x2) -> bool181 fn eq(&self, other: &F32x2) -> bool { 182 self.packed_eq(*other).all_true() 183 } 184 } 185 186 impl Add<F32x2> for F32x2 { 187 type Output = F32x2; 188 #[inline] add(self, other: F32x2) -> F32x2189 fn add(self, other: F32x2) -> F32x2 { 190 (self.to_f32x4() + other.to_f32x4()).xy() 191 } 192 } 193 194 impl Div<F32x2> for F32x2 { 195 type Output = F32x2; 196 #[inline] div(self, other: F32x2) -> F32x2197 fn div(self, other: F32x2) -> F32x2 { 198 (self.to_f32x4() / other.to_f32x4()).xy() 199 } 200 } 201 202 impl Mul<F32x2> for F32x2 { 203 type Output = F32x2; 204 #[inline] mul(self, other: F32x2) -> F32x2205 fn mul(self, other: F32x2) -> F32x2 { 206 (self.to_f32x4() * other.to_f32x4()).xy() 207 } 208 } 209 210 impl Sub<F32x2> for F32x2 { 211 type Output = F32x2; 212 #[inline] sub(self, other: F32x2) -> F32x2213 fn sub(self, other: F32x2) -> F32x2 { 214 (self.to_f32x4() - other.to_f32x4()).xy() 215 } 216 } 217 218 // Four 32-bit floats 219 220 #[derive(Clone, Copy)] 221 pub struct F32x4(pub __m128); 222 223 impl F32x4 { 224 // Constructors 225 226 #[inline] new(a: f32, b: f32, c: f32, d: f32) -> F32x4227 pub fn new(a: f32, b: f32, c: f32, d: f32) -> F32x4 { 228 unsafe { 229 let vector = [a, b, c, d]; 230 F32x4(x86::_mm_loadu_ps(vector.as_ptr())) 231 } 232 } 233 234 #[inline] splat(x: f32) -> F32x4235 pub fn splat(x: f32) -> F32x4 { 236 unsafe { F32x4(x86::_mm_set1_ps(x)) } 237 } 238 239 // Basic operations 240 241 #[inline] approx_recip(self) -> F32x4242 pub fn approx_recip(self) -> F32x4 { 243 unsafe { F32x4(x86::_mm_rcp_ps(self.0)) } 244 } 245 246 #[inline] min(self, other: F32x4) -> F32x4247 pub fn min(self, other: F32x4) -> F32x4 { 248 unsafe { F32x4(x86::_mm_min_ps(self.0, other.0)) } 249 } 250 251 #[inline] max(self, other: F32x4) -> F32x4252 pub fn max(self, other: F32x4) -> F32x4 { 253 unsafe { F32x4(x86::_mm_max_ps(self.0, other.0)) } 254 } 255 256 #[inline] clamp(self, min: F32x4, max: F32x4) -> F32x4257 pub fn clamp(self, min: F32x4, max: F32x4) -> F32x4 { 258 self.max(min).min(max) 259 } 260 261 #[inline] abs(self) -> F32x4262 pub fn abs(self) -> F32x4 { 263 unsafe { 264 let tmp = x86::_mm_srli_epi32(I32x4::splat(-1).0, 1); 265 F32x4(x86::_mm_and_ps(x86::_mm_castsi128_ps(tmp), self.0)) 266 } 267 } 268 269 #[inline] floor(self) -> F32x4270 pub fn floor(self) -> F32x4 { 271 unsafe { F32x4(x86::_mm_floor_ps(self.0)) } 272 } 273 274 #[inline] ceil(self) -> F32x4275 pub fn ceil(self) -> F32x4 { 276 unsafe { F32x4(x86::_mm_ceil_ps(self.0)) } 277 } 278 279 #[inline] sqrt(self) -> F32x4280 pub fn sqrt(self) -> F32x4 { 281 unsafe { F32x4(x86::_mm_sqrt_ps(self.0)) } 282 } 283 284 // Packed comparisons 285 286 #[inline] packed_eq(self, other: F32x4) -> U32x4287 pub fn packed_eq(self, other: F32x4) -> U32x4 { 288 unsafe { 289 U32x4(x86::_mm_castps_si128(x86::_mm_cmpeq_ps( 290 self.0, other.0, 291 ))) 292 } 293 } 294 295 #[inline] packed_gt(self, other: F32x4) -> U32x4296 pub fn packed_gt(self, other: F32x4) -> U32x4 { 297 unsafe { 298 U32x4(x86::_mm_castps_si128(x86::_mm_cmpgt_ps( 299 self.0, other.0, 300 ))) 301 } 302 } 303 304 #[inline] packed_lt(self, other: F32x4) -> U32x4305 pub fn packed_lt(self, other: F32x4) -> U32x4 { 306 other.packed_gt(self) 307 } 308 309 #[inline] packed_le(self, other: F32x4) -> U32x4310 pub fn packed_le(self, other: F32x4) -> U32x4 { 311 !self.packed_gt(other) 312 } 313 314 // Conversions 315 316 /// Converts these packed floats to integers via rounding. 317 #[inline] to_i32x4(self) -> I32x4318 pub fn to_i32x4(self) -> I32x4 { 319 unsafe { I32x4(x86::_mm_cvtps_epi32(self.0)) } 320 } 321 322 // Extraction 323 324 #[inline] xy(self) -> F32x2325 pub fn xy(self) -> F32x2 { 326 unsafe { 327 let swizzled = self.0; 328 F32x2(*mem::transmute::<&__m128, &u64>(&swizzled)) 329 } 330 } 331 332 #[inline] xw(self) -> F32x2333 pub fn xw(self) -> F32x2 { 334 self.xwyz().xy() 335 } 336 337 #[inline] yx(self) -> F32x2338 pub fn yx(self) -> F32x2 { 339 self.yxwz().xy() 340 } 341 342 #[inline] zy(self) -> F32x2343 pub fn zy(self) -> F32x2 { 344 self.zyxw().xy() 345 } 346 347 #[inline] zw(self) -> F32x2348 pub fn zw(self) -> F32x2 { 349 self.zwxy().xy() 350 } 351 352 // Concatenations 353 354 #[inline] concat_xy_xy(self, other: F32x4) -> F32x4355 pub fn concat_xy_xy(self, other: F32x4) -> F32x4 { 356 unsafe { 357 let this = x86::_mm_castps_pd(self.0); 358 let other = x86::_mm_castps_pd(other.0); 359 let result = x86::_mm_unpacklo_pd(this, other); 360 F32x4(x86::_mm_castpd_ps(result)) 361 } 362 } 363 364 #[inline] concat_xy_zw(self, other: F32x4) -> F32x4365 pub fn concat_xy_zw(self, other: F32x4) -> F32x4 { 366 unsafe { 367 let this = x86::_mm_castps_pd(self.0); 368 let other = x86::_mm_castps_pd(other.0); 369 let result = x86::_mm_shuffle_pd(this, other, 0b10); 370 F32x4(x86::_mm_castpd_ps(result)) 371 } 372 } 373 374 #[inline] concat_zw_zw(self, other: F32x4) -> F32x4375 pub fn concat_zw_zw(self, other: F32x4) -> F32x4 { 376 unsafe { 377 let this = x86::_mm_castps_pd(self.0); 378 let other = x86::_mm_castps_pd(other.0); 379 let result = x86::_mm_unpackhi_pd(this, other); 380 F32x4(x86::_mm_castpd_ps(result)) 381 } 382 } 383 384 #[inline] concat_wz_yx(self, other: F32x4) -> F32x4385 pub fn concat_wz_yx(self, other: F32x4) -> F32x4 { 386 unsafe { F32x4(x86::_mm_shuffle_ps(self.0, other.0, 0b0001_1011)) } 387 } 388 } 389 390 impl Default for F32x4 { 391 #[inline] default() -> F32x4392 fn default() -> F32x4 { 393 unsafe { F32x4(x86::_mm_setzero_ps()) } 394 } 395 } 396 397 impl Index<usize> for F32x4 { 398 type Output = f32; 399 #[inline] index(&self, index: usize) -> &f32400 fn index(&self, index: usize) -> &f32 { 401 unsafe { &mem::transmute::<&__m128, &[f32; 4]>(&self.0)[index] } 402 } 403 } 404 405 impl IndexMut<usize> for F32x4 { 406 #[inline] index_mut(&mut self, index: usize) -> &mut f32407 fn index_mut(&mut self, index: usize) -> &mut f32 { 408 unsafe { &mut mem::transmute::<&mut __m128, &mut [f32; 4]>(&mut self.0)[index] } 409 } 410 } 411 412 impl Debug for F32x4 { 413 #[inline] fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error>414 fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> { 415 write!(f, "<{}, {}, {}, {}>", self[0], self[1], self[2], self[3]) 416 } 417 } 418 419 impl PartialEq for F32x4 { 420 #[inline] eq(&self, other: &F32x4) -> bool421 fn eq(&self, other: &F32x4) -> bool { 422 self.packed_eq(*other).all_true() 423 } 424 } 425 426 impl Add<F32x4> for F32x4 { 427 type Output = F32x4; 428 #[inline] add(self, other: F32x4) -> F32x4429 fn add(self, other: F32x4) -> F32x4 { 430 unsafe { F32x4(x86::_mm_add_ps(self.0, other.0)) } 431 } 432 } 433 434 impl Div<F32x4> for F32x4 { 435 type Output = F32x4; 436 #[inline] div(self, other: F32x4) -> F32x4437 fn div(self, other: F32x4) -> F32x4 { 438 unsafe { F32x4(x86::_mm_div_ps(self.0, other.0)) } 439 } 440 } 441 442 impl Mul<F32x4> for F32x4 { 443 type Output = F32x4; 444 #[inline] mul(self, other: F32x4) -> F32x4445 fn mul(self, other: F32x4) -> F32x4 { 446 unsafe { F32x4(x86::_mm_mul_ps(self.0, other.0)) } 447 } 448 } 449 450 impl Sub<F32x4> for F32x4 { 451 type Output = F32x4; 452 #[inline] sub(self, other: F32x4) -> F32x4453 fn sub(self, other: F32x4) -> F32x4 { 454 unsafe { F32x4(x86::_mm_sub_ps(self.0, other.0)) } 455 } 456 } 457 458 // Two 32-bit signed integers 459 460 #[derive(Clone, Copy)] 461 pub struct I32x2(pub u64); 462 463 impl I32x2 { 464 // Constructors 465 466 #[inline] new(a: i32, b: i32) -> I32x2467 pub fn new(a: i32, b: i32) -> I32x2 { 468 unsafe { 469 let a = mem::transmute::<*const i32, *const u32>(&a); 470 let b = mem::transmute::<*const i32, *const u32>(&b); 471 I32x2((*a as u64) | ((*b as u64) << 32)) 472 } 473 } 474 475 #[inline] splat(x: i32) -> I32x2476 pub fn splat(x: i32) -> I32x2 { 477 I32x2::new(x, x) 478 } 479 480 // Accessors 481 482 #[inline] x(self) -> i32483 pub fn x(self) -> i32 { 484 self[0] 485 } 486 487 #[inline] y(self) -> i32488 pub fn y(self) -> i32 { 489 self[1] 490 } 491 492 // Concatenations 493 494 #[inline] concat_xy_xy(self, other: I32x2) -> I32x4495 pub fn concat_xy_xy(self, other: I32x2) -> I32x4 { 496 self.to_i32x4().concat_xy_xy(other.to_i32x4()) 497 } 498 499 // Conversions 500 501 #[inline] to_i32x4(self) -> I32x4502 pub fn to_i32x4(self) -> I32x4 { 503 unsafe { 504 let mut result = I32x4::default(); 505 *mem::transmute::<&mut __m128i, &mut u64>(&mut result.0) = self.0; 506 result 507 } 508 } 509 510 #[inline] to_f32x4(self) -> F32x4511 pub fn to_f32x4(self) -> F32x4 { 512 self.to_i32x4().to_f32x4() 513 } 514 515 /// Converts these packed integers to floats. 516 #[inline] to_f32x2(self) -> F32x2517 pub fn to_f32x2(self) -> F32x2 { 518 self.to_f32x4().xy() 519 } 520 521 // Basic operations 522 523 #[inline] max(self, other: I32x2) -> I32x2524 pub fn max(self, other: I32x2) -> I32x2 { 525 self.to_i32x4().max(other.to_i32x4()).xy() 526 } 527 528 #[inline] min(self, other: I32x2) -> I32x2529 pub fn min(self, other: I32x2) -> I32x2 { 530 self.to_i32x4().min(other.to_i32x4()).xy() 531 } 532 533 // Comparisons 534 535 // TODO(pcwalton): Use the `U32x2` type! 536 #[inline] packed_eq(self, other: I32x2) -> U32x4537 pub fn packed_eq(self, other: I32x2) -> U32x4 { 538 self.to_i32x4().packed_eq(other.to_i32x4()) 539 } 540 541 #[inline] packed_gt(self, other: I32x2) -> U32x4542 pub fn packed_gt(self, other: I32x2) -> U32x4 { 543 self.to_i32x4().packed_gt(other.to_i32x4()) 544 } 545 546 #[inline] packed_le(self, other: I32x2) -> U32x4547 pub fn packed_le(self, other: I32x2) -> U32x4 { 548 self.to_i32x4().packed_le(other.to_i32x4()) 549 } 550 } 551 552 impl Default for I32x2 { 553 #[inline] default() -> I32x2554 fn default() -> I32x2 { 555 I32x2(0) 556 } 557 } 558 559 impl Index<usize> for I32x2 { 560 type Output = i32; 561 #[inline] index(&self, index: usize) -> &i32562 fn index(&self, index: usize) -> &i32 { 563 unsafe { &mem::transmute::<&u64, &[i32; 2]>(&self.0)[index] } 564 } 565 } 566 567 impl IndexMut<usize> for I32x2 { 568 #[inline] index_mut(&mut self, index: usize) -> &mut i32569 fn index_mut(&mut self, index: usize) -> &mut i32 { 570 unsafe { &mut mem::transmute::<&mut u64, &mut [i32; 2]>(&mut self.0)[index] } 571 } 572 } 573 574 impl Add<I32x2> for I32x2 { 575 type Output = I32x2; 576 #[inline] add(self, other: I32x2) -> I32x2577 fn add(self, other: I32x2) -> I32x2 { 578 (self.to_i32x4() + other.to_i32x4()).xy() 579 } 580 } 581 582 impl Sub<I32x2> for I32x2 { 583 type Output = I32x2; 584 #[inline] sub(self, other: I32x2) -> I32x2585 fn sub(self, other: I32x2) -> I32x2 { 586 (self.to_i32x4() - other.to_i32x4()).xy() 587 } 588 } 589 590 impl Mul<I32x2> for I32x2 { 591 type Output = I32x2; 592 #[inline] mul(self, other: I32x2) -> I32x2593 fn mul(self, other: I32x2) -> I32x2 { 594 (self.to_i32x4() * other.to_i32x4()).xy() 595 } 596 } 597 598 impl Debug for I32x2 { 599 #[inline] fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error>600 fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> { 601 write!(f, "<{}, {}>", self[0], self[1]) 602 } 603 } 604 605 impl PartialEq for I32x2 { 606 #[inline] eq(&self, other: &I32x2) -> bool607 fn eq(&self, other: &I32x2) -> bool { 608 self.packed_eq(*other).all_true() 609 } 610 } 611 612 // Four 32-bit signed integers 613 614 #[derive(Clone, Copy)] 615 pub struct I32x4(pub __m128i); 616 617 impl I32x4 { 618 // Constructors 619 620 #[inline] new(a: i32, b: i32, c: i32, d: i32) -> I32x4621 pub fn new(a: i32, b: i32, c: i32, d: i32) -> I32x4 { 622 unsafe { 623 let vector = [a, b, c, d]; 624 I32x4(x86::_mm_loadu_si128(vector.as_ptr() as *const __m128i)) 625 } 626 } 627 628 #[inline] splat(x: i32) -> I32x4629 pub fn splat(x: i32) -> I32x4 { 630 unsafe { I32x4(x86::_mm_set1_epi32(x)) } 631 } 632 633 // Extraction 634 635 #[inline] xy(self) -> I32x2636 pub fn xy(self) -> I32x2 { 637 unsafe { 638 let swizzled = self.0; 639 I32x2(*mem::transmute::<&__m128i, &u64>(&swizzled)) 640 } 641 } 642 643 #[inline] xw(self) -> I32x2644 pub fn xw(self) -> I32x2 { 645 self.xwyz().xy() 646 } 647 648 #[inline] yx(self) -> I32x2649 pub fn yx(self) -> I32x2 { 650 self.yxwz().xy() 651 } 652 653 #[inline] zy(self) -> I32x2654 pub fn zy(self) -> I32x2 { 655 self.zyxw().xy() 656 } 657 658 #[inline] zw(self) -> I32x2659 pub fn zw(self) -> I32x2 { 660 self.zwxy().xy() 661 } 662 663 // Concatenations 664 665 #[inline] concat_xy_xy(self, other: I32x4) -> I32x4666 pub fn concat_xy_xy(self, other: I32x4) -> I32x4 { 667 unsafe { 668 let this = x86::_mm_castsi128_pd(self.0); 669 let other = x86::_mm_castsi128_pd(other.0); 670 let result = x86::_mm_unpacklo_pd(this, other); 671 I32x4(x86::_mm_castpd_si128(result)) 672 } 673 } 674 675 #[inline] concat_zw_zw(self, other: I32x4) -> I32x4676 pub fn concat_zw_zw(self, other: I32x4) -> I32x4 { 677 unsafe { 678 let this = x86::_mm_castsi128_pd(self.0); 679 let other = x86::_mm_castsi128_pd(other.0); 680 let result = x86::_mm_unpackhi_pd(this, other); 681 I32x4(x86::_mm_castpd_si128(result)) 682 } 683 } 684 685 // Conversions 686 687 /// Converts these packed integers to floats. 688 #[inline] to_f32x4(self) -> F32x4689 pub fn to_f32x4(self) -> F32x4 { 690 unsafe { F32x4(x86::_mm_cvtepi32_ps(self.0)) } 691 } 692 693 /// Converts these packed signed integers to unsigned integers. 694 /// 695 /// Overflowing values will wrap around. 696 #[inline] to_u32x4(self) -> U32x4697 pub fn to_u32x4(self) -> U32x4 { 698 U32x4(self.0) 699 } 700 701 // Basic operations 702 703 #[inline] max(self, other: I32x4) -> I32x4704 pub fn max(self, other: I32x4) -> I32x4 { 705 unsafe { I32x4(x86::_mm_max_epi32(self.0, other.0)) } 706 } 707 708 #[inline] min(self, other: I32x4) -> I32x4709 pub fn min(self, other: I32x4) -> I32x4 { 710 unsafe { I32x4(x86::_mm_min_epi32(self.0, other.0)) } 711 } 712 713 // Packed comparisons 714 715 #[inline] packed_eq(self, other: I32x4) -> U32x4716 pub fn packed_eq(self, other: I32x4) -> U32x4 { 717 unsafe { U32x4(x86::_mm_cmpeq_epi32(self.0, other.0)) } 718 } 719 720 // Comparisons 721 722 #[inline] packed_gt(self, other: I32x4) -> U32x4723 pub fn packed_gt(self, other: I32x4) -> U32x4 { 724 unsafe { U32x4(x86::_mm_cmpgt_epi32(self.0, other.0)) } 725 } 726 727 #[inline] packed_lt(self, other: I32x4) -> U32x4728 pub fn packed_lt(self, other: I32x4) -> U32x4 { 729 other.packed_gt(self) 730 } 731 732 #[inline] packed_le(self, other: I32x4) -> U32x4733 pub fn packed_le(self, other: I32x4) -> U32x4 { 734 !self.packed_gt(other) 735 } 736 } 737 738 impl Default for I32x4 { 739 #[inline] default() -> I32x4740 fn default() -> I32x4 { 741 unsafe { I32x4(x86::_mm_setzero_si128()) } 742 } 743 } 744 745 impl Index<usize> for I32x4 { 746 type Output = i32; 747 #[inline] index(&self, index: usize) -> &i32748 fn index(&self, index: usize) -> &i32 { 749 unsafe { &mem::transmute::<&__m128i, &[i32; 4]>(&self.0)[index] } 750 } 751 } 752 753 impl IndexMut<usize> for I32x4 { 754 #[inline] index_mut(&mut self, index: usize) -> &mut i32755 fn index_mut(&mut self, index: usize) -> &mut i32 { 756 unsafe { &mut mem::transmute::<&mut __m128i, &mut [i32; 4]>(&mut self.0)[index] } 757 } 758 } 759 760 impl Add<I32x4> for I32x4 { 761 type Output = I32x4; 762 #[inline] add(self, other: I32x4) -> I32x4763 fn add(self, other: I32x4) -> I32x4 { 764 unsafe { I32x4(x86::_mm_add_epi32(self.0, other.0)) } 765 } 766 } 767 768 impl Sub<I32x4> for I32x4 { 769 type Output = I32x4; 770 #[inline] sub(self, other: I32x4) -> I32x4771 fn sub(self, other: I32x4) -> I32x4 { 772 unsafe { I32x4(x86::_mm_sub_epi32(self.0, other.0)) } 773 } 774 } 775 776 impl Mul<I32x4> for I32x4 { 777 type Output = I32x4; 778 #[inline] mul(self, other: I32x4) -> I32x4779 fn mul(self, other: I32x4) -> I32x4 { 780 unsafe { I32x4(x86::_mm_mullo_epi32(self.0, other.0)) } 781 } 782 } 783 784 impl BitAnd<I32x4> for I32x4 { 785 type Output = I32x4; 786 #[inline] bitand(self, other: I32x4) -> I32x4787 fn bitand(self, other: I32x4) -> I32x4 { 788 unsafe { I32x4(x86::_mm_and_si128(self.0, other.0)) } 789 } 790 } 791 792 impl BitOr<I32x4> for I32x4 { 793 type Output = I32x4; 794 #[inline] bitor(self, other: I32x4) -> I32x4795 fn bitor(self, other: I32x4) -> I32x4 { 796 unsafe { I32x4(x86::_mm_or_si128(self.0, other.0)) } 797 } 798 } 799 800 impl Debug for I32x4 { 801 #[inline] fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error>802 fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> { 803 write!(f, "<{}, {}, {}, {}>", self[0], self[1], self[2], self[3]) 804 } 805 } 806 807 impl PartialEq for I32x4 { 808 #[inline] eq(&self, other: &I32x4) -> bool809 fn eq(&self, other: &I32x4) -> bool { 810 self.packed_eq(*other).all_true() 811 } 812 } 813 814 // Two 32-bit unsigned integers 815 816 #[derive(Clone, Copy)] 817 pub struct U32x2(pub u64); 818 819 impl U32x2 { 820 /// Returns true if both booleans in this vector are true. 821 /// 822 /// The result is *undefined* if both values in this vector are not booleans. A boolean is a 823 /// value with all bits set or all bits clear (i.e. !0 or 0). 824 #[inline] all_true(self) -> bool825 pub fn all_true(self) -> bool { 826 self.0 == !0 827 } 828 829 /// Returns true if both booleans in this vector are false. 830 /// 831 /// The result is *undefined* if both values in this vector are not booleans. A boolean is a 832 /// value with all bits set or all bits clear (i.e. !0 or 0). 833 #[inline] all_false(self) -> bool834 pub fn all_false(self) -> bool { 835 self.0 == 0 836 } 837 } 838 839 // Four 32-bit unsigned integers 840 841 #[derive(Clone, Copy)] 842 pub struct U32x4(pub __m128i); 843 844 impl U32x4 { 845 // Constructors 846 847 #[inline] new(a: u32, b: u32, c: u32, d: u32) -> U32x4848 pub fn new(a: u32, b: u32, c: u32, d: u32) -> U32x4 { 849 unsafe { 850 let vector = [a, b, c, d]; 851 U32x4(x86::_mm_loadu_si128(vector.as_ptr() as *const __m128i)) 852 } 853 } 854 855 #[inline] splat(x: u32) -> U32x4856 pub fn splat(x: u32) -> U32x4 { 857 unsafe { U32x4(x86::_mm_set1_epi32(x as i32)) } 858 } 859 860 // Conversions 861 862 /// Converts these packed unsigned integers to signed integers. 863 /// 864 /// Overflowing values will wrap around. 865 #[inline] to_i32x4(self) -> I32x4866 pub fn to_i32x4(self) -> I32x4 { 867 I32x4(self.0) 868 } 869 870 // Basic operations 871 872 /// Returns true if all four booleans in this vector are true. 873 /// 874 /// The result is *undefined* if all four values in this vector are not booleans. A boolean is 875 /// a value with all bits set or all bits clear (i.e. !0 or 0). 876 #[inline] all_true(self) -> bool877 pub fn all_true(self) -> bool { 878 unsafe { x86::_mm_movemask_ps(x86::_mm_castsi128_ps(self.0)) == 0x0f } 879 } 880 881 /// Returns true if all four booleans in this vector are false. 882 /// 883 /// The result is *undefined* if all four values in this vector are not booleans. A boolean is 884 /// a value with all bits set or all bits clear (i.e. !0 or 0). 885 #[inline] all_false(self) -> bool886 pub fn all_false(self) -> bool { 887 unsafe { x86::_mm_movemask_ps(x86::_mm_castsi128_ps(self.0)) == 0x00 } 888 } 889 890 // Extraction 891 892 #[inline] xy(self) -> U32x2893 pub fn xy(self) -> U32x2 { 894 unsafe { 895 let swizzled = self.0; 896 U32x2(*mem::transmute::<&__m128i, &u64>(&swizzled)) 897 } 898 } 899 900 // Packed comparisons 901 902 #[inline] packed_eq(self, other: U32x4) -> U32x4903 pub fn packed_eq(self, other: U32x4) -> U32x4 { 904 unsafe { U32x4(x86::_mm_cmpeq_epi32(self.0, other.0)) } 905 } 906 } 907 908 impl Debug for U32x4 { 909 #[inline] fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error>910 fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> { 911 write!(f, "<{}, {}, {}, {}>", self[0], self[1], self[2], self[3]) 912 } 913 } 914 915 impl Index<usize> for U32x4 { 916 type Output = u32; 917 #[inline] index(&self, index: usize) -> &u32918 fn index(&self, index: usize) -> &u32 { 919 unsafe { &mem::transmute::<&__m128i, &[u32; 4]>(&self.0)[index] } 920 } 921 } 922 923 impl PartialEq for U32x4 { 924 #[inline] eq(&self, other: &U32x4) -> bool925 fn eq(&self, other: &U32x4) -> bool { 926 self.packed_eq(*other).all_true() 927 } 928 } 929 930 impl Not for U32x4 { 931 type Output = U32x4; 932 #[inline] not(self) -> U32x4933 fn not(self) -> U32x4 { 934 self ^ U32x4::splat(!0) 935 } 936 } 937 938 impl BitXor<U32x4> for U32x4 { 939 type Output = U32x4; 940 #[inline] bitxor(self, other: U32x4) -> U32x4941 fn bitxor(self, other: U32x4) -> U32x4 { 942 unsafe { U32x4(x86::_mm_xor_si128(self.0, other.0)) } 943 } 944 } 945 946 impl Shr<u32> for U32x4 { 947 type Output = U32x4; 948 #[inline] shr(self, amount: u32) -> U32x4949 fn shr(self, amount: u32) -> U32x4 { 950 unsafe { U32x4(x86::_mm_srl_epi32(self.0, U32x4::new(amount, 0, 0, 0).0)) } 951 } 952 } 953