1 // pathfinder/simd/src/x86.rs
2 //
3 // Copyright © 2019 The Pathfinder Project Developers.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10 
11 use std::cmp::PartialEq;
12 use std::fmt::{self, Debug, Formatter};
13 use std::mem;
14 use std::ops::{Add, BitAnd, BitOr, BitXor, Div, Index, IndexMut, Mul, Not, Shr, Sub};
15 
16 #[cfg(target_pointer_width = "32")]
17 use std::arch::x86::{__m128, __m128i};
18 #[cfg(target_pointer_width = "32")]
19 use std::arch::x86;
20 #[cfg(target_pointer_width = "64")]
21 use std::arch::x86_64::{__m128, __m128i};
22 #[cfg(target_pointer_width = "64")]
23 use std::arch::x86_64 as x86;
24 
25 mod swizzle_f32x4;
26 mod swizzle_i32x4;
27 
28 // Two 32-bit floats
29 
30 #[derive(Clone, Copy)]
31 pub struct F32x2(pub u64);
32 
33 impl F32x2 {
34     // Constructors
35 
36     #[inline]
new(a: f32, b: f32) -> F32x237     pub fn new(a: f32, b: f32) -> F32x2 {
38         unsafe {
39             let a = mem::transmute::<*const f32, *const u32>(&a);
40             let b = mem::transmute::<*const f32, *const u32>(&b);
41             F32x2((*a as u64) | ((*b as u64) << 32))
42         }
43     }
44 
45     #[inline]
splat(x: f32) -> F32x246     pub fn splat(x: f32) -> F32x2 {
47         F32x2::new(x, x)
48     }
49 
50     // Basic operations
51 
52     #[inline]
approx_recip(self) -> F32x253     pub fn approx_recip(self) -> F32x2 {
54         self.to_f32x4().approx_recip().xy()
55     }
56 
57     #[inline]
min(self, other: F32x2) -> F32x258     pub fn min(self, other: F32x2) -> F32x2 {
59         self.to_f32x4().min(other.to_f32x4()).xy()
60     }
61 
62     #[inline]
max(self, other: F32x2) -> F32x263     pub fn max(self, other: F32x2) -> F32x2 {
64         self.to_f32x4().max(other.to_f32x4()).xy()
65     }
66 
67     #[inline]
clamp(self, min: F32x2, max: F32x2) -> F32x268     pub fn clamp(self, min: F32x2, max: F32x2) -> F32x2 {
69         self.to_f32x4().clamp(min.to_f32x4(), max.to_f32x4()).xy()
70     }
71 
72     #[inline]
abs(self) -> F32x273     pub fn abs(self) -> F32x2 {
74         self.to_f32x4().abs().xy()
75     }
76 
77     #[inline]
floor(self) -> F32x278     pub fn floor(self) -> F32x2 {
79         self.to_f32x4().floor().xy()
80     }
81 
82     #[inline]
ceil(self) -> F32x283     pub fn ceil(self) -> F32x2 {
84         self.to_f32x4().ceil().xy()
85     }
86 
87     #[inline]
sqrt(self) -> F32x288     pub fn sqrt(self) -> F32x2 {
89         self.to_f32x4().sqrt().xy()
90     }
91 
92     // Packed comparisons
93 
94     #[inline]
packed_eq(self, other: F32x2) -> U32x295     pub fn packed_eq(self, other: F32x2) -> U32x2 {
96         self.to_f32x4().packed_eq(other.to_f32x4()).xy()
97     }
98 
99     #[inline]
packed_gt(self, other: F32x2) -> U32x2100     pub fn packed_gt(self, other: F32x2) -> U32x2 {
101         self.to_f32x4().packed_gt(other.to_f32x4()).xy()
102     }
103 
104     #[inline]
packed_lt(self, other: F32x2) -> U32x2105     pub fn packed_lt(self, other: F32x2) -> U32x2 {
106         self.to_f32x4().packed_lt(other.to_f32x4()).xy()
107     }
108 
109     #[inline]
packed_le(self, other: F32x2) -> U32x2110     pub fn packed_le(self, other: F32x2) -> U32x2 {
111         self.to_f32x4().packed_le(other.to_f32x4()).xy()
112     }
113 
114     // Conversions
115 
116     #[inline]
to_f32x4(self) -> F32x4117     pub fn to_f32x4(self) -> F32x4 {
118         unsafe {
119             let mut result = F32x4::default();
120             *mem::transmute::<&mut __m128, &mut u64>(&mut result.0) = self.0;
121             result
122         }
123     }
124 
125     #[inline]
to_i32x2(self) -> I32x2126     pub fn to_i32x2(self) -> I32x2 {
127         self.to_i32x4().xy()
128     }
129 
130     #[inline]
to_i32x4(self) -> I32x4131     pub fn to_i32x4(self) -> I32x4 {
132         self.to_f32x4().to_i32x4()
133     }
134 
135     // Swizzle
136 
137     #[inline]
yx(self) -> F32x2138     pub fn yx(self) -> F32x2 {
139         self.to_f32x4().yx()
140     }
141 
142     // Concatenations
143 
144     #[inline]
concat_xy_xy(self, other: F32x2) -> F32x4145     pub fn concat_xy_xy(self, other: F32x2) -> F32x4 {
146         self.to_f32x4().concat_xy_xy(other.to_f32x4())
147     }
148 }
149 
150 impl Default for F32x2 {
151     #[inline]
default() -> F32x2152     fn default() -> F32x2 {
153         F32x2(0)
154     }
155 }
156 
157 impl Index<usize> for F32x2 {
158     type Output = f32;
159     #[inline]
index(&self, index: usize) -> &f32160     fn index(&self, index: usize) -> &f32 {
161         unsafe { &mem::transmute::<&u64, &[f32; 2]>(&self.0)[index] }
162     }
163 }
164 
165 impl IndexMut<usize> for F32x2 {
166     #[inline]
index_mut(&mut self, index: usize) -> &mut f32167     fn index_mut(&mut self, index: usize) -> &mut f32 {
168         unsafe { &mut mem::transmute::<&mut u64, &mut [f32; 2]>(&mut self.0)[index] }
169     }
170 }
171 
172 impl Debug for F32x2 {
173     #[inline]
fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error>174     fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
175         write!(f, "<{}, {}>", self[0], self[1])
176     }
177 }
178 
179 impl PartialEq for F32x2 {
180     #[inline]
eq(&self, other: &F32x2) -> bool181     fn eq(&self, other: &F32x2) -> bool {
182         self.packed_eq(*other).all_true()
183     }
184 }
185 
186 impl Add<F32x2> for F32x2 {
187     type Output = F32x2;
188     #[inline]
add(self, other: F32x2) -> F32x2189     fn add(self, other: F32x2) -> F32x2 {
190         (self.to_f32x4() + other.to_f32x4()).xy()
191     }
192 }
193 
194 impl Div<F32x2> for F32x2 {
195     type Output = F32x2;
196     #[inline]
div(self, other: F32x2) -> F32x2197     fn div(self, other: F32x2) -> F32x2 {
198         (self.to_f32x4() / other.to_f32x4()).xy()
199     }
200 }
201 
202 impl Mul<F32x2> for F32x2 {
203     type Output = F32x2;
204     #[inline]
mul(self, other: F32x2) -> F32x2205     fn mul(self, other: F32x2) -> F32x2 {
206         (self.to_f32x4() * other.to_f32x4()).xy()
207     }
208 }
209 
210 impl Sub<F32x2> for F32x2 {
211     type Output = F32x2;
212     #[inline]
sub(self, other: F32x2) -> F32x2213     fn sub(self, other: F32x2) -> F32x2 {
214         (self.to_f32x4() - other.to_f32x4()).xy()
215     }
216 }
217 
218 // Four 32-bit floats
219 
220 #[derive(Clone, Copy)]
221 pub struct F32x4(pub __m128);
222 
223 impl F32x4 {
224     // Constructors
225 
226     #[inline]
new(a: f32, b: f32, c: f32, d: f32) -> F32x4227     pub fn new(a: f32, b: f32, c: f32, d: f32) -> F32x4 {
228         unsafe {
229             let vector = [a, b, c, d];
230             F32x4(x86::_mm_loadu_ps(vector.as_ptr()))
231         }
232     }
233 
234     #[inline]
splat(x: f32) -> F32x4235     pub fn splat(x: f32) -> F32x4 {
236         unsafe { F32x4(x86::_mm_set1_ps(x)) }
237     }
238 
239     // Basic operations
240 
241     #[inline]
approx_recip(self) -> F32x4242     pub fn approx_recip(self) -> F32x4 {
243         unsafe { F32x4(x86::_mm_rcp_ps(self.0)) }
244     }
245 
246     #[inline]
min(self, other: F32x4) -> F32x4247     pub fn min(self, other: F32x4) -> F32x4 {
248         unsafe { F32x4(x86::_mm_min_ps(self.0, other.0)) }
249     }
250 
251     #[inline]
max(self, other: F32x4) -> F32x4252     pub fn max(self, other: F32x4) -> F32x4 {
253         unsafe { F32x4(x86::_mm_max_ps(self.0, other.0)) }
254     }
255 
256     #[inline]
clamp(self, min: F32x4, max: F32x4) -> F32x4257     pub fn clamp(self, min: F32x4, max: F32x4) -> F32x4 {
258         self.max(min).min(max)
259     }
260 
261     #[inline]
abs(self) -> F32x4262     pub fn abs(self) -> F32x4 {
263         unsafe {
264             let tmp = x86::_mm_srli_epi32(I32x4::splat(-1).0, 1);
265             F32x4(x86::_mm_and_ps(x86::_mm_castsi128_ps(tmp), self.0))
266         }
267     }
268 
269     #[inline]
floor(self) -> F32x4270     pub fn floor(self) -> F32x4 {
271         unsafe { F32x4(x86::_mm_floor_ps(self.0)) }
272     }
273 
274     #[inline]
ceil(self) -> F32x4275     pub fn ceil(self) -> F32x4 {
276         unsafe { F32x4(x86::_mm_ceil_ps(self.0)) }
277     }
278 
279     #[inline]
sqrt(self) -> F32x4280     pub fn sqrt(self) -> F32x4 {
281         unsafe { F32x4(x86::_mm_sqrt_ps(self.0)) }
282     }
283 
284     // Packed comparisons
285 
286     #[inline]
packed_eq(self, other: F32x4) -> U32x4287     pub fn packed_eq(self, other: F32x4) -> U32x4 {
288         unsafe {
289             U32x4(x86::_mm_castps_si128(x86::_mm_cmpeq_ps(
290                 self.0, other.0,
291             )))
292         }
293     }
294 
295     #[inline]
packed_gt(self, other: F32x4) -> U32x4296     pub fn packed_gt(self, other: F32x4) -> U32x4 {
297         unsafe {
298             U32x4(x86::_mm_castps_si128(x86::_mm_cmpgt_ps(
299                 self.0, other.0,
300             )))
301         }
302     }
303 
304     #[inline]
packed_lt(self, other: F32x4) -> U32x4305     pub fn packed_lt(self, other: F32x4) -> U32x4 {
306         other.packed_gt(self)
307     }
308 
309     #[inline]
packed_le(self, other: F32x4) -> U32x4310     pub fn packed_le(self, other: F32x4) -> U32x4 {
311         !self.packed_gt(other)
312     }
313 
314     // Conversions
315 
316     /// Converts these packed floats to integers via rounding.
317     #[inline]
to_i32x4(self) -> I32x4318     pub fn to_i32x4(self) -> I32x4 {
319         unsafe { I32x4(x86::_mm_cvtps_epi32(self.0)) }
320     }
321 
322     // Extraction
323 
324     #[inline]
xy(self) -> F32x2325     pub fn xy(self) -> F32x2 {
326         unsafe {
327             let swizzled = self.0;
328             F32x2(*mem::transmute::<&__m128, &u64>(&swizzled))
329         }
330     }
331 
332     #[inline]
xw(self) -> F32x2333     pub fn xw(self) -> F32x2 {
334         self.xwyz().xy()
335     }
336 
337     #[inline]
yx(self) -> F32x2338     pub fn yx(self) -> F32x2 {
339         self.yxwz().xy()
340     }
341 
342     #[inline]
zy(self) -> F32x2343     pub fn zy(self) -> F32x2 {
344         self.zyxw().xy()
345     }
346 
347     #[inline]
zw(self) -> F32x2348     pub fn zw(self) -> F32x2 {
349         self.zwxy().xy()
350     }
351 
352     // Concatenations
353 
354     #[inline]
concat_xy_xy(self, other: F32x4) -> F32x4355     pub fn concat_xy_xy(self, other: F32x4) -> F32x4 {
356         unsafe {
357             let this = x86::_mm_castps_pd(self.0);
358             let other = x86::_mm_castps_pd(other.0);
359             let result = x86::_mm_unpacklo_pd(this, other);
360             F32x4(x86::_mm_castpd_ps(result))
361         }
362     }
363 
364     #[inline]
concat_xy_zw(self, other: F32x4) -> F32x4365     pub fn concat_xy_zw(self, other: F32x4) -> F32x4 {
366         unsafe {
367             let this = x86::_mm_castps_pd(self.0);
368             let other = x86::_mm_castps_pd(other.0);
369             let result = x86::_mm_shuffle_pd(this, other, 0b10);
370             F32x4(x86::_mm_castpd_ps(result))
371         }
372     }
373 
374     #[inline]
concat_zw_zw(self, other: F32x4) -> F32x4375     pub fn concat_zw_zw(self, other: F32x4) -> F32x4 {
376         unsafe {
377             let this = x86::_mm_castps_pd(self.0);
378             let other = x86::_mm_castps_pd(other.0);
379             let result = x86::_mm_unpackhi_pd(this, other);
380             F32x4(x86::_mm_castpd_ps(result))
381         }
382     }
383 
384     #[inline]
concat_wz_yx(self, other: F32x4) -> F32x4385     pub fn concat_wz_yx(self, other: F32x4) -> F32x4 {
386         unsafe { F32x4(x86::_mm_shuffle_ps(self.0, other.0, 0b0001_1011)) }
387     }
388 }
389 
390 impl Default for F32x4 {
391     #[inline]
default() -> F32x4392     fn default() -> F32x4 {
393         unsafe { F32x4(x86::_mm_setzero_ps()) }
394     }
395 }
396 
397 impl Index<usize> for F32x4 {
398     type Output = f32;
399     #[inline]
index(&self, index: usize) -> &f32400     fn index(&self, index: usize) -> &f32 {
401         unsafe { &mem::transmute::<&__m128, &[f32; 4]>(&self.0)[index] }
402     }
403 }
404 
405 impl IndexMut<usize> for F32x4 {
406     #[inline]
index_mut(&mut self, index: usize) -> &mut f32407     fn index_mut(&mut self, index: usize) -> &mut f32 {
408         unsafe { &mut mem::transmute::<&mut __m128, &mut [f32; 4]>(&mut self.0)[index] }
409     }
410 }
411 
412 impl Debug for F32x4 {
413     #[inline]
fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error>414     fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
415         write!(f, "<{}, {}, {}, {}>", self[0], self[1], self[2], self[3])
416     }
417 }
418 
419 impl PartialEq for F32x4 {
420     #[inline]
eq(&self, other: &F32x4) -> bool421     fn eq(&self, other: &F32x4) -> bool {
422         self.packed_eq(*other).all_true()
423     }
424 }
425 
426 impl Add<F32x4> for F32x4 {
427     type Output = F32x4;
428     #[inline]
add(self, other: F32x4) -> F32x4429     fn add(self, other: F32x4) -> F32x4 {
430         unsafe { F32x4(x86::_mm_add_ps(self.0, other.0)) }
431     }
432 }
433 
434 impl Div<F32x4> for F32x4 {
435     type Output = F32x4;
436     #[inline]
div(self, other: F32x4) -> F32x4437     fn div(self, other: F32x4) -> F32x4 {
438         unsafe { F32x4(x86::_mm_div_ps(self.0, other.0)) }
439     }
440 }
441 
442 impl Mul<F32x4> for F32x4 {
443     type Output = F32x4;
444     #[inline]
mul(self, other: F32x4) -> F32x4445     fn mul(self, other: F32x4) -> F32x4 {
446         unsafe { F32x4(x86::_mm_mul_ps(self.0, other.0)) }
447     }
448 }
449 
450 impl Sub<F32x4> for F32x4 {
451     type Output = F32x4;
452     #[inline]
sub(self, other: F32x4) -> F32x4453     fn sub(self, other: F32x4) -> F32x4 {
454         unsafe { F32x4(x86::_mm_sub_ps(self.0, other.0)) }
455     }
456 }
457 
458 // Two 32-bit signed integers
459 
460 #[derive(Clone, Copy)]
461 pub struct I32x2(pub u64);
462 
463 impl I32x2 {
464     // Constructors
465 
466     #[inline]
new(a: i32, b: i32) -> I32x2467     pub fn new(a: i32, b: i32) -> I32x2 {
468         unsafe {
469             let a = mem::transmute::<*const i32, *const u32>(&a);
470             let b = mem::transmute::<*const i32, *const u32>(&b);
471             I32x2((*a as u64) | ((*b as u64) << 32))
472         }
473     }
474 
475     #[inline]
splat(x: i32) -> I32x2476     pub fn splat(x: i32) -> I32x2 {
477         I32x2::new(x, x)
478     }
479 
480     // Accessors
481 
482     #[inline]
x(self) -> i32483     pub fn x(self) -> i32 {
484         self[0]
485     }
486 
487     #[inline]
y(self) -> i32488     pub fn y(self) -> i32 {
489         self[1]
490     }
491 
492     // Concatenations
493 
494     #[inline]
concat_xy_xy(self, other: I32x2) -> I32x4495     pub fn concat_xy_xy(self, other: I32x2) -> I32x4 {
496         self.to_i32x4().concat_xy_xy(other.to_i32x4())
497     }
498 
499     // Conversions
500 
501     #[inline]
to_i32x4(self) -> I32x4502     pub fn to_i32x4(self) -> I32x4 {
503         unsafe {
504             let mut result = I32x4::default();
505             *mem::transmute::<&mut __m128i, &mut u64>(&mut result.0) = self.0;
506             result
507         }
508     }
509 
510     #[inline]
to_f32x4(self) -> F32x4511     pub fn to_f32x4(self) -> F32x4 {
512         self.to_i32x4().to_f32x4()
513     }
514 
515     /// Converts these packed integers to floats.
516     #[inline]
to_f32x2(self) -> F32x2517     pub fn to_f32x2(self) -> F32x2 {
518         self.to_f32x4().xy()
519     }
520 
521     // Basic operations
522 
523     #[inline]
max(self, other: I32x2) -> I32x2524     pub fn max(self, other: I32x2) -> I32x2 {
525         self.to_i32x4().max(other.to_i32x4()).xy()
526     }
527 
528     #[inline]
min(self, other: I32x2) -> I32x2529     pub fn min(self, other: I32x2) -> I32x2 {
530         self.to_i32x4().min(other.to_i32x4()).xy()
531     }
532 
533     // Comparisons
534 
535     // TODO(pcwalton): Use the `U32x2` type!
536     #[inline]
packed_eq(self, other: I32x2) -> U32x4537     pub fn packed_eq(self, other: I32x2) -> U32x4 {
538         self.to_i32x4().packed_eq(other.to_i32x4())
539     }
540 
541     #[inline]
packed_gt(self, other: I32x2) -> U32x4542     pub fn packed_gt(self, other: I32x2) -> U32x4 {
543         self.to_i32x4().packed_gt(other.to_i32x4())
544     }
545 
546     #[inline]
packed_le(self, other: I32x2) -> U32x4547     pub fn packed_le(self, other: I32x2) -> U32x4 {
548         self.to_i32x4().packed_le(other.to_i32x4())
549     }
550 }
551 
552 impl Default for I32x2 {
553     #[inline]
default() -> I32x2554     fn default() -> I32x2 {
555         I32x2(0)
556     }
557 }
558 
559 impl Index<usize> for I32x2 {
560     type Output = i32;
561     #[inline]
index(&self, index: usize) -> &i32562     fn index(&self, index: usize) -> &i32 {
563         unsafe { &mem::transmute::<&u64, &[i32; 2]>(&self.0)[index] }
564     }
565 }
566 
567 impl IndexMut<usize> for I32x2 {
568     #[inline]
index_mut(&mut self, index: usize) -> &mut i32569     fn index_mut(&mut self, index: usize) -> &mut i32 {
570         unsafe { &mut mem::transmute::<&mut u64, &mut [i32; 2]>(&mut self.0)[index] }
571     }
572 }
573 
574 impl Add<I32x2> for I32x2 {
575     type Output = I32x2;
576     #[inline]
add(self, other: I32x2) -> I32x2577     fn add(self, other: I32x2) -> I32x2 {
578         (self.to_i32x4() + other.to_i32x4()).xy()
579     }
580 }
581 
582 impl Sub<I32x2> for I32x2 {
583     type Output = I32x2;
584     #[inline]
sub(self, other: I32x2) -> I32x2585     fn sub(self, other: I32x2) -> I32x2 {
586         (self.to_i32x4() - other.to_i32x4()).xy()
587     }
588 }
589 
590 impl Mul<I32x2> for I32x2 {
591     type Output = I32x2;
592     #[inline]
mul(self, other: I32x2) -> I32x2593     fn mul(self, other: I32x2) -> I32x2 {
594         (self.to_i32x4() * other.to_i32x4()).xy()
595     }
596 }
597 
598 impl Debug for I32x2 {
599     #[inline]
fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error>600     fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
601         write!(f, "<{}, {}>", self[0], self[1])
602     }
603 }
604 
605 impl PartialEq for I32x2 {
606     #[inline]
eq(&self, other: &I32x2) -> bool607     fn eq(&self, other: &I32x2) -> bool {
608         self.packed_eq(*other).all_true()
609     }
610 }
611 
612 // Four 32-bit signed integers
613 
614 #[derive(Clone, Copy)]
615 pub struct I32x4(pub __m128i);
616 
617 impl I32x4 {
618     // Constructors
619 
620     #[inline]
new(a: i32, b: i32, c: i32, d: i32) -> I32x4621     pub fn new(a: i32, b: i32, c: i32, d: i32) -> I32x4 {
622         unsafe {
623             let vector = [a, b, c, d];
624             I32x4(x86::_mm_loadu_si128(vector.as_ptr() as *const __m128i))
625         }
626     }
627 
628     #[inline]
splat(x: i32) -> I32x4629     pub fn splat(x: i32) -> I32x4 {
630         unsafe { I32x4(x86::_mm_set1_epi32(x)) }
631     }
632 
633     // Extraction
634 
635     #[inline]
xy(self) -> I32x2636     pub fn xy(self) -> I32x2 {
637         unsafe {
638             let swizzled = self.0;
639             I32x2(*mem::transmute::<&__m128i, &u64>(&swizzled))
640         }
641     }
642 
643     #[inline]
xw(self) -> I32x2644     pub fn xw(self) -> I32x2 {
645         self.xwyz().xy()
646     }
647 
648     #[inline]
yx(self) -> I32x2649     pub fn yx(self) -> I32x2 {
650         self.yxwz().xy()
651     }
652 
653     #[inline]
zy(self) -> I32x2654     pub fn zy(self) -> I32x2 {
655         self.zyxw().xy()
656     }
657 
658     #[inline]
zw(self) -> I32x2659     pub fn zw(self) -> I32x2 {
660         self.zwxy().xy()
661     }
662 
663     // Concatenations
664 
665     #[inline]
concat_xy_xy(self, other: I32x4) -> I32x4666     pub fn concat_xy_xy(self, other: I32x4) -> I32x4 {
667         unsafe {
668             let this = x86::_mm_castsi128_pd(self.0);
669             let other = x86::_mm_castsi128_pd(other.0);
670             let result = x86::_mm_unpacklo_pd(this, other);
671             I32x4(x86::_mm_castpd_si128(result))
672         }
673     }
674 
675     #[inline]
concat_zw_zw(self, other: I32x4) -> I32x4676     pub fn concat_zw_zw(self, other: I32x4) -> I32x4 {
677         unsafe {
678             let this = x86::_mm_castsi128_pd(self.0);
679             let other = x86::_mm_castsi128_pd(other.0);
680             let result = x86::_mm_unpackhi_pd(this, other);
681             I32x4(x86::_mm_castpd_si128(result))
682         }
683     }
684 
685     // Conversions
686 
687     /// Converts these packed integers to floats.
688     #[inline]
to_f32x4(self) -> F32x4689     pub fn to_f32x4(self) -> F32x4 {
690         unsafe { F32x4(x86::_mm_cvtepi32_ps(self.0)) }
691     }
692 
693     /// Converts these packed signed integers to unsigned integers.
694     ///
695     /// Overflowing values will wrap around.
696     #[inline]
to_u32x4(self) -> U32x4697     pub fn to_u32x4(self) -> U32x4 {
698         U32x4(self.0)
699     }
700 
701     // Basic operations
702 
703     #[inline]
max(self, other: I32x4) -> I32x4704     pub fn max(self, other: I32x4) -> I32x4 {
705         unsafe { I32x4(x86::_mm_max_epi32(self.0, other.0)) }
706     }
707 
708     #[inline]
min(self, other: I32x4) -> I32x4709     pub fn min(self, other: I32x4) -> I32x4 {
710         unsafe { I32x4(x86::_mm_min_epi32(self.0, other.0)) }
711     }
712 
713     // Packed comparisons
714 
715     #[inline]
packed_eq(self, other: I32x4) -> U32x4716     pub fn packed_eq(self, other: I32x4) -> U32x4 {
717         unsafe { U32x4(x86::_mm_cmpeq_epi32(self.0, other.0)) }
718     }
719 
720     // Comparisons
721 
722     #[inline]
packed_gt(self, other: I32x4) -> U32x4723     pub fn packed_gt(self, other: I32x4) -> U32x4 {
724         unsafe { U32x4(x86::_mm_cmpgt_epi32(self.0, other.0)) }
725     }
726 
727     #[inline]
packed_lt(self, other: I32x4) -> U32x4728     pub fn packed_lt(self, other: I32x4) -> U32x4 {
729         other.packed_gt(self)
730     }
731 
732     #[inline]
packed_le(self, other: I32x4) -> U32x4733     pub fn packed_le(self, other: I32x4) -> U32x4 {
734         !self.packed_gt(other)
735     }
736 }
737 
738 impl Default for I32x4 {
739     #[inline]
default() -> I32x4740     fn default() -> I32x4 {
741         unsafe { I32x4(x86::_mm_setzero_si128()) }
742     }
743 }
744 
745 impl Index<usize> for I32x4 {
746     type Output = i32;
747     #[inline]
index(&self, index: usize) -> &i32748     fn index(&self, index: usize) -> &i32 {
749         unsafe { &mem::transmute::<&__m128i, &[i32; 4]>(&self.0)[index] }
750     }
751 }
752 
753 impl IndexMut<usize> for I32x4 {
754     #[inline]
index_mut(&mut self, index: usize) -> &mut i32755     fn index_mut(&mut self, index: usize) -> &mut i32 {
756         unsafe { &mut mem::transmute::<&mut __m128i, &mut [i32; 4]>(&mut self.0)[index] }
757     }
758 }
759 
760 impl Add<I32x4> for I32x4 {
761     type Output = I32x4;
762     #[inline]
add(self, other: I32x4) -> I32x4763     fn add(self, other: I32x4) -> I32x4 {
764         unsafe { I32x4(x86::_mm_add_epi32(self.0, other.0)) }
765     }
766 }
767 
768 impl Sub<I32x4> for I32x4 {
769     type Output = I32x4;
770     #[inline]
sub(self, other: I32x4) -> I32x4771     fn sub(self, other: I32x4) -> I32x4 {
772         unsafe { I32x4(x86::_mm_sub_epi32(self.0, other.0)) }
773     }
774 }
775 
776 impl Mul<I32x4> for I32x4 {
777     type Output = I32x4;
778     #[inline]
mul(self, other: I32x4) -> I32x4779     fn mul(self, other: I32x4) -> I32x4 {
780         unsafe { I32x4(x86::_mm_mullo_epi32(self.0, other.0)) }
781     }
782 }
783 
784 impl BitAnd<I32x4> for I32x4 {
785     type Output = I32x4;
786     #[inline]
bitand(self, other: I32x4) -> I32x4787     fn bitand(self, other: I32x4) -> I32x4 {
788         unsafe { I32x4(x86::_mm_and_si128(self.0, other.0)) }
789     }
790 }
791 
792 impl BitOr<I32x4> for I32x4 {
793     type Output = I32x4;
794     #[inline]
bitor(self, other: I32x4) -> I32x4795     fn bitor(self, other: I32x4) -> I32x4 {
796         unsafe { I32x4(x86::_mm_or_si128(self.0, other.0)) }
797     }
798 }
799 
800 impl Debug for I32x4 {
801     #[inline]
fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error>802     fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
803         write!(f, "<{}, {}, {}, {}>", self[0], self[1], self[2], self[3])
804     }
805 }
806 
807 impl PartialEq for I32x4 {
808     #[inline]
eq(&self, other: &I32x4) -> bool809     fn eq(&self, other: &I32x4) -> bool {
810         self.packed_eq(*other).all_true()
811     }
812 }
813 
814 // Two 32-bit unsigned integers
815 
816 #[derive(Clone, Copy)]
817 pub struct U32x2(pub u64);
818 
819 impl U32x2 {
820     /// Returns true if both booleans in this vector are true.
821     ///
822     /// The result is *undefined* if both values in this vector are not booleans. A boolean is a
823     /// value with all bits set or all bits clear (i.e. !0 or 0).
824     #[inline]
all_true(self) -> bool825     pub fn all_true(self) -> bool {
826         self.0 == !0
827     }
828 
829     /// Returns true if both booleans in this vector are false.
830     ///
831     /// The result is *undefined* if both values in this vector are not booleans. A boolean is a
832     /// value with all bits set or all bits clear (i.e. !0 or 0).
833     #[inline]
all_false(self) -> bool834     pub fn all_false(self) -> bool {
835         self.0 == 0
836     }
837 }
838 
839 // Four 32-bit unsigned integers
840 
841 #[derive(Clone, Copy)]
842 pub struct U32x4(pub __m128i);
843 
844 impl U32x4 {
845     // Constructors
846 
847     #[inline]
new(a: u32, b: u32, c: u32, d: u32) -> U32x4848     pub fn new(a: u32, b: u32, c: u32, d: u32) -> U32x4 {
849         unsafe {
850             let vector = [a, b, c, d];
851             U32x4(x86::_mm_loadu_si128(vector.as_ptr() as *const __m128i))
852         }
853     }
854 
855     #[inline]
splat(x: u32) -> U32x4856     pub fn splat(x: u32) -> U32x4 {
857         unsafe { U32x4(x86::_mm_set1_epi32(x as i32)) }
858     }
859 
860     // Conversions
861 
862     /// Converts these packed unsigned integers to signed integers.
863     ///
864     /// Overflowing values will wrap around.
865     #[inline]
to_i32x4(self) -> I32x4866     pub fn to_i32x4(self) -> I32x4 {
867         I32x4(self.0)
868     }
869 
870     // Basic operations
871 
872     /// Returns true if all four booleans in this vector are true.
873     ///
874     /// The result is *undefined* if all four values in this vector are not booleans. A boolean is
875     /// a value with all bits set or all bits clear (i.e. !0 or 0).
876     #[inline]
all_true(self) -> bool877     pub fn all_true(self) -> bool {
878         unsafe { x86::_mm_movemask_ps(x86::_mm_castsi128_ps(self.0)) == 0x0f }
879     }
880 
881     /// Returns true if all four booleans in this vector are false.
882     ///
883     /// The result is *undefined* if all four values in this vector are not booleans. A boolean is
884     /// a value with all bits set or all bits clear (i.e. !0 or 0).
885     #[inline]
all_false(self) -> bool886     pub fn all_false(self) -> bool {
887         unsafe { x86::_mm_movemask_ps(x86::_mm_castsi128_ps(self.0)) == 0x00 }
888     }
889 
890     // Extraction
891 
892     #[inline]
xy(self) -> U32x2893     pub fn xy(self) -> U32x2 {
894         unsafe {
895             let swizzled = self.0;
896             U32x2(*mem::transmute::<&__m128i, &u64>(&swizzled))
897         }
898     }
899 
900     // Packed comparisons
901 
902     #[inline]
packed_eq(self, other: U32x4) -> U32x4903     pub fn packed_eq(self, other: U32x4) -> U32x4 {
904         unsafe { U32x4(x86::_mm_cmpeq_epi32(self.0, other.0)) }
905     }
906 }
907 
908 impl Debug for U32x4 {
909     #[inline]
fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error>910     fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
911         write!(f, "<{}, {}, {}, {}>", self[0], self[1], self[2], self[3])
912     }
913 }
914 
915 impl Index<usize> for U32x4 {
916     type Output = u32;
917     #[inline]
index(&self, index: usize) -> &u32918     fn index(&self, index: usize) -> &u32 {
919         unsafe { &mem::transmute::<&__m128i, &[u32; 4]>(&self.0)[index] }
920     }
921 }
922 
923 impl PartialEq for U32x4 {
924     #[inline]
eq(&self, other: &U32x4) -> bool925     fn eq(&self, other: &U32x4) -> bool {
926         self.packed_eq(*other).all_true()
927     }
928 }
929 
930 impl Not for U32x4 {
931     type Output = U32x4;
932     #[inline]
not(self) -> U32x4933     fn not(self) -> U32x4 {
934         self ^ U32x4::splat(!0)
935     }
936 }
937 
938 impl BitXor<U32x4> for U32x4 {
939     type Output = U32x4;
940     #[inline]
bitxor(self, other: U32x4) -> U32x4941     fn bitxor(self, other: U32x4) -> U32x4 {
942         unsafe { U32x4(x86::_mm_xor_si128(self.0, other.0)) }
943     }
944 }
945 
946 impl Shr<u32> for U32x4 {
947     type Output = U32x4;
948     #[inline]
shr(self, amount: u32) -> U32x4949     fn shr(self, amount: u32) -> U32x4 {
950         unsafe { U32x4(x86::_mm_srl_epi32(self.0, U32x4::new(amount, 0, 0, 0).0)) }
951     }
952 }
953