1 //! Streaming SIMD Extensions (SSE)
2 
3 use crate::{
4     core_arch::{simd::*, simd_llvm::*, x86::*},
5     intrinsics, mem, ptr,
6 };
7 
8 #[cfg(test)]
9 use stdarch_test::assert_instr;
10 
11 /// Adds the first component of `a` and `b`, the other components are copied
12 /// from `a`.
13 ///
14 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_ss)
15 #[inline]
16 #[target_feature(enable = "sse")]
17 #[cfg_attr(test, assert_instr(addss))]
18 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_add_ss(a: __m128, b: __m128) -> __m12819 pub unsafe fn _mm_add_ss(a: __m128, b: __m128) -> __m128 {
20     addss(a, b)
21 }
22 
23 /// Adds __m128 vectors.
24 ///
25 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_ps)
26 #[inline]
27 #[target_feature(enable = "sse")]
28 #[cfg_attr(test, assert_instr(addps))]
29 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_add_ps(a: __m128, b: __m128) -> __m12830 pub unsafe fn _mm_add_ps(a: __m128, b: __m128) -> __m128 {
31     simd_add(a, b)
32 }
33 
34 /// Subtracts the first component of `b` from `a`, the other components are
35 /// copied from `a`.
36 ///
37 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss)
38 #[inline]
39 #[target_feature(enable = "sse")]
40 #[cfg_attr(test, assert_instr(subss))]
41 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_sub_ss(a: __m128, b: __m128) -> __m12842 pub unsafe fn _mm_sub_ss(a: __m128, b: __m128) -> __m128 {
43     subss(a, b)
44 }
45 
46 /// Subtracts __m128 vectors.
47 ///
48 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ps)
49 #[inline]
50 #[target_feature(enable = "sse")]
51 #[cfg_attr(test, assert_instr(subps))]
52 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_sub_ps(a: __m128, b: __m128) -> __m12853 pub unsafe fn _mm_sub_ps(a: __m128, b: __m128) -> __m128 {
54     simd_sub(a, b)
55 }
56 
57 /// Multiplies the first component of `a` and `b`, the other components are
58 /// copied from `a`.
59 ///
60 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss)
61 #[inline]
62 #[target_feature(enable = "sse")]
63 #[cfg_attr(test, assert_instr(mulss))]
64 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_mul_ss(a: __m128, b: __m128) -> __m12865 pub unsafe fn _mm_mul_ss(a: __m128, b: __m128) -> __m128 {
66     mulss(a, b)
67 }
68 
69 /// Multiplies __m128 vectors.
70 ///
71 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ps)
72 #[inline]
73 #[target_feature(enable = "sse")]
74 #[cfg_attr(test, assert_instr(mulps))]
75 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_mul_ps(a: __m128, b: __m128) -> __m12876 pub unsafe fn _mm_mul_ps(a: __m128, b: __m128) -> __m128 {
77     simd_mul(a, b)
78 }
79 
80 /// Divides the first component of `b` by `a`, the other components are
81 /// copied from `a`.
82 ///
83 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_ss)
84 #[inline]
85 #[target_feature(enable = "sse")]
86 #[cfg_attr(test, assert_instr(divss))]
87 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_div_ss(a: __m128, b: __m128) -> __m12888 pub unsafe fn _mm_div_ss(a: __m128, b: __m128) -> __m128 {
89     divss(a, b)
90 }
91 
92 /// Divides __m128 vectors.
93 ///
94 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_ps)
95 #[inline]
96 #[target_feature(enable = "sse")]
97 #[cfg_attr(test, assert_instr(divps))]
98 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_div_ps(a: __m128, b: __m128) -> __m12899 pub unsafe fn _mm_div_ps(a: __m128, b: __m128) -> __m128 {
100     simd_div(a, b)
101 }
102 
103 /// Returns the square root of the first single-precision (32-bit)
104 /// floating-point element in `a`, the other elements are unchanged.
105 ///
106 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_ss)
107 #[inline]
108 #[target_feature(enable = "sse")]
109 #[cfg_attr(test, assert_instr(sqrtss))]
110 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_sqrt_ss(a: __m128) -> __m128111 pub unsafe fn _mm_sqrt_ss(a: __m128) -> __m128 {
112     sqrtss(a)
113 }
114 
115 /// Returns the square root of packed single-precision (32-bit) floating-point
116 /// elements in `a`.
117 ///
118 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_ps)
119 #[inline]
120 #[target_feature(enable = "sse")]
121 #[cfg_attr(test, assert_instr(sqrtps))]
122 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_sqrt_ps(a: __m128) -> __m128123 pub unsafe fn _mm_sqrt_ps(a: __m128) -> __m128 {
124     sqrtps(a)
125 }
126 
127 /// Returns the approximate reciprocal of the first single-precision
128 /// (32-bit) floating-point element in `a`, the other elements are unchanged.
129 ///
130 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss)
131 #[inline]
132 #[target_feature(enable = "sse")]
133 #[cfg_attr(test, assert_instr(rcpss))]
134 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_rcp_ss(a: __m128) -> __m128135 pub unsafe fn _mm_rcp_ss(a: __m128) -> __m128 {
136     rcpss(a)
137 }
138 
139 /// Returns the approximate reciprocal of packed single-precision (32-bit)
140 /// floating-point elements in `a`.
141 ///
142 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps)
143 #[inline]
144 #[target_feature(enable = "sse")]
145 #[cfg_attr(test, assert_instr(rcpps))]
146 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_rcp_ps(a: __m128) -> __m128147 pub unsafe fn _mm_rcp_ps(a: __m128) -> __m128 {
148     rcpps(a)
149 }
150 
151 /// Returns the approximate reciprocal square root of the first single-precision
152 /// (32-bit) floating-point element in `a`, the other elements are unchanged.
153 ///
154 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss)
155 #[inline]
156 #[target_feature(enable = "sse")]
157 #[cfg_attr(test, assert_instr(rsqrtss))]
158 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_rsqrt_ss(a: __m128) -> __m128159 pub unsafe fn _mm_rsqrt_ss(a: __m128) -> __m128 {
160     rsqrtss(a)
161 }
162 
163 /// Returns the approximate reciprocal square root of packed single-precision
164 /// (32-bit) floating-point elements in `a`.
165 ///
166 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ps)
167 #[inline]
168 #[target_feature(enable = "sse")]
169 #[cfg_attr(test, assert_instr(rsqrtps))]
170 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_rsqrt_ps(a: __m128) -> __m128171 pub unsafe fn _mm_rsqrt_ps(a: __m128) -> __m128 {
172     rsqrtps(a)
173 }
174 
175 /// Compares the first single-precision (32-bit) floating-point element of `a`
176 /// and `b`, and return the minimum value in the first element of the return
177 /// value, the other elements are copied from `a`.
178 ///
179 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_ss)
180 #[inline]
181 #[target_feature(enable = "sse")]
182 #[cfg_attr(test, assert_instr(minss))]
183 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_min_ss(a: __m128, b: __m128) -> __m128184 pub unsafe fn _mm_min_ss(a: __m128, b: __m128) -> __m128 {
185     minss(a, b)
186 }
187 
188 /// Compares packed single-precision (32-bit) floating-point elements in `a` and
189 /// `b`, and return the corresponding minimum values.
190 ///
191 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_ps)
192 #[inline]
193 #[target_feature(enable = "sse")]
194 #[cfg_attr(test, assert_instr(minps))]
195 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_min_ps(a: __m128, b: __m128) -> __m128196 pub unsafe fn _mm_min_ps(a: __m128, b: __m128) -> __m128 {
197     // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmin`.
198     minps(a, b)
199 }
200 
201 /// Compares the first single-precision (32-bit) floating-point element of `a`
202 /// and `b`, and return the maximum value in the first element of the return
203 /// value, the other elements are copied from `a`.
204 ///
205 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_ss)
206 #[inline]
207 #[target_feature(enable = "sse")]
208 #[cfg_attr(test, assert_instr(maxss))]
209 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_max_ss(a: __m128, b: __m128) -> __m128210 pub unsafe fn _mm_max_ss(a: __m128, b: __m128) -> __m128 {
211     maxss(a, b)
212 }
213 
214 /// Compares packed single-precision (32-bit) floating-point elements in `a` and
215 /// `b`, and return the corresponding maximum values.
216 ///
217 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_ps)
218 #[inline]
219 #[target_feature(enable = "sse")]
220 #[cfg_attr(test, assert_instr(maxps))]
221 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_max_ps(a: __m128, b: __m128) -> __m128222 pub unsafe fn _mm_max_ps(a: __m128, b: __m128) -> __m128 {
223     // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmax`.
224     maxps(a, b)
225 }
226 
227 /// Bitwise AND of packed single-precision (32-bit) floating-point elements.
228 ///
229 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_ps)
230 #[inline]
231 #[target_feature(enable = "sse")]
232 // i586 only seems to generate plain `and` instructions, so ignore it.
233 #[cfg_attr(
234     all(test, any(target_arch = "x86_64", target_feature = "sse2")),
235     assert_instr(andps)
236 )]
237 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_and_ps(a: __m128, b: __m128) -> __m128238 pub unsafe fn _mm_and_ps(a: __m128, b: __m128) -> __m128 {
239     let a: __m128i = mem::transmute(a);
240     let b: __m128i = mem::transmute(b);
241     mem::transmute(simd_and(a, b))
242 }
243 
244 /// Bitwise AND-NOT of packed single-precision (32-bit) floating-point
245 /// elements.
246 ///
247 /// Computes `!a & b` for each bit in `a` and `b`.
248 ///
249 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_ps)
250 #[inline]
251 #[target_feature(enable = "sse")]
252 // i586 only seems to generate plain `not` and `and` instructions, so ignore
253 // it.
254 #[cfg_attr(
255     all(test, any(target_arch = "x86_64", target_feature = "sse2")),
256     assert_instr(andnps)
257 )]
258 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_andnot_ps(a: __m128, b: __m128) -> __m128259 pub unsafe fn _mm_andnot_ps(a: __m128, b: __m128) -> __m128 {
260     let a: __m128i = mem::transmute(a);
261     let b: __m128i = mem::transmute(b);
262     let mask: __m128i = mem::transmute(i32x4::splat(-1));
263     mem::transmute(simd_and(simd_xor(mask, a), b))
264 }
265 
266 /// Bitwise OR of packed single-precision (32-bit) floating-point elements.
267 ///
268 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_or_ps)
269 #[inline]
270 #[target_feature(enable = "sse")]
271 // i586 only seems to generate plain `or` instructions, so we ignore it.
272 #[cfg_attr(
273     all(test, any(target_arch = "x86_64", target_feature = "sse2")),
274     assert_instr(orps)
275 )]
276 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_or_ps(a: __m128, b: __m128) -> __m128277 pub unsafe fn _mm_or_ps(a: __m128, b: __m128) -> __m128 {
278     let a: __m128i = mem::transmute(a);
279     let b: __m128i = mem::transmute(b);
280     mem::transmute(simd_or(a, b))
281 }
282 
283 /// Bitwise exclusive OR of packed single-precision (32-bit) floating-point
284 /// elements.
285 ///
286 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_ps)
287 #[inline]
288 #[target_feature(enable = "sse")]
289 // i586 only seems to generate plain `xor` instructions, so we ignore it.
290 #[cfg_attr(
291     all(test, any(target_arch = "x86_64", target_feature = "sse2")),
292     assert_instr(xorps)
293 )]
294 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_xor_ps(a: __m128, b: __m128) -> __m128295 pub unsafe fn _mm_xor_ps(a: __m128, b: __m128) -> __m128 {
296     let a: __m128i = mem::transmute(a);
297     let b: __m128i = mem::transmute(b);
298     mem::transmute(simd_xor(a, b))
299 }
300 
301 /// Compares the lowest `f32` of both inputs for equality. The lowest 32 bits of
302 /// the result will be `0xffffffff` if the two inputs are equal, or `0`
303 /// otherwise. The upper 96 bits of the result are the upper 96 bits of `a`.
304 ///
305 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_ss)
306 #[inline]
307 #[target_feature(enable = "sse")]
308 #[cfg_attr(test, assert_instr(cmpeqss))]
309 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cmpeq_ss(a: __m128, b: __m128) -> __m128310 pub unsafe fn _mm_cmpeq_ss(a: __m128, b: __m128) -> __m128 {
311     cmpss(a, b, 0)
312 }
313 
314 /// Compares the lowest `f32` of both inputs for less than. The lowest 32 bits
315 /// of the result will be `0xffffffff` if `a.extract(0)` is less than
316 /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
317 /// upper 96 bits of `a`.
318 ///
319 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_ss)
320 #[inline]
321 #[target_feature(enable = "sse")]
322 #[cfg_attr(test, assert_instr(cmpltss))]
323 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cmplt_ss(a: __m128, b: __m128) -> __m128324 pub unsafe fn _mm_cmplt_ss(a: __m128, b: __m128) -> __m128 {
325     cmpss(a, b, 1)
326 }
327 
328 /// Compares the lowest `f32` of both inputs for less than or equal. The lowest
329 /// 32 bits of the result will be `0xffffffff` if `a.extract(0)` is less than
330 /// or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
331 /// are the upper 96 bits of `a`.
332 ///
333 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_ss)
334 #[inline]
335 #[target_feature(enable = "sse")]
336 #[cfg_attr(test, assert_instr(cmpless))]
337 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cmple_ss(a: __m128, b: __m128) -> __m128338 pub unsafe fn _mm_cmple_ss(a: __m128, b: __m128) -> __m128 {
339     cmpss(a, b, 2)
340 }
341 
342 /// Compares the lowest `f32` of both inputs for greater than. The lowest 32
343 /// bits of the result will be `0xffffffff` if `a.extract(0)` is greater
344 /// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
345 /// are the upper 96 bits of `a`.
346 ///
347 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_ss)
348 #[inline]
349 #[target_feature(enable = "sse")]
350 #[cfg_attr(test, assert_instr(cmpltss))]
351 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cmpgt_ss(a: __m128, b: __m128) -> __m128352 pub unsafe fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128 {
353     simd_shuffle4!(a, cmpss(b, a, 1), [4, 1, 2, 3])
354 }
355 
356 /// Compares the lowest `f32` of both inputs for greater than or equal. The
357 /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is
358 /// greater than or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits
359 /// of the result are the upper 96 bits of `a`.
360 ///
361 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_ss)
362 #[inline]
363 #[target_feature(enable = "sse")]
364 #[cfg_attr(test, assert_instr(cmpless))]
365 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cmpge_ss(a: __m128, b: __m128) -> __m128366 pub unsafe fn _mm_cmpge_ss(a: __m128, b: __m128) -> __m128 {
367     simd_shuffle4!(a, cmpss(b, a, 2), [4, 1, 2, 3])
368 }
369 
370 /// Compares the lowest `f32` of both inputs for inequality. The lowest 32 bits
371 /// of the result will be `0xffffffff` if `a.extract(0)` is not equal to
372 /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
373 /// upper 96 bits of `a`.
374 ///
375 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_ss)
376 #[inline]
377 #[target_feature(enable = "sse")]
378 #[cfg_attr(test, assert_instr(cmpneqss))]
379 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cmpneq_ss(a: __m128, b: __m128) -> __m128380 pub unsafe fn _mm_cmpneq_ss(a: __m128, b: __m128) -> __m128 {
381     cmpss(a, b, 4)
382 }
383 
384 /// Compares the lowest `f32` of both inputs for not-less-than. The lowest 32
385 /// bits of the result will be `0xffffffff` if `a.extract(0)` is not less than
386 /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
387 /// upper 96 bits of `a`.
388 ///
389 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_ss)
390 #[inline]
391 #[target_feature(enable = "sse")]
392 #[cfg_attr(test, assert_instr(cmpnltss))]
393 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cmpnlt_ss(a: __m128, b: __m128) -> __m128394 pub unsafe fn _mm_cmpnlt_ss(a: __m128, b: __m128) -> __m128 {
395     cmpss(a, b, 5)
396 }
397 
398 /// Compares the lowest `f32` of both inputs for not-less-than-or-equal. The
399 /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
400 /// less than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 bits
401 /// of the result are the upper 96 bits of `a`.
402 ///
403 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_ss)
404 #[inline]
405 #[target_feature(enable = "sse")]
406 #[cfg_attr(test, assert_instr(cmpnless))]
407 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cmpnle_ss(a: __m128, b: __m128) -> __m128408 pub unsafe fn _mm_cmpnle_ss(a: __m128, b: __m128) -> __m128 {
409     cmpss(a, b, 6)
410 }
411 
412 /// Compares the lowest `f32` of both inputs for not-greater-than. The lowest 32
413 /// bits of the result will be `0xffffffff` if `a.extract(0)` is not greater
414 /// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are
415 /// the upper 96 bits of `a`.
416 ///
417 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_ss)
418 #[inline]
419 #[target_feature(enable = "sse")]
420 #[cfg_attr(test, assert_instr(cmpnltss))]
421 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cmpngt_ss(a: __m128, b: __m128) -> __m128422 pub unsafe fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128 {
423     simd_shuffle4!(a, cmpss(b, a, 5), [4, 1, 2, 3])
424 }
425 
426 /// Compares the lowest `f32` of both inputs for not-greater-than-or-equal. The
427 /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
428 /// greater than or equal to `b.extract(0)`, or `0` otherwise. The upper 96
429 /// bits of the result are the upper 96 bits of `a`.
430 ///
431 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_ss)
432 #[inline]
433 #[target_feature(enable = "sse")]
434 #[cfg_attr(test, assert_instr(cmpnless))]
435 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cmpnge_ss(a: __m128, b: __m128) -> __m128436 pub unsafe fn _mm_cmpnge_ss(a: __m128, b: __m128) -> __m128 {
437     simd_shuffle4!(a, cmpss(b, a, 6), [4, 1, 2, 3])
438 }
439 
440 /// Checks if the lowest `f32` of both inputs are ordered. The lowest 32 bits of
441 /// the result will be `0xffffffff` if neither of `a.extract(0)` or
442 /// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
443 /// are the upper 96 bits of `a`.
444 ///
445 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_ss)
446 #[inline]
447 #[target_feature(enable = "sse")]
448 #[cfg_attr(test, assert_instr(cmpordss))]
449 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cmpord_ss(a: __m128, b: __m128) -> __m128450 pub unsafe fn _mm_cmpord_ss(a: __m128, b: __m128) -> __m128 {
451     cmpss(a, b, 7)
452 }
453 
454 /// Checks if the lowest `f32` of both inputs are unordered. The lowest 32 bits
455 /// of the result will be `0xffffffff` if any of `a.extract(0)` or
456 /// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
457 /// are the upper 96 bits of `a`.
458 ///
459 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_ss)
460 #[inline]
461 #[target_feature(enable = "sse")]
462 #[cfg_attr(test, assert_instr(cmpunordss))]
463 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cmpunord_ss(a: __m128, b: __m128) -> __m128464 pub unsafe fn _mm_cmpunord_ss(a: __m128, b: __m128) -> __m128 {
465     cmpss(a, b, 3)
466 }
467 
468 /// Compares each of the four floats in `a` to the corresponding element in `b`.
469 /// The result in the output vector will be `0xffffffff` if the input elements
470 /// were equal, or `0` otherwise.
471 ///
472 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_ps)
473 #[inline]
474 #[target_feature(enable = "sse")]
475 #[cfg_attr(test, assert_instr(cmpeqps))]
476 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cmpeq_ps(a: __m128, b: __m128) -> __m128477 pub unsafe fn _mm_cmpeq_ps(a: __m128, b: __m128) -> __m128 {
478     cmpps(a, b, 0)
479 }
480 
481 /// Compares each of the four floats in `a` to the corresponding element in `b`.
482 /// The result in the output vector will be `0xffffffff` if the input element
483 /// in `a` is less than the corresponding element in `b`, or `0` otherwise.
484 ///
485 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_ps)
486 #[inline]
487 #[target_feature(enable = "sse")]
488 #[cfg_attr(test, assert_instr(cmpltps))]
489 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cmplt_ps(a: __m128, b: __m128) -> __m128490 pub unsafe fn _mm_cmplt_ps(a: __m128, b: __m128) -> __m128 {
491     cmpps(a, b, 1)
492 }
493 
494 /// Compares each of the four floats in `a` to the corresponding element in `b`.
495 /// The result in the output vector will be `0xffffffff` if the input element
496 /// in `a` is less than or equal to the corresponding element in `b`, or `0`
497 /// otherwise.
498 ///
499 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_ps)
500 #[inline]
501 #[target_feature(enable = "sse")]
502 #[cfg_attr(test, assert_instr(cmpleps))]
503 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cmple_ps(a: __m128, b: __m128) -> __m128504 pub unsafe fn _mm_cmple_ps(a: __m128, b: __m128) -> __m128 {
505     cmpps(a, b, 2)
506 }
507 
508 /// Compares each of the four floats in `a` to the corresponding element in `b`.
509 /// The result in the output vector will be `0xffffffff` if the input element
510 /// in `a` is greater than the corresponding element in `b`, or `0` otherwise.
511 ///
512 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_ps)
513 #[inline]
514 #[target_feature(enable = "sse")]
515 #[cfg_attr(test, assert_instr(cmpltps))]
516 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cmpgt_ps(a: __m128, b: __m128) -> __m128517 pub unsafe fn _mm_cmpgt_ps(a: __m128, b: __m128) -> __m128 {
518     cmpps(b, a, 1)
519 }
520 
521 /// Compares each of the four floats in `a` to the corresponding element in `b`.
522 /// The result in the output vector will be `0xffffffff` if the input element
523 /// in `a` is greater than or equal to the corresponding element in `b`, or `0`
524 /// otherwise.
525 ///
526 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_ps)
527 #[inline]
528 #[target_feature(enable = "sse")]
529 #[cfg_attr(test, assert_instr(cmpleps))]
530 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cmpge_ps(a: __m128, b: __m128) -> __m128531 pub unsafe fn _mm_cmpge_ps(a: __m128, b: __m128) -> __m128 {
532     cmpps(b, a, 2)
533 }
534 
535 /// Compares each of the four floats in `a` to the corresponding element in `b`.
536 /// The result in the output vector will be `0xffffffff` if the input elements
537 /// are **not** equal, or `0` otherwise.
538 ///
539 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_ps)
540 #[inline]
541 #[target_feature(enable = "sse")]
542 #[cfg_attr(test, assert_instr(cmpneqps))]
543 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cmpneq_ps(a: __m128, b: __m128) -> __m128544 pub unsafe fn _mm_cmpneq_ps(a: __m128, b: __m128) -> __m128 {
545     cmpps(a, b, 4)
546 }
547 
548 /// Compares each of the four floats in `a` to the corresponding element in `b`.
549 /// The result in the output vector will be `0xffffffff` if the input element
550 /// in `a` is **not** less than the corresponding element in `b`, or `0`
551 /// otherwise.
552 ///
553 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_ps)
554 #[inline]
555 #[target_feature(enable = "sse")]
556 #[cfg_attr(test, assert_instr(cmpnltps))]
557 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cmpnlt_ps(a: __m128, b: __m128) -> __m128558 pub unsafe fn _mm_cmpnlt_ps(a: __m128, b: __m128) -> __m128 {
559     cmpps(a, b, 5)
560 }
561 
562 /// Compares each of the four floats in `a` to the corresponding element in `b`.
563 /// The result in the output vector will be `0xffffffff` if the input element
564 /// in `a` is **not** less than or equal to the corresponding element in `b`, or
565 /// `0` otherwise.
566 ///
567 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_ps)
568 #[inline]
569 #[target_feature(enable = "sse")]
570 #[cfg_attr(test, assert_instr(cmpnleps))]
571 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cmpnle_ps(a: __m128, b: __m128) -> __m128572 pub unsafe fn _mm_cmpnle_ps(a: __m128, b: __m128) -> __m128 {
573     cmpps(a, b, 6)
574 }
575 
576 /// Compares each of the four floats in `a` to the corresponding element in `b`.
577 /// The result in the output vector will be `0xffffffff` if the input element
578 /// in `a` is **not** greater than the corresponding element in `b`, or `0`
579 /// otherwise.
580 ///
581 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_ps)
582 #[inline]
583 #[target_feature(enable = "sse")]
584 #[cfg_attr(test, assert_instr(cmpnltps))]
585 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cmpngt_ps(a: __m128, b: __m128) -> __m128586 pub unsafe fn _mm_cmpngt_ps(a: __m128, b: __m128) -> __m128 {
587     cmpps(b, a, 5)
588 }
589 
590 /// Compares each of the four floats in `a` to the corresponding element in `b`.
591 /// The result in the output vector will be `0xffffffff` if the input element
592 /// in `a` is **not** greater than or equal to the corresponding element in `b`,
593 /// or `0` otherwise.
594 ///
595 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_ps)
596 #[inline]
597 #[target_feature(enable = "sse")]
598 #[cfg_attr(test, assert_instr(cmpnleps))]
599 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cmpnge_ps(a: __m128, b: __m128) -> __m128600 pub unsafe fn _mm_cmpnge_ps(a: __m128, b: __m128) -> __m128 {
601     cmpps(b, a, 6)
602 }
603 
604 /// Compares each of the four floats in `a` to the corresponding element in `b`.
605 /// Returns four floats that have one of two possible bit patterns. The element
606 /// in the output vector will be `0xffffffff` if the input elements in `a` and
607 /// `b` are ordered (i.e., neither of them is a NaN), or 0 otherwise.
608 ///
609 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_ps)
610 #[inline]
611 #[target_feature(enable = "sse")]
612 #[cfg_attr(test, assert_instr(cmpordps))]
613 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cmpord_ps(a: __m128, b: __m128) -> __m128614 pub unsafe fn _mm_cmpord_ps(a: __m128, b: __m128) -> __m128 {
615     cmpps(b, a, 7)
616 }
617 
618 /// Compares each of the four floats in `a` to the corresponding element in `b`.
619 /// Returns four floats that have one of two possible bit patterns. The element
620 /// in the output vector will be `0xffffffff` if the input elements in `a` and
621 /// `b` are unordered (i.e., at least on of them is a NaN), or 0 otherwise.
622 ///
623 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_ps)
624 #[inline]
625 #[target_feature(enable = "sse")]
626 #[cfg_attr(test, assert_instr(cmpunordps))]
627 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cmpunord_ps(a: __m128, b: __m128) -> __m128628 pub unsafe fn _mm_cmpunord_ps(a: __m128, b: __m128) -> __m128 {
629     cmpps(b, a, 3)
630 }
631 
632 /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
633 /// `1` if they are equal, or `0` otherwise.
634 ///
635 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_ss)
636 #[inline]
637 #[target_feature(enable = "sse")]
638 #[cfg_attr(test, assert_instr(comiss))]
639 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_comieq_ss(a: __m128, b: __m128) -> i32640 pub unsafe fn _mm_comieq_ss(a: __m128, b: __m128) -> i32 {
641     comieq_ss(a, b)
642 }
643 
644 /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
645 /// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
646 ///
647 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_ss)
648 #[inline]
649 #[target_feature(enable = "sse")]
650 #[cfg_attr(test, assert_instr(comiss))]
651 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_comilt_ss(a: __m128, b: __m128) -> i32652 pub unsafe fn _mm_comilt_ss(a: __m128, b: __m128) -> i32 {
653     comilt_ss(a, b)
654 }
655 
656 /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
657 /// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
658 /// otherwise.
659 ///
660 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_ss)
661 #[inline]
662 #[target_feature(enable = "sse")]
663 #[cfg_attr(test, assert_instr(comiss))]
664 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_comile_ss(a: __m128, b: __m128) -> i32665 pub unsafe fn _mm_comile_ss(a: __m128, b: __m128) -> i32 {
666     comile_ss(a, b)
667 }
668 
669 /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
670 /// `1` if the value from `a` is greater than the one from `b`, or `0`
671 /// otherwise.
672 ///
673 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_ss)
674 #[inline]
675 #[target_feature(enable = "sse")]
676 #[cfg_attr(test, assert_instr(comiss))]
677 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_comigt_ss(a: __m128, b: __m128) -> i32678 pub unsafe fn _mm_comigt_ss(a: __m128, b: __m128) -> i32 {
679     comigt_ss(a, b)
680 }
681 
682 /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
683 /// `1` if the value from `a` is greater than or equal to the one from `b`, or
684 /// `0` otherwise.
685 ///
686 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_ss)
687 #[inline]
688 #[target_feature(enable = "sse")]
689 #[cfg_attr(test, assert_instr(comiss))]
690 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_comige_ss(a: __m128, b: __m128) -> i32691 pub unsafe fn _mm_comige_ss(a: __m128, b: __m128) -> i32 {
692     comige_ss(a, b)
693 }
694 
695 /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
696 /// `1` if they are **not** equal, or `0` otherwise.
697 ///
698 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_ss)
699 #[inline]
700 #[target_feature(enable = "sse")]
701 #[cfg_attr(test, assert_instr(comiss))]
702 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_comineq_ss(a: __m128, b: __m128) -> i32703 pub unsafe fn _mm_comineq_ss(a: __m128, b: __m128) -> i32 {
704     comineq_ss(a, b)
705 }
706 
707 /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
708 /// `1` if they are equal, or `0` otherwise. This instruction will not signal
709 /// an exception if either argument is a quiet NaN.
710 ///
711 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomieq_ss)
712 #[inline]
713 #[target_feature(enable = "sse")]
714 #[cfg_attr(test, assert_instr(ucomiss))]
715 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_ucomieq_ss(a: __m128, b: __m128) -> i32716 pub unsafe fn _mm_ucomieq_ss(a: __m128, b: __m128) -> i32 {
717     ucomieq_ss(a, b)
718 }
719 
720 /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
721 /// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
722 /// This instruction will not signal an exception if either argument is a quiet
723 /// NaN.
724 ///
725 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomilt_ss)
726 #[inline]
727 #[target_feature(enable = "sse")]
728 #[cfg_attr(test, assert_instr(ucomiss))]
729 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_ucomilt_ss(a: __m128, b: __m128) -> i32730 pub unsafe fn _mm_ucomilt_ss(a: __m128, b: __m128) -> i32 {
731     ucomilt_ss(a, b)
732 }
733 
734 /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
735 /// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
736 /// otherwise. This instruction will not signal an exception if either argument
737 /// is a quiet NaN.
738 ///
739 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomile_ss)
740 #[inline]
741 #[target_feature(enable = "sse")]
742 #[cfg_attr(test, assert_instr(ucomiss))]
743 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_ucomile_ss(a: __m128, b: __m128) -> i32744 pub unsafe fn _mm_ucomile_ss(a: __m128, b: __m128) -> i32 {
745     ucomile_ss(a, b)
746 }
747 
748 /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
749 /// `1` if the value from `a` is greater than the one from `b`, or `0`
750 /// otherwise. This instruction will not signal an exception if either argument
751 /// is a quiet NaN.
752 ///
753 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomigt_ss)
754 #[inline]
755 #[target_feature(enable = "sse")]
756 #[cfg_attr(test, assert_instr(ucomiss))]
757 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_ucomigt_ss(a: __m128, b: __m128) -> i32758 pub unsafe fn _mm_ucomigt_ss(a: __m128, b: __m128) -> i32 {
759     ucomigt_ss(a, b)
760 }
761 
762 /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
763 /// `1` if the value from `a` is greater than or equal to the one from `b`, or
764 /// `0` otherwise. This instruction will not signal an exception if either
765 /// argument is a quiet NaN.
766 ///
767 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomige_ss)
768 #[inline]
769 #[target_feature(enable = "sse")]
770 #[cfg_attr(test, assert_instr(ucomiss))]
771 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_ucomige_ss(a: __m128, b: __m128) -> i32772 pub unsafe fn _mm_ucomige_ss(a: __m128, b: __m128) -> i32 {
773     ucomige_ss(a, b)
774 }
775 
776 /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
777 /// `1` if they are **not** equal, or `0` otherwise. This instruction will not
778 /// signal an exception if either argument is a quiet NaN.
779 ///
780 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomineq_ss)
781 #[inline]
782 #[target_feature(enable = "sse")]
783 #[cfg_attr(test, assert_instr(ucomiss))]
784 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_ucomineq_ss(a: __m128, b: __m128) -> i32785 pub unsafe fn _mm_ucomineq_ss(a: __m128, b: __m128) -> i32 {
786     ucomineq_ss(a, b)
787 }
788 
789 /// Converts the lowest 32 bit float in the input vector to a 32 bit integer.
790 ///
791 /// The result is rounded according to the current rounding mode. If the result
792 /// cannot be represented as a 32 bit integer the result will be `0x8000_0000`
793 /// (`i32::MIN`) or an invalid operation floating point exception if
794 /// unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
795 ///
796 /// This corresponds to the `CVTSS2SI` instruction (with 32 bit output).
797 ///
798 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32)
799 #[inline]
800 #[target_feature(enable = "sse")]
801 #[cfg_attr(test, assert_instr(cvtss2si))]
802 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cvtss_si32(a: __m128) -> i32803 pub unsafe fn _mm_cvtss_si32(a: __m128) -> i32 {
804     cvtss2si(a)
805 }
806 
807 /// Alias for [`_mm_cvtss_si32`](fn._mm_cvtss_si32.html).
808 ///
809 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si)
810 #[inline]
811 #[target_feature(enable = "sse")]
812 #[cfg_attr(test, assert_instr(cvtss2si))]
813 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cvt_ss2si(a: __m128) -> i32814 pub unsafe fn _mm_cvt_ss2si(a: __m128) -> i32 {
815     _mm_cvtss_si32(a)
816 }
817 
818 /// Converts the lowest 32 bit float in the input vector to a 32 bit integer
819 /// with
820 /// truncation.
821 ///
822 /// The result is rounded always using truncation (round towards zero). If the
823 /// result cannot be represented as a 32 bit integer the result will be
824 /// `0x8000_0000` (`i32::MIN`) or an invalid operation floating point
825 /// exception if unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
826 ///
827 /// This corresponds to the `CVTTSS2SI` instruction (with 32 bit output).
828 ///
829 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32)
830 #[inline]
831 #[target_feature(enable = "sse")]
832 #[cfg_attr(test, assert_instr(cvttss2si))]
833 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cvttss_si32(a: __m128) -> i32834 pub unsafe fn _mm_cvttss_si32(a: __m128) -> i32 {
835     cvttss2si(a)
836 }
837 
838 /// Alias for [`_mm_cvttss_si32`](fn._mm_cvttss_si32.html).
839 ///
840 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si)
841 #[inline]
842 #[target_feature(enable = "sse")]
843 #[cfg_attr(test, assert_instr(cvttss2si))]
844 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cvtt_ss2si(a: __m128) -> i32845 pub unsafe fn _mm_cvtt_ss2si(a: __m128) -> i32 {
846     _mm_cvttss_si32(a)
847 }
848 
849 /// Extracts the lowest 32 bit float from the input vector.
850 ///
851 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32)
852 #[inline]
853 #[target_feature(enable = "sse")]
854 // No point in using assert_instrs. In Unix x86_64 calling convention this is a
855 // no-op, and on Windows it's just a `mov`.
856 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cvtss_f32(a: __m128) -> f32857 pub unsafe fn _mm_cvtss_f32(a: __m128) -> f32 {
858     simd_extract(a, 0)
859 }
860 
861 /// Converts a 32 bit integer to a 32 bit float. The result vector is the input
862 /// vector `a` with the lowest 32 bit float replaced by the converted integer.
863 ///
864 /// This intrinsic corresponds to the `CVTSI2SS` instruction (with 32 bit
865 /// input).
866 ///
867 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss)
868 #[inline]
869 #[target_feature(enable = "sse")]
870 #[cfg_attr(test, assert_instr(cvtsi2ss))]
871 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cvtsi32_ss(a: __m128, b: i32) -> __m128872 pub unsafe fn _mm_cvtsi32_ss(a: __m128, b: i32) -> __m128 {
873     cvtsi2ss(a, b)
874 }
875 
876 /// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html).
877 ///
878 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss)
879 #[inline]
880 #[target_feature(enable = "sse")]
881 #[cfg_attr(test, assert_instr(cvtsi2ss))]
882 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cvt_si2ss(a: __m128, b: i32) -> __m128883 pub unsafe fn _mm_cvt_si2ss(a: __m128, b: i32) -> __m128 {
884     _mm_cvtsi32_ss(a, b)
885 }
886 
887 /// Construct a `__m128` with the lowest element set to `a` and the rest set to
888 /// zero.
889 ///
890 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss)
891 #[inline]
892 #[target_feature(enable = "sse")]
893 #[cfg_attr(test, assert_instr(movss))]
894 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_set_ss(a: f32) -> __m128895 pub unsafe fn _mm_set_ss(a: f32) -> __m128 {
896     __m128(a, 0.0, 0.0, 0.0)
897 }
898 
899 /// Construct a `__m128` with all element set to `a`.
900 ///
901 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_ps)
902 #[inline]
903 #[target_feature(enable = "sse")]
904 #[cfg_attr(test, assert_instr(shufps))]
905 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_set1_ps(a: f32) -> __m128906 pub unsafe fn _mm_set1_ps(a: f32) -> __m128 {
907     __m128(a, a, a, a)
908 }
909 
910 /// Alias for [`_mm_set1_ps`](fn._mm_set1_ps.html)
911 ///
912 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ps1)
913 #[inline]
914 #[target_feature(enable = "sse")]
915 #[cfg_attr(test, assert_instr(shufps))]
916 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_set_ps1(a: f32) -> __m128917 pub unsafe fn _mm_set_ps1(a: f32) -> __m128 {
918     _mm_set1_ps(a)
919 }
920 
921 /// Construct a `__m128` from four floating point values highest to lowest.
922 ///
923 /// Note that `a` will be the highest 32 bits of the result, and `d` the
924 /// lowest. This matches the standard way of writing bit patterns on x86:
925 ///
926 /// ```text
927 ///  bit    127 .. 96  95 .. 64  63 .. 32  31 .. 0
928 ///        +---------+---------+---------+---------+
929 ///        |    a    |    b    |    c    |    d    |   result
930 ///        +---------+---------+---------+---------+
931 /// ```
932 ///
933 /// Alternatively:
934 ///
935 /// ```text
936 /// let v = _mm_set_ps(d, c, b, a);
937 /// ```
938 ///
939 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ps)
940 #[inline]
941 #[target_feature(enable = "sse")]
942 #[cfg_attr(test, assert_instr(unpcklps))]
943 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_set_ps(a: f32, b: f32, c: f32, d: f32) -> __m128944 pub unsafe fn _mm_set_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
945     __m128(d, c, b, a)
946 }
947 
948 /// Construct a `__m128` from four floating point values lowest to highest.
949 ///
950 /// This matches the memory order of `__m128`, i.e., `a` will be the lowest 32
951 /// bits of the result, and `d` the highest.
952 ///
953 /// ```text
954 /// assert_eq!(__m128::new(a, b, c, d), _mm_setr_ps(a, b, c, d));
955 /// ```
956 ///
957 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_ps)
958 #[inline]
959 #[target_feature(enable = "sse")]
960 #[cfg_attr(
961     all(test, any(target_os = "windows", target_arch = "x86_64")),
962     assert_instr(unpcklps)
963 )]
964 // On a 32-bit architecture on non-Windows it just copies the operands from the stack.
965 #[cfg_attr(
966     all(test, all(not(target_os = "windows"), target_arch = "x86")),
967     assert_instr(movaps)
968 )]
969 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_setr_ps(a: f32, b: f32, c: f32, d: f32) -> __m128970 pub unsafe fn _mm_setr_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
971     __m128(a, b, c, d)
972 }
973 
974 /// Construct a `__m128` with all elements initialized to zero.
975 ///
976 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_ps)
977 #[inline]
978 #[target_feature(enable = "sse")]
979 #[cfg_attr(test, assert_instr(xorps))]
980 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_setzero_ps() -> __m128981 pub unsafe fn _mm_setzero_ps() -> __m128 {
982     __m128(0.0, 0.0, 0.0, 0.0)
983 }
984 
985 /// A utility function for creating masks to use with Intel shuffle and
986 /// permute intrinsics.
987 #[inline]
988 #[allow(non_snake_case)]
989 #[unstable(feature = "stdarch", issue = "27731")]
_MM_SHUFFLE(z: u32, y: u32, x: u32, w: u32) -> i32990 pub const fn _MM_SHUFFLE(z: u32, y: u32, x: u32, w: u32) -> i32 {
991     ((z << 6) | (y << 4) | (x << 2) | w) as i32
992 }
993 
994 /// Shuffles packed single-precision (32-bit) floating-point elements in `a` and
995 /// `b` using `MASK`.
996 ///
997 /// The lower half of result takes values from `a` and the higher half from
998 /// `b`. Mask is split to 2 control bits each to index the element from inputs.
999 ///
1000 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_ps)
1001 ///
1002 /// Note that there appears to be a mistake within Intel's Intrinsics Guide.
1003 /// `_mm_shuffle_ps` is supposed to take an `i32` instead of a `u32`
1004 /// as is the case for [other shuffle intrinsics](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_).
1005 /// Performing an implicit type conversion between an unsigned integer and a signed integer
1006 /// does not cause a problem in C, however Rust's commitment to strong typing does not allow this.
1007 #[inline]
1008 #[target_feature(enable = "sse")]
1009 #[cfg_attr(test, assert_instr(shufps, MASK = 3))]
1010 #[rustc_legacy_const_generics(2)]
1011 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_shuffle_ps<const MASK: i32>(a: __m128, b: __m128) -> __m1281012 pub unsafe fn _mm_shuffle_ps<const MASK: i32>(a: __m128, b: __m128) -> __m128 {
1013     static_assert_imm8!(MASK);
1014     simd_shuffle4!(
1015         a,
1016         b,
1017         <const MASK: i32> [
1018             MASK as u32 & 0b11,
1019             (MASK as u32 >> 2) & 0b11,
1020             ((MASK as u32 >> 4) & 0b11) + 4,
1021             ((MASK as u32 >> 6) & 0b11) + 4,
1022         ],
1023     )
1024 }
1025 
1026 /// Unpacks and interleave single-precision (32-bit) floating-point elements
1027 /// from the higher half of `a` and `b`.
1028 ///
1029 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_ps)
1030 #[inline]
1031 #[target_feature(enable = "sse")]
1032 #[cfg_attr(test, assert_instr(unpckhps))]
1033 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_unpackhi_ps(a: __m128, b: __m128) -> __m1281034 pub unsafe fn _mm_unpackhi_ps(a: __m128, b: __m128) -> __m128 {
1035     simd_shuffle4!(a, b, [2, 6, 3, 7])
1036 }
1037 
1038 /// Unpacks and interleave single-precision (32-bit) floating-point elements
1039 /// from the lower half of `a` and `b`.
1040 ///
1041 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_ps)
1042 #[inline]
1043 #[target_feature(enable = "sse")]
1044 #[cfg_attr(test, assert_instr(unpcklps))]
1045 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_unpacklo_ps(a: __m128, b: __m128) -> __m1281046 pub unsafe fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128 {
1047     simd_shuffle4!(a, b, [0, 4, 1, 5])
1048 }
1049 
1050 /// Combine higher half of `a` and `b`. The highwe half of `b` occupies the
1051 /// lower half of result.
1052 ///
1053 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehl_ps)
1054 #[inline]
1055 #[target_feature(enable = "sse")]
1056 #[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movhlps))]
1057 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_movehl_ps(a: __m128, b: __m128) -> __m1281058 pub unsafe fn _mm_movehl_ps(a: __m128, b: __m128) -> __m128 {
1059     // TODO; figure why this is a different instruction on Windows?
1060     simd_shuffle4!(a, b, [6, 7, 2, 3])
1061 }
1062 
1063 /// Combine lower half of `a` and `b`. The lower half of `b` occupies the
1064 /// higher half of result.
1065 ///
1066 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movelh_ps)
1067 #[inline]
1068 #[target_feature(enable = "sse")]
1069 #[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movlhps))]
1070 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_movelh_ps(a: __m128, b: __m128) -> __m1281071 pub unsafe fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 {
1072     simd_shuffle4!(a, b, [0, 1, 4, 5])
1073 }
1074 
1075 /// Returns a mask of the most significant bit of each element in `a`.
1076 ///
1077 /// The mask is stored in the 4 least significant bits of the return value.
1078 /// All other bits are set to `0`.
1079 ///
1080 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_ps)
1081 #[inline]
1082 #[target_feature(enable = "sse")]
1083 // FIXME: LLVM9 trunk has the following bug:
1084 // https://github.com/rust-lang/stdarch/issues/794
1085 // so we only temporarily test this on i686 and x86_64 but not on i586:
1086 #[cfg_attr(all(test, target_feature = "sse2"), assert_instr(movmskps))]
1087 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_movemask_ps(a: __m128) -> i321088 pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 {
1089     movmskps(a)
1090 }
1091 
1092 /// Construct a `__m128` with the lowest element read from `p` and the other
1093 /// elements set to zero.
1094 ///
1095 /// This corresponds to instructions `VMOVSS` / `MOVSS`.
1096 ///
1097 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ss)
1098 #[inline]
1099 #[target_feature(enable = "sse")]
1100 #[cfg_attr(test, assert_instr(movss))]
1101 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_load_ss(p: *const f32) -> __m1281102 pub unsafe fn _mm_load_ss(p: *const f32) -> __m128 {
1103     __m128(*p, 0.0, 0.0, 0.0)
1104 }
1105 
1106 /// Construct a `__m128` by duplicating the value read from `p` into all
1107 /// elements.
1108 ///
1109 /// This corresponds to instructions `VMOVSS` / `MOVSS` followed by some
1110 /// shuffling.
1111 ///
1112 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_ps)
1113 #[inline]
1114 #[target_feature(enable = "sse")]
1115 #[cfg_attr(test, assert_instr(movss))]
1116 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_load1_ps(p: *const f32) -> __m1281117 pub unsafe fn _mm_load1_ps(p: *const f32) -> __m128 {
1118     let a = *p;
1119     __m128(a, a, a, a)
1120 }
1121 
1122 /// Alias for [`_mm_load1_ps`](fn._mm_load1_ps.html)
1123 ///
1124 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1)
1125 #[inline]
1126 #[target_feature(enable = "sse")]
1127 #[cfg_attr(test, assert_instr(movss))]
1128 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_load_ps1(p: *const f32) -> __m1281129 pub unsafe fn _mm_load_ps1(p: *const f32) -> __m128 {
1130     _mm_load1_ps(p)
1131 }
1132 
1133 /// Loads four `f32` values from *aligned* memory into a `__m128`. If the
1134 /// pointer is not aligned to a 128-bit boundary (16 bytes) a general
1135 /// protection fault will be triggered (fatal program crash).
1136 ///
1137 /// Use [`_mm_loadu_ps`](fn._mm_loadu_ps.html) for potentially unaligned
1138 /// memory.
1139 ///
1140 /// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
1141 ///
1142 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps)
1143 #[inline]
1144 #[target_feature(enable = "sse")]
1145 #[cfg_attr(test, assert_instr(movaps))]
1146 #[stable(feature = "simd_x86", since = "1.27.0")]
1147 #[allow(clippy::cast_ptr_alignment)]
_mm_load_ps(p: *const f32) -> __m1281148 pub unsafe fn _mm_load_ps(p: *const f32) -> __m128 {
1149     *(p as *const __m128)
1150 }
1151 
1152 /// Loads four `f32` values from memory into a `__m128`. There are no
1153 /// restrictions
1154 /// on memory alignment. For aligned memory
1155 /// [`_mm_load_ps`](fn._mm_load_ps.html)
1156 /// may be faster.
1157 ///
1158 /// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
1159 ///
1160 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_ps)
1161 #[inline]
1162 #[target_feature(enable = "sse")]
1163 #[cfg_attr(test, assert_instr(movups))]
1164 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_loadu_ps(p: *const f32) -> __m1281165 pub unsafe fn _mm_loadu_ps(p: *const f32) -> __m128 {
1166     // Note: Using `*p` would require `f32` alignment, but `movups` has no
1167     // alignment restrictions.
1168     let mut dst = _mm_undefined_ps();
1169     ptr::copy_nonoverlapping(
1170         p as *const u8,
1171         &mut dst as *mut __m128 as *mut u8,
1172         mem::size_of::<__m128>(),
1173     );
1174     dst
1175 }
1176 
1177 /// Loads four `f32` values from aligned memory into a `__m128` in reverse
1178 /// order.
1179 ///
1180 /// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1181 /// protection fault will be triggered (fatal program crash).
1182 ///
1183 /// Functionally equivalent to the following code sequence (assuming `p`
1184 /// satisfies the alignment restrictions):
1185 ///
1186 /// ```text
1187 /// let a0 = *p;
1188 /// let a1 = *p.offset(1);
1189 /// let a2 = *p.offset(2);
1190 /// let a3 = *p.offset(3);
1191 /// __m128::new(a3, a2, a1, a0)
1192 /// ```
1193 ///
1194 /// This corresponds to instructions `VMOVAPS` / `MOVAPS` followed by some
1195 /// shuffling.
1196 ///
1197 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps)
1198 #[inline]
1199 #[target_feature(enable = "sse")]
1200 #[cfg_attr(test, assert_instr(movaps))]
1201 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_loadr_ps(p: *const f32) -> __m1281202 pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 {
1203     let a = _mm_load_ps(p);
1204     simd_shuffle4!(a, a, [3, 2, 1, 0])
1205 }
1206 
1207 /// Loads unaligned 64-bits of integer data from memory into new vector.
1208 ///
1209 /// `mem_addr` does not need to be aligned on any particular boundary.
1210 ///
1211 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64)
1212 #[inline]
1213 #[target_feature(enable = "sse")]
1214 #[stable(feature = "simd_x86_mm_loadu_si64", since = "1.46.0")]
_mm_loadu_si64(mem_addr: *const u8) -> __m128i1215 pub unsafe fn _mm_loadu_si64(mem_addr: *const u8) -> __m128i {
1216     transmute(i64x2(ptr::read_unaligned(mem_addr as *const i64), 0))
1217 }
1218 
1219 /// Stores the lowest 32 bit float of `a` into memory.
1220 ///
1221 /// This intrinsic corresponds to the `MOVSS` instruction.
1222 ///
1223 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ss)
1224 #[inline]
1225 #[target_feature(enable = "sse")]
1226 #[cfg_attr(test, assert_instr(movss))]
1227 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_store_ss(p: *mut f32, a: __m128)1228 pub unsafe fn _mm_store_ss(p: *mut f32, a: __m128) {
1229     *p = simd_extract(a, 0);
1230 }
1231 
1232 /// Stores the lowest 32 bit float of `a` repeated four times into *aligned*
1233 /// memory.
1234 ///
1235 /// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1236 /// protection fault will be triggered (fatal program crash).
1237 ///
1238 /// Functionally equivalent to the following code sequence (assuming `p`
1239 /// satisfies the alignment restrictions):
1240 ///
1241 /// ```text
1242 /// let x = a.extract(0);
1243 /// *p = x;
1244 /// *p.offset(1) = x;
1245 /// *p.offset(2) = x;
1246 /// *p.offset(3) = x;
1247 /// ```
1248 ///
1249 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps)
1250 #[inline]
1251 #[target_feature(enable = "sse")]
1252 #[cfg_attr(test, assert_instr(movaps))]
1253 #[stable(feature = "simd_x86", since = "1.27.0")]
1254 #[allow(clippy::cast_ptr_alignment)]
_mm_store1_ps(p: *mut f32, a: __m128)1255 pub unsafe fn _mm_store1_ps(p: *mut f32, a: __m128) {
1256     let b: __m128 = simd_shuffle4!(a, a, [0, 0, 0, 0]);
1257     *(p as *mut __m128) = b;
1258 }
1259 
1260 /// Alias for [`_mm_store1_ps`](fn._mm_store1_ps.html)
1261 ///
1262 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1)
1263 #[inline]
1264 #[target_feature(enable = "sse")]
1265 #[cfg_attr(test, assert_instr(movaps))]
1266 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_store_ps1(p: *mut f32, a: __m128)1267 pub unsafe fn _mm_store_ps1(p: *mut f32, a: __m128) {
1268     _mm_store1_ps(p, a);
1269 }
1270 
1271 /// Stores four 32-bit floats into *aligned* memory.
1272 ///
1273 /// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1274 /// protection fault will be triggered (fatal program crash).
1275 ///
1276 /// Use [`_mm_storeu_ps`](fn._mm_storeu_ps.html) for potentially unaligned
1277 /// memory.
1278 ///
1279 /// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
1280 ///
1281 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps)
1282 #[inline]
1283 #[target_feature(enable = "sse")]
1284 #[cfg_attr(test, assert_instr(movaps))]
1285 #[stable(feature = "simd_x86", since = "1.27.0")]
1286 #[allow(clippy::cast_ptr_alignment)]
_mm_store_ps(p: *mut f32, a: __m128)1287 pub unsafe fn _mm_store_ps(p: *mut f32, a: __m128) {
1288     *(p as *mut __m128) = a;
1289 }
1290 
1291 /// Stores four 32-bit floats into memory. There are no restrictions on memory
1292 /// alignment. For aligned memory [`_mm_store_ps`](fn._mm_store_ps.html) may be
1293 /// faster.
1294 ///
1295 /// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
1296 ///
1297 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_ps)
1298 #[inline]
1299 #[target_feature(enable = "sse")]
1300 #[cfg_attr(test, assert_instr(movups))]
1301 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_storeu_ps(p: *mut f32, a: __m128)1302 pub unsafe fn _mm_storeu_ps(p: *mut f32, a: __m128) {
1303     ptr::copy_nonoverlapping(
1304         &a as *const __m128 as *const u8,
1305         p as *mut u8,
1306         mem::size_of::<__m128>(),
1307     );
1308 }
1309 
1310 /// Stores four 32-bit floats into *aligned* memory in reverse order.
1311 ///
1312 /// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1313 /// protection fault will be triggered (fatal program crash).
1314 ///
1315 /// Functionally equivalent to the following code sequence (assuming `p`
1316 /// satisfies the alignment restrictions):
1317 ///
1318 /// ```text
1319 /// *p = a.extract(3);
1320 /// *p.offset(1) = a.extract(2);
1321 /// *p.offset(2) = a.extract(1);
1322 /// *p.offset(3) = a.extract(0);
1323 /// ```
1324 ///
1325 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps)
1326 #[inline]
1327 #[target_feature(enable = "sse")]
1328 #[cfg_attr(test, assert_instr(movaps))]
1329 #[stable(feature = "simd_x86", since = "1.27.0")]
1330 #[allow(clippy::cast_ptr_alignment)]
_mm_storer_ps(p: *mut f32, a: __m128)1331 pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) {
1332     let b: __m128 = simd_shuffle4!(a, a, [3, 2, 1, 0]);
1333     *(p as *mut __m128) = b;
1334 }
1335 
1336 /// Returns a `__m128` with the first component from `b` and the remaining
1337 /// components from `a`.
1338 ///
1339 /// In other words for any `a` and `b`:
1340 /// ```text
1341 /// _mm_move_ss(a, b) == a.replace(0, b.extract(0))
1342 /// ```
1343 ///
1344 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_ss)
1345 #[inline]
1346 #[target_feature(enable = "sse")]
1347 #[cfg_attr(test, assert_instr(movss))]
1348 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_move_ss(a: __m128, b: __m128) -> __m1281349 pub unsafe fn _mm_move_ss(a: __m128, b: __m128) -> __m128 {
1350     simd_shuffle4!(a, b, [4, 1, 2, 3])
1351 }
1352 
1353 /// Performs a serializing operation on all store-to-memory instructions that
1354 /// were issued prior to this instruction.
1355 ///
1356 /// Guarantees that every store instruction that precedes, in program order, is
1357 /// globally visible before any store instruction which follows the fence in
1358 /// program order.
1359 ///
1360 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sfence)
1361 #[inline]
1362 #[target_feature(enable = "sse")]
1363 #[cfg_attr(test, assert_instr(sfence))]
1364 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_sfence()1365 pub unsafe fn _mm_sfence() {
1366     sfence()
1367 }
1368 
1369 /// Gets the unsigned 32-bit value of the MXCSR control and status register.
1370 ///
1371 /// For more info see [`_mm_setcsr`](fn._mm_setcsr.html)
1372 ///
1373 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getcsr)
1374 #[inline]
1375 #[target_feature(enable = "sse")]
1376 #[cfg_attr(test, assert_instr(stmxcsr))]
1377 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_getcsr() -> u321378 pub unsafe fn _mm_getcsr() -> u32 {
1379     let mut result = 0_i32;
1380     stmxcsr((&mut result) as *mut _ as *mut i8);
1381     result as u32
1382 }
1383 
1384 /// Sets the MXCSR register with the 32-bit unsigned integer value.
1385 ///
1386 /// This register constrols how SIMD instructions handle floating point
1387 /// operations. Modifying this register only affects the current thread.
1388 ///
1389 /// It contains several groups of flags:
1390 ///
1391 /// * *Exception flags* report which exceptions occurred since last they were
1392 /// reset.
1393 ///
1394 /// * *Masking flags* can be used to mask (ignore) certain exceptions. By
1395 /// default
1396 /// these flags are all set to 1, so all exceptions are masked. When an
1397 /// an exception is masked, the processor simply sets the exception flag and
1398 /// continues the operation. If the exception is unmasked, the flag is also set
1399 /// but additionally an exception handler is invoked.
1400 ///
1401 /// * *Rounding mode flags* control the rounding mode of floating point
1402 /// instructions.
1403 ///
1404 /// * The *denormals-are-zero mode flag* turns all numbers which would be
1405 /// denormalized (exponent bits are all zeros) into zeros.
1406 ///
1407 /// ## Exception Flags
1408 ///
1409 /// * `_MM_EXCEPT_INVALID`: An invalid operation was performed (e.g., dividing
1410 ///   Infinity by Infinity).
1411 ///
1412 /// * `_MM_EXCEPT_DENORM`: An operation attempted to operate on a denormalized
1413 ///   number. Mainly this can cause loss of precision.
1414 ///
1415 /// * `_MM_EXCEPT_DIV_ZERO`: Division by zero occured.
1416 ///
1417 /// * `_MM_EXCEPT_OVERFLOW`: A numeric overflow exception occured, i.e., a
1418 /// result was too large to be represented (e.g., an `f32` with absolute
1419 /// value
1420 ///   greater than `2^128`).
1421 ///
1422 /// * `_MM_EXCEPT_UNDERFLOW`: A numeric underflow exception occured, i.e., a
1423 /// result was too small to be represented in a normalized way (e.g., an
1424 /// `f32`
1425 ///   with absulte value smaller than `2^-126`.)
1426 ///
1427 /// * `_MM_EXCEPT_INEXACT`: An inexact-result exception occured (a.k.a.
1428 ///   precision exception). This means some precision was lost due to rounding.
1429 ///   For example, the fraction `1/3` cannot be represented accurately in a
1430 ///   32 or 64 bit float and computing it would cause this exception to be
1431 ///   raised. Precision exceptions are very common, so they are usually masked.
1432 ///
1433 /// Exception flags can be read and set using the convenience functions
1434 /// `_MM_GET_EXCEPTION_STATE` and `_MM_SET_EXCEPTION_STATE`. For example, to
1435 /// check if an operation caused some overflow:
1436 ///
1437 /// ```rust,ignore
1438 /// _MM_SET_EXCEPTION_STATE(0); // clear all exception flags
1439 ///                             // perform calculations
1440 /// if _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_OVERFLOW != 0 {
1441 ///     // handle overflow
1442 /// }
1443 /// ```
1444 ///
1445 /// ## Masking Flags
1446 ///
1447 /// There is one masking flag for each exception flag: `_MM_MASK_INVALID`,
1448 /// `_MM_MASK_DENORM`, `_MM_MASK_DIV_ZERO`, `_MM_MASK_OVERFLOW`,
1449 /// `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`.
1450 ///
1451 /// A single masking bit can be set via
1452 ///
1453 /// ```rust,ignore
1454 /// _MM_SET_EXCEPTION_MASK(_MM_MASK_UNDERFLOW);
1455 /// ```
1456 ///
1457 /// However, since mask bits are by default all set to 1, it is more common to
1458 /// want to *disable* certain bits. For example, to unmask the underflow
1459 /// exception, use:
1460 ///
1461 /// ```rust,ignore
1462 /// _mm_setcsr(_mm_getcsr() & !_MM_MASK_UNDERFLOW); // unmask underflow
1463 /// exception
1464 /// ```
1465 ///
1466 /// Warning: an unmasked exception will cause an exception handler to be
1467 /// called.
1468 /// The standard handler will simply terminate the process. So, in this case
1469 /// any underflow exception would terminate the current process with something
1470 /// like `signal: 8, SIGFPE: erroneous arithmetic operation`.
1471 ///
1472 /// ## Rounding Mode
1473 ///
1474 /// The rounding mode is describe using two bits. It can be read and set using
1475 /// the convenience wrappers `_MM_GET_ROUNDING_MODE()` and
1476 /// `_MM_SET_ROUNDING_MODE(mode)`.
1477 ///
1478 /// The rounding modes are:
1479 ///
1480 /// * `_MM_ROUND_NEAREST`: (default) Round to closest to the infinite precision
1481 ///   value. If two values are equally close, round to even (i.e., least
1482 ///   significant bit will be zero).
1483 ///
1484 /// * `_MM_ROUND_DOWN`: Round toward negative Infinity.
1485 ///
1486 /// * `_MM_ROUND_UP`: Round toward positive Infinity.
1487 ///
1488 /// * `_MM_ROUND_TOWARD_ZERO`: Round towards zero (truncate).
1489 ///
1490 /// Example:
1491 ///
1492 /// ```rust,ignore
1493 /// _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN)
1494 /// ```
1495 ///
1496 /// ## Denormals-are-zero/Flush-to-zero Mode
1497 ///
1498 /// If this bit is set, values that would be denormalized will be set to zero
1499 /// instead. This is turned off by default.
1500 ///
1501 /// You can read and enable/disable this mode via the helper functions
1502 /// `_MM_GET_FLUSH_ZERO_MODE()` and `_MM_SET_FLUSH_ZERO_MODE()`:
1503 ///
1504 /// ```rust,ignore
1505 /// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); // turn off (default)
1506 /// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // turn on
1507 /// ```
1508 ///
1509 ///
1510 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setcsr)
1511 #[inline]
1512 #[target_feature(enable = "sse")]
1513 #[cfg_attr(test, assert_instr(ldmxcsr))]
1514 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_setcsr(val: u32)1515 pub unsafe fn _mm_setcsr(val: u32) {
1516     ldmxcsr(&val as *const _ as *const i8);
1517 }
1518 
1519 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1520 #[stable(feature = "simd_x86", since = "1.27.0")]
1521 pub const _MM_EXCEPT_INVALID: u32 = 0x0001;
1522 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1523 #[stable(feature = "simd_x86", since = "1.27.0")]
1524 pub const _MM_EXCEPT_DENORM: u32 = 0x0002;
1525 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1526 #[stable(feature = "simd_x86", since = "1.27.0")]
1527 pub const _MM_EXCEPT_DIV_ZERO: u32 = 0x0004;
1528 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1529 #[stable(feature = "simd_x86", since = "1.27.0")]
1530 pub const _MM_EXCEPT_OVERFLOW: u32 = 0x0008;
1531 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1532 #[stable(feature = "simd_x86", since = "1.27.0")]
1533 pub const _MM_EXCEPT_UNDERFLOW: u32 = 0x0010;
1534 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1535 #[stable(feature = "simd_x86", since = "1.27.0")]
1536 pub const _MM_EXCEPT_INEXACT: u32 = 0x0020;
1537 /// See [`_MM_GET_EXCEPTION_STATE`](fn._MM_GET_EXCEPTION_STATE.html)
1538 #[stable(feature = "simd_x86", since = "1.27.0")]
1539 pub const _MM_EXCEPT_MASK: u32 = 0x003f;
1540 
1541 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1542 #[stable(feature = "simd_x86", since = "1.27.0")]
1543 pub const _MM_MASK_INVALID: u32 = 0x0080;
1544 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1545 #[stable(feature = "simd_x86", since = "1.27.0")]
1546 pub const _MM_MASK_DENORM: u32 = 0x0100;
1547 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1548 #[stable(feature = "simd_x86", since = "1.27.0")]
1549 pub const _MM_MASK_DIV_ZERO: u32 = 0x0200;
1550 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1551 #[stable(feature = "simd_x86", since = "1.27.0")]
1552 pub const _MM_MASK_OVERFLOW: u32 = 0x0400;
1553 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1554 #[stable(feature = "simd_x86", since = "1.27.0")]
1555 pub const _MM_MASK_UNDERFLOW: u32 = 0x0800;
1556 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1557 #[stable(feature = "simd_x86", since = "1.27.0")]
1558 pub const _MM_MASK_INEXACT: u32 = 0x1000;
1559 /// See [`_MM_GET_EXCEPTION_MASK`](fn._MM_GET_EXCEPTION_MASK.html)
1560 #[stable(feature = "simd_x86", since = "1.27.0")]
1561 pub const _MM_MASK_MASK: u32 = 0x1f80;
1562 
1563 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1564 #[stable(feature = "simd_x86", since = "1.27.0")]
1565 pub const _MM_ROUND_NEAREST: u32 = 0x0000;
1566 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1567 #[stable(feature = "simd_x86", since = "1.27.0")]
1568 pub const _MM_ROUND_DOWN: u32 = 0x2000;
1569 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1570 #[stable(feature = "simd_x86", since = "1.27.0")]
1571 pub const _MM_ROUND_UP: u32 = 0x4000;
1572 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1573 #[stable(feature = "simd_x86", since = "1.27.0")]
1574 pub const _MM_ROUND_TOWARD_ZERO: u32 = 0x6000;
1575 
1576 /// See [`_MM_GET_ROUNDING_MODE`](fn._MM_GET_ROUNDING_MODE.html)
1577 #[stable(feature = "simd_x86", since = "1.27.0")]
1578 pub const _MM_ROUND_MASK: u32 = 0x6000;
1579 
1580 /// See [`_MM_GET_FLUSH_ZERO_MODE`](fn._MM_GET_FLUSH_ZERO_MODE.html)
1581 #[stable(feature = "simd_x86", since = "1.27.0")]
1582 pub const _MM_FLUSH_ZERO_MASK: u32 = 0x8000;
1583 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1584 #[stable(feature = "simd_x86", since = "1.27.0")]
1585 pub const _MM_FLUSH_ZERO_ON: u32 = 0x8000;
1586 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1587 #[stable(feature = "simd_x86", since = "1.27.0")]
1588 pub const _MM_FLUSH_ZERO_OFF: u32 = 0x0000;
1589 
1590 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1591 ///
1592 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_EXCEPTION_MASK)
1593 #[inline]
1594 #[allow(non_snake_case)]
1595 #[target_feature(enable = "sse")]
1596 #[stable(feature = "simd_x86", since = "1.27.0")]
_MM_GET_EXCEPTION_MASK() -> u321597 pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 {
1598     _mm_getcsr() & _MM_MASK_MASK
1599 }
1600 
1601 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1602 ///
1603 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_EXCEPTION_STATE)
1604 #[inline]
1605 #[allow(non_snake_case)]
1606 #[target_feature(enable = "sse")]
1607 #[stable(feature = "simd_x86", since = "1.27.0")]
_MM_GET_EXCEPTION_STATE() -> u321608 pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 {
1609     _mm_getcsr() & _MM_EXCEPT_MASK
1610 }
1611 
1612 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1613 ///
1614 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_FLUSH_ZERO_MODE)
1615 #[inline]
1616 #[allow(non_snake_case)]
1617 #[target_feature(enable = "sse")]
1618 #[stable(feature = "simd_x86", since = "1.27.0")]
_MM_GET_FLUSH_ZERO_MODE() -> u321619 pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 {
1620     _mm_getcsr() & _MM_FLUSH_ZERO_MASK
1621 }
1622 
1623 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1624 ///
1625 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_ROUNDING_MODE)
1626 #[inline]
1627 #[allow(non_snake_case)]
1628 #[target_feature(enable = "sse")]
1629 #[stable(feature = "simd_x86", since = "1.27.0")]
_MM_GET_ROUNDING_MODE() -> u321630 pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 {
1631     _mm_getcsr() & _MM_ROUND_MASK
1632 }
1633 
1634 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1635 ///
1636 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_EXCEPTION_MASK)
1637 #[inline]
1638 #[allow(non_snake_case)]
1639 #[target_feature(enable = "sse")]
1640 #[stable(feature = "simd_x86", since = "1.27.0")]
_MM_SET_EXCEPTION_MASK(x: u32)1641 pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) {
1642     _mm_setcsr((_mm_getcsr() & !_MM_MASK_MASK) | x)
1643 }
1644 
1645 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1646 ///
1647 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_EXCEPTION_STATE)
1648 #[inline]
1649 #[allow(non_snake_case)]
1650 #[target_feature(enable = "sse")]
1651 #[stable(feature = "simd_x86", since = "1.27.0")]
_MM_SET_EXCEPTION_STATE(x: u32)1652 pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) {
1653     _mm_setcsr((_mm_getcsr() & !_MM_EXCEPT_MASK) | x)
1654 }
1655 
1656 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1657 ///
1658 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_FLUSH_ZERO_MODE)
1659 #[inline]
1660 #[allow(non_snake_case)]
1661 #[target_feature(enable = "sse")]
1662 #[stable(feature = "simd_x86", since = "1.27.0")]
_MM_SET_FLUSH_ZERO_MODE(x: u32)1663 pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) {
1664     let val = (_mm_getcsr() & !_MM_FLUSH_ZERO_MASK) | x;
1665     // println!("setting csr={:x}", val);
1666     _mm_setcsr(val)
1667 }
1668 
1669 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1670 ///
1671 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE)
1672 #[inline]
1673 #[allow(non_snake_case)]
1674 #[target_feature(enable = "sse")]
1675 #[stable(feature = "simd_x86", since = "1.27.0")]
_MM_SET_ROUNDING_MODE(x: u32)1676 pub unsafe fn _MM_SET_ROUNDING_MODE(x: u32) {
1677     _mm_setcsr((_mm_getcsr() & !_MM_ROUND_MASK) | x)
1678 }
1679 
1680 /// See [`_mm_prefetch`](fn._mm_prefetch.html).
1681 #[stable(feature = "simd_x86", since = "1.27.0")]
1682 pub const _MM_HINT_T0: i32 = 3;
1683 
1684 /// See [`_mm_prefetch`](fn._mm_prefetch.html).
1685 #[stable(feature = "simd_x86", since = "1.27.0")]
1686 pub const _MM_HINT_T1: i32 = 2;
1687 
1688 /// See [`_mm_prefetch`](fn._mm_prefetch.html).
1689 #[stable(feature = "simd_x86", since = "1.27.0")]
1690 pub const _MM_HINT_T2: i32 = 1;
1691 
1692 /// See [`_mm_prefetch`](fn._mm_prefetch.html).
1693 #[stable(feature = "simd_x86", since = "1.27.0")]
1694 pub const _MM_HINT_NTA: i32 = 0;
1695 
1696 /// See [`_mm_prefetch`](fn._mm_prefetch.html).
1697 #[stable(feature = "simd_x86", since = "1.27.0")]
1698 pub const _MM_HINT_ET0: i32 = 7;
1699 
1700 /// See [`_mm_prefetch`](fn._mm_prefetch.html).
1701 #[stable(feature = "simd_x86", since = "1.27.0")]
1702 pub const _MM_HINT_ET1: i32 = 6;
1703 
1704 /// Fetch the cache line that contains address `p` using the given `STRATEGY`.
1705 ///
1706 /// The `STRATEGY` must be one of:
1707 ///
1708 /// * [`_MM_HINT_T0`](constant._MM_HINT_T0.html): Fetch into all levels of the
1709 ///   cache hierarchy.
1710 ///
1711 /// * [`_MM_HINT_T1`](constant._MM_HINT_T1.html): Fetch into L2 and higher.
1712 ///
1713 /// * [`_MM_HINT_T2`](constant._MM_HINT_T2.html): Fetch into L3 and higher or
1714 /// an   implementation-specific choice (e.g., L2 if there is no L3).
1715 ///
1716 /// * [`_MM_HINT_NTA`](constant._MM_HINT_NTA.html): Fetch data using the
1717 ///   non-temporal access (NTA) hint. It may be a place closer than main memory
1718 ///   but outside of the cache hierarchy. This is used to reduce access latency
1719 ///   without polluting the cache.
1720 ///
1721 /// * [`_MM_HINT_ET0`](constant._MM_HINT_ET0.html) and
1722 ///   [`_MM_HINT_ET1`](constant._MM_HINT_ET1.html) are similar to `_MM_HINT_T0`
1723 ///   and `_MM_HINT_T1` but indicate an anticipation to write to the address.
1724 ///
1725 /// The actual implementation depends on the particular CPU. This instruction
1726 /// is considered a hint, so the CPU is also free to simply ignore the request.
1727 ///
1728 /// The amount of prefetched data depends on the cache line size of the
1729 /// specific CPU, but it will be at least 32 bytes.
1730 ///
1731 /// Common caveats:
1732 ///
1733 /// * Most modern CPUs already automatically prefetch data based on predicted
1734 ///   access patterns.
1735 ///
1736 /// * Data is usually not fetched if this would cause a TLB miss or a page
1737 ///   fault.
1738 ///
1739 /// * Too much prefetching can cause unnecessary cache evictions.
1740 ///
1741 /// * Prefetching may also fail if there are not enough memory-subsystem
1742 ///   resources (e.g., request buffers).
1743 ///
1744 ///
1745 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_prefetch)
1746 #[inline]
1747 #[target_feature(enable = "sse")]
1748 #[cfg_attr(test, assert_instr(prefetcht0, STRATEGY = _MM_HINT_T0))]
1749 #[cfg_attr(test, assert_instr(prefetcht1, STRATEGY = _MM_HINT_T1))]
1750 #[cfg_attr(test, assert_instr(prefetcht2, STRATEGY = _MM_HINT_T2))]
1751 #[cfg_attr(test, assert_instr(prefetchnta, STRATEGY = _MM_HINT_NTA))]
1752 #[rustc_legacy_const_generics(1)]
1753 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_prefetch<const STRATEGY: i32>(p: *const i8)1754 pub unsafe fn _mm_prefetch<const STRATEGY: i32>(p: *const i8) {
1755     // We use the `llvm.prefetch` instrinsic with `cache type` = 1 (data cache).
1756     // `locality` and `rw` are based on our `STRATEGY`.
1757     prefetch(p, (STRATEGY >> 2) & 1, STRATEGY & 3, 1);
1758 }
1759 
1760 /// Returns vector of type __m128 with undefined elements.
1761 ///
1762 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps)
1763 #[inline]
1764 #[target_feature(enable = "sse")]
1765 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_undefined_ps() -> __m1281766 pub unsafe fn _mm_undefined_ps() -> __m128 {
1767     _mm_set1_ps(0.0)
1768 }
1769 
1770 /// Transpose the 4x4 matrix formed by 4 rows of __m128 in place.
1771 ///
1772 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_TRANSPOSE4_PS)
1773 #[inline]
1774 #[allow(non_snake_case)]
1775 #[target_feature(enable = "sse")]
1776 #[stable(feature = "simd_x86", since = "1.27.0")]
_MM_TRANSPOSE4_PS( row0: &mut __m128, row1: &mut __m128, row2: &mut __m128, row3: &mut __m128, )1777 pub unsafe fn _MM_TRANSPOSE4_PS(
1778     row0: &mut __m128,
1779     row1: &mut __m128,
1780     row2: &mut __m128,
1781     row3: &mut __m128,
1782 ) {
1783     let tmp0 = _mm_unpacklo_ps(*row0, *row1);
1784     let tmp2 = _mm_unpacklo_ps(*row2, *row3);
1785     let tmp1 = _mm_unpackhi_ps(*row0, *row1);
1786     let tmp3 = _mm_unpackhi_ps(*row2, *row3);
1787 
1788     *row0 = _mm_movelh_ps(tmp0, tmp2);
1789     *row1 = _mm_movehl_ps(tmp2, tmp0);
1790     *row2 = _mm_movelh_ps(tmp1, tmp3);
1791     *row3 = _mm_movehl_ps(tmp3, tmp1);
1792 }
1793 
1794 #[allow(improper_ctypes)]
1795 extern "C" {
1796     #[link_name = "llvm.x86.sse.add.ss"]
addss(a: __m128, b: __m128) -> __m1281797     fn addss(a: __m128, b: __m128) -> __m128;
1798     #[link_name = "llvm.x86.sse.sub.ss"]
subss(a: __m128, b: __m128) -> __m1281799     fn subss(a: __m128, b: __m128) -> __m128;
1800     #[link_name = "llvm.x86.sse.mul.ss"]
mulss(a: __m128, b: __m128) -> __m1281801     fn mulss(a: __m128, b: __m128) -> __m128;
1802     #[link_name = "llvm.x86.sse.div.ss"]
divss(a: __m128, b: __m128) -> __m1281803     fn divss(a: __m128, b: __m128) -> __m128;
1804     #[link_name = "llvm.x86.sse.sqrt.ss"]
sqrtss(a: __m128) -> __m1281805     fn sqrtss(a: __m128) -> __m128;
1806     #[link_name = "llvm.x86.sse.sqrt.ps"]
sqrtps(a: __m128) -> __m1281807     fn sqrtps(a: __m128) -> __m128;
1808     #[link_name = "llvm.x86.sse.rcp.ss"]
rcpss(a: __m128) -> __m1281809     fn rcpss(a: __m128) -> __m128;
1810     #[link_name = "llvm.x86.sse.rcp.ps"]
rcpps(a: __m128) -> __m1281811     fn rcpps(a: __m128) -> __m128;
1812     #[link_name = "llvm.x86.sse.rsqrt.ss"]
rsqrtss(a: __m128) -> __m1281813     fn rsqrtss(a: __m128) -> __m128;
1814     #[link_name = "llvm.x86.sse.rsqrt.ps"]
rsqrtps(a: __m128) -> __m1281815     fn rsqrtps(a: __m128) -> __m128;
1816     #[link_name = "llvm.x86.sse.min.ss"]
minss(a: __m128, b: __m128) -> __m1281817     fn minss(a: __m128, b: __m128) -> __m128;
1818     #[link_name = "llvm.x86.sse.min.ps"]
minps(a: __m128, b: __m128) -> __m1281819     fn minps(a: __m128, b: __m128) -> __m128;
1820     #[link_name = "llvm.x86.sse.max.ss"]
maxss(a: __m128, b: __m128) -> __m1281821     fn maxss(a: __m128, b: __m128) -> __m128;
1822     #[link_name = "llvm.x86.sse.max.ps"]
maxps(a: __m128, b: __m128) -> __m1281823     fn maxps(a: __m128, b: __m128) -> __m128;
1824     #[link_name = "llvm.x86.sse.movmsk.ps"]
movmskps(a: __m128) -> i321825     fn movmskps(a: __m128) -> i32;
1826     #[link_name = "llvm.x86.sse.cmp.ps"]
cmpps(a: __m128, b: __m128, imm8: i8) -> __m1281827     fn cmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
1828     #[link_name = "llvm.x86.sse.comieq.ss"]
comieq_ss(a: __m128, b: __m128) -> i321829     fn comieq_ss(a: __m128, b: __m128) -> i32;
1830     #[link_name = "llvm.x86.sse.comilt.ss"]
comilt_ss(a: __m128, b: __m128) -> i321831     fn comilt_ss(a: __m128, b: __m128) -> i32;
1832     #[link_name = "llvm.x86.sse.comile.ss"]
comile_ss(a: __m128, b: __m128) -> i321833     fn comile_ss(a: __m128, b: __m128) -> i32;
1834     #[link_name = "llvm.x86.sse.comigt.ss"]
comigt_ss(a: __m128, b: __m128) -> i321835     fn comigt_ss(a: __m128, b: __m128) -> i32;
1836     #[link_name = "llvm.x86.sse.comige.ss"]
comige_ss(a: __m128, b: __m128) -> i321837     fn comige_ss(a: __m128, b: __m128) -> i32;
1838     #[link_name = "llvm.x86.sse.comineq.ss"]
comineq_ss(a: __m128, b: __m128) -> i321839     fn comineq_ss(a: __m128, b: __m128) -> i32;
1840     #[link_name = "llvm.x86.sse.ucomieq.ss"]
ucomieq_ss(a: __m128, b: __m128) -> i321841     fn ucomieq_ss(a: __m128, b: __m128) -> i32;
1842     #[link_name = "llvm.x86.sse.ucomilt.ss"]
ucomilt_ss(a: __m128, b: __m128) -> i321843     fn ucomilt_ss(a: __m128, b: __m128) -> i32;
1844     #[link_name = "llvm.x86.sse.ucomile.ss"]
ucomile_ss(a: __m128, b: __m128) -> i321845     fn ucomile_ss(a: __m128, b: __m128) -> i32;
1846     #[link_name = "llvm.x86.sse.ucomigt.ss"]
ucomigt_ss(a: __m128, b: __m128) -> i321847     fn ucomigt_ss(a: __m128, b: __m128) -> i32;
1848     #[link_name = "llvm.x86.sse.ucomige.ss"]
ucomige_ss(a: __m128, b: __m128) -> i321849     fn ucomige_ss(a: __m128, b: __m128) -> i32;
1850     #[link_name = "llvm.x86.sse.ucomineq.ss"]
ucomineq_ss(a: __m128, b: __m128) -> i321851     fn ucomineq_ss(a: __m128, b: __m128) -> i32;
1852     #[link_name = "llvm.x86.sse.cvtss2si"]
cvtss2si(a: __m128) -> i321853     fn cvtss2si(a: __m128) -> i32;
1854     #[link_name = "llvm.x86.sse.cvttss2si"]
cvttss2si(a: __m128) -> i321855     fn cvttss2si(a: __m128) -> i32;
1856     #[link_name = "llvm.x86.sse.cvtsi2ss"]
cvtsi2ss(a: __m128, b: i32) -> __m1281857     fn cvtsi2ss(a: __m128, b: i32) -> __m128;
1858     #[link_name = "llvm.x86.sse.sfence"]
sfence()1859     fn sfence();
1860     #[link_name = "llvm.x86.sse.stmxcsr"]
stmxcsr(p: *mut i8)1861     fn stmxcsr(p: *mut i8);
1862     #[link_name = "llvm.x86.sse.ldmxcsr"]
ldmxcsr(p: *const i8)1863     fn ldmxcsr(p: *const i8);
1864     #[link_name = "llvm.prefetch"]
prefetch(p: *const i8, rw: i32, loc: i32, ty: i32)1865     fn prefetch(p: *const i8, rw: i32, loc: i32, ty: i32);
1866     #[link_name = "llvm.x86.sse.cmp.ss"]
cmpss(a: __m128, b: __m128, imm8: i8) -> __m1281867     fn cmpss(a: __m128, b: __m128, imm8: i8) -> __m128;
1868 }
1869 
1870 /// Stores `a` into the memory at `mem_addr` using a non-temporal memory hint.
1871 ///
1872 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
1873 /// exception _may_ be generated.
1874 ///
1875 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps)
1876 #[inline]
1877 #[target_feature(enable = "sse")]
1878 #[cfg_attr(test, assert_instr(movntps))]
1879 #[stable(feature = "simd_x86", since = "1.27.0")]
1880 #[allow(clippy::cast_ptr_alignment)]
_mm_stream_ps(mem_addr: *mut f32, a: __m128)1881 pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) {
1882     intrinsics::nontemporal_store(mem_addr as *mut __m128, a);
1883 }
1884 
1885 #[cfg(test)]
1886 mod tests {
1887     use crate::{hint::black_box, mem::transmute};
1888     use std::{boxed, f32::NAN};
1889     use stdarch_test::simd_test;
1890 
1891     use crate::core_arch::{simd::*, x86::*};
1892 
1893     #[simd_test(enable = "sse")]
test_mm_add_ps()1894     unsafe fn test_mm_add_ps() {
1895         let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
1896         let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
1897         let r = _mm_add_ps(a, b);
1898         assert_eq_m128(r, _mm_setr_ps(-101.0, 25.0, 0.0, -15.0));
1899     }
1900 
1901     #[simd_test(enable = "sse")]
test_mm_add_ss()1902     unsafe fn test_mm_add_ss() {
1903         let a = _mm_set_ps(-1.0, 5.0, 0.0, -10.0);
1904         let b = _mm_set_ps(-100.0, 20.0, 0.0, -5.0);
1905         let r = _mm_add_ss(a, b);
1906         assert_eq_m128(r, _mm_set_ps(-1.0, 5.0, 0.0, -15.0));
1907     }
1908 
1909     #[simd_test(enable = "sse")]
test_mm_sub_ps()1910     unsafe fn test_mm_sub_ps() {
1911         let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
1912         let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
1913         let r = _mm_sub_ps(a, b);
1914         assert_eq_m128(r, _mm_setr_ps(99.0, -15.0, 0.0, -5.0));
1915     }
1916 
1917     #[simd_test(enable = "sse")]
test_mm_sub_ss()1918     unsafe fn test_mm_sub_ss() {
1919         let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
1920         let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
1921         let r = _mm_sub_ss(a, b);
1922         assert_eq_m128(r, _mm_setr_ps(99.0, 5.0, 0.0, -10.0));
1923     }
1924 
1925     #[simd_test(enable = "sse")]
test_mm_mul_ps()1926     unsafe fn test_mm_mul_ps() {
1927         let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
1928         let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
1929         let r = _mm_mul_ps(a, b);
1930         assert_eq_m128(r, _mm_setr_ps(100.0, 100.0, 0.0, 50.0));
1931     }
1932 
1933     #[simd_test(enable = "sse")]
test_mm_mul_ss()1934     unsafe fn test_mm_mul_ss() {
1935         let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
1936         let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
1937         let r = _mm_mul_ss(a, b);
1938         assert_eq_m128(r, _mm_setr_ps(100.0, 5.0, 0.0, -10.0));
1939     }
1940 
1941     #[simd_test(enable = "sse")]
test_mm_div_ps()1942     unsafe fn test_mm_div_ps() {
1943         let a = _mm_setr_ps(-1.0, 5.0, 2.0, -10.0);
1944         let b = _mm_setr_ps(-100.0, 20.0, 0.2, -5.0);
1945         let r = _mm_div_ps(a, b);
1946         assert_eq_m128(r, _mm_setr_ps(0.01, 0.25, 10.0, 2.0));
1947     }
1948 
1949     #[simd_test(enable = "sse")]
test_mm_div_ss()1950     unsafe fn test_mm_div_ss() {
1951         let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
1952         let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
1953         let r = _mm_div_ss(a, b);
1954         assert_eq_m128(r, _mm_setr_ps(0.01, 5.0, 0.0, -10.0));
1955     }
1956 
1957     #[simd_test(enable = "sse")]
test_mm_sqrt_ss()1958     unsafe fn test_mm_sqrt_ss() {
1959         let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
1960         let r = _mm_sqrt_ss(a);
1961         let e = _mm_setr_ps(2.0, 13.0, 16.0, 100.0);
1962         assert_eq_m128(r, e);
1963     }
1964 
1965     #[simd_test(enable = "sse")]
test_mm_sqrt_ps()1966     unsafe fn test_mm_sqrt_ps() {
1967         let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
1968         let r = _mm_sqrt_ps(a);
1969         let e = _mm_setr_ps(2.0, 3.6055512, 4.0, 10.0);
1970         assert_eq_m128(r, e);
1971     }
1972 
1973     #[simd_test(enable = "sse")]
test_mm_rcp_ss()1974     unsafe fn test_mm_rcp_ss() {
1975         let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
1976         let r = _mm_rcp_ss(a);
1977         let e = _mm_setr_ps(0.24993896, 13.0, 16.0, 100.0);
1978         assert_eq_m128(r, e);
1979     }
1980 
1981     #[simd_test(enable = "sse")]
test_mm_rcp_ps()1982     unsafe fn test_mm_rcp_ps() {
1983         let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
1984         let r = _mm_rcp_ps(a);
1985         let e = _mm_setr_ps(0.24993896, 0.0769043, 0.06248474, 0.0099983215);
1986         let rel_err = 0.00048828125;
1987         for i in 0..4 {
1988             assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
1989         }
1990     }
1991 
1992     #[simd_test(enable = "sse")]
test_mm_rsqrt_ss()1993     unsafe fn test_mm_rsqrt_ss() {
1994         let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
1995         let r = _mm_rsqrt_ss(a);
1996         let e = _mm_setr_ps(0.49987793, 13.0, 16.0, 100.0);
1997         let rel_err = 0.00048828125;
1998         for i in 0..4 {
1999             assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
2000         }
2001     }
2002 
2003     #[simd_test(enable = "sse")]
test_mm_rsqrt_ps()2004     unsafe fn test_mm_rsqrt_ps() {
2005         let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2006         let r = _mm_rsqrt_ps(a);
2007         let e = _mm_setr_ps(0.49987793, 0.2772827, 0.24993896, 0.099990845);
2008         let rel_err = 0.00048828125;
2009         for i in 0..4 {
2010             assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
2011         }
2012     }
2013 
2014     #[simd_test(enable = "sse")]
test_mm_min_ss()2015     unsafe fn test_mm_min_ss() {
2016         let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2017         let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2018         let r = _mm_min_ss(a, b);
2019         assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0));
2020     }
2021 
2022     #[simd_test(enable = "sse")]
test_mm_min_ps()2023     unsafe fn test_mm_min_ps() {
2024         let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2025         let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2026         let r = _mm_min_ps(a, b);
2027         assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0));
2028 
2029         // `_mm_min_ps` can **not** be implemented using the `simd_min` rust intrinsic. `simd_min`
2030         // is lowered by the llvm codegen backend to `llvm.minnum.v*` llvm intrinsic. This intrinsic
2031         // doesn't specify how -0.0 is handled. Unfortunately it happens to behave different from
2032         // the `minps` x86 instruction on x86. The `llvm.minnum.v*` llvm intrinsic equals
2033         // `r1` to `a` and `r2` to `b`.
2034         let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0);
2035         let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0);
2036         let r1: [u8; 16] = transmute(_mm_min_ps(a, b));
2037         let r2: [u8; 16] = transmute(_mm_min_ps(b, a));
2038         let a: [u8; 16] = transmute(a);
2039         let b: [u8; 16] = transmute(b);
2040         assert_eq!(r1, b);
2041         assert_eq!(r2, a);
2042         assert_ne!(a, b); // sanity check that -0.0 is actually present
2043     }
2044 
2045     #[simd_test(enable = "sse")]
test_mm_max_ss()2046     unsafe fn test_mm_max_ss() {
2047         let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2048         let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2049         let r = _mm_max_ss(a, b);
2050         assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, -10.0));
2051     }
2052 
2053     #[simd_test(enable = "sse")]
test_mm_max_ps()2054     unsafe fn test_mm_max_ps() {
2055         let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2056         let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2057         let r = _mm_max_ps(a, b);
2058         assert_eq_m128(r, _mm_setr_ps(-1.0, 20.0, 0.0, -5.0));
2059     }
2060 
2061     #[simd_test(enable = "sse")]
test_mm_and_ps()2062     unsafe fn test_mm_and_ps() {
2063         let a = transmute(u32x4::splat(0b0011));
2064         let b = transmute(u32x4::splat(0b0101));
2065         let r = _mm_and_ps(*black_box(&a), *black_box(&b));
2066         let e = transmute(u32x4::splat(0b0001));
2067         assert_eq_m128(r, e);
2068     }
2069 
2070     #[simd_test(enable = "sse")]
test_mm_andnot_ps()2071     unsafe fn test_mm_andnot_ps() {
2072         let a = transmute(u32x4::splat(0b0011));
2073         let b = transmute(u32x4::splat(0b0101));
2074         let r = _mm_andnot_ps(*black_box(&a), *black_box(&b));
2075         let e = transmute(u32x4::splat(0b0100));
2076         assert_eq_m128(r, e);
2077     }
2078 
2079     #[simd_test(enable = "sse")]
test_mm_or_ps()2080     unsafe fn test_mm_or_ps() {
2081         let a = transmute(u32x4::splat(0b0011));
2082         let b = transmute(u32x4::splat(0b0101));
2083         let r = _mm_or_ps(*black_box(&a), *black_box(&b));
2084         let e = transmute(u32x4::splat(0b0111));
2085         assert_eq_m128(r, e);
2086     }
2087 
2088     #[simd_test(enable = "sse")]
test_mm_xor_ps()2089     unsafe fn test_mm_xor_ps() {
2090         let a = transmute(u32x4::splat(0b0011));
2091         let b = transmute(u32x4::splat(0b0101));
2092         let r = _mm_xor_ps(*black_box(&a), *black_box(&b));
2093         let e = transmute(u32x4::splat(0b0110));
2094         assert_eq_m128(r, e);
2095     }
2096 
2097     #[simd_test(enable = "sse")]
test_mm_cmpeq_ss()2098     unsafe fn test_mm_cmpeq_ss() {
2099         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2100         let b = _mm_setr_ps(-1.0, 5.0, 6.0, 7.0);
2101         let r: u32x4 = transmute(_mm_cmpeq_ss(a, b));
2102         let e: u32x4 = transmute(_mm_setr_ps(transmute(0u32), 2.0, 3.0, 4.0));
2103         assert_eq!(r, e);
2104 
2105         let b2 = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2106         let r2: u32x4 = transmute(_mm_cmpeq_ss(a, b2));
2107         let e2: u32x4 = transmute(_mm_setr_ps(transmute(0xffffffffu32), 2.0, 3.0, 4.0));
2108         assert_eq!(r2, e2);
2109     }
2110 
2111     #[simd_test(enable = "sse")]
test_mm_cmplt_ss()2112     unsafe fn test_mm_cmplt_ss() {
2113         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2114         let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2115         let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2116         let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2117 
2118         let b1 = 0u32; // a.extract(0) < b.extract(0)
2119         let c1 = 0u32; // a.extract(0) < c.extract(0)
2120         let d1 = !0u32; // a.extract(0) < d.extract(0)
2121 
2122         let rb: u32x4 = transmute(_mm_cmplt_ss(a, b));
2123         let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2124         assert_eq!(rb, eb);
2125 
2126         let rc: u32x4 = transmute(_mm_cmplt_ss(a, c));
2127         let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2128         assert_eq!(rc, ec);
2129 
2130         let rd: u32x4 = transmute(_mm_cmplt_ss(a, d));
2131         let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2132         assert_eq!(rd, ed);
2133     }
2134 
2135     #[simd_test(enable = "sse")]
test_mm_cmple_ss()2136     unsafe fn test_mm_cmple_ss() {
2137         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2138         let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2139         let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2140         let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2141 
2142         let b1 = 0u32; // a.extract(0) <= b.extract(0)
2143         let c1 = !0u32; // a.extract(0) <= c.extract(0)
2144         let d1 = !0u32; // a.extract(0) <= d.extract(0)
2145 
2146         let rb: u32x4 = transmute(_mm_cmple_ss(a, b));
2147         let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2148         assert_eq!(rb, eb);
2149 
2150         let rc: u32x4 = transmute(_mm_cmple_ss(a, c));
2151         let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2152         assert_eq!(rc, ec);
2153 
2154         let rd: u32x4 = transmute(_mm_cmple_ss(a, d));
2155         let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2156         assert_eq!(rd, ed);
2157     }
2158 
2159     #[simd_test(enable = "sse")]
test_mm_cmpgt_ss()2160     unsafe fn test_mm_cmpgt_ss() {
2161         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2162         let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2163         let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2164         let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2165 
2166         let b1 = !0u32; // a.extract(0) > b.extract(0)
2167         let c1 = 0u32; // a.extract(0) > c.extract(0)
2168         let d1 = 0u32; // a.extract(0) > d.extract(0)
2169 
2170         let rb: u32x4 = transmute(_mm_cmpgt_ss(a, b));
2171         let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2172         assert_eq!(rb, eb);
2173 
2174         let rc: u32x4 = transmute(_mm_cmpgt_ss(a, c));
2175         let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2176         assert_eq!(rc, ec);
2177 
2178         let rd: u32x4 = transmute(_mm_cmpgt_ss(a, d));
2179         let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2180         assert_eq!(rd, ed);
2181     }
2182 
2183     #[simd_test(enable = "sse")]
test_mm_cmpge_ss()2184     unsafe fn test_mm_cmpge_ss() {
2185         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2186         let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2187         let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2188         let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2189 
2190         let b1 = !0u32; // a.extract(0) >= b.extract(0)
2191         let c1 = !0u32; // a.extract(0) >= c.extract(0)
2192         let d1 = 0u32; // a.extract(0) >= d.extract(0)
2193 
2194         let rb: u32x4 = transmute(_mm_cmpge_ss(a, b));
2195         let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2196         assert_eq!(rb, eb);
2197 
2198         let rc: u32x4 = transmute(_mm_cmpge_ss(a, c));
2199         let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2200         assert_eq!(rc, ec);
2201 
2202         let rd: u32x4 = transmute(_mm_cmpge_ss(a, d));
2203         let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2204         assert_eq!(rd, ed);
2205     }
2206 
2207     #[simd_test(enable = "sse")]
test_mm_cmpneq_ss()2208     unsafe fn test_mm_cmpneq_ss() {
2209         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2210         let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2211         let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2212         let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2213 
2214         let b1 = !0u32; // a.extract(0) != b.extract(0)
2215         let c1 = 0u32; // a.extract(0) != c.extract(0)
2216         let d1 = !0u32; // a.extract(0) != d.extract(0)
2217 
2218         let rb: u32x4 = transmute(_mm_cmpneq_ss(a, b));
2219         let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2220         assert_eq!(rb, eb);
2221 
2222         let rc: u32x4 = transmute(_mm_cmpneq_ss(a, c));
2223         let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2224         assert_eq!(rc, ec);
2225 
2226         let rd: u32x4 = transmute(_mm_cmpneq_ss(a, d));
2227         let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2228         assert_eq!(rd, ed);
2229     }
2230 
2231     #[simd_test(enable = "sse")]
test_mm_cmpnlt_ss()2232     unsafe fn test_mm_cmpnlt_ss() {
2233         // TODO: this test is exactly the same as for `_mm_cmpge_ss`, but there
2234         // must be a difference. It may have to do with behavior in the
2235         // presence of NaNs (signaling or quiet). If so, we should add tests
2236         // for those.
2237 
2238         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2239         let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2240         let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2241         let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2242 
2243         let b1 = !0u32; // a.extract(0) >= b.extract(0)
2244         let c1 = !0u32; // a.extract(0) >= c.extract(0)
2245         let d1 = 0u32; // a.extract(0) >= d.extract(0)
2246 
2247         let rb: u32x4 = transmute(_mm_cmpnlt_ss(a, b));
2248         let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2249         assert_eq!(rb, eb);
2250 
2251         let rc: u32x4 = transmute(_mm_cmpnlt_ss(a, c));
2252         let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2253         assert_eq!(rc, ec);
2254 
2255         let rd: u32x4 = transmute(_mm_cmpnlt_ss(a, d));
2256         let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2257         assert_eq!(rd, ed);
2258     }
2259 
2260     #[simd_test(enable = "sse")]
test_mm_cmpnle_ss()2261     unsafe fn test_mm_cmpnle_ss() {
2262         // TODO: this test is exactly the same as for `_mm_cmpgt_ss`, but there
2263         // must be a difference. It may have to do with behavior in the
2264         // presence
2265         // of NaNs (signaling or quiet). If so, we should add tests for those.
2266 
2267         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2268         let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2269         let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2270         let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2271 
2272         let b1 = !0u32; // a.extract(0) > b.extract(0)
2273         let c1 = 0u32; // a.extract(0) > c.extract(0)
2274         let d1 = 0u32; // a.extract(0) > d.extract(0)
2275 
2276         let rb: u32x4 = transmute(_mm_cmpnle_ss(a, b));
2277         let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2278         assert_eq!(rb, eb);
2279 
2280         let rc: u32x4 = transmute(_mm_cmpnle_ss(a, c));
2281         let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2282         assert_eq!(rc, ec);
2283 
2284         let rd: u32x4 = transmute(_mm_cmpnle_ss(a, d));
2285         let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2286         assert_eq!(rd, ed);
2287     }
2288 
2289     #[simd_test(enable = "sse")]
test_mm_cmpngt_ss()2290     unsafe fn test_mm_cmpngt_ss() {
2291         // TODO: this test is exactly the same as for `_mm_cmple_ss`, but there
2292         // must be a difference. It may have to do with behavior in the
2293         // presence of NaNs (signaling or quiet). If so, we should add tests
2294         // for those.
2295 
2296         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2297         let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2298         let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2299         let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2300 
2301         let b1 = 0u32; // a.extract(0) <= b.extract(0)
2302         let c1 = !0u32; // a.extract(0) <= c.extract(0)
2303         let d1 = !0u32; // a.extract(0) <= d.extract(0)
2304 
2305         let rb: u32x4 = transmute(_mm_cmpngt_ss(a, b));
2306         let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2307         assert_eq!(rb, eb);
2308 
2309         let rc: u32x4 = transmute(_mm_cmpngt_ss(a, c));
2310         let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2311         assert_eq!(rc, ec);
2312 
2313         let rd: u32x4 = transmute(_mm_cmpngt_ss(a, d));
2314         let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2315         assert_eq!(rd, ed);
2316     }
2317 
2318     #[simd_test(enable = "sse")]
test_mm_cmpnge_ss()2319     unsafe fn test_mm_cmpnge_ss() {
2320         // TODO: this test is exactly the same as for `_mm_cmplt_ss`, but there
2321         // must be a difference. It may have to do with behavior in the
2322         // presence of NaNs (signaling or quiet). If so, we should add tests
2323         // for those.
2324 
2325         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2326         let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2327         let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2328         let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2329 
2330         let b1 = 0u32; // a.extract(0) < b.extract(0)
2331         let c1 = 0u32; // a.extract(0) < c.extract(0)
2332         let d1 = !0u32; // a.extract(0) < d.extract(0)
2333 
2334         let rb: u32x4 = transmute(_mm_cmpnge_ss(a, b));
2335         let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2336         assert_eq!(rb, eb);
2337 
2338         let rc: u32x4 = transmute(_mm_cmpnge_ss(a, c));
2339         let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2340         assert_eq!(rc, ec);
2341 
2342         let rd: u32x4 = transmute(_mm_cmpnge_ss(a, d));
2343         let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2344         assert_eq!(rd, ed);
2345     }
2346 
2347     #[simd_test(enable = "sse")]
test_mm_cmpord_ss()2348     unsafe fn test_mm_cmpord_ss() {
2349         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2350         let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2351         let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0);
2352         let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2353 
2354         let b1 = !0u32; // a.extract(0) ord b.extract(0)
2355         let c1 = 0u32; // a.extract(0) ord c.extract(0)
2356         let d1 = !0u32; // a.extract(0) ord d.extract(0)
2357 
2358         let rb: u32x4 = transmute(_mm_cmpord_ss(a, b));
2359         let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2360         assert_eq!(rb, eb);
2361 
2362         let rc: u32x4 = transmute(_mm_cmpord_ss(a, c));
2363         let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2364         assert_eq!(rc, ec);
2365 
2366         let rd: u32x4 = transmute(_mm_cmpord_ss(a, d));
2367         let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2368         assert_eq!(rd, ed);
2369     }
2370 
2371     #[simd_test(enable = "sse")]
test_mm_cmpunord_ss()2372     unsafe fn test_mm_cmpunord_ss() {
2373         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2374         let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2375         let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0);
2376         let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2377 
2378         let b1 = 0u32; // a.extract(0) unord b.extract(0)
2379         let c1 = !0u32; // a.extract(0) unord c.extract(0)
2380         let d1 = 0u32; // a.extract(0) unord d.extract(0)
2381 
2382         let rb: u32x4 = transmute(_mm_cmpunord_ss(a, b));
2383         let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2384         assert_eq!(rb, eb);
2385 
2386         let rc: u32x4 = transmute(_mm_cmpunord_ss(a, c));
2387         let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2388         assert_eq!(rc, ec);
2389 
2390         let rd: u32x4 = transmute(_mm_cmpunord_ss(a, d));
2391         let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2392         assert_eq!(rd, ed);
2393     }
2394 
2395     #[simd_test(enable = "sse")]
test_mm_cmpeq_ps()2396     unsafe fn test_mm_cmpeq_ps() {
2397         let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2398         let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2399         let tru = !0u32;
2400         let fls = 0u32;
2401 
2402         let e = u32x4::new(fls, fls, tru, fls);
2403         let r: u32x4 = transmute(_mm_cmpeq_ps(a, b));
2404         assert_eq!(r, e);
2405     }
2406 
2407     #[simd_test(enable = "sse")]
test_mm_cmplt_ps()2408     unsafe fn test_mm_cmplt_ps() {
2409         let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2410         let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2411         let tru = !0u32;
2412         let fls = 0u32;
2413 
2414         let e = u32x4::new(tru, fls, fls, fls);
2415         let r: u32x4 = transmute(_mm_cmplt_ps(a, b));
2416         assert_eq!(r, e);
2417     }
2418 
2419     #[simd_test(enable = "sse")]
test_mm_cmple_ps()2420     unsafe fn test_mm_cmple_ps() {
2421         let a = _mm_setr_ps(10.0, 50.0, 1.0, 4.0);
2422         let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2423         let tru = !0u32;
2424         let fls = 0u32;
2425 
2426         let e = u32x4::new(tru, fls, tru, fls);
2427         let r: u32x4 = transmute(_mm_cmple_ps(a, b));
2428         assert_eq!(r, e);
2429     }
2430 
2431     #[simd_test(enable = "sse")]
test_mm_cmpgt_ps()2432     unsafe fn test_mm_cmpgt_ps() {
2433         let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2434         let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0);
2435         let tru = !0u32;
2436         let fls = 0u32;
2437 
2438         let e = u32x4::new(fls, tru, fls, fls);
2439         let r: u32x4 = transmute(_mm_cmpgt_ps(a, b));
2440         assert_eq!(r, e);
2441     }
2442 
2443     #[simd_test(enable = "sse")]
test_mm_cmpge_ps()2444     unsafe fn test_mm_cmpge_ps() {
2445         let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2446         let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0);
2447         let tru = !0u32;
2448         let fls = 0u32;
2449 
2450         let e = u32x4::new(fls, tru, tru, fls);
2451         let r: u32x4 = transmute(_mm_cmpge_ps(a, b));
2452         assert_eq!(r, e);
2453     }
2454 
2455     #[simd_test(enable = "sse")]
test_mm_cmpneq_ps()2456     unsafe fn test_mm_cmpneq_ps() {
2457         let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2458         let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2459         let tru = !0u32;
2460         let fls = 0u32;
2461 
2462         let e = u32x4::new(tru, tru, fls, tru);
2463         let r: u32x4 = transmute(_mm_cmpneq_ps(a, b));
2464         assert_eq!(r, e);
2465     }
2466 
2467     #[simd_test(enable = "sse")]
test_mm_cmpnlt_ps()2468     unsafe fn test_mm_cmpnlt_ps() {
2469         let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2470         let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2471         let tru = !0u32;
2472         let fls = 0u32;
2473 
2474         let e = u32x4::new(fls, tru, tru, tru);
2475         let r: u32x4 = transmute(_mm_cmpnlt_ps(a, b));
2476         assert_eq!(r, e);
2477     }
2478 
2479     #[simd_test(enable = "sse")]
test_mm_cmpnle_ps()2480     unsafe fn test_mm_cmpnle_ps() {
2481         let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2482         let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2483         let tru = !0u32;
2484         let fls = 0u32;
2485 
2486         let e = u32x4::new(fls, tru, fls, tru);
2487         let r: u32x4 = transmute(_mm_cmpnle_ps(a, b));
2488         assert_eq!(r, e);
2489     }
2490 
2491     #[simd_test(enable = "sse")]
test_mm_cmpngt_ps()2492     unsafe fn test_mm_cmpngt_ps() {
2493         let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2494         let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2495         let tru = !0u32;
2496         let fls = 0u32;
2497 
2498         let e = u32x4::new(tru, fls, tru, tru);
2499         let r: u32x4 = transmute(_mm_cmpngt_ps(a, b));
2500         assert_eq!(r, e);
2501     }
2502 
2503     #[simd_test(enable = "sse")]
test_mm_cmpnge_ps()2504     unsafe fn test_mm_cmpnge_ps() {
2505         let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2506         let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2507         let tru = !0u32;
2508         let fls = 0u32;
2509 
2510         let e = u32x4::new(tru, fls, fls, tru);
2511         let r: u32x4 = transmute(_mm_cmpnge_ps(a, b));
2512         assert_eq!(r, e);
2513     }
2514 
2515     #[simd_test(enable = "sse")]
test_mm_cmpord_ps()2516     unsafe fn test_mm_cmpord_ps() {
2517         let a = _mm_setr_ps(10.0, 50.0, NAN, NAN);
2518         let b = _mm_setr_ps(15.0, NAN, 1.0, NAN);
2519         let tru = !0u32;
2520         let fls = 0u32;
2521 
2522         let e = u32x4::new(tru, fls, fls, fls);
2523         let r: u32x4 = transmute(_mm_cmpord_ps(a, b));
2524         assert_eq!(r, e);
2525     }
2526 
2527     #[simd_test(enable = "sse")]
test_mm_cmpunord_ps()2528     unsafe fn test_mm_cmpunord_ps() {
2529         let a = _mm_setr_ps(10.0, 50.0, NAN, NAN);
2530         let b = _mm_setr_ps(15.0, NAN, 1.0, NAN);
2531         let tru = !0u32;
2532         let fls = 0u32;
2533 
2534         let e = u32x4::new(fls, tru, tru, tru);
2535         let r: u32x4 = transmute(_mm_cmpunord_ps(a, b));
2536         assert_eq!(r, e);
2537     }
2538 
2539     #[simd_test(enable = "sse")]
test_mm_comieq_ss()2540     unsafe fn test_mm_comieq_ss() {
2541         let aa = &[3.0f32, 12.0, 23.0, NAN];
2542         let bb = &[3.0f32, 47.5, 1.5, NAN];
2543 
2544         let ee = &[1i32, 0, 0, 0];
2545 
2546         for i in 0..4 {
2547             let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2548             let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2549 
2550             let r = _mm_comieq_ss(a, b);
2551 
2552             assert_eq!(
2553                 ee[i], r,
2554                 "_mm_comieq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2555                 a, b, r, ee[i], i
2556             );
2557         }
2558     }
2559 
2560     #[simd_test(enable = "sse")]
test_mm_comilt_ss()2561     unsafe fn test_mm_comilt_ss() {
2562         let aa = &[3.0f32, 12.0, 23.0, NAN];
2563         let bb = &[3.0f32, 47.5, 1.5, NAN];
2564 
2565         let ee = &[0i32, 1, 0, 0];
2566 
2567         for i in 0..4 {
2568             let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2569             let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2570 
2571             let r = _mm_comilt_ss(a, b);
2572 
2573             assert_eq!(
2574                 ee[i], r,
2575                 "_mm_comilt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2576                 a, b, r, ee[i], i
2577             );
2578         }
2579     }
2580 
2581     #[simd_test(enable = "sse")]
test_mm_comile_ss()2582     unsafe fn test_mm_comile_ss() {
2583         let aa = &[3.0f32, 12.0, 23.0, NAN];
2584         let bb = &[3.0f32, 47.5, 1.5, NAN];
2585 
2586         let ee = &[1i32, 1, 0, 0];
2587 
2588         for i in 0..4 {
2589             let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2590             let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2591 
2592             let r = _mm_comile_ss(a, b);
2593 
2594             assert_eq!(
2595                 ee[i], r,
2596                 "_mm_comile_ss({:?}, {:?}) = {}, expected: {} (i={})",
2597                 a, b, r, ee[i], i
2598             );
2599         }
2600     }
2601 
2602     #[simd_test(enable = "sse")]
test_mm_comigt_ss()2603     unsafe fn test_mm_comigt_ss() {
2604         let aa = &[3.0f32, 12.0, 23.0, NAN];
2605         let bb = &[3.0f32, 47.5, 1.5, NAN];
2606 
2607         let ee = &[1i32, 0, 1, 0];
2608 
2609         for i in 0..4 {
2610             let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2611             let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2612 
2613             let r = _mm_comige_ss(a, b);
2614 
2615             assert_eq!(
2616                 ee[i], r,
2617                 "_mm_comige_ss({:?}, {:?}) = {}, expected: {} (i={})",
2618                 a, b, r, ee[i], i
2619             );
2620         }
2621     }
2622 
2623     #[simd_test(enable = "sse")]
test_mm_comineq_ss()2624     unsafe fn test_mm_comineq_ss() {
2625         let aa = &[3.0f32, 12.0, 23.0, NAN];
2626         let bb = &[3.0f32, 47.5, 1.5, NAN];
2627 
2628         let ee = &[0i32, 1, 1, 1];
2629 
2630         for i in 0..4 {
2631             let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2632             let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2633 
2634             let r = _mm_comineq_ss(a, b);
2635 
2636             assert_eq!(
2637                 ee[i], r,
2638                 "_mm_comineq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2639                 a, b, r, ee[i], i
2640             );
2641         }
2642     }
2643 
2644     #[simd_test(enable = "sse")]
test_mm_ucomieq_ss()2645     unsafe fn test_mm_ucomieq_ss() {
2646         let aa = &[3.0f32, 12.0, 23.0, NAN];
2647         let bb = &[3.0f32, 47.5, 1.5, NAN];
2648 
2649         let ee = &[1i32, 0, 0, 0];
2650 
2651         for i in 0..4 {
2652             let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2653             let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2654 
2655             let r = _mm_ucomieq_ss(a, b);
2656 
2657             assert_eq!(
2658                 ee[i], r,
2659                 "_mm_ucomieq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2660                 a, b, r, ee[i], i
2661             );
2662         }
2663     }
2664 
2665     #[simd_test(enable = "sse")]
test_mm_ucomilt_ss()2666     unsafe fn test_mm_ucomilt_ss() {
2667         let aa = &[3.0f32, 12.0, 23.0, NAN];
2668         let bb = &[3.0f32, 47.5, 1.5, NAN];
2669 
2670         let ee = &[0i32, 1, 0, 0];
2671 
2672         for i in 0..4 {
2673             let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2674             let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2675 
2676             let r = _mm_ucomilt_ss(a, b);
2677 
2678             assert_eq!(
2679                 ee[i], r,
2680                 "_mm_ucomilt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2681                 a, b, r, ee[i], i
2682             );
2683         }
2684     }
2685 
2686     #[simd_test(enable = "sse")]
test_mm_ucomile_ss()2687     unsafe fn test_mm_ucomile_ss() {
2688         let aa = &[3.0f32, 12.0, 23.0, NAN];
2689         let bb = &[3.0f32, 47.5, 1.5, NAN];
2690 
2691         let ee = &[1i32, 1, 0, 0];
2692 
2693         for i in 0..4 {
2694             let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2695             let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2696 
2697             let r = _mm_ucomile_ss(a, b);
2698 
2699             assert_eq!(
2700                 ee[i], r,
2701                 "_mm_ucomile_ss({:?}, {:?}) = {}, expected: {} (i={})",
2702                 a, b, r, ee[i], i
2703             );
2704         }
2705     }
2706 
2707     #[simd_test(enable = "sse")]
test_mm_ucomigt_ss()2708     unsafe fn test_mm_ucomigt_ss() {
2709         let aa = &[3.0f32, 12.0, 23.0, NAN];
2710         let bb = &[3.0f32, 47.5, 1.5, NAN];
2711 
2712         let ee = &[0i32, 0, 1, 0];
2713 
2714         for i in 0..4 {
2715             let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2716             let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2717 
2718             let r = _mm_ucomigt_ss(a, b);
2719 
2720             assert_eq!(
2721                 ee[i], r,
2722                 "_mm_ucomigt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2723                 a, b, r, ee[i], i
2724             );
2725         }
2726     }
2727 
2728     #[simd_test(enable = "sse")]
test_mm_ucomige_ss()2729     unsafe fn test_mm_ucomige_ss() {
2730         let aa = &[3.0f32, 12.0, 23.0, NAN];
2731         let bb = &[3.0f32, 47.5, 1.5, NAN];
2732 
2733         let ee = &[1i32, 0, 1, 0];
2734 
2735         for i in 0..4 {
2736             let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2737             let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2738 
2739             let r = _mm_ucomige_ss(a, b);
2740 
2741             assert_eq!(
2742                 ee[i], r,
2743                 "_mm_ucomige_ss({:?}, {:?}) = {}, expected: {} (i={})",
2744                 a, b, r, ee[i], i
2745             );
2746         }
2747     }
2748 
2749     #[simd_test(enable = "sse")]
test_mm_ucomineq_ss()2750     unsafe fn test_mm_ucomineq_ss() {
2751         let aa = &[3.0f32, 12.0, 23.0, NAN];
2752         let bb = &[3.0f32, 47.5, 1.5, NAN];
2753 
2754         let ee = &[0i32, 1, 1, 1];
2755 
2756         for i in 0..4 {
2757             let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2758             let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2759 
2760             let r = _mm_ucomineq_ss(a, b);
2761 
2762             assert_eq!(
2763                 ee[i], r,
2764                 "_mm_ucomineq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2765                 a, b, r, ee[i], i
2766             );
2767         }
2768     }
2769 
2770     #[simd_test(enable = "sse")]
test_mm_comieq_ss_vs_ucomieq_ss()2771     unsafe fn test_mm_comieq_ss_vs_ucomieq_ss() {
2772         // If one of the arguments is a quiet NaN `comieq_ss` should signal an
2773         // Invalid Operation Exception while `ucomieq_ss` should not.
2774         let aa = &[3.0f32, NAN, 23.0, NAN];
2775         let bb = &[3.0f32, 47.5, NAN, NAN];
2776 
2777         let ee = &[1i32, 0, 0, 0];
2778         let exc = &[0u32, 1, 1, 1]; // Should comieq_ss signal an exception?
2779 
2780         for i in 0..4 {
2781             let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2782             let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2783 
2784             _MM_SET_EXCEPTION_STATE(0);
2785             let r1 = _mm_comieq_ss(*black_box(&a), b);
2786             let s1 = _MM_GET_EXCEPTION_STATE();
2787 
2788             _MM_SET_EXCEPTION_STATE(0);
2789             let r2 = _mm_ucomieq_ss(*black_box(&a), b);
2790             let s2 = _MM_GET_EXCEPTION_STATE();
2791 
2792             assert_eq!(
2793                 ee[i], r1,
2794                 "_mm_comeq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2795                 a, b, r1, ee[i], i
2796             );
2797             assert_eq!(
2798                 ee[i], r2,
2799                 "_mm_ucomeq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2800                 a, b, r2, ee[i], i
2801             );
2802             assert_eq!(
2803                 s1,
2804                 exc[i] * _MM_EXCEPT_INVALID,
2805                 "_mm_comieq_ss() set exception flags: {} (i={})",
2806                 s1,
2807                 i
2808             );
2809             assert_eq!(
2810                 s2,
2811                 0, // ucomieq_ss should not signal an exception
2812                 "_mm_ucomieq_ss() set exception flags: {} (i={})",
2813                 s2,
2814                 i
2815             );
2816         }
2817     }
2818 
2819     #[simd_test(enable = "sse")]
test_mm_cvtss_si32()2820     unsafe fn test_mm_cvtss_si32() {
2821         let inputs = &[42.0f32, -3.1, 4.0e10, 4.0e-20, NAN, 2147483500.1];
2822         let result = &[42i32, -3, i32::MIN, 0, i32::MIN, 2147483520];
2823         for i in 0..inputs.len() {
2824             let x = _mm_setr_ps(inputs[i], 1.0, 3.0, 4.0);
2825             let e = result[i];
2826             let r = _mm_cvtss_si32(x);
2827             assert_eq!(
2828                 e, r,
2829                 "TestCase #{} _mm_cvtss_si32({:?}) = {}, expected: {}",
2830                 i, x, r, e
2831             );
2832         }
2833     }
2834 
2835     #[simd_test(enable = "sse")]
test_mm_cvttss_si32()2836     unsafe fn test_mm_cvttss_si32() {
2837         let inputs = &[
2838             (42.0f32, 42i32),
2839             (-31.4, -31),
2840             (-33.5, -33),
2841             (-34.5, -34),
2842             (10.999, 10),
2843             (-5.99, -5),
2844             (4.0e10, i32::MIN),
2845             (4.0e-10, 0),
2846             (NAN, i32::MIN),
2847             (2147483500.1, 2147483520),
2848         ];
2849         for i in 0..inputs.len() {
2850             let (xi, e) = inputs[i];
2851             let x = _mm_setr_ps(xi, 1.0, 3.0, 4.0);
2852             let r = _mm_cvttss_si32(x);
2853             assert_eq!(
2854                 e, r,
2855                 "TestCase #{} _mm_cvttss_si32({:?}) = {}, expected: {}",
2856                 i, x, r, e
2857             );
2858         }
2859     }
2860 
2861     #[simd_test(enable = "sse")]
test_mm_cvtsi32_ss()2862     unsafe fn test_mm_cvtsi32_ss() {
2863         let inputs = &[
2864             (4555i32, 4555.0f32),
2865             (322223333, 322223330.0),
2866             (-432, -432.0),
2867             (-322223333, -322223330.0),
2868         ];
2869 
2870         for i in 0..inputs.len() {
2871             let (x, f) = inputs[i];
2872             let a = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
2873             let r = _mm_cvtsi32_ss(a, x);
2874             let e = _mm_setr_ps(f, 6.0, 7.0, 8.0);
2875             assert_eq_m128(e, r);
2876         }
2877     }
2878 
2879     #[simd_test(enable = "sse")]
test_mm_cvtss_f32()2880     unsafe fn test_mm_cvtss_f32() {
2881         let a = _mm_setr_ps(312.0134, 5.0, 6.0, 7.0);
2882         assert_eq!(_mm_cvtss_f32(a), 312.0134);
2883     }
2884 
2885     #[simd_test(enable = "sse")]
test_mm_set_ss()2886     unsafe fn test_mm_set_ss() {
2887         let r = _mm_set_ss(black_box(4.25));
2888         assert_eq_m128(r, _mm_setr_ps(4.25, 0.0, 0.0, 0.0));
2889     }
2890 
2891     #[simd_test(enable = "sse")]
test_mm_set1_ps()2892     unsafe fn test_mm_set1_ps() {
2893         let r1 = _mm_set1_ps(black_box(4.25));
2894         let r2 = _mm_set_ps1(black_box(4.25));
2895         assert_eq!(get_m128(r1, 0), 4.25);
2896         assert_eq!(get_m128(r1, 1), 4.25);
2897         assert_eq!(get_m128(r1, 2), 4.25);
2898         assert_eq!(get_m128(r1, 3), 4.25);
2899         assert_eq!(get_m128(r2, 0), 4.25);
2900         assert_eq!(get_m128(r2, 1), 4.25);
2901         assert_eq!(get_m128(r2, 2), 4.25);
2902         assert_eq!(get_m128(r2, 3), 4.25);
2903     }
2904 
2905     #[simd_test(enable = "sse")]
test_mm_set_ps()2906     unsafe fn test_mm_set_ps() {
2907         let r = _mm_set_ps(
2908             black_box(1.0),
2909             black_box(2.0),
2910             black_box(3.0),
2911             black_box(4.0),
2912         );
2913         assert_eq!(get_m128(r, 0), 4.0);
2914         assert_eq!(get_m128(r, 1), 3.0);
2915         assert_eq!(get_m128(r, 2), 2.0);
2916         assert_eq!(get_m128(r, 3), 1.0);
2917     }
2918 
2919     #[simd_test(enable = "sse")]
test_mm_setr_ps()2920     unsafe fn test_mm_setr_ps() {
2921         let r = _mm_setr_ps(
2922             black_box(1.0),
2923             black_box(2.0),
2924             black_box(3.0),
2925             black_box(4.0),
2926         );
2927         assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0));
2928     }
2929 
2930     #[simd_test(enable = "sse")]
test_mm_setzero_ps()2931     unsafe fn test_mm_setzero_ps() {
2932         let r = *black_box(&_mm_setzero_ps());
2933         assert_eq_m128(r, _mm_set1_ps(0.0));
2934     }
2935 
2936     #[simd_test(enable = "sse")]
test_mm_shuffle()2937     unsafe fn test_mm_shuffle() {
2938         assert_eq!(_MM_SHUFFLE(0, 1, 1, 3), 0b00_01_01_11);
2939         assert_eq!(_MM_SHUFFLE(3, 1, 1, 0), 0b11_01_01_00);
2940         assert_eq!(_MM_SHUFFLE(1, 2, 2, 1), 0b01_10_10_01);
2941     }
2942 
2943     #[simd_test(enable = "sse")]
test_mm_shuffle_ps()2944     unsafe fn test_mm_shuffle_ps() {
2945         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2946         let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
2947         let r = _mm_shuffle_ps::<0b00_01_01_11>(a, b);
2948         assert_eq_m128(r, _mm_setr_ps(4.0, 2.0, 6.0, 5.0));
2949     }
2950 
2951     #[simd_test(enable = "sse")]
test_mm_unpackhi_ps()2952     unsafe fn test_mm_unpackhi_ps() {
2953         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2954         let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
2955         let r = _mm_unpackhi_ps(a, b);
2956         assert_eq_m128(r, _mm_setr_ps(3.0, 7.0, 4.0, 8.0));
2957     }
2958 
2959     #[simd_test(enable = "sse")]
test_mm_unpacklo_ps()2960     unsafe fn test_mm_unpacklo_ps() {
2961         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2962         let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
2963         let r = _mm_unpacklo_ps(a, b);
2964         assert_eq_m128(r, _mm_setr_ps(1.0, 5.0, 2.0, 6.0));
2965     }
2966 
2967     #[simd_test(enable = "sse")]
test_mm_movehl_ps()2968     unsafe fn test_mm_movehl_ps() {
2969         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2970         let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
2971         let r = _mm_movehl_ps(a, b);
2972         assert_eq_m128(r, _mm_setr_ps(7.0, 8.0, 3.0, 4.0));
2973     }
2974 
2975     #[simd_test(enable = "sse")]
test_mm_movelh_ps()2976     unsafe fn test_mm_movelh_ps() {
2977         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2978         let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
2979         let r = _mm_movelh_ps(a, b);
2980         assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 5.0, 6.0));
2981     }
2982 
2983     #[simd_test(enable = "sse")]
test_mm_load_ss()2984     unsafe fn test_mm_load_ss() {
2985         let a = 42.0f32;
2986         let r = _mm_load_ss(&a as *const f32);
2987         assert_eq_m128(r, _mm_setr_ps(42.0, 0.0, 0.0, 0.0));
2988     }
2989 
2990     #[simd_test(enable = "sse")]
test_mm_load1_ps()2991     unsafe fn test_mm_load1_ps() {
2992         let a = 42.0f32;
2993         let r = _mm_load1_ps(&a as *const f32);
2994         assert_eq_m128(r, _mm_setr_ps(42.0, 42.0, 42.0, 42.0));
2995     }
2996 
2997     #[simd_test(enable = "sse")]
test_mm_load_ps()2998     unsafe fn test_mm_load_ps() {
2999         let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
3000 
3001         let mut p = vals.as_ptr();
3002         let mut fixup = 0.0f32;
3003 
3004         // Make sure p is aligned, otherwise we might get a
3005         // (signal: 11, SIGSEGV: invalid memory reference)
3006 
3007         let unalignment = (p as usize) & 0xf;
3008         if unalignment != 0 {
3009             let delta = ((16 - unalignment) >> 2) as isize;
3010             fixup = delta as f32;
3011             p = p.offset(delta);
3012         }
3013 
3014         let r = _mm_load_ps(p);
3015         let e = _mm_add_ps(_mm_setr_ps(1.0, 2.0, 3.0, 4.0), _mm_set1_ps(fixup));
3016         assert_eq_m128(r, e);
3017     }
3018 
3019     #[simd_test(enable = "sse")]
test_mm_loadu_ps()3020     unsafe fn test_mm_loadu_ps() {
3021         let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
3022         let p = vals.as_ptr().offset(3);
3023         let r = _mm_loadu_ps(black_box(p));
3024         assert_eq_m128(r, _mm_setr_ps(4.0, 5.0, 6.0, 7.0));
3025     }
3026 
3027     #[simd_test(enable = "sse")]
test_mm_loadr_ps()3028     unsafe fn test_mm_loadr_ps() {
3029         let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
3030 
3031         let mut p = vals.as_ptr();
3032         let mut fixup = 0.0f32;
3033 
3034         // Make sure p is aligned, otherwise we might get a
3035         // (signal: 11, SIGSEGV: invalid memory reference)
3036 
3037         let unalignment = (p as usize) & 0xf;
3038         if unalignment != 0 {
3039             let delta = ((16 - unalignment) >> 2) as isize;
3040             fixup = delta as f32;
3041             p = p.offset(delta);
3042         }
3043 
3044         let r = _mm_loadr_ps(p);
3045         let e = _mm_add_ps(_mm_setr_ps(4.0, 3.0, 2.0, 1.0), _mm_set1_ps(fixup));
3046         assert_eq_m128(r, e);
3047     }
3048 
3049     #[simd_test(enable = "sse2")]
test_mm_loadu_si64()3050     unsafe fn test_mm_loadu_si64() {
3051         let a = _mm_setr_epi64x(5, 6);
3052         let r = _mm_loadu_si64(&a as *const _ as *const _);
3053         assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
3054     }
3055 
3056     #[simd_test(enable = "sse")]
test_mm_store_ss()3057     unsafe fn test_mm_store_ss() {
3058         let mut vals = [0.0f32; 8];
3059         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3060         _mm_store_ss(vals.as_mut_ptr().offset(1), a);
3061 
3062         assert_eq!(vals[0], 0.0);
3063         assert_eq!(vals[1], 1.0);
3064         assert_eq!(vals[2], 0.0);
3065     }
3066 
3067     #[simd_test(enable = "sse")]
test_mm_store1_ps()3068     unsafe fn test_mm_store1_ps() {
3069         let mut vals = [0.0f32; 8];
3070         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3071 
3072         let mut ofs = 0;
3073         let mut p = vals.as_mut_ptr();
3074 
3075         if (p as usize) & 0xf != 0 {
3076             ofs = ((16 - (p as usize)) & 0xf) >> 2;
3077             p = p.add(ofs);
3078         }
3079 
3080         _mm_store1_ps(p, *black_box(&a));
3081 
3082         if ofs > 0 {
3083             assert_eq!(vals[ofs - 1], 0.0);
3084         }
3085         assert_eq!(vals[ofs + 0], 1.0);
3086         assert_eq!(vals[ofs + 1], 1.0);
3087         assert_eq!(vals[ofs + 2], 1.0);
3088         assert_eq!(vals[ofs + 3], 1.0);
3089         assert_eq!(vals[ofs + 4], 0.0);
3090     }
3091 
3092     #[simd_test(enable = "sse")]
test_mm_store_ps()3093     unsafe fn test_mm_store_ps() {
3094         let mut vals = [0.0f32; 8];
3095         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3096 
3097         let mut ofs = 0;
3098         let mut p = vals.as_mut_ptr();
3099 
3100         // Align p to 16-byte boundary
3101         if (p as usize) & 0xf != 0 {
3102             ofs = ((16 - (p as usize)) & 0xf) >> 2;
3103             p = p.add(ofs);
3104         }
3105 
3106         _mm_store_ps(p, *black_box(&a));
3107 
3108         if ofs > 0 {
3109             assert_eq!(vals[ofs - 1], 0.0);
3110         }
3111         assert_eq!(vals[ofs + 0], 1.0);
3112         assert_eq!(vals[ofs + 1], 2.0);
3113         assert_eq!(vals[ofs + 2], 3.0);
3114         assert_eq!(vals[ofs + 3], 4.0);
3115         assert_eq!(vals[ofs + 4], 0.0);
3116     }
3117 
3118     #[simd_test(enable = "sse")]
test_mm_storer_ps()3119     unsafe fn test_mm_storer_ps() {
3120         let mut vals = [0.0f32; 8];
3121         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3122 
3123         let mut ofs = 0;
3124         let mut p = vals.as_mut_ptr();
3125 
3126         // Align p to 16-byte boundary
3127         if (p as usize) & 0xf != 0 {
3128             ofs = ((16 - (p as usize)) & 0xf) >> 2;
3129             p = p.add(ofs);
3130         }
3131 
3132         _mm_storer_ps(p, *black_box(&a));
3133 
3134         if ofs > 0 {
3135             assert_eq!(vals[ofs - 1], 0.0);
3136         }
3137         assert_eq!(vals[ofs + 0], 4.0);
3138         assert_eq!(vals[ofs + 1], 3.0);
3139         assert_eq!(vals[ofs + 2], 2.0);
3140         assert_eq!(vals[ofs + 3], 1.0);
3141         assert_eq!(vals[ofs + 4], 0.0);
3142     }
3143 
3144     #[simd_test(enable = "sse")]
test_mm_storeu_ps()3145     unsafe fn test_mm_storeu_ps() {
3146         let mut vals = [0.0f32; 8];
3147         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3148 
3149         let mut ofs = 0;
3150         let mut p = vals.as_mut_ptr();
3151 
3152         // Make sure p is **not** aligned to 16-byte boundary
3153         if (p as usize) & 0xf == 0 {
3154             ofs = 1;
3155             p = p.offset(1);
3156         }
3157 
3158         _mm_storeu_ps(p, *black_box(&a));
3159 
3160         if ofs > 0 {
3161             assert_eq!(vals[ofs - 1], 0.0);
3162         }
3163         assert_eq!(vals[ofs + 0], 1.0);
3164         assert_eq!(vals[ofs + 1], 2.0);
3165         assert_eq!(vals[ofs + 2], 3.0);
3166         assert_eq!(vals[ofs + 3], 4.0);
3167         assert_eq!(vals[ofs + 4], 0.0);
3168     }
3169 
3170     #[simd_test(enable = "sse")]
test_mm_move_ss()3171     unsafe fn test_mm_move_ss() {
3172         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3173         let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3174 
3175         let r = _mm_move_ss(a, b);
3176         let e = _mm_setr_ps(5.0, 2.0, 3.0, 4.0);
3177         assert_eq_m128(e, r);
3178     }
3179 
3180     #[simd_test(enable = "sse")]
test_mm_movemask_ps()3181     unsafe fn test_mm_movemask_ps() {
3182         let r = _mm_movemask_ps(_mm_setr_ps(-1.0, 5.0, -5.0, 0.0));
3183         assert_eq!(r, 0b0101);
3184 
3185         let r = _mm_movemask_ps(_mm_setr_ps(-1.0, -5.0, -5.0, 0.0));
3186         assert_eq!(r, 0b0111);
3187     }
3188 
3189     #[simd_test(enable = "sse")]
test_mm_sfence()3190     unsafe fn test_mm_sfence() {
3191         _mm_sfence();
3192     }
3193 
3194     #[simd_test(enable = "sse")]
test_mm_getcsr_setcsr_1()3195     unsafe fn test_mm_getcsr_setcsr_1() {
3196         let saved_csr = _mm_getcsr();
3197 
3198         let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0);
3199         let b = _mm_setr_ps(0.001, 0.0, 0.0, 1.0);
3200 
3201         _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
3202         let r = _mm_mul_ps(*black_box(&a), *black_box(&b));
3203 
3204         _mm_setcsr(saved_csr);
3205 
3206         let exp = _mm_setr_ps(0.0, 0.0, 0.0, 1.0);
3207         assert_eq_m128(r, exp); // first component is a denormalized f32
3208     }
3209 
3210     #[simd_test(enable = "sse")]
test_mm_getcsr_setcsr_2()3211     unsafe fn test_mm_getcsr_setcsr_2() {
3212         // Same as _mm_setcsr_1 test, but with opposite flag value.
3213 
3214         let saved_csr = _mm_getcsr();
3215 
3216         let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0);
3217         let b = _mm_setr_ps(0.001, 0.0, 0.0, 1.0);
3218 
3219         _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
3220         let r = _mm_mul_ps(*black_box(&a), *black_box(&b));
3221 
3222         _mm_setcsr(saved_csr);
3223 
3224         let exp = _mm_setr_ps(1.1e-39, 0.0, 0.0, 1.0);
3225         assert_eq_m128(r, exp); // first component is a denormalized f32
3226     }
3227 
3228     #[simd_test(enable = "sse")]
test_mm_getcsr_setcsr_underflow()3229     unsafe fn test_mm_getcsr_setcsr_underflow() {
3230         _MM_SET_EXCEPTION_STATE(0);
3231 
3232         let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0);
3233         let b = _mm_setr_ps(1e-5, 0.0, 0.0, 1.0);
3234 
3235         assert_eq!(_MM_GET_EXCEPTION_STATE(), 0); // just to be sure
3236 
3237         let r = _mm_mul_ps(*black_box(&a), *black_box(&b));
3238 
3239         let exp = _mm_setr_ps(1.1e-41, 0.0, 0.0, 1.0);
3240         assert_eq_m128(r, exp);
3241 
3242         let underflow = _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_UNDERFLOW != 0;
3243         assert_eq!(underflow, true);
3244     }
3245 
3246     #[simd_test(enable = "sse")]
test_MM_TRANSPOSE4_PS()3247     unsafe fn test_MM_TRANSPOSE4_PS() {
3248         let mut a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3249         let mut b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3250         let mut c = _mm_setr_ps(9.0, 10.0, 11.0, 12.0);
3251         let mut d = _mm_setr_ps(13.0, 14.0, 15.0, 16.0);
3252 
3253         _MM_TRANSPOSE4_PS(&mut a, &mut b, &mut c, &mut d);
3254 
3255         assert_eq_m128(a, _mm_setr_ps(1.0, 5.0, 9.0, 13.0));
3256         assert_eq_m128(b, _mm_setr_ps(2.0, 6.0, 10.0, 14.0));
3257         assert_eq_m128(c, _mm_setr_ps(3.0, 7.0, 11.0, 15.0));
3258         assert_eq_m128(d, _mm_setr_ps(4.0, 8.0, 12.0, 16.0));
3259     }
3260 
3261     #[repr(align(16))]
3262     struct Memory {
3263         pub data: [f32; 4],
3264     }
3265 
3266     #[simd_test(enable = "sse")]
test_mm_stream_ps()3267     unsafe fn test_mm_stream_ps() {
3268         let a = _mm_set1_ps(7.0);
3269         let mut mem = Memory { data: [-1.0; 4] };
3270 
3271         _mm_stream_ps(&mut mem.data[0] as *mut f32, a);
3272         for i in 0..4 {
3273             assert_eq!(mem.data[i], get_m128(a, i));
3274         }
3275     }
3276 }
3277