1 //! Streaming SIMD Extensions 4.1 (SSE4.1)
2 
3 use crate::{
4     core_arch::{simd::*, simd_llvm::*, x86::*},
5     mem::transmute,
6 };
7 
8 #[cfg(test)]
9 use stdarch_test::assert_instr;
10 
11 // SSE4 rounding constans
12 /// round to nearest
13 #[stable(feature = "simd_x86", since = "1.27.0")]
14 pub const _MM_FROUND_TO_NEAREST_INT: i32 = 0x00;
15 /// round down
16 #[stable(feature = "simd_x86", since = "1.27.0")]
17 pub const _MM_FROUND_TO_NEG_INF: i32 = 0x01;
18 /// round up
19 #[stable(feature = "simd_x86", since = "1.27.0")]
20 pub const _MM_FROUND_TO_POS_INF: i32 = 0x02;
21 /// truncate
22 #[stable(feature = "simd_x86", since = "1.27.0")]
23 pub const _MM_FROUND_TO_ZERO: i32 = 0x03;
24 /// use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`
25 #[stable(feature = "simd_x86", since = "1.27.0")]
26 pub const _MM_FROUND_CUR_DIRECTION: i32 = 0x04;
27 /// do not suppress exceptions
28 #[stable(feature = "simd_x86", since = "1.27.0")]
29 pub const _MM_FROUND_RAISE_EXC: i32 = 0x00;
30 /// suppress exceptions
31 #[stable(feature = "simd_x86", since = "1.27.0")]
32 pub const _MM_FROUND_NO_EXC: i32 = 0x08;
33 /// round to nearest and do not suppress exceptions
34 #[stable(feature = "simd_x86", since = "1.27.0")]
35 pub const _MM_FROUND_NINT: i32 = 0x00;
36 /// round down and do not suppress exceptions
37 #[stable(feature = "simd_x86", since = "1.27.0")]
38 pub const _MM_FROUND_FLOOR: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF;
39 /// round up and do not suppress exceptions
40 #[stable(feature = "simd_x86", since = "1.27.0")]
41 pub const _MM_FROUND_CEIL: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF;
42 /// truncate and do not suppress exceptions
43 #[stable(feature = "simd_x86", since = "1.27.0")]
44 pub const _MM_FROUND_TRUNC: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO;
45 /// use MXCSR.RC and do not suppress exceptions; see
46 /// `vendor::_MM_SET_ROUNDING_MODE`
47 #[stable(feature = "simd_x86", since = "1.27.0")]
48 pub const _MM_FROUND_RINT: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION;
49 /// use MXCSR.RC and suppress exceptions; see `vendor::_MM_SET_ROUNDING_MODE`
50 #[stable(feature = "simd_x86", since = "1.27.0")]
51 pub const _MM_FROUND_NEARBYINT: i32 = _MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION;
52 
53 /// Blend packed 8-bit integers from `a` and `b` using `mask`
54 ///
55 /// The high bit of each corresponding mask byte determines the selection.
56 /// If the high bit is set the element of `a` is selected. The element
57 /// of `b` is selected otherwise.
58 ///
59 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_epi8)
60 #[inline]
61 #[target_feature(enable = "sse4.1")]
62 #[cfg_attr(test, assert_instr(pblendvb))]
63 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i64 pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i {
65     transmute(pblendvb(a.as_i8x16(), b.as_i8x16(), mask.as_i8x16()))
66 }
67 
68 /// Blend packed 16-bit integers from `a` and `b` using the mask `IMM8`.
69 ///
70 /// The mask bits determine the selection. A clear bit selects the
71 /// corresponding element of `a`, and a set bit the corresponding
72 /// element of `b`.
73 ///
74 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_epi16)
75 #[inline]
76 #[target_feature(enable = "sse4.1")]
77 // Note: LLVM7 prefers the single-precision floating-point domain when possible
78 // see https://bugs.llvm.org/show_bug.cgi?id=38195
79 // #[cfg_attr(test, assert_instr(pblendw, IMM8 = 0xF0))]
80 #[cfg_attr(test, assert_instr(blendps, IMM8 = 0xF0))]
81 #[rustc_legacy_const_generics(2)]
82 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_blend_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i83 pub unsafe fn _mm_blend_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
84     static_assert_imm8!(IMM8);
85     transmute(pblendw(a.as_i16x8(), b.as_i16x8(), IMM8 as u8))
86 }
87 
88 /// Blend packed double-precision (64-bit) floating-point elements from `a`
89 /// and `b` using `mask`
90 ///
91 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd)
92 #[inline]
93 #[target_feature(enable = "sse4.1")]
94 #[cfg_attr(test, assert_instr(blendvpd))]
95 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d96 pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
97     blendvpd(a, b, mask)
98 }
99 
100 /// Blend packed single-precision (32-bit) floating-point elements from `a`
101 /// and `b` using `mask`
102 ///
103 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps)
104 #[inline]
105 #[target_feature(enable = "sse4.1")]
106 #[cfg_attr(test, assert_instr(blendvps))]
107 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128108 pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
109     blendvps(a, b, mask)
110 }
111 
112 /// Blend packed double-precision (64-bit) floating-point elements from `a`
113 /// and `b` using control mask `IMM2`
114 ///
115 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_pd)
116 #[inline]
117 #[target_feature(enable = "sse4.1")]
118 // Note: LLVM7 prefers the single-precision floating-point domain when possible
119 // see https://bugs.llvm.org/show_bug.cgi?id=38195
120 // #[cfg_attr(test, assert_instr(blendpd, IMM2 = 0b10))]
121 #[cfg_attr(test, assert_instr(blendps, IMM2 = 0b10))]
122 #[rustc_legacy_const_generics(2)]
123 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_blend_pd<const IMM2: i32>(a: __m128d, b: __m128d) -> __m128d124 pub unsafe fn _mm_blend_pd<const IMM2: i32>(a: __m128d, b: __m128d) -> __m128d {
125     static_assert_imm2!(IMM2);
126     blendpd(a, b, IMM2 as u8)
127 }
128 
129 /// Blend packed single-precision (32-bit) floating-point elements from `a`
130 /// and `b` using mask `IMM4`
131 ///
132 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps)
133 #[inline]
134 #[target_feature(enable = "sse4.1")]
135 #[cfg_attr(test, assert_instr(blendps, IMM4 = 0b0101))]
136 #[rustc_legacy_const_generics(2)]
137 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_blend_ps<const IMM4: i32>(a: __m128, b: __m128) -> __m128138 pub unsafe fn _mm_blend_ps<const IMM4: i32>(a: __m128, b: __m128) -> __m128 {
139     static_assert_imm4!(IMM4);
140     blendps(a, b, IMM4 as u8)
141 }
142 
143 /// Extracts a single-precision (32-bit) floating-point element from `a`,
144 /// selected with `IMM8`. The returned `i32` stores the float's bit-pattern,
145 /// and may be converted back to a floating point number via casting.
146 ///
147 /// # Example
148 /// ```rust
149 /// # #[cfg(target_arch = "x86")]
150 /// # use std::arch::x86::*;
151 /// # #[cfg(target_arch = "x86_64")]
152 /// # use std::arch::x86_64::*;
153 /// # fn main() {
154 /// #    if is_x86_feature_detected!("sse4.1") {
155 /// #       #[target_feature(enable = "sse4.1")]
156 /// #       unsafe fn worker() {
157 /// let mut float_store = vec![1.0, 1.0, 2.0, 3.0];
158 /// unsafe {
159 ///     let simd_floats = _mm_set_ps(2.5, 5.0, 7.5, 10.0);
160 ///     let x: i32 = _mm_extract_ps::<2>(simd_floats);
161 ///     float_store.push(f32::from_bits(x as u32));
162 /// }
163 /// assert_eq!(float_store, vec![1.0, 1.0, 2.0, 3.0, 5.0]);
164 /// #       }
165 /// #       unsafe { worker() }
166 /// #   }
167 /// # }
168 /// ```
169 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_ps)
170 #[inline]
171 #[target_feature(enable = "sse4.1")]
172 #[cfg_attr(
173     all(test, not(target_os = "windows")),
174     assert_instr(extractps, IMM8 = 0)
175 )]
176 #[rustc_legacy_const_generics(1)]
177 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_extract_ps<const IMM8: i32>(a: __m128) -> i32178 pub unsafe fn _mm_extract_ps<const IMM8: i32>(a: __m128) -> i32 {
179     static_assert_imm2!(IMM8);
180     transmute(simd_extract::<_, f32>(a, IMM8 as u32))
181 }
182 
183 /// Extracts an 8-bit integer from `a`, selected with `IMM8`. Returns a 32-bit
184 /// integer containing the zero-extended integer data.
185 ///
186 /// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
187 ///
188 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi8)
189 #[inline]
190 #[target_feature(enable = "sse4.1")]
191 #[cfg_attr(test, assert_instr(pextrb, IMM8 = 0))]
192 #[rustc_legacy_const_generics(1)]
193 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_extract_epi8<const IMM8: i32>(a: __m128i) -> i32194 pub unsafe fn _mm_extract_epi8<const IMM8: i32>(a: __m128i) -> i32 {
195     static_assert_imm4!(IMM8);
196     simd_extract::<_, u8>(a.as_u8x16(), IMM8 as u32) as i32
197 }
198 
199 /// Extracts an 32-bit integer from `a` selected with `IMM8`
200 ///
201 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi32)
202 #[inline]
203 #[target_feature(enable = "sse4.1")]
204 #[cfg_attr(
205     all(test, not(target_os = "windows")),
206     assert_instr(extractps, IMM8 = 1)
207 )]
208 #[rustc_legacy_const_generics(1)]
209 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_extract_epi32<const IMM8: i32>(a: __m128i) -> i32210 pub unsafe fn _mm_extract_epi32<const IMM8: i32>(a: __m128i) -> i32 {
211     static_assert_imm2!(IMM8);
212     simd_extract::<_, i32>(a.as_i32x4(), IMM8 as u32)
213 }
214 
215 /// Select a single value in `a` to store at some position in `b`,
216 /// Then zero elements according to `IMM8`.
217 ///
218 /// `IMM8` specifies which bits from operand `a` will be copied, which bits in
219 /// the result they will be copied to, and which bits in the result will be
220 /// cleared. The following assignments are made:
221 ///
222 /// * Bits `[7:6]` specify the bits to copy from operand `a`:
223 ///     - `00`: Selects bits `[31:0]` from operand `a`.
224 ///     - `01`: Selects bits `[63:32]` from operand `a`.
225 ///     - `10`: Selects bits `[95:64]` from operand `a`.
226 ///     - `11`: Selects bits `[127:96]` from operand `a`.
227 ///
228 /// * Bits `[5:4]` specify the bits in the result to which the selected bits
229 /// from operand `a` are copied:
230 ///     - `00`: Copies the selected bits from `a` to result bits `[31:0]`.
231 ///     - `01`: Copies the selected bits from `a` to result bits `[63:32]`.
232 ///     - `10`: Copies the selected bits from `a` to result bits `[95:64]`.
233 ///     - `11`: Copies the selected bits from `a` to result bits `[127:96]`.
234 ///
235 /// * Bits `[3:0]`: If any of these bits are set, the corresponding result
236 /// element is cleared.
237 ///
238 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_ps)
239 #[inline]
240 #[target_feature(enable = "sse4.1")]
241 #[cfg_attr(test, assert_instr(insertps, IMM8 = 0b1010))]
242 #[rustc_legacy_const_generics(2)]
243 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_insert_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128244 pub unsafe fn _mm_insert_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
245     static_assert_imm8!(IMM8);
246     insertps(a, b, IMM8 as u8)
247 }
248 
249 /// Returns a copy of `a` with the 8-bit integer from `i` inserted at a
250 /// location specified by `IMM8`.
251 ///
252 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi8)
253 #[inline]
254 #[target_feature(enable = "sse4.1")]
255 #[cfg_attr(test, assert_instr(pinsrb, IMM8 = 0))]
256 #[rustc_legacy_const_generics(2)]
257 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_insert_epi8<const IMM8: i32>(a: __m128i, i: i32) -> __m128i258 pub unsafe fn _mm_insert_epi8<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
259     static_assert_imm4!(IMM8);
260     transmute(simd_insert(a.as_i8x16(), IMM8 as u32, i as i8))
261 }
262 
263 /// Returns a copy of `a` with the 32-bit integer from `i` inserted at a
264 /// location specified by `IMM8`.
265 ///
266 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi32)
267 #[inline]
268 #[target_feature(enable = "sse4.1")]
269 #[cfg_attr(test, assert_instr(pinsrd, IMM8 = 0))]
270 #[rustc_legacy_const_generics(2)]
271 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_insert_epi32<const IMM8: i32>(a: __m128i, i: i32) -> __m128i272 pub unsafe fn _mm_insert_epi32<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
273     static_assert_imm2!(IMM8);
274     transmute(simd_insert(a.as_i32x4(), IMM8 as u32, i))
275 }
276 
277 /// Compares packed 8-bit integers in `a` and `b` and returns packed maximum
278 /// values in dst.
279 ///
280 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8)
281 #[inline]
282 #[target_feature(enable = "sse4.1")]
283 #[cfg_attr(test, assert_instr(pmaxsb))]
284 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_max_epi8(a: __m128i, b: __m128i) -> __m128i285 pub unsafe fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i {
286     transmute(pmaxsb(a.as_i8x16(), b.as_i8x16()))
287 }
288 
289 /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
290 /// maximum.
291 ///
292 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16)
293 #[inline]
294 #[target_feature(enable = "sse4.1")]
295 #[cfg_attr(test, assert_instr(pmaxuw))]
296 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_max_epu16(a: __m128i, b: __m128i) -> __m128i297 pub unsafe fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i {
298     transmute(pmaxuw(a.as_u16x8(), b.as_u16x8()))
299 }
300 
301 /// Compares packed 32-bit integers in `a` and `b`, and returns packed maximum
302 /// values.
303 ///
304 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi32)
305 #[inline]
306 #[target_feature(enable = "sse4.1")]
307 #[cfg_attr(test, assert_instr(pmaxsd))]
308 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_max_epi32(a: __m128i, b: __m128i) -> __m128i309 pub unsafe fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i {
310     transmute(pmaxsd(a.as_i32x4(), b.as_i32x4()))
311 }
312 
313 /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
314 /// maximum values.
315 ///
316 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32)
317 #[inline]
318 #[target_feature(enable = "sse4.1")]
319 #[cfg_attr(test, assert_instr(pmaxud))]
320 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_max_epu32(a: __m128i, b: __m128i) -> __m128i321 pub unsafe fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i {
322     transmute(pmaxud(a.as_u32x4(), b.as_u32x4()))
323 }
324 
325 /// Compares packed 8-bit integers in `a` and `b` and returns packed minimum
326 /// values in dst.
327 ///
328 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8)
329 #[inline]
330 #[target_feature(enable = "sse4.1")]
331 #[cfg_attr(test, assert_instr(pminsb))]
332 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_min_epi8(a: __m128i, b: __m128i) -> __m128i333 pub unsafe fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i {
334     transmute(pminsb(a.as_i8x16(), b.as_i8x16()))
335 }
336 
337 /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
338 /// minimum.
339 ///
340 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16)
341 #[inline]
342 #[target_feature(enable = "sse4.1")]
343 #[cfg_attr(test, assert_instr(pminuw))]
344 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_min_epu16(a: __m128i, b: __m128i) -> __m128i345 pub unsafe fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i {
346     transmute(pminuw(a.as_u16x8(), b.as_u16x8()))
347 }
348 
349 /// Compares packed 32-bit integers in `a` and `b`, and returns packed minimum
350 /// values.
351 ///
352 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi32)
353 #[inline]
354 #[target_feature(enable = "sse4.1")]
355 #[cfg_attr(test, assert_instr(pminsd))]
356 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_min_epi32(a: __m128i, b: __m128i) -> __m128i357 pub unsafe fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i {
358     transmute(pminsd(a.as_i32x4(), b.as_i32x4()))
359 }
360 
361 /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
362 /// minimum values.
363 ///
364 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu32)
365 #[inline]
366 #[target_feature(enable = "sse4.1")]
367 #[cfg_attr(test, assert_instr(pminud))]
368 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_min_epu32(a: __m128i, b: __m128i) -> __m128i369 pub unsafe fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
370     transmute(pminud(a.as_u32x4(), b.as_u32x4()))
371 }
372 
373 /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
374 /// using unsigned saturation
375 ///
376 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_packus_epi32)
377 #[inline]
378 #[target_feature(enable = "sse4.1")]
379 #[cfg_attr(test, assert_instr(packusdw))]
380 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i381 pub unsafe fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i {
382     transmute(packusdw(a.as_i32x4(), b.as_i32x4()))
383 }
384 
385 /// Compares packed 64-bit integers in `a` and `b` for equality
386 ///
387 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epi64)
388 #[inline]
389 #[target_feature(enable = "sse4.1")]
390 #[cfg_attr(test, assert_instr(pcmpeqq))]
391 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i392 pub unsafe fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i {
393     transmute(simd_eq::<_, i64x2>(a.as_i64x2(), b.as_i64x2()))
394 }
395 
396 /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers
397 ///
398 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi16)
399 #[inline]
400 #[target_feature(enable = "sse4.1")]
401 #[cfg_attr(test, assert_instr(pmovsxbw))]
402 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cvtepi8_epi16(a: __m128i) -> __m128i403 pub unsafe fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
404     let a = a.as_i8x16();
405     let a: i8x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
406     transmute(simd_cast::<_, i16x8>(a))
407 }
408 
409 /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers
410 ///
411 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi32)
412 #[inline]
413 #[target_feature(enable = "sse4.1")]
414 #[cfg_attr(test, assert_instr(pmovsxbd))]
415 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cvtepi8_epi32(a: __m128i) -> __m128i416 pub unsafe fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {
417     let a = a.as_i8x16();
418     let a: i8x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
419     transmute(simd_cast::<_, i32x4>(a))
420 }
421 
422 /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed
423 /// 64-bit integers
424 ///
425 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi64)
426 #[inline]
427 #[target_feature(enable = "sse4.1")]
428 #[cfg_attr(test, assert_instr(pmovsxbq))]
429 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cvtepi8_epi64(a: __m128i) -> __m128i430 pub unsafe fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
431     let a = a.as_i8x16();
432     let a: i8x2 = simd_shuffle2!(a, a, [0, 1]);
433     transmute(simd_cast::<_, i64x2>(a))
434 }
435 
436 /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers
437 ///
438 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_epi32)
439 #[inline]
440 #[target_feature(enable = "sse4.1")]
441 #[cfg_attr(test, assert_instr(pmovsxwd))]
442 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cvtepi16_epi32(a: __m128i) -> __m128i443 pub unsafe fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
444     let a = a.as_i16x8();
445     let a: i16x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
446     transmute(simd_cast::<_, i32x4>(a))
447 }
448 
449 /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers
450 ///
451 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_epi64)
452 #[inline]
453 #[target_feature(enable = "sse4.1")]
454 #[cfg_attr(test, assert_instr(pmovsxwq))]
455 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cvtepi16_epi64(a: __m128i) -> __m128i456 pub unsafe fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
457     let a = a.as_i16x8();
458     let a: i16x2 = simd_shuffle2!(a, a, [0, 1]);
459     transmute(simd_cast::<_, i64x2>(a))
460 }
461 
462 /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers
463 ///
464 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_epi64)
465 #[inline]
466 #[target_feature(enable = "sse4.1")]
467 #[cfg_attr(test, assert_instr(pmovsxdq))]
468 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cvtepi32_epi64(a: __m128i) -> __m128i469 pub unsafe fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
470     let a = a.as_i32x4();
471     let a: i32x2 = simd_shuffle2!(a, a, [0, 1]);
472     transmute(simd_cast::<_, i64x2>(a))
473 }
474 
475 /// Zeroes extend packed unsigned 8-bit integers in `a` to packed 16-bit integers
476 ///
477 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi16)
478 #[inline]
479 #[target_feature(enable = "sse4.1")]
480 #[cfg_attr(test, assert_instr(pmovzxbw))]
481 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cvtepu8_epi16(a: __m128i) -> __m128i482 pub unsafe fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
483     let a = a.as_u8x16();
484     let a: u8x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
485     transmute(simd_cast::<_, i16x8>(a))
486 }
487 
488 /// Zeroes extend packed unsigned 8-bit integers in `a` to packed 32-bit integers
489 ///
490 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi32)
491 #[inline]
492 #[target_feature(enable = "sse4.1")]
493 #[cfg_attr(test, assert_instr(pmovzxbd))]
494 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cvtepu8_epi32(a: __m128i) -> __m128i495 pub unsafe fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
496     let a = a.as_u8x16();
497     let a: u8x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
498     transmute(simd_cast::<_, i32x4>(a))
499 }
500 
501 /// Zeroes extend packed unsigned 8-bit integers in `a` to packed 64-bit integers
502 ///
503 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi64)
504 #[inline]
505 #[target_feature(enable = "sse4.1")]
506 #[cfg_attr(test, assert_instr(pmovzxbq))]
507 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cvtepu8_epi64(a: __m128i) -> __m128i508 pub unsafe fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {
509     let a = a.as_u8x16();
510     let a: u8x2 = simd_shuffle2!(a, a, [0, 1]);
511     transmute(simd_cast::<_, i64x2>(a))
512 }
513 
514 /// Zeroes extend packed unsigned 16-bit integers in `a`
515 /// to packed 32-bit integers
516 ///
517 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_epi32)
518 #[inline]
519 #[target_feature(enable = "sse4.1")]
520 #[cfg_attr(test, assert_instr(pmovzxwd))]
521 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cvtepu16_epi32(a: __m128i) -> __m128i522 pub unsafe fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {
523     let a = a.as_u16x8();
524     let a: u16x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
525     transmute(simd_cast::<_, i32x4>(a))
526 }
527 
528 /// Zeroes extend packed unsigned 16-bit integers in `a`
529 /// to packed 64-bit integers
530 ///
531 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_epi64)
532 #[inline]
533 #[target_feature(enable = "sse4.1")]
534 #[cfg_attr(test, assert_instr(pmovzxwq))]
535 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cvtepu16_epi64(a: __m128i) -> __m128i536 pub unsafe fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {
537     let a = a.as_u16x8();
538     let a: u16x2 = simd_shuffle2!(a, a, [0, 1]);
539     transmute(simd_cast::<_, i64x2>(a))
540 }
541 
542 /// Zeroes extend packed unsigned 32-bit integers in `a`
543 /// to packed 64-bit integers
544 ///
545 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu32_epi64)
546 #[inline]
547 #[target_feature(enable = "sse4.1")]
548 #[cfg_attr(test, assert_instr(pmovzxdq))]
549 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_cvtepu32_epi64(a: __m128i) -> __m128i550 pub unsafe fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i {
551     let a = a.as_u32x4();
552     let a: u32x2 = simd_shuffle2!(a, a, [0, 1]);
553     transmute(simd_cast::<_, i64x2>(a))
554 }
555 
556 /// Returns the dot product of two __m128d vectors.
557 ///
558 /// `IMM8[1:0]` is the broadcast mask, and `IMM8[5:4]` is the condition mask.
559 /// If a condition mask bit is zero, the corresponding multiplication is
560 /// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
561 /// the dot product will be stored in the return value component. Otherwise if
562 /// the broadcast mask bit is zero then the return component will be zero.
563 ///
564 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_pd)
565 #[inline]
566 #[target_feature(enable = "sse4.1")]
567 #[cfg_attr(test, assert_instr(dppd, IMM8 = 0))]
568 #[rustc_legacy_const_generics(2)]
569 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_dp_pd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d570 pub unsafe fn _mm_dp_pd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
571     static_assert_imm8!(IMM8);
572     dppd(a, b, IMM8 as u8)
573 }
574 
575 /// Returns the dot product of two __m128 vectors.
576 ///
577 /// `IMM8[3:0]` is the broadcast mask, and `IMM8[7:4]` is the condition mask.
578 /// If a condition mask bit is zero, the corresponding multiplication is
579 /// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
580 /// the dot product will be stored in the return value component. Otherwise if
581 /// the broadcast mask bit is zero then the return component will be zero.
582 ///
583 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps)
584 #[inline]
585 #[target_feature(enable = "sse4.1")]
586 #[cfg_attr(test, assert_instr(dpps, IMM8 = 0))]
587 #[rustc_legacy_const_generics(2)]
588 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_dp_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128589 pub unsafe fn _mm_dp_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
590     static_assert_imm8!(IMM8);
591     dpps(a, b, IMM8 as u8)
592 }
593 
594 /// Round the packed double-precision (64-bit) floating-point elements in `a`
595 /// down to an integer value, and stores the results as packed double-precision
596 /// floating-point elements.
597 ///
598 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_pd)
599 #[inline]
600 #[target_feature(enable = "sse4.1")]
601 #[cfg_attr(test, assert_instr(roundpd))]
602 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_floor_pd(a: __m128d) -> __m128d603 pub unsafe fn _mm_floor_pd(a: __m128d) -> __m128d {
604     simd_floor(a)
605 }
606 
607 /// Round the packed single-precision (32-bit) floating-point elements in `a`
608 /// down to an integer value, and stores the results as packed single-precision
609 /// floating-point elements.
610 ///
611 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps)
612 #[inline]
613 #[target_feature(enable = "sse4.1")]
614 #[cfg_attr(test, assert_instr(roundps))]
615 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_floor_ps(a: __m128) -> __m128616 pub unsafe fn _mm_floor_ps(a: __m128) -> __m128 {
617     simd_floor(a)
618 }
619 
620 /// Round the lower double-precision (64-bit) floating-point element in `b`
621 /// down to an integer value, store the result as a double-precision
622 /// floating-point element in the lower element of the intrinsic result,
623 /// and copies the upper element from `a` to the upper element of the intrinsic
624 /// result.
625 ///
626 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_sd)
627 #[inline]
628 #[target_feature(enable = "sse4.1")]
629 #[cfg_attr(test, assert_instr(roundsd))]
630 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_floor_sd(a: __m128d, b: __m128d) -> __m128d631 pub unsafe fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d {
632     roundsd(a, b, _MM_FROUND_FLOOR)
633 }
634 
635 /// Round the lower single-precision (32-bit) floating-point element in `b`
636 /// down to an integer value, store the result as a single-precision
637 /// floating-point element in the lower element of the intrinsic result,
638 /// and copies the upper 3 packed elements from `a` to the upper elements
639 /// of the intrinsic result.
640 ///
641 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss)
642 #[inline]
643 #[target_feature(enable = "sse4.1")]
644 #[cfg_attr(test, assert_instr(roundss))]
645 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_floor_ss(a: __m128, b: __m128) -> __m128646 pub unsafe fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 {
647     roundss(a, b, _MM_FROUND_FLOOR)
648 }
649 
650 /// Round the packed double-precision (64-bit) floating-point elements in `a`
651 /// up to an integer value, and stores the results as packed double-precision
652 /// floating-point elements.
653 ///
654 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_pd)
655 #[inline]
656 #[target_feature(enable = "sse4.1")]
657 #[cfg_attr(test, assert_instr(roundpd))]
658 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_ceil_pd(a: __m128d) -> __m128d659 pub unsafe fn _mm_ceil_pd(a: __m128d) -> __m128d {
660     simd_ceil(a)
661 }
662 
663 /// Round the packed single-precision (32-bit) floating-point elements in `a`
664 /// up to an integer value, and stores the results as packed single-precision
665 /// floating-point elements.
666 ///
667 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps)
668 #[inline]
669 #[target_feature(enable = "sse4.1")]
670 #[cfg_attr(test, assert_instr(roundps))]
671 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_ceil_ps(a: __m128) -> __m128672 pub unsafe fn _mm_ceil_ps(a: __m128) -> __m128 {
673     simd_ceil(a)
674 }
675 
676 /// Round the lower double-precision (64-bit) floating-point element in `b`
677 /// up to an integer value, store the result as a double-precision
678 /// floating-point element in the lower element of the intrisic result,
679 /// and copies the upper element from `a` to the upper element
680 /// of the intrinsic result.
681 ///
682 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_sd)
683 #[inline]
684 #[target_feature(enable = "sse4.1")]
685 #[cfg_attr(test, assert_instr(roundsd))]
686 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d687 pub unsafe fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d {
688     roundsd(a, b, _MM_FROUND_CEIL)
689 }
690 
691 /// Round the lower single-precision (32-bit) floating-point element in `b`
692 /// up to an integer value, store the result as a single-precision
693 /// floating-point element in the lower element of the intrinsic result,
694 /// and copies the upper 3 packed elements from `a` to the upper elements
695 /// of the intrinsic result.
696 ///
697 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss)
698 #[inline]
699 #[target_feature(enable = "sse4.1")]
700 #[cfg_attr(test, assert_instr(roundss))]
701 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_ceil_ss(a: __m128, b: __m128) -> __m128702 pub unsafe fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 {
703     roundss(a, b, _MM_FROUND_CEIL)
704 }
705 
706 /// Round the packed double-precision (64-bit) floating-point elements in `a`
707 /// using the `ROUNDING` parameter, and stores the results as packed
708 /// double-precision floating-point elements.
709 /// Rounding is done according to the rounding parameter, which can be one of:
710 ///
711 /// ```
712 /// #[cfg(target_arch = "x86")]
713 /// use std::arch::x86::*;
714 /// #[cfg(target_arch = "x86_64")]
715 /// use std::arch::x86_64::*;
716 ///
717 /// # fn main() {
718 /// // round to nearest, and suppress exceptions:
719 /// # let _x =
720 /// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
721 /// // round down, and suppress exceptions:
722 /// # let _x =
723 /// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
724 /// // round up, and suppress exceptions:
725 /// # let _x =
726 /// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
727 /// // truncate, and suppress exceptions:
728 /// # let _x =
729 /// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
730 /// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
731 /// # let _x =
732 /// _MM_FROUND_CUR_DIRECTION;
733 /// # }
734 /// ```
735 ///
736 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_pd)
737 #[inline]
738 #[target_feature(enable = "sse4.1")]
739 #[cfg_attr(test, assert_instr(roundpd, ROUNDING = 0))]
740 #[rustc_legacy_const_generics(1)]
741 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_round_pd<const ROUNDING: i32>(a: __m128d) -> __m128d742 pub unsafe fn _mm_round_pd<const ROUNDING: i32>(a: __m128d) -> __m128d {
743     static_assert_imm4!(ROUNDING);
744     roundpd(a, ROUNDING)
745 }
746 
747 /// Round the packed single-precision (32-bit) floating-point elements in `a`
748 /// using the `ROUNDING` parameter, and stores the results as packed
749 /// single-precision floating-point elements.
750 /// Rounding is done according to the rounding parameter, which can be one of:
751 ///
752 /// ```
753 /// #[cfg(target_arch = "x86")]
754 /// use std::arch::x86::*;
755 /// #[cfg(target_arch = "x86_64")]
756 /// use std::arch::x86_64::*;
757 ///
758 /// # fn main() {
759 /// // round to nearest, and suppress exceptions:
760 /// # let _x =
761 /// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
762 /// // round down, and suppress exceptions:
763 /// # let _x =
764 /// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
765 /// // round up, and suppress exceptions:
766 /// # let _x =
767 /// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
768 /// // truncate, and suppress exceptions:
769 /// # let _x =
770 /// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
771 /// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
772 /// # let _x =
773 /// _MM_FROUND_CUR_DIRECTION;
774 /// # }
775 /// ```
776 ///
777 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps)
778 #[inline]
779 #[target_feature(enable = "sse4.1")]
780 #[cfg_attr(test, assert_instr(roundps, ROUNDING = 0))]
781 #[rustc_legacy_const_generics(1)]
782 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_round_ps<const ROUNDING: i32>(a: __m128) -> __m128783 pub unsafe fn _mm_round_ps<const ROUNDING: i32>(a: __m128) -> __m128 {
784     static_assert_imm4!(ROUNDING);
785     roundps(a, ROUNDING)
786 }
787 
788 /// Round the lower double-precision (64-bit) floating-point element in `b`
789 /// using the `ROUNDING` parameter, store the result as a double-precision
790 /// floating-point element in the lower element of the intrinsic result,
791 /// and copies the upper element from `a` to the upper element of the intrinsic
792 /// result.
793 /// Rounding is done according to the rounding parameter, which can be one of:
794 ///
795 /// ```
796 /// #[cfg(target_arch = "x86")]
797 /// use std::arch::x86::*;
798 /// #[cfg(target_arch = "x86_64")]
799 /// use std::arch::x86_64::*;
800 ///
801 /// # fn main() {
802 /// // round to nearest, and suppress exceptions:
803 /// # let _x =
804 /// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
805 /// // round down, and suppress exceptions:
806 /// # let _x =
807 /// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
808 /// // round up, and suppress exceptions:
809 /// # let _x =
810 /// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
811 /// // truncate, and suppress exceptions:
812 /// # let _x =
813 /// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
814 /// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
815 /// # let _x =
816 /// _MM_FROUND_CUR_DIRECTION;
817 /// # }
818 /// ```
819 ///
820 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_sd)
821 #[inline]
822 #[target_feature(enable = "sse4.1")]
823 #[cfg_attr(test, assert_instr(roundsd, ROUNDING = 0))]
824 #[rustc_legacy_const_generics(2)]
825 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d826 pub unsafe fn _mm_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
827     static_assert_imm4!(ROUNDING);
828     roundsd(a, b, ROUNDING)
829 }
830 
831 /// Round the lower single-precision (32-bit) floating-point element in `b`
832 /// using the `ROUNDING` parameter, store the result as a single-precision
833 /// floating-point element in the lower element of the intrinsic result,
834 /// and copies the upper 3 packed elements from `a` to the upper elements
835 /// of the instrinsic result.
836 /// Rounding is done according to the rounding parameter, which can be one of:
837 ///
838 /// ```
839 /// #[cfg(target_arch = "x86")]
840 /// use std::arch::x86::*;
841 /// #[cfg(target_arch = "x86_64")]
842 /// use std::arch::x86_64::*;
843 ///
844 /// # fn main() {
845 /// // round to nearest, and suppress exceptions:
846 /// # let _x =
847 /// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
848 /// // round down, and suppress exceptions:
849 /// # let _x =
850 /// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
851 /// // round up, and suppress exceptions:
852 /// # let _x =
853 /// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
854 /// // truncate, and suppress exceptions:
855 /// # let _x =
856 /// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
857 /// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
858 /// # let _x =
859 /// _MM_FROUND_CUR_DIRECTION;
860 /// # }
861 /// ```
862 ///
863 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ss)
864 #[inline]
865 #[target_feature(enable = "sse4.1")]
866 #[cfg_attr(test, assert_instr(roundss, ROUNDING = 0))]
867 #[rustc_legacy_const_generics(2)]
868 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128869 pub unsafe fn _mm_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
870     static_assert_imm4!(ROUNDING);
871     roundss(a, b, ROUNDING)
872 }
873 
874 /// Finds the minimum unsigned 16-bit element in the 128-bit __m128i vector,
875 /// returning a vector containing its value in its first position, and its
876 /// index
877 /// in its second position; all other elements are set to zero.
878 ///
879 /// This intrinsic corresponds to the `VPHMINPOSUW` / `PHMINPOSUW`
880 /// instruction.
881 ///
882 /// Arguments:
883 ///
884 /// * `a` - A 128-bit vector of type `__m128i`.
885 ///
886 /// Returns:
887 ///
888 /// A 128-bit value where:
889 ///
890 /// * bits `[15:0]` - contain the minimum value found in parameter `a`,
891 /// * bits `[18:16]` - contain the index of the minimum value
892 /// * remaining bits are set to `0`.
893 ///
894 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16)
895 #[inline]
896 #[target_feature(enable = "sse4.1")]
897 #[cfg_attr(test, assert_instr(phminposuw))]
898 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_minpos_epu16(a: __m128i) -> __m128i899 pub unsafe fn _mm_minpos_epu16(a: __m128i) -> __m128i {
900     transmute(phminposuw(a.as_u16x8()))
901 }
902 
903 /// Multiplies the low 32-bit integers from each packed 64-bit
904 /// element in `a` and `b`, and returns the signed 64-bit result.
905 ///
906 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_epi32)
907 #[inline]
908 #[target_feature(enable = "sse4.1")]
909 #[cfg_attr(test, assert_instr(pmuldq))]
910 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i911 pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i {
912     transmute(pmuldq(a.as_i32x4(), b.as_i32x4()))
913 }
914 
915 /// Multiplies the packed 32-bit integers in `a` and `b`, producing intermediate
916 /// 64-bit integers, and returns the lowest 32-bit, whatever they might be,
917 /// reinterpreted as a signed integer. While `pmulld __m128i::splat(2),
918 /// __m128i::splat(2)` returns the obvious `__m128i::splat(4)`, due to wrapping
919 /// arithmetic `pmulld __m128i::splat(i32::MAX), __m128i::splat(2)` would
920 /// return a negative number.
921 ///
922 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mullo_epi32)
923 #[inline]
924 #[target_feature(enable = "sse4.1")]
925 #[cfg_attr(test, assert_instr(pmulld))]
926 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i927 pub unsafe fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i {
928     transmute(simd_mul(a.as_i32x4(), b.as_i32x4()))
929 }
930 
931 /// Subtracts 8-bit unsigned integer values and computes the absolute
932 /// values of the differences to the corresponding bits in the destination.
933 /// Then sums of the absolute differences are returned according to the bit
934 /// fields in the immediate operand.
935 ///
936 /// The following algorithm is performed:
937 ///
938 /// ```ignore
939 /// i = IMM8[2] * 4
940 /// j = IMM8[1:0] * 4
941 /// for k := 0 to 7
942 ///     d0 = abs(a[i + k + 0] - b[j + 0])
943 ///     d1 = abs(a[i + k + 1] - b[j + 1])
944 ///     d2 = abs(a[i + k + 2] - b[j + 2])
945 ///     d3 = abs(a[i + k + 3] - b[j + 3])
946 ///     r[k] = d0 + d1 + d2 + d3
947 /// ```
948 ///
949 /// Arguments:
950 ///
951 /// * `a` - A 128-bit vector of type `__m128i`.
952 /// * `b` - A 128-bit vector of type `__m128i`.
953 /// * `IMM8` - An 8-bit immediate operand specifying how the absolute
954 ///   differences are to be calculated
955 ///     * Bit `[2]` specify the offset for operand `a`
956 ///     * Bits `[1:0]` specify the offset for operand `b`
957 ///
958 /// Returns:
959 ///
960 /// * A `__m128i` vector containing the sums of the sets of   absolute
961 ///   differences between both operands.
962 ///
963 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mpsadbw_epu8)
964 #[inline]
965 #[target_feature(enable = "sse4.1")]
966 #[cfg_attr(test, assert_instr(mpsadbw, IMM8 = 0))]
967 #[rustc_legacy_const_generics(2)]
968 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_mpsadbw_epu8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i969 pub unsafe fn _mm_mpsadbw_epu8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
970     static_assert_imm3!(IMM8);
971     transmute(mpsadbw(a.as_u8x16(), b.as_u8x16(), IMM8 as u8))
972 }
973 
974 /// Tests whether the specified bits in a 128-bit integer vector are all
975 /// zeros.
976 ///
977 /// Arguments:
978 ///
979 /// * `a` - A 128-bit integer vector containing the bits to be tested.
980 /// * `mask` - A 128-bit integer vector selecting which bits to test in
981 ///   operand `a`.
982 ///
983 /// Returns:
984 ///
985 /// * `1` - if the specified bits are all zeros,
986 /// * `0` - otherwise.
987 ///
988 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128)
989 #[inline]
990 #[target_feature(enable = "sse4.1")]
991 #[cfg_attr(test, assert_instr(ptest))]
992 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_testz_si128(a: __m128i, mask: __m128i) -> i32993 pub unsafe fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
994     ptestz(a.as_i64x2(), mask.as_i64x2())
995 }
996 
997 /// Tests whether the specified bits in a 128-bit integer vector are all
998 /// ones.
999 ///
1000 /// Arguments:
1001 ///
1002 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1003 /// * `mask` - A 128-bit integer vector selecting which bits to test in
1004 ///   operand `a`.
1005 ///
1006 /// Returns:
1007 ///
1008 /// * `1` - if the specified bits are all ones,
1009 /// * `0` - otherwise.
1010 ///
1011 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128)
1012 #[inline]
1013 #[target_feature(enable = "sse4.1")]
1014 #[cfg_attr(test, assert_instr(ptest))]
1015 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_testc_si128(a: __m128i, mask: __m128i) -> i321016 pub unsafe fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
1017     ptestc(a.as_i64x2(), mask.as_i64x2())
1018 }
1019 
1020 /// Tests whether the specified bits in a 128-bit integer vector are
1021 /// neither all zeros nor all ones.
1022 ///
1023 /// Arguments:
1024 ///
1025 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1026 /// * `mask` - A 128-bit integer vector selecting which bits to test in
1027 ///   operand `a`.
1028 ///
1029 /// Returns:
1030 ///
1031 /// * `1` - if the specified bits are neither all zeros nor all ones,
1032 /// * `0` - otherwise.
1033 ///
1034 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_si128)
1035 #[inline]
1036 #[target_feature(enable = "sse4.1")]
1037 #[cfg_attr(test, assert_instr(ptest))]
1038 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_testnzc_si128(a: __m128i, mask: __m128i) -> i321039 pub unsafe fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 {
1040     ptestnzc(a.as_i64x2(), mask.as_i64x2())
1041 }
1042 
1043 /// Tests whether the specified bits in a 128-bit integer vector are all
1044 /// zeros.
1045 ///
1046 /// Arguments:
1047 ///
1048 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1049 /// * `mask` - A 128-bit integer vector selecting which bits to test in
1050 ///   operand `a`.
1051 ///
1052 /// Returns:
1053 ///
1054 /// * `1` - if the specified bits are all zeros,
1055 /// * `0` - otherwise.
1056 ///
1057 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros)
1058 #[inline]
1059 #[target_feature(enable = "sse4.1")]
1060 #[cfg_attr(test, assert_instr(ptest))]
1061 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_test_all_zeros(a: __m128i, mask: __m128i) -> i321062 pub unsafe fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 {
1063     _mm_testz_si128(a, mask)
1064 }
1065 
1066 /// Tests whether the specified bits in `a` 128-bit integer vector are all
1067 /// ones.
1068 ///
1069 /// Argument:
1070 ///
1071 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1072 ///
1073 /// Returns:
1074 ///
1075 /// * `1` - if the bits specified in the operand are all set to 1,
1076 /// * `0` - otherwise.
1077 ///
1078 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones)
1079 #[inline]
1080 #[target_feature(enable = "sse4.1")]
1081 #[cfg_attr(test, assert_instr(pcmpeqd))]
1082 #[cfg_attr(test, assert_instr(ptest))]
1083 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_test_all_ones(a: __m128i) -> i321084 pub unsafe fn _mm_test_all_ones(a: __m128i) -> i32 {
1085     _mm_testc_si128(a, _mm_cmpeq_epi32(a, a))
1086 }
1087 
1088 /// Tests whether the specified bits in a 128-bit integer vector are
1089 /// neither all zeros nor all ones.
1090 ///
1091 /// Arguments:
1092 ///
1093 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1094 /// * `mask` - A 128-bit integer vector selecting which bits to test in
1095 ///   operand `a`.
1096 ///
1097 /// Returns:
1098 ///
1099 /// * `1` - if the specified bits are neither all zeros nor all ones,
1100 /// * `0` - otherwise.
1101 ///
1102 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_mix_ones_zeros)
1103 #[inline]
1104 #[target_feature(enable = "sse4.1")]
1105 #[cfg_attr(test, assert_instr(ptest))]
1106 #[stable(feature = "simd_x86", since = "1.27.0")]
_mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i321107 pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
1108     _mm_testnzc_si128(a, mask)
1109 }
1110 
1111 #[allow(improper_ctypes)]
1112 extern "C" {
1113     #[link_name = "llvm.x86.sse41.pblendvb"]
pblendvb(a: i8x16, b: i8x16, mask: i8x16) -> i8x161114     fn pblendvb(a: i8x16, b: i8x16, mask: i8x16) -> i8x16;
1115     #[link_name = "llvm.x86.sse41.blendvpd"]
blendvpd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d1116     fn blendvpd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d;
1117     #[link_name = "llvm.x86.sse41.blendvps"]
blendvps(a: __m128, b: __m128, mask: __m128) -> __m1281118     fn blendvps(a: __m128, b: __m128, mask: __m128) -> __m128;
1119     #[link_name = "llvm.x86.sse41.blendpd"]
blendpd(a: __m128d, b: __m128d, imm2: u8) -> __m128d1120     fn blendpd(a: __m128d, b: __m128d, imm2: u8) -> __m128d;
1121     #[link_name = "llvm.x86.sse41.blendps"]
blendps(a: __m128, b: __m128, imm4: u8) -> __m1281122     fn blendps(a: __m128, b: __m128, imm4: u8) -> __m128;
1123     #[link_name = "llvm.x86.sse41.pblendw"]
pblendw(a: i16x8, b: i16x8, imm8: u8) -> i16x81124     fn pblendw(a: i16x8, b: i16x8, imm8: u8) -> i16x8;
1125     #[link_name = "llvm.x86.sse41.insertps"]
insertps(a: __m128, b: __m128, imm8: u8) -> __m1281126     fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128;
1127     #[link_name = "llvm.x86.sse41.pmaxsb"]
pmaxsb(a: i8x16, b: i8x16) -> i8x161128     fn pmaxsb(a: i8x16, b: i8x16) -> i8x16;
1129     #[link_name = "llvm.x86.sse41.pmaxuw"]
pmaxuw(a: u16x8, b: u16x8) -> u16x81130     fn pmaxuw(a: u16x8, b: u16x8) -> u16x8;
1131     #[link_name = "llvm.x86.sse41.pmaxsd"]
pmaxsd(a: i32x4, b: i32x4) -> i32x41132     fn pmaxsd(a: i32x4, b: i32x4) -> i32x4;
1133     #[link_name = "llvm.x86.sse41.pmaxud"]
pmaxud(a: u32x4, b: u32x4) -> u32x41134     fn pmaxud(a: u32x4, b: u32x4) -> u32x4;
1135     #[link_name = "llvm.x86.sse41.pminsb"]
pminsb(a: i8x16, b: i8x16) -> i8x161136     fn pminsb(a: i8x16, b: i8x16) -> i8x16;
1137     #[link_name = "llvm.x86.sse41.pminuw"]
pminuw(a: u16x8, b: u16x8) -> u16x81138     fn pminuw(a: u16x8, b: u16x8) -> u16x8;
1139     #[link_name = "llvm.x86.sse41.pminsd"]
pminsd(a: i32x4, b: i32x4) -> i32x41140     fn pminsd(a: i32x4, b: i32x4) -> i32x4;
1141     #[link_name = "llvm.x86.sse41.pminud"]
pminud(a: u32x4, b: u32x4) -> u32x41142     fn pminud(a: u32x4, b: u32x4) -> u32x4;
1143     #[link_name = "llvm.x86.sse41.packusdw"]
packusdw(a: i32x4, b: i32x4) -> u16x81144     fn packusdw(a: i32x4, b: i32x4) -> u16x8;
1145     #[link_name = "llvm.x86.sse41.dppd"]
dppd(a: __m128d, b: __m128d, imm8: u8) -> __m128d1146     fn dppd(a: __m128d, b: __m128d, imm8: u8) -> __m128d;
1147     #[link_name = "llvm.x86.sse41.dpps"]
dpps(a: __m128, b: __m128, imm8: u8) -> __m1281148     fn dpps(a: __m128, b: __m128, imm8: u8) -> __m128;
1149     #[link_name = "llvm.x86.sse41.round.pd"]
roundpd(a: __m128d, rounding: i32) -> __m128d1150     fn roundpd(a: __m128d, rounding: i32) -> __m128d;
1151     #[link_name = "llvm.x86.sse41.round.ps"]
roundps(a: __m128, rounding: i32) -> __m1281152     fn roundps(a: __m128, rounding: i32) -> __m128;
1153     #[link_name = "llvm.x86.sse41.round.sd"]
roundsd(a: __m128d, b: __m128d, rounding: i32) -> __m128d1154     fn roundsd(a: __m128d, b: __m128d, rounding: i32) -> __m128d;
1155     #[link_name = "llvm.x86.sse41.round.ss"]
roundss(a: __m128, b: __m128, rounding: i32) -> __m1281156     fn roundss(a: __m128, b: __m128, rounding: i32) -> __m128;
1157     #[link_name = "llvm.x86.sse41.phminposuw"]
phminposuw(a: u16x8) -> u16x81158     fn phminposuw(a: u16x8) -> u16x8;
1159     #[link_name = "llvm.x86.sse41.pmuldq"]
pmuldq(a: i32x4, b: i32x4) -> i64x21160     fn pmuldq(a: i32x4, b: i32x4) -> i64x2;
1161     #[link_name = "llvm.x86.sse41.mpsadbw"]
mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x81162     fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8;
1163     #[link_name = "llvm.x86.sse41.ptestz"]
ptestz(a: i64x2, mask: i64x2) -> i321164     fn ptestz(a: i64x2, mask: i64x2) -> i32;
1165     #[link_name = "llvm.x86.sse41.ptestc"]
ptestc(a: i64x2, mask: i64x2) -> i321166     fn ptestc(a: i64x2, mask: i64x2) -> i32;
1167     #[link_name = "llvm.x86.sse41.ptestnzc"]
ptestnzc(a: i64x2, mask: i64x2) -> i321168     fn ptestnzc(a: i64x2, mask: i64x2) -> i32;
1169 }
1170 
1171 #[cfg(test)]
1172 mod tests {
1173     use crate::core_arch::x86::*;
1174     use std::mem;
1175     use stdarch_test::simd_test;
1176 
1177     #[simd_test(enable = "sse4.1")]
test_mm_blendv_epi8()1178     unsafe fn test_mm_blendv_epi8() {
1179         #[rustfmt::skip]
1180         let a = _mm_setr_epi8(
1181             0, 1, 2, 3, 4, 5, 6, 7,
1182             8, 9, 10, 11, 12, 13, 14, 15,
1183         );
1184         #[rustfmt::skip]
1185         let b = _mm_setr_epi8(
1186             16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
1187         );
1188         #[rustfmt::skip]
1189         let mask = _mm_setr_epi8(
1190             0, -1, 0, -1, 0, -1, 0, -1,
1191             0, -1, 0, -1, 0, -1, 0, -1,
1192         );
1193         #[rustfmt::skip]
1194         let e = _mm_setr_epi8(
1195             0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31,
1196         );
1197         assert_eq_m128i(_mm_blendv_epi8(a, b, mask), e);
1198     }
1199 
1200     #[simd_test(enable = "sse4.1")]
test_mm_blendv_pd()1201     unsafe fn test_mm_blendv_pd() {
1202         let a = _mm_set1_pd(0.0);
1203         let b = _mm_set1_pd(1.0);
1204         let mask = transmute(_mm_setr_epi64x(0, -1));
1205         let r = _mm_blendv_pd(a, b, mask);
1206         let e = _mm_setr_pd(0.0, 1.0);
1207         assert_eq_m128d(r, e);
1208     }
1209 
1210     #[simd_test(enable = "sse4.1")]
test_mm_blendv_ps()1211     unsafe fn test_mm_blendv_ps() {
1212         let a = _mm_set1_ps(0.0);
1213         let b = _mm_set1_ps(1.0);
1214         let mask = transmute(_mm_setr_epi32(0, -1, 0, -1));
1215         let r = _mm_blendv_ps(a, b, mask);
1216         let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
1217         assert_eq_m128(r, e);
1218     }
1219 
1220     #[simd_test(enable = "sse4.1")]
test_mm_blend_pd()1221     unsafe fn test_mm_blend_pd() {
1222         let a = _mm_set1_pd(0.0);
1223         let b = _mm_set1_pd(1.0);
1224         let r = _mm_blend_pd::<0b10>(a, b);
1225         let e = _mm_setr_pd(0.0, 1.0);
1226         assert_eq_m128d(r, e);
1227     }
1228 
1229     #[simd_test(enable = "sse4.1")]
test_mm_blend_ps()1230     unsafe fn test_mm_blend_ps() {
1231         let a = _mm_set1_ps(0.0);
1232         let b = _mm_set1_ps(1.0);
1233         let r = _mm_blend_ps::<0b1010>(a, b);
1234         let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
1235         assert_eq_m128(r, e);
1236     }
1237 
1238     #[simd_test(enable = "sse4.1")]
test_mm_blend_epi16()1239     unsafe fn test_mm_blend_epi16() {
1240         let a = _mm_set1_epi16(0);
1241         let b = _mm_set1_epi16(1);
1242         let r = _mm_blend_epi16::<0b1010_1100>(a, b);
1243         let e = _mm_setr_epi16(0, 0, 1, 1, 0, 1, 0, 1);
1244         assert_eq_m128i(r, e);
1245     }
1246 
1247     #[simd_test(enable = "sse4.1")]
test_mm_extract_ps()1248     unsafe fn test_mm_extract_ps() {
1249         let a = _mm_setr_ps(0.0, 1.0, 2.0, 3.0);
1250         let r: f32 = transmute(_mm_extract_ps::<1>(a));
1251         assert_eq!(r, 1.0);
1252         let r: f32 = transmute(_mm_extract_ps::<3>(a));
1253         assert_eq!(r, 3.0);
1254     }
1255 
1256     #[simd_test(enable = "sse4.1")]
test_mm_extract_epi8()1257     unsafe fn test_mm_extract_epi8() {
1258         #[rustfmt::skip]
1259         let a = _mm_setr_epi8(
1260             -1, 1, 2, 3, 4, 5, 6, 7,
1261             8, 9, 10, 11, 12, 13, 14, 15
1262         );
1263         let r1 = _mm_extract_epi8::<0>(a);
1264         let r2 = _mm_extract_epi8::<3>(a);
1265         assert_eq!(r1, 0xFF);
1266         assert_eq!(r2, 3);
1267     }
1268 
1269     #[simd_test(enable = "sse4.1")]
test_mm_extract_epi32()1270     unsafe fn test_mm_extract_epi32() {
1271         let a = _mm_setr_epi32(0, 1, 2, 3);
1272         let r = _mm_extract_epi32::<1>(a);
1273         assert_eq!(r, 1);
1274         let r = _mm_extract_epi32::<3>(a);
1275         assert_eq!(r, 3);
1276     }
1277 
1278     #[simd_test(enable = "sse4.1")]
test_mm_insert_ps()1279     unsafe fn test_mm_insert_ps() {
1280         let a = _mm_set1_ps(1.0);
1281         let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
1282         let r = _mm_insert_ps::<0b11_00_1100>(a, b);
1283         let e = _mm_setr_ps(4.0, 1.0, 0.0, 0.0);
1284         assert_eq_m128(r, e);
1285     }
1286 
1287     #[simd_test(enable = "sse4.1")]
test_mm_insert_epi8()1288     unsafe fn test_mm_insert_epi8() {
1289         let a = _mm_set1_epi8(0);
1290         let e = _mm_setr_epi8(0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
1291         let r = _mm_insert_epi8::<1>(a, 32);
1292         assert_eq_m128i(r, e);
1293         let e = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0);
1294         let r = _mm_insert_epi8::<14>(a, 32);
1295         assert_eq_m128i(r, e);
1296     }
1297 
1298     #[simd_test(enable = "sse4.1")]
test_mm_insert_epi32()1299     unsafe fn test_mm_insert_epi32() {
1300         let a = _mm_set1_epi32(0);
1301         let e = _mm_setr_epi32(0, 32, 0, 0);
1302         let r = _mm_insert_epi32::<1>(a, 32);
1303         assert_eq_m128i(r, e);
1304         let e = _mm_setr_epi32(0, 0, 0, 32);
1305         let r = _mm_insert_epi32::<3>(a, 32);
1306         assert_eq_m128i(r, e);
1307     }
1308 
1309     #[simd_test(enable = "sse4.1")]
test_mm_max_epi8()1310     unsafe fn test_mm_max_epi8() {
1311         #[rustfmt::skip]
1312         let a = _mm_setr_epi8(
1313             1, 4, 5, 8, 9, 12, 13, 16,
1314             17, 20, 21, 24, 25, 28, 29, 32,
1315         );
1316         #[rustfmt::skip]
1317         let b = _mm_setr_epi8(
1318             2, 3, 6, 7, 10, 11, 14, 15,
1319             18, 19, 22, 23, 26, 27, 30, 31,
1320         );
1321         let r = _mm_max_epi8(a, b);
1322         #[rustfmt::skip]
1323         let e = _mm_setr_epi8(
1324             2, 4, 6, 8, 10, 12, 14, 16,
1325             18, 20, 22, 24, 26, 28, 30, 32,
1326         );
1327         assert_eq_m128i(r, e);
1328     }
1329 
1330     #[simd_test(enable = "sse4.1")]
test_mm_max_epu16()1331     unsafe fn test_mm_max_epu16() {
1332         let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
1333         let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
1334         let r = _mm_max_epu16(a, b);
1335         let e = _mm_setr_epi16(2, 4, 6, 8, 10, 12, 14, 16);
1336         assert_eq_m128i(r, e);
1337     }
1338 
1339     #[simd_test(enable = "sse4.1")]
test_mm_max_epi32()1340     unsafe fn test_mm_max_epi32() {
1341         let a = _mm_setr_epi32(1, 4, 5, 8);
1342         let b = _mm_setr_epi32(2, 3, 6, 7);
1343         let r = _mm_max_epi32(a, b);
1344         let e = _mm_setr_epi32(2, 4, 6, 8);
1345         assert_eq_m128i(r, e);
1346     }
1347 
1348     #[simd_test(enable = "sse4.1")]
test_mm_max_epu32()1349     unsafe fn test_mm_max_epu32() {
1350         let a = _mm_setr_epi32(1, 4, 5, 8);
1351         let b = _mm_setr_epi32(2, 3, 6, 7);
1352         let r = _mm_max_epu32(a, b);
1353         let e = _mm_setr_epi32(2, 4, 6, 8);
1354         assert_eq_m128i(r, e);
1355     }
1356 
1357     #[simd_test(enable = "sse4.1")]
test_mm_min_epi8_1()1358     unsafe fn test_mm_min_epi8_1() {
1359         #[rustfmt::skip]
1360         let a = _mm_setr_epi8(
1361             1, 4, 5, 8, 9, 12, 13, 16,
1362             17, 20, 21, 24, 25, 28, 29, 32,
1363         );
1364         #[rustfmt::skip]
1365         let b = _mm_setr_epi8(
1366             2, 3, 6, 7, 10, 11, 14, 15,
1367             18, 19, 22, 23, 26, 27, 30, 31,
1368         );
1369         let r = _mm_min_epi8(a, b);
1370         #[rustfmt::skip]
1371         let e = _mm_setr_epi8(
1372             1, 3, 5, 7, 9, 11, 13, 15,
1373             17, 19, 21, 23, 25, 27, 29, 31,
1374         );
1375         assert_eq_m128i(r, e);
1376     }
1377 
1378     #[simd_test(enable = "sse4.1")]
test_mm_min_epi8_2()1379     unsafe fn test_mm_min_epi8_2() {
1380         #[rustfmt::skip]
1381         let a = _mm_setr_epi8(
1382             1, -4, -5, 8, -9, -12, 13, -16,
1383             17, 20, 21, 24, 25, 28, 29, 32,
1384         );
1385         #[rustfmt::skip]
1386         let b = _mm_setr_epi8(
1387             2, -3, -6, 7, -10, -11, 14, -15,
1388             18, 19, 22, 23, 26, 27, 30, 31,
1389         );
1390         let r = _mm_min_epi8(a, b);
1391         #[rustfmt::skip]
1392         let e = _mm_setr_epi8(
1393             1, -4, -6, 7, -10, -12, 13, -16,
1394             17, 19, 21, 23, 25, 27, 29, 31,
1395         );
1396         assert_eq_m128i(r, e);
1397     }
1398 
1399     #[simd_test(enable = "sse4.1")]
test_mm_min_epu16()1400     unsafe fn test_mm_min_epu16() {
1401         let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
1402         let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
1403         let r = _mm_min_epu16(a, b);
1404         let e = _mm_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15);
1405         assert_eq_m128i(r, e);
1406     }
1407 
1408     #[simd_test(enable = "sse4.1")]
test_mm_min_epi32_1()1409     unsafe fn test_mm_min_epi32_1() {
1410         let a = _mm_setr_epi32(1, 4, 5, 8);
1411         let b = _mm_setr_epi32(2, 3, 6, 7);
1412         let r = _mm_min_epi32(a, b);
1413         let e = _mm_setr_epi32(1, 3, 5, 7);
1414         assert_eq_m128i(r, e);
1415     }
1416 
1417     #[simd_test(enable = "sse4.1")]
test_mm_min_epi32_2()1418     unsafe fn test_mm_min_epi32_2() {
1419         let a = _mm_setr_epi32(-1, 4, 5, -7);
1420         let b = _mm_setr_epi32(-2, 3, -6, 8);
1421         let r = _mm_min_epi32(a, b);
1422         let e = _mm_setr_epi32(-2, 3, -6, -7);
1423         assert_eq_m128i(r, e);
1424     }
1425 
1426     #[simd_test(enable = "sse4.1")]
test_mm_min_epu32()1427     unsafe fn test_mm_min_epu32() {
1428         let a = _mm_setr_epi32(1, 4, 5, 8);
1429         let b = _mm_setr_epi32(2, 3, 6, 7);
1430         let r = _mm_min_epu32(a, b);
1431         let e = _mm_setr_epi32(1, 3, 5, 7);
1432         assert_eq_m128i(r, e);
1433     }
1434 
1435     #[simd_test(enable = "sse4.1")]
test_mm_packus_epi32()1436     unsafe fn test_mm_packus_epi32() {
1437         let a = _mm_setr_epi32(1, 2, 3, 4);
1438         let b = _mm_setr_epi32(-1, -2, -3, -4);
1439         let r = _mm_packus_epi32(a, b);
1440         let e = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
1441         assert_eq_m128i(r, e);
1442     }
1443 
1444     #[simd_test(enable = "sse4.1")]
test_mm_cmpeq_epi64()1445     unsafe fn test_mm_cmpeq_epi64() {
1446         let a = _mm_setr_epi64x(0, 1);
1447         let b = _mm_setr_epi64x(0, 0);
1448         let r = _mm_cmpeq_epi64(a, b);
1449         let e = _mm_setr_epi64x(-1, 0);
1450         assert_eq_m128i(r, e);
1451     }
1452 
1453     #[simd_test(enable = "sse4.1")]
test_mm_cvtepi8_epi16()1454     unsafe fn test_mm_cvtepi8_epi16() {
1455         let a = _mm_set1_epi8(10);
1456         let r = _mm_cvtepi8_epi16(a);
1457         let e = _mm_set1_epi16(10);
1458         assert_eq_m128i(r, e);
1459         let a = _mm_set1_epi8(-10);
1460         let r = _mm_cvtepi8_epi16(a);
1461         let e = _mm_set1_epi16(-10);
1462         assert_eq_m128i(r, e);
1463     }
1464 
1465     #[simd_test(enable = "sse4.1")]
test_mm_cvtepi8_epi32()1466     unsafe fn test_mm_cvtepi8_epi32() {
1467         let a = _mm_set1_epi8(10);
1468         let r = _mm_cvtepi8_epi32(a);
1469         let e = _mm_set1_epi32(10);
1470         assert_eq_m128i(r, e);
1471         let a = _mm_set1_epi8(-10);
1472         let r = _mm_cvtepi8_epi32(a);
1473         let e = _mm_set1_epi32(-10);
1474         assert_eq_m128i(r, e);
1475     }
1476 
1477     #[simd_test(enable = "sse4.1")]
test_mm_cvtepi8_epi64()1478     unsafe fn test_mm_cvtepi8_epi64() {
1479         let a = _mm_set1_epi8(10);
1480         let r = _mm_cvtepi8_epi64(a);
1481         let e = _mm_set1_epi64x(10);
1482         assert_eq_m128i(r, e);
1483         let a = _mm_set1_epi8(-10);
1484         let r = _mm_cvtepi8_epi64(a);
1485         let e = _mm_set1_epi64x(-10);
1486         assert_eq_m128i(r, e);
1487     }
1488 
1489     #[simd_test(enable = "sse4.1")]
test_mm_cvtepi16_epi32()1490     unsafe fn test_mm_cvtepi16_epi32() {
1491         let a = _mm_set1_epi16(10);
1492         let r = _mm_cvtepi16_epi32(a);
1493         let e = _mm_set1_epi32(10);
1494         assert_eq_m128i(r, e);
1495         let a = _mm_set1_epi16(-10);
1496         let r = _mm_cvtepi16_epi32(a);
1497         let e = _mm_set1_epi32(-10);
1498         assert_eq_m128i(r, e);
1499     }
1500 
1501     #[simd_test(enable = "sse4.1")]
test_mm_cvtepi16_epi64()1502     unsafe fn test_mm_cvtepi16_epi64() {
1503         let a = _mm_set1_epi16(10);
1504         let r = _mm_cvtepi16_epi64(a);
1505         let e = _mm_set1_epi64x(10);
1506         assert_eq_m128i(r, e);
1507         let a = _mm_set1_epi16(-10);
1508         let r = _mm_cvtepi16_epi64(a);
1509         let e = _mm_set1_epi64x(-10);
1510         assert_eq_m128i(r, e);
1511     }
1512 
1513     #[simd_test(enable = "sse4.1")]
test_mm_cvtepi32_epi64()1514     unsafe fn test_mm_cvtepi32_epi64() {
1515         let a = _mm_set1_epi32(10);
1516         let r = _mm_cvtepi32_epi64(a);
1517         let e = _mm_set1_epi64x(10);
1518         assert_eq_m128i(r, e);
1519         let a = _mm_set1_epi32(-10);
1520         let r = _mm_cvtepi32_epi64(a);
1521         let e = _mm_set1_epi64x(-10);
1522         assert_eq_m128i(r, e);
1523     }
1524 
1525     #[simd_test(enable = "sse4.1")]
test_mm_cvtepu8_epi16()1526     unsafe fn test_mm_cvtepu8_epi16() {
1527         let a = _mm_set1_epi8(10);
1528         let r = _mm_cvtepu8_epi16(a);
1529         let e = _mm_set1_epi16(10);
1530         assert_eq_m128i(r, e);
1531     }
1532 
1533     #[simd_test(enable = "sse4.1")]
test_mm_cvtepu8_epi32()1534     unsafe fn test_mm_cvtepu8_epi32() {
1535         let a = _mm_set1_epi8(10);
1536         let r = _mm_cvtepu8_epi32(a);
1537         let e = _mm_set1_epi32(10);
1538         assert_eq_m128i(r, e);
1539     }
1540 
1541     #[simd_test(enable = "sse4.1")]
test_mm_cvtepu8_epi64()1542     unsafe fn test_mm_cvtepu8_epi64() {
1543         let a = _mm_set1_epi8(10);
1544         let r = _mm_cvtepu8_epi64(a);
1545         let e = _mm_set1_epi64x(10);
1546         assert_eq_m128i(r, e);
1547     }
1548 
1549     #[simd_test(enable = "sse4.1")]
test_mm_cvtepu16_epi32()1550     unsafe fn test_mm_cvtepu16_epi32() {
1551         let a = _mm_set1_epi16(10);
1552         let r = _mm_cvtepu16_epi32(a);
1553         let e = _mm_set1_epi32(10);
1554         assert_eq_m128i(r, e);
1555     }
1556 
1557     #[simd_test(enable = "sse4.1")]
test_mm_cvtepu16_epi64()1558     unsafe fn test_mm_cvtepu16_epi64() {
1559         let a = _mm_set1_epi16(10);
1560         let r = _mm_cvtepu16_epi64(a);
1561         let e = _mm_set1_epi64x(10);
1562         assert_eq_m128i(r, e);
1563     }
1564 
1565     #[simd_test(enable = "sse4.1")]
test_mm_cvtepu32_epi64()1566     unsafe fn test_mm_cvtepu32_epi64() {
1567         let a = _mm_set1_epi32(10);
1568         let r = _mm_cvtepu32_epi64(a);
1569         let e = _mm_set1_epi64x(10);
1570         assert_eq_m128i(r, e);
1571     }
1572 
1573     #[simd_test(enable = "sse4.1")]
test_mm_dp_pd()1574     unsafe fn test_mm_dp_pd() {
1575         let a = _mm_setr_pd(2.0, 3.0);
1576         let b = _mm_setr_pd(1.0, 4.0);
1577         let e = _mm_setr_pd(14.0, 0.0);
1578         assert_eq_m128d(_mm_dp_pd::<0b00110001>(a, b), e);
1579     }
1580 
1581     #[simd_test(enable = "sse4.1")]
test_mm_dp_ps()1582     unsafe fn test_mm_dp_ps() {
1583         let a = _mm_setr_ps(2.0, 3.0, 1.0, 10.0);
1584         let b = _mm_setr_ps(1.0, 4.0, 0.5, 10.0);
1585         let e = _mm_setr_ps(14.5, 0.0, 14.5, 0.0);
1586         assert_eq_m128(_mm_dp_ps::<0b01110101>(a, b), e);
1587     }
1588 
1589     #[simd_test(enable = "sse4.1")]
test_mm_floor_pd()1590     unsafe fn test_mm_floor_pd() {
1591         let a = _mm_setr_pd(2.5, 4.5);
1592         let r = _mm_floor_pd(a);
1593         let e = _mm_setr_pd(2.0, 4.0);
1594         assert_eq_m128d(r, e);
1595     }
1596 
1597     #[simd_test(enable = "sse4.1")]
test_mm_floor_ps()1598     unsafe fn test_mm_floor_ps() {
1599         let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
1600         let r = _mm_floor_ps(a);
1601         let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
1602         assert_eq_m128(r, e);
1603     }
1604 
1605     #[simd_test(enable = "sse4.1")]
test_mm_floor_sd()1606     unsafe fn test_mm_floor_sd() {
1607         let a = _mm_setr_pd(2.5, 4.5);
1608         let b = _mm_setr_pd(-1.5, -3.5);
1609         let r = _mm_floor_sd(a, b);
1610         let e = _mm_setr_pd(-2.0, 4.5);
1611         assert_eq_m128d(r, e);
1612     }
1613 
1614     #[simd_test(enable = "sse4.1")]
test_mm_floor_ss()1615     unsafe fn test_mm_floor_ss() {
1616         let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
1617         let b = _mm_setr_ps(-1.5, -3.5, -7.5, -15.5);
1618         let r = _mm_floor_ss(a, b);
1619         let e = _mm_setr_ps(-2.0, 4.5, 8.5, 16.5);
1620         assert_eq_m128(r, e);
1621     }
1622 
1623     #[simd_test(enable = "sse4.1")]
test_mm_ceil_pd()1624     unsafe fn test_mm_ceil_pd() {
1625         let a = _mm_setr_pd(1.5, 3.5);
1626         let r = _mm_ceil_pd(a);
1627         let e = _mm_setr_pd(2.0, 4.0);
1628         assert_eq_m128d(r, e);
1629     }
1630 
1631     #[simd_test(enable = "sse4.1")]
test_mm_ceil_ps()1632     unsafe fn test_mm_ceil_ps() {
1633         let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1634         let r = _mm_ceil_ps(a);
1635         let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
1636         assert_eq_m128(r, e);
1637     }
1638 
1639     #[simd_test(enable = "sse4.1")]
test_mm_ceil_sd()1640     unsafe fn test_mm_ceil_sd() {
1641         let a = _mm_setr_pd(1.5, 3.5);
1642         let b = _mm_setr_pd(-2.5, -4.5);
1643         let r = _mm_ceil_sd(a, b);
1644         let e = _mm_setr_pd(-2.0, 3.5);
1645         assert_eq_m128d(r, e);
1646     }
1647 
1648     #[simd_test(enable = "sse4.1")]
test_mm_ceil_ss()1649     unsafe fn test_mm_ceil_ss() {
1650         let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1651         let b = _mm_setr_ps(-2.5, -4.5, -8.5, -16.5);
1652         let r = _mm_ceil_ss(a, b);
1653         let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
1654         assert_eq_m128(r, e);
1655     }
1656 
1657     #[simd_test(enable = "sse4.1")]
test_mm_round_pd()1658     unsafe fn test_mm_round_pd() {
1659         let a = _mm_setr_pd(1.25, 3.75);
1660         let r = _mm_round_pd::<_MM_FROUND_TO_NEAREST_INT>(a);
1661         let e = _mm_setr_pd(1.0, 4.0);
1662         assert_eq_m128d(r, e);
1663     }
1664 
1665     #[simd_test(enable = "sse4.1")]
test_mm_round_ps()1666     unsafe fn test_mm_round_ps() {
1667         let a = _mm_setr_ps(2.25, 4.75, -1.75, -4.25);
1668         let r = _mm_round_ps::<_MM_FROUND_TO_ZERO>(a);
1669         let e = _mm_setr_ps(2.0, 4.0, -1.0, -4.0);
1670         assert_eq_m128(r, e);
1671     }
1672 
1673     #[simd_test(enable = "sse4.1")]
test_mm_round_sd()1674     unsafe fn test_mm_round_sd() {
1675         let a = _mm_setr_pd(1.5, 3.5);
1676         let b = _mm_setr_pd(-2.5, -4.5);
1677         let old_mode = _MM_GET_ROUNDING_MODE();
1678         _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1679         let r = _mm_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, b);
1680         _MM_SET_ROUNDING_MODE(old_mode);
1681         let e = _mm_setr_pd(-2.0, 3.5);
1682         assert_eq_m128d(r, e);
1683     }
1684 
1685     #[simd_test(enable = "sse4.1")]
test_mm_round_ss()1686     unsafe fn test_mm_round_ss() {
1687         let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1688         let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
1689         let old_mode = _MM_GET_ROUNDING_MODE();
1690         _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1691         let r = _mm_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, b);
1692         _MM_SET_ROUNDING_MODE(old_mode);
1693         let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
1694         assert_eq_m128(r, e);
1695     }
1696 
1697     #[simd_test(enable = "sse4.1")]
test_mm_minpos_epu16_1()1698     unsafe fn test_mm_minpos_epu16_1() {
1699         let a = _mm_setr_epi16(23, 18, 44, 97, 50, 13, 67, 66);
1700         let r = _mm_minpos_epu16(a);
1701         let e = _mm_setr_epi16(13, 5, 0, 0, 0, 0, 0, 0);
1702         assert_eq_m128i(r, e);
1703     }
1704 
1705     #[simd_test(enable = "sse4.1")]
test_mm_minpos_epu16_2()1706     unsafe fn test_mm_minpos_epu16_2() {
1707         let a = _mm_setr_epi16(0, 18, 44, 97, 50, 13, 67, 66);
1708         let r = _mm_minpos_epu16(a);
1709         let e = _mm_setr_epi16(0, 0, 0, 0, 0, 0, 0, 0);
1710         assert_eq_m128i(r, e);
1711     }
1712 
1713     #[simd_test(enable = "sse4.1")]
test_mm_mul_epi32()1714     unsafe fn test_mm_mul_epi32() {
1715         {
1716             let a = _mm_setr_epi32(1, 1, 1, 1);
1717             let b = _mm_setr_epi32(1, 2, 3, 4);
1718             let r = _mm_mul_epi32(a, b);
1719             let e = _mm_setr_epi64x(1, 3);
1720             assert_eq_m128i(r, e);
1721         }
1722         {
1723             let a = _mm_setr_epi32(15, 2 /* ignored */, 1234567, 4 /* ignored */);
1724             let b = _mm_setr_epi32(
1725                 -20, -256, /* ignored */
1726                 666666, 666666, /* ignored */
1727             );
1728             let r = _mm_mul_epi32(a, b);
1729             let e = _mm_setr_epi64x(-300, 823043843622);
1730             assert_eq_m128i(r, e);
1731         }
1732     }
1733 
1734     #[simd_test(enable = "sse4.1")]
test_mm_mullo_epi32()1735     unsafe fn test_mm_mullo_epi32() {
1736         {
1737             let a = _mm_setr_epi32(1, 1, 1, 1);
1738             let b = _mm_setr_epi32(1, 2, 3, 4);
1739             let r = _mm_mullo_epi32(a, b);
1740             let e = _mm_setr_epi32(1, 2, 3, 4);
1741             assert_eq_m128i(r, e);
1742         }
1743         {
1744             let a = _mm_setr_epi32(15, -2, 1234567, 99999);
1745             let b = _mm_setr_epi32(-20, -256, 666666, -99999);
1746             let r = _mm_mullo_epi32(a, b);
1747             // Attention, most significant bit in r[2] is treated
1748             // as a sign bit:
1749             // 1234567 * 666666 = -1589877210
1750             let e = _mm_setr_epi32(-300, 512, -1589877210, -1409865409);
1751             assert_eq_m128i(r, e);
1752         }
1753     }
1754 
1755     #[simd_test(enable = "sse4.1")]
test_mm_minpos_epu16()1756     unsafe fn test_mm_minpos_epu16() {
1757         let a = _mm_setr_epi16(8, 7, 6, 5, 4, 1, 2, 3);
1758         let r = _mm_minpos_epu16(a);
1759         let e = _mm_setr_epi16(1, 5, 0, 0, 0, 0, 0, 0);
1760         assert_eq_m128i(r, e);
1761     }
1762 
1763     #[simd_test(enable = "sse4.1")]
test_mm_mpsadbw_epu8()1764     unsafe fn test_mm_mpsadbw_epu8() {
1765         #[rustfmt::skip]
1766         let a = _mm_setr_epi8(
1767             0, 1, 2, 3, 4, 5, 6, 7,
1768             8, 9, 10, 11, 12, 13, 14, 15,
1769         );
1770 
1771         let r = _mm_mpsadbw_epu8::<0b000>(a, a);
1772         let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
1773         assert_eq_m128i(r, e);
1774 
1775         let r = _mm_mpsadbw_epu8::<0b001>(a, a);
1776         let e = _mm_setr_epi16(16, 12, 8, 4, 0, 4, 8, 12);
1777         assert_eq_m128i(r, e);
1778 
1779         let r = _mm_mpsadbw_epu8::<0b100>(a, a);
1780         let e = _mm_setr_epi16(16, 20, 24, 28, 32, 36, 40, 44);
1781         assert_eq_m128i(r, e);
1782 
1783         let r = _mm_mpsadbw_epu8::<0b101>(a, a);
1784         let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
1785         assert_eq_m128i(r, e);
1786 
1787         let r = _mm_mpsadbw_epu8::<0b111>(a, a);
1788         let e = _mm_setr_epi16(32, 28, 24, 20, 16, 12, 8, 4);
1789         assert_eq_m128i(r, e);
1790     }
1791 
1792     #[simd_test(enable = "sse4.1")]
test_mm_testz_si128()1793     unsafe fn test_mm_testz_si128() {
1794         let a = _mm_set1_epi8(1);
1795         let mask = _mm_set1_epi8(0);
1796         let r = _mm_testz_si128(a, mask);
1797         assert_eq!(r, 1);
1798         let a = _mm_set1_epi8(0b101);
1799         let mask = _mm_set1_epi8(0b110);
1800         let r = _mm_testz_si128(a, mask);
1801         assert_eq!(r, 0);
1802         let a = _mm_set1_epi8(0b011);
1803         let mask = _mm_set1_epi8(0b100);
1804         let r = _mm_testz_si128(a, mask);
1805         assert_eq!(r, 1);
1806     }
1807 
1808     #[simd_test(enable = "sse4.1")]
test_mm_testc_si128()1809     unsafe fn test_mm_testc_si128() {
1810         let a = _mm_set1_epi8(-1);
1811         let mask = _mm_set1_epi8(0);
1812         let r = _mm_testc_si128(a, mask);
1813         assert_eq!(r, 1);
1814         let a = _mm_set1_epi8(0b101);
1815         let mask = _mm_set1_epi8(0b110);
1816         let r = _mm_testc_si128(a, mask);
1817         assert_eq!(r, 0);
1818         let a = _mm_set1_epi8(0b101);
1819         let mask = _mm_set1_epi8(0b100);
1820         let r = _mm_testc_si128(a, mask);
1821         assert_eq!(r, 1);
1822     }
1823 
1824     #[simd_test(enable = "sse4.1")]
test_mm_testnzc_si128()1825     unsafe fn test_mm_testnzc_si128() {
1826         let a = _mm_set1_epi8(0);
1827         let mask = _mm_set1_epi8(1);
1828         let r = _mm_testnzc_si128(a, mask);
1829         assert_eq!(r, 0);
1830         let a = _mm_set1_epi8(-1);
1831         let mask = _mm_set1_epi8(0);
1832         let r = _mm_testnzc_si128(a, mask);
1833         assert_eq!(r, 0);
1834         let a = _mm_set1_epi8(0b101);
1835         let mask = _mm_set1_epi8(0b110);
1836         let r = _mm_testnzc_si128(a, mask);
1837         assert_eq!(r, 1);
1838         let a = _mm_set1_epi8(0b101);
1839         let mask = _mm_set1_epi8(0b101);
1840         let r = _mm_testnzc_si128(a, mask);
1841         assert_eq!(r, 0);
1842     }
1843 
1844     #[simd_test(enable = "sse4.1")]
test_mm_test_all_zeros()1845     unsafe fn test_mm_test_all_zeros() {
1846         let a = _mm_set1_epi8(1);
1847         let mask = _mm_set1_epi8(0);
1848         let r = _mm_test_all_zeros(a, mask);
1849         assert_eq!(r, 1);
1850         let a = _mm_set1_epi8(0b101);
1851         let mask = _mm_set1_epi8(0b110);
1852         let r = _mm_test_all_zeros(a, mask);
1853         assert_eq!(r, 0);
1854         let a = _mm_set1_epi8(0b011);
1855         let mask = _mm_set1_epi8(0b100);
1856         let r = _mm_test_all_zeros(a, mask);
1857         assert_eq!(r, 1);
1858     }
1859 
1860     #[simd_test(enable = "sse4.1")]
test_mm_test_all_ones()1861     unsafe fn test_mm_test_all_ones() {
1862         let a = _mm_set1_epi8(-1);
1863         let r = _mm_test_all_ones(a);
1864         assert_eq!(r, 1);
1865         let a = _mm_set1_epi8(0b101);
1866         let r = _mm_test_all_ones(a);
1867         assert_eq!(r, 0);
1868     }
1869 
1870     #[simd_test(enable = "sse4.1")]
test_mm_test_mix_ones_zeros()1871     unsafe fn test_mm_test_mix_ones_zeros() {
1872         let a = _mm_set1_epi8(0);
1873         let mask = _mm_set1_epi8(1);
1874         let r = _mm_test_mix_ones_zeros(a, mask);
1875         assert_eq!(r, 0);
1876         let a = _mm_set1_epi8(-1);
1877         let mask = _mm_set1_epi8(0);
1878         let r = _mm_test_mix_ones_zeros(a, mask);
1879         assert_eq!(r, 0);
1880         let a = _mm_set1_epi8(0b101);
1881         let mask = _mm_set1_epi8(0b110);
1882         let r = _mm_test_mix_ones_zeros(a, mask);
1883         assert_eq!(r, 1);
1884         let a = _mm_set1_epi8(0b101);
1885         let mask = _mm_set1_epi8(0b101);
1886         let r = _mm_test_mix_ones_zeros(a, mask);
1887         assert_eq!(r, 0);
1888     }
1889 }
1890