1 /*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10 #ifndef __XMMINTRIN_H
11 #define __XMMINTRIN_H
12
13 #include <mmintrin.h>
14
15 typedef int __v4si __attribute__((__vector_size__(16)));
16 typedef float __v4sf __attribute__((__vector_size__(16)));
17 typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
18
19 typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
20
21 /* Unsigned types */
22 typedef unsigned int __v4su __attribute__((__vector_size__(16)));
23
24 /* This header should only be included in a hosted environment as it depends on
25 * a standard library to provide allocation routines. */
26 #if __STDC_HOSTED__
27 #include <mm_malloc.h>
28 #endif
29
30 /* Define the default attributes for the functions in this file. */
31 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"), __min_vector_width__(128)))
32 #define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse"), __min_vector_width__(64)))
33
34 /// Adds the 32-bit float values in the low-order bits of the operands.
35 ///
36 /// \headerfile <x86intrin.h>
37 ///
38 /// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
39 ///
40 /// \param __a
41 /// A 128-bit vector of [4 x float] containing one of the source operands.
42 /// The lower 32 bits of this operand are used in the calculation.
43 /// \param __b
44 /// A 128-bit vector of [4 x float] containing one of the source operands.
45 /// The lower 32 bits of this operand are used in the calculation.
46 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
47 /// of the lower 32 bits of both operands. The upper 96 bits are copied from
48 /// the upper 96 bits of the first source operand.
49 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_add_ss(__m128 __a,__m128 __b)50 _mm_add_ss(__m128 __a, __m128 __b)
51 {
52 __a[0] += __b[0];
53 return __a;
54 }
55
56 /// Adds two 128-bit vectors of [4 x float], and returns the results of
57 /// the addition.
58 ///
59 /// \headerfile <x86intrin.h>
60 ///
61 /// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
62 ///
63 /// \param __a
64 /// A 128-bit vector of [4 x float] containing one of the source operands.
65 /// \param __b
66 /// A 128-bit vector of [4 x float] containing one of the source operands.
67 /// \returns A 128-bit vector of [4 x float] containing the sums of both
68 /// operands.
69 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_add_ps(__m128 __a,__m128 __b)70 _mm_add_ps(__m128 __a, __m128 __b)
71 {
72 return (__m128)((__v4sf)__a + (__v4sf)__b);
73 }
74
75 /// Subtracts the 32-bit float value in the low-order bits of the second
76 /// operand from the corresponding value in the first operand.
77 ///
78 /// \headerfile <x86intrin.h>
79 ///
80 /// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
81 ///
82 /// \param __a
83 /// A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
84 /// of this operand are used in the calculation.
85 /// \param __b
86 /// A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
87 /// bits of this operand are used in the calculation.
88 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
89 /// difference of the lower 32 bits of both operands. The upper 96 bits are
90 /// copied from the upper 96 bits of the first source operand.
91 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_sub_ss(__m128 __a,__m128 __b)92 _mm_sub_ss(__m128 __a, __m128 __b)
93 {
94 __a[0] -= __b[0];
95 return __a;
96 }
97
98 /// Subtracts each of the values of the second operand from the first
99 /// operand, both of which are 128-bit vectors of [4 x float] and returns
100 /// the results of the subtraction.
101 ///
102 /// \headerfile <x86intrin.h>
103 ///
104 /// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
105 ///
106 /// \param __a
107 /// A 128-bit vector of [4 x float] containing the minuend.
108 /// \param __b
109 /// A 128-bit vector of [4 x float] containing the subtrahend.
110 /// \returns A 128-bit vector of [4 x float] containing the differences between
111 /// both operands.
112 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_sub_ps(__m128 __a,__m128 __b)113 _mm_sub_ps(__m128 __a, __m128 __b)
114 {
115 return (__m128)((__v4sf)__a - (__v4sf)__b);
116 }
117
118 /// Multiplies two 32-bit float values in the low-order bits of the
119 /// operands.
120 ///
121 /// \headerfile <x86intrin.h>
122 ///
123 /// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
124 ///
125 /// \param __a
126 /// A 128-bit vector of [4 x float] containing one of the source operands.
127 /// The lower 32 bits of this operand are used in the calculation.
128 /// \param __b
129 /// A 128-bit vector of [4 x float] containing one of the source operands.
130 /// The lower 32 bits of this operand are used in the calculation.
131 /// \returns A 128-bit vector of [4 x float] containing the product of the lower
132 /// 32 bits of both operands. The upper 96 bits are copied from the upper 96
133 /// bits of the first source operand.
134 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_mul_ss(__m128 __a,__m128 __b)135 _mm_mul_ss(__m128 __a, __m128 __b)
136 {
137 __a[0] *= __b[0];
138 return __a;
139 }
140
141 /// Multiplies two 128-bit vectors of [4 x float] and returns the
142 /// results of the multiplication.
143 ///
144 /// \headerfile <x86intrin.h>
145 ///
146 /// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
147 ///
148 /// \param __a
149 /// A 128-bit vector of [4 x float] containing one of the source operands.
150 /// \param __b
151 /// A 128-bit vector of [4 x float] containing one of the source operands.
152 /// \returns A 128-bit vector of [4 x float] containing the products of both
153 /// operands.
154 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_mul_ps(__m128 __a,__m128 __b)155 _mm_mul_ps(__m128 __a, __m128 __b)
156 {
157 return (__m128)((__v4sf)__a * (__v4sf)__b);
158 }
159
160 /// Divides the value in the low-order 32 bits of the first operand by
161 /// the corresponding value in the second operand.
162 ///
163 /// \headerfile <x86intrin.h>
164 ///
165 /// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
166 ///
167 /// \param __a
168 /// A 128-bit vector of [4 x float] containing the dividend. The lower 32
169 /// bits of this operand are used in the calculation.
170 /// \param __b
171 /// A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
172 /// of this operand are used in the calculation.
173 /// \returns A 128-bit vector of [4 x float] containing the quotients of the
174 /// lower 32 bits of both operands. The upper 96 bits are copied from the
175 /// upper 96 bits of the first source operand.
176 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_div_ss(__m128 __a,__m128 __b)177 _mm_div_ss(__m128 __a, __m128 __b)
178 {
179 __a[0] /= __b[0];
180 return __a;
181 }
182
183 /// Divides two 128-bit vectors of [4 x float].
184 ///
185 /// \headerfile <x86intrin.h>
186 ///
187 /// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
188 ///
189 /// \param __a
190 /// A 128-bit vector of [4 x float] containing the dividend.
191 /// \param __b
192 /// A 128-bit vector of [4 x float] containing the divisor.
193 /// \returns A 128-bit vector of [4 x float] containing the quotients of both
194 /// operands.
195 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_div_ps(__m128 __a,__m128 __b)196 _mm_div_ps(__m128 __a, __m128 __b)
197 {
198 return (__m128)((__v4sf)__a / (__v4sf)__b);
199 }
200
201 /// Calculates the square root of the value stored in the low-order bits
202 /// of a 128-bit vector of [4 x float].
203 ///
204 /// \headerfile <x86intrin.h>
205 ///
206 /// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
207 ///
208 /// \param __a
209 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
210 /// used in the calculation.
211 /// \returns A 128-bit vector of [4 x float] containing the square root of the
212 /// value in the low-order bits of the operand.
213 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_sqrt_ss(__m128 __a)214 _mm_sqrt_ss(__m128 __a)
215 {
216 return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
217 }
218
219 /// Calculates the square roots of the values stored in a 128-bit vector
220 /// of [4 x float].
221 ///
222 /// \headerfile <x86intrin.h>
223 ///
224 /// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
225 ///
226 /// \param __a
227 /// A 128-bit vector of [4 x float].
228 /// \returns A 128-bit vector of [4 x float] containing the square roots of the
229 /// values in the operand.
230 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_sqrt_ps(__m128 __a)231 _mm_sqrt_ps(__m128 __a)
232 {
233 return __builtin_ia32_sqrtps((__v4sf)__a);
234 }
235
236 /// Calculates the approximate reciprocal of the value stored in the
237 /// low-order bits of a 128-bit vector of [4 x float].
238 ///
239 /// \headerfile <x86intrin.h>
240 ///
241 /// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
242 ///
243 /// \param __a
244 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
245 /// used in the calculation.
246 /// \returns A 128-bit vector of [4 x float] containing the approximate
247 /// reciprocal of the value in the low-order bits of the operand.
248 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_rcp_ss(__m128 __a)249 _mm_rcp_ss(__m128 __a)
250 {
251 return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
252 }
253
254 /// Calculates the approximate reciprocals of the values stored in a
255 /// 128-bit vector of [4 x float].
256 ///
257 /// \headerfile <x86intrin.h>
258 ///
259 /// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
260 ///
261 /// \param __a
262 /// A 128-bit vector of [4 x float].
263 /// \returns A 128-bit vector of [4 x float] containing the approximate
264 /// reciprocals of the values in the operand.
265 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_rcp_ps(__m128 __a)266 _mm_rcp_ps(__m128 __a)
267 {
268 return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
269 }
270
271 /// Calculates the approximate reciprocal of the square root of the value
272 /// stored in the low-order bits of a 128-bit vector of [4 x float].
273 ///
274 /// \headerfile <x86intrin.h>
275 ///
276 /// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
277 ///
278 /// \param __a
279 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
280 /// used in the calculation.
281 /// \returns A 128-bit vector of [4 x float] containing the approximate
282 /// reciprocal of the square root of the value in the low-order bits of the
283 /// operand.
284 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_rsqrt_ss(__m128 __a)285 _mm_rsqrt_ss(__m128 __a)
286 {
287 return __builtin_ia32_rsqrtss((__v4sf)__a);
288 }
289
290 /// Calculates the approximate reciprocals of the square roots of the
291 /// values stored in a 128-bit vector of [4 x float].
292 ///
293 /// \headerfile <x86intrin.h>
294 ///
295 /// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
296 ///
297 /// \param __a
298 /// A 128-bit vector of [4 x float].
299 /// \returns A 128-bit vector of [4 x float] containing the approximate
300 /// reciprocals of the square roots of the values in the operand.
301 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_rsqrt_ps(__m128 __a)302 _mm_rsqrt_ps(__m128 __a)
303 {
304 return __builtin_ia32_rsqrtps((__v4sf)__a);
305 }
306
307 /// Compares two 32-bit float values in the low-order bits of both
308 /// operands and returns the lesser value in the low-order bits of the
309 /// vector of [4 x float].
310 ///
311 /// \headerfile <x86intrin.h>
312 ///
313 /// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
314 ///
315 /// \param __a
316 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
317 /// 32 bits of this operand are used in the comparison.
318 /// \param __b
319 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
320 /// 32 bits of this operand are used in the comparison.
321 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
322 /// minimum value between both operands. The upper 96 bits are copied from
323 /// the upper 96 bits of the first source operand.
324 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_min_ss(__m128 __a,__m128 __b)325 _mm_min_ss(__m128 __a, __m128 __b)
326 {
327 return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
328 }
329
330 /// Compares two 128-bit vectors of [4 x float] and returns the lesser
331 /// of each pair of values.
332 ///
333 /// \headerfile <x86intrin.h>
334 ///
335 /// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
336 ///
337 /// \param __a
338 /// A 128-bit vector of [4 x float] containing one of the operands.
339 /// \param __b
340 /// A 128-bit vector of [4 x float] containing one of the operands.
341 /// \returns A 128-bit vector of [4 x float] containing the minimum values
342 /// between both operands.
343 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_min_ps(__m128 __a,__m128 __b)344 _mm_min_ps(__m128 __a, __m128 __b)
345 {
346 return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
347 }
348
349 /// Compares two 32-bit float values in the low-order bits of both
350 /// operands and returns the greater value in the low-order bits of a 128-bit
351 /// vector of [4 x float].
352 ///
353 /// \headerfile <x86intrin.h>
354 ///
355 /// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
356 ///
357 /// \param __a
358 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
359 /// 32 bits of this operand are used in the comparison.
360 /// \param __b
361 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
362 /// 32 bits of this operand are used in the comparison.
363 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
364 /// maximum value between both operands. The upper 96 bits are copied from
365 /// the upper 96 bits of the first source operand.
366 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_max_ss(__m128 __a,__m128 __b)367 _mm_max_ss(__m128 __a, __m128 __b)
368 {
369 return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
370 }
371
372 /// Compares two 128-bit vectors of [4 x float] and returns the greater
373 /// of each pair of values.
374 ///
375 /// \headerfile <x86intrin.h>
376 ///
377 /// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
378 ///
379 /// \param __a
380 /// A 128-bit vector of [4 x float] containing one of the operands.
381 /// \param __b
382 /// A 128-bit vector of [4 x float] containing one of the operands.
383 /// \returns A 128-bit vector of [4 x float] containing the maximum values
384 /// between both operands.
385 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_max_ps(__m128 __a,__m128 __b)386 _mm_max_ps(__m128 __a, __m128 __b)
387 {
388 return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
389 }
390
391 /// Performs a bitwise AND of two 128-bit vectors of [4 x float].
392 ///
393 /// \headerfile <x86intrin.h>
394 ///
395 /// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
396 ///
397 /// \param __a
398 /// A 128-bit vector containing one of the source operands.
399 /// \param __b
400 /// A 128-bit vector containing one of the source operands.
401 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
402 /// values between both operands.
403 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_and_ps(__m128 __a,__m128 __b)404 _mm_and_ps(__m128 __a, __m128 __b)
405 {
406 return (__m128)((__v4su)__a & (__v4su)__b);
407 }
408
409 /// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
410 /// the one's complement of the values contained in the first source
411 /// operand.
412 ///
413 /// \headerfile <x86intrin.h>
414 ///
415 /// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
416 ///
417 /// \param __a
418 /// A 128-bit vector of [4 x float] containing the first source operand. The
419 /// one's complement of this value is used in the bitwise AND.
420 /// \param __b
421 /// A 128-bit vector of [4 x float] containing the second source operand.
422 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
423 /// one's complement of the first operand and the values in the second
424 /// operand.
425 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_andnot_ps(__m128 __a,__m128 __b)426 _mm_andnot_ps(__m128 __a, __m128 __b)
427 {
428 return (__m128)(~(__v4su)__a & (__v4su)__b);
429 }
430
431 /// Performs a bitwise OR of two 128-bit vectors of [4 x float].
432 ///
433 /// \headerfile <x86intrin.h>
434 ///
435 /// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
436 ///
437 /// \param __a
438 /// A 128-bit vector of [4 x float] containing one of the source operands.
439 /// \param __b
440 /// A 128-bit vector of [4 x float] containing one of the source operands.
441 /// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
442 /// values between both operands.
443 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_or_ps(__m128 __a,__m128 __b)444 _mm_or_ps(__m128 __a, __m128 __b)
445 {
446 return (__m128)((__v4su)__a | (__v4su)__b);
447 }
448
449 /// Performs a bitwise exclusive OR of two 128-bit vectors of
450 /// [4 x float].
451 ///
452 /// \headerfile <x86intrin.h>
453 ///
454 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
455 ///
456 /// \param __a
457 /// A 128-bit vector of [4 x float] containing one of the source operands.
458 /// \param __b
459 /// A 128-bit vector of [4 x float] containing one of the source operands.
460 /// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
461 /// of the values between both operands.
462 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_xor_ps(__m128 __a,__m128 __b)463 _mm_xor_ps(__m128 __a, __m128 __b)
464 {
465 return (__m128)((__v4su)__a ^ (__v4su)__b);
466 }
467
468 /// Compares two 32-bit float values in the low-order bits of both
469 /// operands for equality and returns the result of the comparison in the
470 /// low-order bits of a vector [4 x float].
471 ///
472 /// \headerfile <x86intrin.h>
473 ///
474 /// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
475 ///
476 /// \param __a
477 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
478 /// 32 bits of this operand are used in the comparison.
479 /// \param __b
480 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
481 /// 32 bits of this operand are used in the comparison.
482 /// \returns A 128-bit vector of [4 x float] containing the comparison results
483 /// in the low-order bits.
484 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpeq_ss(__m128 __a,__m128 __b)485 _mm_cmpeq_ss(__m128 __a, __m128 __b)
486 {
487 return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
488 }
489
490 /// Compares each of the corresponding 32-bit float values of the
491 /// 128-bit vectors of [4 x float] for equality.
492 ///
493 /// \headerfile <x86intrin.h>
494 ///
495 /// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
496 ///
497 /// \param __a
498 /// A 128-bit vector of [4 x float].
499 /// \param __b
500 /// A 128-bit vector of [4 x float].
501 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
502 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpeq_ps(__m128 __a,__m128 __b)503 _mm_cmpeq_ps(__m128 __a, __m128 __b)
504 {
505 return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
506 }
507
508 /// Compares two 32-bit float values in the low-order bits of both
509 /// operands to determine if the value in the first operand is less than the
510 /// corresponding value in the second operand and returns the result of the
511 /// comparison in the low-order bits of a vector of [4 x float].
512 ///
513 /// \headerfile <x86intrin.h>
514 ///
515 /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
516 ///
517 /// \param __a
518 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
519 /// 32 bits of this operand are used in the comparison.
520 /// \param __b
521 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
522 /// 32 bits of this operand are used in the comparison.
523 /// \returns A 128-bit vector of [4 x float] containing the comparison results
524 /// in the low-order bits.
525 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmplt_ss(__m128 __a,__m128 __b)526 _mm_cmplt_ss(__m128 __a, __m128 __b)
527 {
528 return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
529 }
530
531 /// Compares each of the corresponding 32-bit float values of the
532 /// 128-bit vectors of [4 x float] to determine if the values in the first
533 /// operand are less than those in the second operand.
534 ///
535 /// \headerfile <x86intrin.h>
536 ///
537 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
538 ///
539 /// \param __a
540 /// A 128-bit vector of [4 x float].
541 /// \param __b
542 /// A 128-bit vector of [4 x float].
543 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
544 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmplt_ps(__m128 __a,__m128 __b)545 _mm_cmplt_ps(__m128 __a, __m128 __b)
546 {
547 return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
548 }
549
550 /// Compares two 32-bit float values in the low-order bits of both
551 /// operands to determine if the value in the first operand is less than or
552 /// equal to the corresponding value in the second operand and returns the
553 /// result of the comparison in the low-order bits of a vector of
554 /// [4 x float].
555 ///
556 /// \headerfile <x86intrin.h>
557 ///
558 /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
559 ///
560 /// \param __a
561 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
562 /// 32 bits of this operand are used in the comparison.
563 /// \param __b
564 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
565 /// 32 bits of this operand are used in the comparison.
566 /// \returns A 128-bit vector of [4 x float] containing the comparison results
567 /// in the low-order bits.
568 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmple_ss(__m128 __a,__m128 __b)569 _mm_cmple_ss(__m128 __a, __m128 __b)
570 {
571 return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
572 }
573
574 /// Compares each of the corresponding 32-bit float values of the
575 /// 128-bit vectors of [4 x float] to determine if the values in the first
576 /// operand are less than or equal to those in the second operand.
577 ///
578 /// \headerfile <x86intrin.h>
579 ///
580 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
581 ///
582 /// \param __a
583 /// A 128-bit vector of [4 x float].
584 /// \param __b
585 /// A 128-bit vector of [4 x float].
586 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
587 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmple_ps(__m128 __a,__m128 __b)588 _mm_cmple_ps(__m128 __a, __m128 __b)
589 {
590 return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
591 }
592
593 /// Compares two 32-bit float values in the low-order bits of both
594 /// operands to determine if the value in the first operand is greater than
595 /// the corresponding value in the second operand and returns the result of
596 /// the comparison in the low-order bits of a vector of [4 x float].
597 ///
598 /// \headerfile <x86intrin.h>
599 ///
600 /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
601 ///
602 /// \param __a
603 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
604 /// 32 bits of this operand are used in the comparison.
605 /// \param __b
606 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
607 /// 32 bits of this operand are used in the comparison.
608 /// \returns A 128-bit vector of [4 x float] containing the comparison results
609 /// in the low-order bits.
610 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpgt_ss(__m128 __a,__m128 __b)611 _mm_cmpgt_ss(__m128 __a, __m128 __b)
612 {
613 return (__m128)__builtin_shufflevector((__v4sf)__a,
614 (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
615 4, 1, 2, 3);
616 }
617
618 /// Compares each of the corresponding 32-bit float values of the
619 /// 128-bit vectors of [4 x float] to determine if the values in the first
620 /// operand are greater than those in the second operand.
621 ///
622 /// \headerfile <x86intrin.h>
623 ///
624 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
625 ///
626 /// \param __a
627 /// A 128-bit vector of [4 x float].
628 /// \param __b
629 /// A 128-bit vector of [4 x float].
630 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
631 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpgt_ps(__m128 __a,__m128 __b)632 _mm_cmpgt_ps(__m128 __a, __m128 __b)
633 {
634 return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
635 }
636
637 /// Compares two 32-bit float values in the low-order bits of both
638 /// operands to determine if the value in the first operand is greater than
639 /// or equal to the corresponding value in the second operand and returns
640 /// the result of the comparison in the low-order bits of a vector of
641 /// [4 x float].
642 ///
643 /// \headerfile <x86intrin.h>
644 ///
645 /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
646 ///
647 /// \param __a
648 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
649 /// 32 bits of this operand are used in the comparison.
650 /// \param __b
651 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
652 /// 32 bits of this operand are used in the comparison.
653 /// \returns A 128-bit vector of [4 x float] containing the comparison results
654 /// in the low-order bits.
655 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpge_ss(__m128 __a,__m128 __b)656 _mm_cmpge_ss(__m128 __a, __m128 __b)
657 {
658 return (__m128)__builtin_shufflevector((__v4sf)__a,
659 (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
660 4, 1, 2, 3);
661 }
662
663 /// Compares each of the corresponding 32-bit float values of the
664 /// 128-bit vectors of [4 x float] to determine if the values in the first
665 /// operand are greater than or equal to those in the second operand.
666 ///
667 /// \headerfile <x86intrin.h>
668 ///
669 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
670 ///
671 /// \param __a
672 /// A 128-bit vector of [4 x float].
673 /// \param __b
674 /// A 128-bit vector of [4 x float].
675 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
676 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpge_ps(__m128 __a,__m128 __b)677 _mm_cmpge_ps(__m128 __a, __m128 __b)
678 {
679 return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
680 }
681
682 /// Compares two 32-bit float values in the low-order bits of both
683 /// operands for inequality and returns the result of the comparison in the
684 /// low-order bits of a vector of [4 x float].
685 ///
686 /// \headerfile <x86intrin.h>
687 ///
688 /// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
689 /// instructions.
690 ///
691 /// \param __a
692 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
693 /// 32 bits of this operand are used in the comparison.
694 /// \param __b
695 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
696 /// 32 bits of this operand are used in the comparison.
697 /// \returns A 128-bit vector of [4 x float] containing the comparison results
698 /// in the low-order bits.
699 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpneq_ss(__m128 __a,__m128 __b)700 _mm_cmpneq_ss(__m128 __a, __m128 __b)
701 {
702 return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
703 }
704
705 /// Compares each of the corresponding 32-bit float values of the
706 /// 128-bit vectors of [4 x float] for inequality.
707 ///
708 /// \headerfile <x86intrin.h>
709 ///
710 /// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
711 /// instructions.
712 ///
713 /// \param __a
714 /// A 128-bit vector of [4 x float].
715 /// \param __b
716 /// A 128-bit vector of [4 x float].
717 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
718 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpneq_ps(__m128 __a,__m128 __b)719 _mm_cmpneq_ps(__m128 __a, __m128 __b)
720 {
721 return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
722 }
723
724 /// Compares two 32-bit float values in the low-order bits of both
725 /// operands to determine if the value in the first operand is not less than
726 /// the corresponding value in the second operand and returns the result of
727 /// the comparison in the low-order bits of a vector of [4 x float].
728 ///
729 /// \headerfile <x86intrin.h>
730 ///
731 /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
732 /// instructions.
733 ///
734 /// \param __a
735 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
736 /// 32 bits of this operand are used in the comparison.
737 /// \param __b
738 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
739 /// 32 bits of this operand are used in the comparison.
740 /// \returns A 128-bit vector of [4 x float] containing the comparison results
741 /// in the low-order bits.
742 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpnlt_ss(__m128 __a,__m128 __b)743 _mm_cmpnlt_ss(__m128 __a, __m128 __b)
744 {
745 return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
746 }
747
748 /// Compares each of the corresponding 32-bit float values of the
749 /// 128-bit vectors of [4 x float] to determine if the values in the first
750 /// operand are not less than those in the second operand.
751 ///
752 /// \headerfile <x86intrin.h>
753 ///
754 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
755 /// instructions.
756 ///
757 /// \param __a
758 /// A 128-bit vector of [4 x float].
759 /// \param __b
760 /// A 128-bit vector of [4 x float].
761 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
762 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpnlt_ps(__m128 __a,__m128 __b)763 _mm_cmpnlt_ps(__m128 __a, __m128 __b)
764 {
765 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
766 }
767
768 /// Compares two 32-bit float values in the low-order bits of both
769 /// operands to determine if the value in the first operand is not less than
770 /// or equal to the corresponding value in the second operand and returns
771 /// the result of the comparison in the low-order bits of a vector of
772 /// [4 x float].
773 ///
774 /// \headerfile <x86intrin.h>
775 ///
776 /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
777 /// instructions.
778 ///
779 /// \param __a
780 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
781 /// 32 bits of this operand are used in the comparison.
782 /// \param __b
783 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
784 /// 32 bits of this operand are used in the comparison.
785 /// \returns A 128-bit vector of [4 x float] containing the comparison results
786 /// in the low-order bits.
787 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpnle_ss(__m128 __a,__m128 __b)788 _mm_cmpnle_ss(__m128 __a, __m128 __b)
789 {
790 return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
791 }
792
793 /// Compares each of the corresponding 32-bit float values of the
794 /// 128-bit vectors of [4 x float] to determine if the values in the first
795 /// operand are not less than or equal to those in the second operand.
796 ///
797 /// \headerfile <x86intrin.h>
798 ///
799 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
800 /// instructions.
801 ///
802 /// \param __a
803 /// A 128-bit vector of [4 x float].
804 /// \param __b
805 /// A 128-bit vector of [4 x float].
806 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
807 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpnle_ps(__m128 __a,__m128 __b)808 _mm_cmpnle_ps(__m128 __a, __m128 __b)
809 {
810 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
811 }
812
813 /// Compares two 32-bit float values in the low-order bits of both
814 /// operands to determine if the value in the first operand is not greater
815 /// than the corresponding value in the second operand and returns the
816 /// result of the comparison in the low-order bits of a vector of
817 /// [4 x float].
818 ///
819 /// \headerfile <x86intrin.h>
820 ///
821 /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
822 /// instructions.
823 ///
824 /// \param __a
825 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
826 /// 32 bits of this operand are used in the comparison.
827 /// \param __b
828 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
829 /// 32 bits of this operand are used in the comparison.
830 /// \returns A 128-bit vector of [4 x float] containing the comparison results
831 /// in the low-order bits.
832 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpngt_ss(__m128 __a,__m128 __b)833 _mm_cmpngt_ss(__m128 __a, __m128 __b)
834 {
835 return (__m128)__builtin_shufflevector((__v4sf)__a,
836 (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
837 4, 1, 2, 3);
838 }
839
840 /// Compares each of the corresponding 32-bit float values of the
841 /// 128-bit vectors of [4 x float] to determine if the values in the first
842 /// operand are not greater than those in the second operand.
843 ///
844 /// \headerfile <x86intrin.h>
845 ///
846 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
847 /// instructions.
848 ///
849 /// \param __a
850 /// A 128-bit vector of [4 x float].
851 /// \param __b
852 /// A 128-bit vector of [4 x float].
853 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
854 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpngt_ps(__m128 __a,__m128 __b)855 _mm_cmpngt_ps(__m128 __a, __m128 __b)
856 {
857 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
858 }
859
860 /// Compares two 32-bit float values in the low-order bits of both
861 /// operands to determine if the value in the first operand is not greater
862 /// than or equal to the corresponding value in the second operand and
863 /// returns the result of the comparison in the low-order bits of a vector
864 /// of [4 x float].
865 ///
866 /// \headerfile <x86intrin.h>
867 ///
868 /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
869 /// instructions.
870 ///
871 /// \param __a
872 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
873 /// 32 bits of this operand are used in the comparison.
874 /// \param __b
875 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
876 /// 32 bits of this operand are used in the comparison.
877 /// \returns A 128-bit vector of [4 x float] containing the comparison results
878 /// in the low-order bits.
879 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpnge_ss(__m128 __a,__m128 __b)880 _mm_cmpnge_ss(__m128 __a, __m128 __b)
881 {
882 return (__m128)__builtin_shufflevector((__v4sf)__a,
883 (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
884 4, 1, 2, 3);
885 }
886
887 /// Compares each of the corresponding 32-bit float values of the
888 /// 128-bit vectors of [4 x float] to determine if the values in the first
889 /// operand are not greater than or equal to those in the second operand.
890 ///
891 /// \headerfile <x86intrin.h>
892 ///
893 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
894 /// instructions.
895 ///
896 /// \param __a
897 /// A 128-bit vector of [4 x float].
898 /// \param __b
899 /// A 128-bit vector of [4 x float].
900 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
901 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpnge_ps(__m128 __a,__m128 __b)902 _mm_cmpnge_ps(__m128 __a, __m128 __b)
903 {
904 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
905 }
906
907 /// Compares two 32-bit float values in the low-order bits of both
908 /// operands to determine if the value in the first operand is ordered with
909 /// respect to the corresponding value in the second operand and returns the
910 /// result of the comparison in the low-order bits of a vector of
911 /// [4 x float].
912 ///
913 /// \headerfile <x86intrin.h>
914 ///
915 /// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
916 /// instructions.
917 ///
918 /// \param __a
919 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
920 /// 32 bits of this operand are used in the comparison.
921 /// \param __b
922 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
923 /// 32 bits of this operand are used in the comparison.
924 /// \returns A 128-bit vector of [4 x float] containing the comparison results
925 /// in the low-order bits.
926 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpord_ss(__m128 __a,__m128 __b)927 _mm_cmpord_ss(__m128 __a, __m128 __b)
928 {
929 return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
930 }
931
932 /// Compares each of the corresponding 32-bit float values of the
933 /// 128-bit vectors of [4 x float] to determine if the values in the first
934 /// operand are ordered with respect to those in the second operand.
935 ///
936 /// \headerfile <x86intrin.h>
937 ///
938 /// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
939 /// instructions.
940 ///
941 /// \param __a
942 /// A 128-bit vector of [4 x float].
943 /// \param __b
944 /// A 128-bit vector of [4 x float].
945 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
946 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpord_ps(__m128 __a,__m128 __b)947 _mm_cmpord_ps(__m128 __a, __m128 __b)
948 {
949 return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
950 }
951
952 /// Compares two 32-bit float values in the low-order bits of both
953 /// operands to determine if the value in the first operand is unordered
954 /// with respect to the corresponding value in the second operand and
955 /// returns the result of the comparison in the low-order bits of a vector
956 /// of [4 x float].
957 ///
958 /// \headerfile <x86intrin.h>
959 ///
960 /// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
961 /// instructions.
962 ///
963 /// \param __a
964 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
965 /// 32 bits of this operand are used in the comparison.
966 /// \param __b
967 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
968 /// 32 bits of this operand are used in the comparison.
969 /// \returns A 128-bit vector of [4 x float] containing the comparison results
970 /// in the low-order bits.
971 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpunord_ss(__m128 __a,__m128 __b)972 _mm_cmpunord_ss(__m128 __a, __m128 __b)
973 {
974 return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
975 }
976
977 /// Compares each of the corresponding 32-bit float values of the
978 /// 128-bit vectors of [4 x float] to determine if the values in the first
979 /// operand are unordered with respect to those in the second operand.
980 ///
981 /// \headerfile <x86intrin.h>
982 ///
983 /// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
984 /// instructions.
985 ///
986 /// \param __a
987 /// A 128-bit vector of [4 x float].
988 /// \param __b
989 /// A 128-bit vector of [4 x float].
990 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
991 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpunord_ps(__m128 __a,__m128 __b)992 _mm_cmpunord_ps(__m128 __a, __m128 __b)
993 {
994 return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
995 }
996
997 /// Compares two 32-bit float values in the low-order bits of both
998 /// operands for equality and returns the result of the comparison.
999 ///
1000 /// If either of the two lower 32-bit values is NaN, 0 is returned.
1001 ///
1002 /// \headerfile <x86intrin.h>
1003 ///
1004 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1005 /// instructions.
1006 ///
1007 /// \param __a
1008 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1009 /// used in the comparison.
1010 /// \param __b
1011 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1012 /// used in the comparison.
1013 /// \returns An integer containing the comparison results. If either of the
1014 /// two lower 32-bit values is NaN, 0 is returned.
1015 static __inline__ int __DEFAULT_FN_ATTRS
_mm_comieq_ss(__m128 __a,__m128 __b)1016 _mm_comieq_ss(__m128 __a, __m128 __b)
1017 {
1018 return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
1019 }
1020
1021 /// Compares two 32-bit float values in the low-order bits of both
1022 /// operands to determine if the first operand is less than the second
1023 /// operand and returns the result of the comparison.
1024 ///
1025 /// If either of the two lower 32-bit values is NaN, 0 is returned.
1026 ///
1027 /// \headerfile <x86intrin.h>
1028 ///
1029 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1030 /// instructions.
1031 ///
1032 /// \param __a
1033 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1034 /// used in the comparison.
1035 /// \param __b
1036 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1037 /// used in the comparison.
1038 /// \returns An integer containing the comparison results. If either of the two
1039 /// lower 32-bit values is NaN, 0 is returned.
1040 static __inline__ int __DEFAULT_FN_ATTRS
_mm_comilt_ss(__m128 __a,__m128 __b)1041 _mm_comilt_ss(__m128 __a, __m128 __b)
1042 {
1043 return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
1044 }
1045
1046 /// Compares two 32-bit float values in the low-order bits of both
1047 /// operands to determine if the first operand is less than or equal to the
1048 /// second operand and returns the result of the comparison.
1049 ///
1050 /// If either of the two lower 32-bit values is NaN, 0 is returned.
1051 ///
1052 /// \headerfile <x86intrin.h>
1053 ///
1054 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1055 ///
1056 /// \param __a
1057 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1058 /// used in the comparison.
1059 /// \param __b
1060 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1061 /// used in the comparison.
1062 /// \returns An integer containing the comparison results. If either of the two
1063 /// lower 32-bit values is NaN, 0 is returned.
1064 static __inline__ int __DEFAULT_FN_ATTRS
_mm_comile_ss(__m128 __a,__m128 __b)1065 _mm_comile_ss(__m128 __a, __m128 __b)
1066 {
1067 return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
1068 }
1069
1070 /// Compares two 32-bit float values in the low-order bits of both
1071 /// operands to determine if the first operand is greater than the second
1072 /// operand and returns the result of the comparison.
1073 ///
1074 /// If either of the two lower 32-bit values is NaN, 0 is returned.
1075 ///
1076 /// \headerfile <x86intrin.h>
1077 ///
1078 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1079 ///
1080 /// \param __a
1081 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1082 /// used in the comparison.
1083 /// \param __b
1084 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1085 /// used in the comparison.
1086 /// \returns An integer containing the comparison results. If either of the
1087 /// two lower 32-bit values is NaN, 0 is returned.
1088 static __inline__ int __DEFAULT_FN_ATTRS
_mm_comigt_ss(__m128 __a,__m128 __b)1089 _mm_comigt_ss(__m128 __a, __m128 __b)
1090 {
1091 return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
1092 }
1093
1094 /// Compares two 32-bit float values in the low-order bits of both
1095 /// operands to determine if the first operand is greater than or equal to
1096 /// the second operand and returns the result of the comparison.
1097 ///
1098 /// If either of the two lower 32-bit values is NaN, 0 is returned.
1099 ///
1100 /// \headerfile <x86intrin.h>
1101 ///
1102 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1103 ///
1104 /// \param __a
1105 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1106 /// used in the comparison.
1107 /// \param __b
1108 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1109 /// used in the comparison.
1110 /// \returns An integer containing the comparison results. If either of the two
1111 /// lower 32-bit values is NaN, 0 is returned.
1112 static __inline__ int __DEFAULT_FN_ATTRS
_mm_comige_ss(__m128 __a,__m128 __b)1113 _mm_comige_ss(__m128 __a, __m128 __b)
1114 {
1115 return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
1116 }
1117
1118 /// Compares two 32-bit float values in the low-order bits of both
1119 /// operands to determine if the first operand is not equal to the second
1120 /// operand and returns the result of the comparison.
1121 ///
1122 /// If either of the two lower 32-bit values is NaN, 1 is returned.
1123 ///
1124 /// \headerfile <x86intrin.h>
1125 ///
1126 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1127 ///
1128 /// \param __a
1129 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1130 /// used in the comparison.
1131 /// \param __b
1132 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1133 /// used in the comparison.
1134 /// \returns An integer containing the comparison results. If either of the
1135 /// two lower 32-bit values is NaN, 1 is returned.
1136 static __inline__ int __DEFAULT_FN_ATTRS
_mm_comineq_ss(__m128 __a,__m128 __b)1137 _mm_comineq_ss(__m128 __a, __m128 __b)
1138 {
1139 return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
1140 }
1141
1142 /// Performs an unordered comparison of two 32-bit float values using
1143 /// the low-order bits of both operands to determine equality and returns
1144 /// the result of the comparison.
1145 ///
1146 /// If either of the two lower 32-bit values is NaN, 0 is returned.
1147 ///
1148 /// \headerfile <x86intrin.h>
1149 ///
1150 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1151 ///
1152 /// \param __a
1153 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1154 /// used in the comparison.
1155 /// \param __b
1156 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1157 /// used in the comparison.
1158 /// \returns An integer containing the comparison results. If either of the two
1159 /// lower 32-bit values is NaN, 0 is returned.
1160 static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomieq_ss(__m128 __a,__m128 __b)1161 _mm_ucomieq_ss(__m128 __a, __m128 __b)
1162 {
1163 return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
1164 }
1165
1166 /// Performs an unordered comparison of two 32-bit float values using
1167 /// the low-order bits of both operands to determine if the first operand is
1168 /// less than the second operand and returns the result of the comparison.
1169 ///
1170 /// If either of the two lower 32-bit values is NaN, 0 is returned.
1171 ///
1172 /// \headerfile <x86intrin.h>
1173 ///
1174 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1175 ///
1176 /// \param __a
1177 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1178 /// used in the comparison.
1179 /// \param __b
1180 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1181 /// used in the comparison.
1182 /// \returns An integer containing the comparison results. If either of the two
1183 /// lower 32-bit values is NaN, 0 is returned.
1184 static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomilt_ss(__m128 __a,__m128 __b)1185 _mm_ucomilt_ss(__m128 __a, __m128 __b)
1186 {
1187 return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
1188 }
1189
1190 /// Performs an unordered comparison of two 32-bit float values using
1191 /// the low-order bits of both operands to determine if the first operand is
1192 /// less than or equal to the second operand and returns the result of the
1193 /// comparison.
1194 ///
1195 /// If either of the two lower 32-bit values is NaN, 0 is returned.
1196 ///
1197 /// \headerfile <x86intrin.h>
1198 ///
1199 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1200 ///
1201 /// \param __a
1202 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1203 /// used in the comparison.
1204 /// \param __b
1205 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1206 /// used in the comparison.
1207 /// \returns An integer containing the comparison results. If either of the two
1208 /// lower 32-bit values is NaN, 0 is returned.
1209 static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomile_ss(__m128 __a,__m128 __b)1210 _mm_ucomile_ss(__m128 __a, __m128 __b)
1211 {
1212 return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
1213 }
1214
1215 /// Performs an unordered comparison of two 32-bit float values using
1216 /// the low-order bits of both operands to determine if the first operand is
1217 /// greater than the second operand and returns the result of the
1218 /// comparison.
1219 ///
1220 /// If either of the two lower 32-bit values is NaN, 0 is returned.
1221 ///
1222 /// \headerfile <x86intrin.h>
1223 ///
1224 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1225 ///
1226 /// \param __a
1227 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1228 /// used in the comparison.
1229 /// \param __b
1230 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1231 /// used in the comparison.
1232 /// \returns An integer containing the comparison results. If either of the two
1233 /// lower 32-bit values is NaN, 0 is returned.
1234 static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomigt_ss(__m128 __a,__m128 __b)1235 _mm_ucomigt_ss(__m128 __a, __m128 __b)
1236 {
1237 return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
1238 }
1239
1240 /// Performs an unordered comparison of two 32-bit float values using
1241 /// the low-order bits of both operands to determine if the first operand is
1242 /// greater than or equal to the second operand and returns the result of
1243 /// the comparison.
1244 ///
1245 /// If either of the two lower 32-bit values is NaN, 0 is returned.
1246 ///
1247 /// \headerfile <x86intrin.h>
1248 ///
1249 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1250 ///
1251 /// \param __a
1252 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1253 /// used in the comparison.
1254 /// \param __b
1255 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1256 /// used in the comparison.
1257 /// \returns An integer containing the comparison results. If either of the two
1258 /// lower 32-bit values is NaN, 0 is returned.
1259 static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomige_ss(__m128 __a,__m128 __b)1260 _mm_ucomige_ss(__m128 __a, __m128 __b)
1261 {
1262 return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
1263 }
1264
1265 /// Performs an unordered comparison of two 32-bit float values using
1266 /// the low-order bits of both operands to determine inequality and returns
1267 /// the result of the comparison.
1268 ///
1269 /// If either of the two lower 32-bit values is NaN, 1 is returned.
1270 ///
1271 /// \headerfile <x86intrin.h>
1272 ///
1273 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1274 ///
1275 /// \param __a
1276 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1277 /// used in the comparison.
1278 /// \param __b
1279 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1280 /// used in the comparison.
1281 /// \returns An integer containing the comparison results. If either of the two
1282 /// lower 32-bit values is NaN, 1 is returned.
1283 static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomineq_ss(__m128 __a,__m128 __b)1284 _mm_ucomineq_ss(__m128 __a, __m128 __b)
1285 {
1286 return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
1287 }
1288
1289 /// Converts a float value contained in the lower 32 bits of a vector of
1290 /// [4 x float] into a 32-bit integer.
1291 ///
1292 /// \headerfile <x86intrin.h>
1293 ///
1294 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1295 /// instructions.
1296 ///
1297 /// \param __a
1298 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1299 /// used in the conversion.
1300 /// \returns A 32-bit integer containing the converted value.
1301 static __inline__ int __DEFAULT_FN_ATTRS
_mm_cvtss_si32(__m128 __a)1302 _mm_cvtss_si32(__m128 __a)
1303 {
1304 return __builtin_ia32_cvtss2si((__v4sf)__a);
1305 }
1306
1307 /// Converts a float value contained in the lower 32 bits of a vector of
1308 /// [4 x float] into a 32-bit integer.
1309 ///
1310 /// \headerfile <x86intrin.h>
1311 ///
1312 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1313 /// instructions.
1314 ///
1315 /// \param __a
1316 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1317 /// used in the conversion.
1318 /// \returns A 32-bit integer containing the converted value.
1319 static __inline__ int __DEFAULT_FN_ATTRS
_mm_cvt_ss2si(__m128 __a)1320 _mm_cvt_ss2si(__m128 __a)
1321 {
1322 return _mm_cvtss_si32(__a);
1323 }
1324
1325 #ifdef __x86_64__
1326
1327 /// Converts a float value contained in the lower 32 bits of a vector of
1328 /// [4 x float] into a 64-bit integer.
1329 ///
1330 /// \headerfile <x86intrin.h>
1331 ///
1332 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1333 /// instructions.
1334 ///
1335 /// \param __a
1336 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1337 /// used in the conversion.
1338 /// \returns A 64-bit integer containing the converted value.
1339 static __inline__ long long __DEFAULT_FN_ATTRS
_mm_cvtss_si64(__m128 __a)1340 _mm_cvtss_si64(__m128 __a)
1341 {
1342 return __builtin_ia32_cvtss2si64((__v4sf)__a);
1343 }
1344
1345 #endif
1346
1347 /// Converts two low-order float values in a 128-bit vector of
1348 /// [4 x float] into a 64-bit vector of [2 x i32].
1349 ///
1350 /// \headerfile <x86intrin.h>
1351 ///
1352 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1353 ///
1354 /// \param __a
1355 /// A 128-bit vector of [4 x float].
1356 /// \returns A 64-bit integer vector containing the converted values.
1357 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_cvtps_pi32(__m128 __a)1358 _mm_cvtps_pi32(__m128 __a)
1359 {
1360 return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
1361 }
1362
1363 /// Converts two low-order float values in a 128-bit vector of
1364 /// [4 x float] into a 64-bit vector of [2 x i32].
1365 ///
1366 /// \headerfile <x86intrin.h>
1367 ///
1368 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1369 ///
1370 /// \param __a
1371 /// A 128-bit vector of [4 x float].
1372 /// \returns A 64-bit integer vector containing the converted values.
1373 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_cvt_ps2pi(__m128 __a)1374 _mm_cvt_ps2pi(__m128 __a)
1375 {
1376 return _mm_cvtps_pi32(__a);
1377 }
1378
1379 /// Converts a float value contained in the lower 32 bits of a vector of
1380 /// [4 x float] into a 32-bit integer, truncating the result when it is
1381 /// inexact.
1382 ///
1383 /// \headerfile <x86intrin.h>
1384 ///
1385 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1386 /// instructions.
1387 ///
1388 /// \param __a
1389 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1390 /// used in the conversion.
1391 /// \returns A 32-bit integer containing the converted value.
1392 static __inline__ int __DEFAULT_FN_ATTRS
_mm_cvttss_si32(__m128 __a)1393 _mm_cvttss_si32(__m128 __a)
1394 {
1395 return __builtin_ia32_cvttss2si((__v4sf)__a);
1396 }
1397
1398 /// Converts a float value contained in the lower 32 bits of a vector of
1399 /// [4 x float] into a 32-bit integer, truncating the result when it is
1400 /// inexact.
1401 ///
1402 /// \headerfile <x86intrin.h>
1403 ///
1404 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1405 /// instructions.
1406 ///
1407 /// \param __a
1408 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1409 /// used in the conversion.
1410 /// \returns A 32-bit integer containing the converted value.
1411 static __inline__ int __DEFAULT_FN_ATTRS
_mm_cvtt_ss2si(__m128 __a)1412 _mm_cvtt_ss2si(__m128 __a)
1413 {
1414 return _mm_cvttss_si32(__a);
1415 }
1416
1417 #ifdef __x86_64__
1418 /// Converts a float value contained in the lower 32 bits of a vector of
1419 /// [4 x float] into a 64-bit integer, truncating the result when it is
1420 /// inexact.
1421 ///
1422 /// \headerfile <x86intrin.h>
1423 ///
1424 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1425 /// instructions.
1426 ///
1427 /// \param __a
1428 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1429 /// used in the conversion.
1430 /// \returns A 64-bit integer containing the converted value.
1431 static __inline__ long long __DEFAULT_FN_ATTRS
_mm_cvttss_si64(__m128 __a)1432 _mm_cvttss_si64(__m128 __a)
1433 {
1434 return __builtin_ia32_cvttss2si64((__v4sf)__a);
1435 }
1436 #endif
1437
1438 /// Converts two low-order float values in a 128-bit vector of
1439 /// [4 x float] into a 64-bit vector of [2 x i32], truncating the result
1440 /// when it is inexact.
1441 ///
1442 /// \headerfile <x86intrin.h>
1443 ///
1444 /// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
1445 /// instructions.
1446 ///
1447 /// \param __a
1448 /// A 128-bit vector of [4 x float].
1449 /// \returns A 64-bit integer vector containing the converted values.
1450 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_cvttps_pi32(__m128 __a)1451 _mm_cvttps_pi32(__m128 __a)
1452 {
1453 return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
1454 }
1455
1456 /// Converts two low-order float values in a 128-bit vector of [4 x
1457 /// float] into a 64-bit vector of [2 x i32], truncating the result when it
1458 /// is inexact.
1459 ///
1460 /// \headerfile <x86intrin.h>
1461 ///
1462 /// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
1463 ///
1464 /// \param __a
1465 /// A 128-bit vector of [4 x float].
1466 /// \returns A 64-bit integer vector containing the converted values.
1467 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_cvtt_ps2pi(__m128 __a)1468 _mm_cvtt_ps2pi(__m128 __a)
1469 {
1470 return _mm_cvttps_pi32(__a);
1471 }
1472
1473 /// Converts a 32-bit signed integer value into a floating point value
1474 /// and writes it to the lower 32 bits of the destination. The remaining
1475 /// higher order elements of the destination vector are copied from the
1476 /// corresponding elements in the first operand.
1477 ///
1478 /// \headerfile <x86intrin.h>
1479 ///
1480 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1481 ///
1482 /// \param __a
1483 /// A 128-bit vector of [4 x float].
1484 /// \param __b
1485 /// A 32-bit signed integer operand containing the value to be converted.
1486 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1487 /// converted value of the second operand. The upper 96 bits are copied from
1488 /// the upper 96 bits of the first operand.
1489 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cvtsi32_ss(__m128 __a,int __b)1490 _mm_cvtsi32_ss(__m128 __a, int __b)
1491 {
1492 __a[0] = __b;
1493 return __a;
1494 }
1495
1496 /// Converts a 32-bit signed integer value into a floating point value
1497 /// and writes it to the lower 32 bits of the destination. The remaining
1498 /// higher order elements of the destination are copied from the
1499 /// corresponding elements in the first operand.
1500 ///
1501 /// \headerfile <x86intrin.h>
1502 ///
1503 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1504 ///
1505 /// \param __a
1506 /// A 128-bit vector of [4 x float].
1507 /// \param __b
1508 /// A 32-bit signed integer operand containing the value to be converted.
1509 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1510 /// converted value of the second operand. The upper 96 bits are copied from
1511 /// the upper 96 bits of the first operand.
1512 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cvt_si2ss(__m128 __a,int __b)1513 _mm_cvt_si2ss(__m128 __a, int __b)
1514 {
1515 return _mm_cvtsi32_ss(__a, __b);
1516 }
1517
1518 #ifdef __x86_64__
1519
1520 /// Converts a 64-bit signed integer value into a floating point value
1521 /// and writes it to the lower 32 bits of the destination. The remaining
1522 /// higher order elements of the destination are copied from the
1523 /// corresponding elements in the first operand.
1524 ///
1525 /// \headerfile <x86intrin.h>
1526 ///
1527 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1528 ///
1529 /// \param __a
1530 /// A 128-bit vector of [4 x float].
1531 /// \param __b
1532 /// A 64-bit signed integer operand containing the value to be converted.
1533 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1534 /// converted value of the second operand. The upper 96 bits are copied from
1535 /// the upper 96 bits of the first operand.
1536 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cvtsi64_ss(__m128 __a,long long __b)1537 _mm_cvtsi64_ss(__m128 __a, long long __b)
1538 {
1539 __a[0] = __b;
1540 return __a;
1541 }
1542
1543 #endif
1544
1545 /// Converts two elements of a 64-bit vector of [2 x i32] into two
1546 /// floating point values and writes them to the lower 64-bits of the
1547 /// destination. The remaining higher order elements of the destination are
1548 /// copied from the corresponding elements in the first operand.
1549 ///
1550 /// \headerfile <x86intrin.h>
1551 ///
1552 /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1553 ///
1554 /// \param __a
1555 /// A 128-bit vector of [4 x float].
1556 /// \param __b
1557 /// A 64-bit vector of [2 x i32]. The elements in this vector are converted
1558 /// and written to the corresponding low-order elements in the destination.
1559 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1560 /// converted value of the second operand. The upper 64 bits are copied from
1561 /// the upper 64 bits of the first operand.
1562 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
_mm_cvtpi32_ps(__m128 __a,__m64 __b)1563 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
1564 {
1565 return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
1566 }
1567
1568 /// Converts two elements of a 64-bit vector of [2 x i32] into two
1569 /// floating point values and writes them to the lower 64-bits of the
1570 /// destination. The remaining higher order elements of the destination are
1571 /// copied from the corresponding elements in the first operand.
1572 ///
1573 /// \headerfile <x86intrin.h>
1574 ///
1575 /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1576 ///
1577 /// \param __a
1578 /// A 128-bit vector of [4 x float].
1579 /// \param __b
1580 /// A 64-bit vector of [2 x i32]. The elements in this vector are converted
1581 /// and written to the corresponding low-order elements in the destination.
1582 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1583 /// converted value from the second operand. The upper 64 bits are copied
1584 /// from the upper 64 bits of the first operand.
1585 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
_mm_cvt_pi2ps(__m128 __a,__m64 __b)1586 _mm_cvt_pi2ps(__m128 __a, __m64 __b)
1587 {
1588 return _mm_cvtpi32_ps(__a, __b);
1589 }
1590
1591 /// Extracts a float value contained in the lower 32 bits of a vector of
1592 /// [4 x float].
1593 ///
1594 /// \headerfile <x86intrin.h>
1595 ///
1596 /// This intrinsic has no corresponding instruction.
1597 ///
1598 /// \param __a
1599 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1600 /// used in the extraction.
1601 /// \returns A 32-bit float containing the extracted value.
1602 static __inline__ float __DEFAULT_FN_ATTRS
_mm_cvtss_f32(__m128 __a)1603 _mm_cvtss_f32(__m128 __a)
1604 {
1605 return __a[0];
1606 }
1607
1608 /// Loads two packed float values from the address \a __p into the
1609 /// high-order bits of a 128-bit vector of [4 x float]. The low-order bits
1610 /// are copied from the low-order bits of the first operand.
1611 ///
1612 /// \headerfile <x86intrin.h>
1613 ///
1614 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1615 ///
1616 /// \param __a
1617 /// A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
1618 /// of the destination.
1619 /// \param __p
1620 /// A pointer to two packed float values. Bits [63:0] are written to bits
1621 /// [127:64] of the destination.
1622 /// \returns A 128-bit vector of [4 x float] containing the moved values.
1623 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_loadh_pi(__m128 __a,const __m64 * __p)1624 _mm_loadh_pi(__m128 __a, const __m64 *__p)
1625 {
1626 typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
1627 struct __mm_loadh_pi_struct {
1628 __mm_loadh_pi_v2f32 __u;
1629 } __attribute__((__packed__, __may_alias__));
1630 __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;
1631 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1632 return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
1633 }
1634
1635 /// Loads two packed float values from the address \a __p into the
1636 /// low-order bits of a 128-bit vector of [4 x float]. The high-order bits
1637 /// are copied from the high-order bits of the first operand.
1638 ///
1639 /// \headerfile <x86intrin.h>
1640 ///
1641 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1642 ///
1643 /// \param __a
1644 /// A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
1645 /// [127:64] of the destination.
1646 /// \param __p
1647 /// A pointer to two packed float values. Bits [63:0] are written to bits
1648 /// [63:0] of the destination.
1649 /// \returns A 128-bit vector of [4 x float] containing the moved values.
1650 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_loadl_pi(__m128 __a,const __m64 * __p)1651 _mm_loadl_pi(__m128 __a, const __m64 *__p)
1652 {
1653 typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
1654 struct __mm_loadl_pi_struct {
1655 __mm_loadl_pi_v2f32 __u;
1656 } __attribute__((__packed__, __may_alias__));
1657 __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;
1658 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1659 return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
1660 }
1661
1662 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1663 /// 32 bits of the vector are initialized with the single-precision
1664 /// floating-point value loaded from a specified memory location. The upper
1665 /// 96 bits are set to zero.
1666 ///
1667 /// \headerfile <x86intrin.h>
1668 ///
1669 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1670 ///
1671 /// \param __p
1672 /// A pointer to a 32-bit memory location containing a single-precision
1673 /// floating-point value.
1674 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1675 /// lower 32 bits contain the value loaded from the memory location. The
1676 /// upper 96 bits are set to zero.
1677 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_load_ss(const float * __p)1678 _mm_load_ss(const float *__p)
1679 {
1680 struct __mm_load_ss_struct {
1681 float __u;
1682 } __attribute__((__packed__, __may_alias__));
1683 float __u = ((const struct __mm_load_ss_struct*)__p)->__u;
1684 return __extension__ (__m128){ __u, 0, 0, 0 };
1685 }
1686
1687 /// Loads a 32-bit float value and duplicates it to all four vector
1688 /// elements of a 128-bit vector of [4 x float].
1689 ///
1690 /// \headerfile <x86intrin.h>
1691 ///
1692 /// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
1693 /// instruction.
1694 ///
1695 /// \param __p
1696 /// A pointer to a float value to be loaded and duplicated.
1697 /// \returns A 128-bit vector of [4 x float] containing the loaded and
1698 /// duplicated values.
1699 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_load1_ps(const float * __p)1700 _mm_load1_ps(const float *__p)
1701 {
1702 struct __mm_load1_ps_struct {
1703 float __u;
1704 } __attribute__((__packed__, __may_alias__));
1705 float __u = ((const struct __mm_load1_ps_struct*)__p)->__u;
1706 return __extension__ (__m128){ __u, __u, __u, __u };
1707 }
1708
1709 #define _mm_load_ps1(p) _mm_load1_ps(p)
1710
1711 /// Loads a 128-bit floating-point vector of [4 x float] from an aligned
1712 /// memory location.
1713 ///
1714 /// \headerfile <x86intrin.h>
1715 ///
1716 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
1717 ///
1718 /// \param __p
1719 /// A pointer to a 128-bit memory location. The address of the memory
1720 /// location has to be 128-bit aligned.
1721 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
1722 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_load_ps(const float * __p)1723 _mm_load_ps(const float *__p)
1724 {
1725 return *(const __m128*)__p;
1726 }
1727
1728 /// Loads a 128-bit floating-point vector of [4 x float] from an
1729 /// unaligned memory location.
1730 ///
1731 /// \headerfile <x86intrin.h>
1732 ///
1733 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1734 ///
1735 /// \param __p
1736 /// A pointer to a 128-bit memory location. The address of the memory
1737 /// location does not have to be aligned.
1738 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
1739 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_loadu_ps(const float * __p)1740 _mm_loadu_ps(const float *__p)
1741 {
1742 struct __loadu_ps {
1743 __m128_u __v;
1744 } __attribute__((__packed__, __may_alias__));
1745 return ((const struct __loadu_ps*)__p)->__v;
1746 }
1747
1748 /// Loads four packed float values, in reverse order, from an aligned
1749 /// memory location to 32-bit elements in a 128-bit vector of [4 x float].
1750 ///
1751 /// \headerfile <x86intrin.h>
1752 ///
1753 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
1754 /// instruction.
1755 ///
1756 /// \param __p
1757 /// A pointer to a 128-bit memory location. The address of the memory
1758 /// location has to be 128-bit aligned.
1759 /// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
1760 /// in reverse order.
1761 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_loadr_ps(const float * __p)1762 _mm_loadr_ps(const float *__p)
1763 {
1764 __m128 __a = _mm_load_ps(__p);
1765 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
1766 }
1767
1768 /// Create a 128-bit vector of [4 x float] with undefined values.
1769 ///
1770 /// \headerfile <x86intrin.h>
1771 ///
1772 /// This intrinsic has no corresponding instruction.
1773 ///
1774 /// \returns A 128-bit vector of [4 x float] containing undefined values.
1775 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_undefined_ps(void)1776 _mm_undefined_ps(void)
1777 {
1778 return (__m128)__builtin_ia32_undef128();
1779 }
1780
1781 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1782 /// 32 bits of the vector are initialized with the specified single-precision
1783 /// floating-point value. The upper 96 bits are set to zero.
1784 ///
1785 /// \headerfile <x86intrin.h>
1786 ///
1787 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1788 ///
1789 /// \param __w
1790 /// A single-precision floating-point value used to initialize the lower 32
1791 /// bits of the result.
1792 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1793 /// lower 32 bits contain the value provided in the source operand. The
1794 /// upper 96 bits are set to zero.
1795 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_set_ss(float __w)1796 _mm_set_ss(float __w)
1797 {
1798 return __extension__ (__m128){ __w, 0, 0, 0 };
1799 }
1800
1801 /// Constructs a 128-bit floating-point vector of [4 x float], with each
1802 /// of the four single-precision floating-point vector elements set to the
1803 /// specified single-precision floating-point value.
1804 ///
1805 /// \headerfile <x86intrin.h>
1806 ///
1807 /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1808 ///
1809 /// \param __w
1810 /// A single-precision floating-point value used to initialize each vector
1811 /// element of the result.
1812 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1813 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_set1_ps(float __w)1814 _mm_set1_ps(float __w)
1815 {
1816 return __extension__ (__m128){ __w, __w, __w, __w };
1817 }
1818
1819 /* Microsoft specific. */
1820 /// Constructs a 128-bit floating-point vector of [4 x float], with each
1821 /// of the four single-precision floating-point vector elements set to the
1822 /// specified single-precision floating-point value.
1823 ///
1824 /// \headerfile <x86intrin.h>
1825 ///
1826 /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1827 ///
1828 /// \param __w
1829 /// A single-precision floating-point value used to initialize each vector
1830 /// element of the result.
1831 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1832 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_set_ps1(float __w)1833 _mm_set_ps1(float __w)
1834 {
1835 return _mm_set1_ps(__w);
1836 }
1837
1838 /// Constructs a 128-bit floating-point vector of [4 x float]
1839 /// initialized with the specified single-precision floating-point values.
1840 ///
1841 /// \headerfile <x86intrin.h>
1842 ///
1843 /// This intrinsic is a utility function and does not correspond to a specific
1844 /// instruction.
1845 ///
1846 /// \param __z
1847 /// A single-precision floating-point value used to initialize bits [127:96]
1848 /// of the result.
1849 /// \param __y
1850 /// A single-precision floating-point value used to initialize bits [95:64]
1851 /// of the result.
1852 /// \param __x
1853 /// A single-precision floating-point value used to initialize bits [63:32]
1854 /// of the result.
1855 /// \param __w
1856 /// A single-precision floating-point value used to initialize bits [31:0]
1857 /// of the result.
1858 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1859 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_set_ps(float __z,float __y,float __x,float __w)1860 _mm_set_ps(float __z, float __y, float __x, float __w)
1861 {
1862 return __extension__ (__m128){ __w, __x, __y, __z };
1863 }
1864
1865 /// Constructs a 128-bit floating-point vector of [4 x float],
1866 /// initialized in reverse order with the specified 32-bit single-precision
1867 /// float-point values.
1868 ///
1869 /// \headerfile <x86intrin.h>
1870 ///
1871 /// This intrinsic is a utility function and does not correspond to a specific
1872 /// instruction.
1873 ///
1874 /// \param __z
1875 /// A single-precision floating-point value used to initialize bits [31:0]
1876 /// of the result.
1877 /// \param __y
1878 /// A single-precision floating-point value used to initialize bits [63:32]
1879 /// of the result.
1880 /// \param __x
1881 /// A single-precision floating-point value used to initialize bits [95:64]
1882 /// of the result.
1883 /// \param __w
1884 /// A single-precision floating-point value used to initialize bits [127:96]
1885 /// of the result.
1886 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1887 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_setr_ps(float __z,float __y,float __x,float __w)1888 _mm_setr_ps(float __z, float __y, float __x, float __w)
1889 {
1890 return __extension__ (__m128){ __z, __y, __x, __w };
1891 }
1892
1893 /// Constructs a 128-bit floating-point vector of [4 x float] initialized
1894 /// to zero.
1895 ///
1896 /// \headerfile <x86intrin.h>
1897 ///
1898 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1899 ///
1900 /// \returns An initialized 128-bit floating-point vector of [4 x float] with
1901 /// all elements set to zero.
1902 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_setzero_ps(void)1903 _mm_setzero_ps(void)
1904 {
1905 return __extension__ (__m128){ 0, 0, 0, 0 };
1906 }
1907
1908 /// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
1909 /// memory location.
1910 ///
1911 /// \headerfile <x86intrin.h>
1912 ///
1913 /// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
1914 ///
1915 /// \param __p
1916 /// A pointer to a 64-bit memory location.
1917 /// \param __a
1918 /// A 128-bit vector of [4 x float] containing the values to be stored.
1919 static __inline__ void __DEFAULT_FN_ATTRS
_mm_storeh_pi(__m64 * __p,__m128 __a)1920 _mm_storeh_pi(__m64 *__p, __m128 __a)
1921 {
1922 typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
1923 struct __mm_storeh_pi_struct {
1924 __mm_storeh_pi_v2f32 __u;
1925 } __attribute__((__packed__, __may_alias__));
1926 ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
1927 }
1928
1929 /// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
1930 /// memory location.
1931 ///
1932 /// \headerfile <x86intrin.h>
1933 ///
1934 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
1935 ///
1936 /// \param __p
1937 /// A pointer to a memory location that will receive the float values.
1938 /// \param __a
1939 /// A 128-bit vector of [4 x float] containing the values to be stored.
1940 static __inline__ void __DEFAULT_FN_ATTRS
_mm_storel_pi(__m64 * __p,__m128 __a)1941 _mm_storel_pi(__m64 *__p, __m128 __a)
1942 {
1943 typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
1944 struct __mm_storeh_pi_struct {
1945 __mm_storeh_pi_v2f32 __u;
1946 } __attribute__((__packed__, __may_alias__));
1947 ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
1948 }
1949
1950 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
1951 /// memory location.
1952 ///
1953 /// \headerfile <x86intrin.h>
1954 ///
1955 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1956 ///
1957 /// \param __p
1958 /// A pointer to a 32-bit memory location.
1959 /// \param __a
1960 /// A 128-bit vector of [4 x float] containing the value to be stored.
1961 static __inline__ void __DEFAULT_FN_ATTRS
_mm_store_ss(float * __p,__m128 __a)1962 _mm_store_ss(float *__p, __m128 __a)
1963 {
1964 struct __mm_store_ss_struct {
1965 float __u;
1966 } __attribute__((__packed__, __may_alias__));
1967 ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
1968 }
1969
1970 /// Stores a 128-bit vector of [4 x float] to an unaligned memory
1971 /// location.
1972 ///
1973 /// \headerfile <x86intrin.h>
1974 ///
1975 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1976 ///
1977 /// \param __p
1978 /// A pointer to a 128-bit memory location. The address of the memory
1979 /// location does not have to be aligned.
1980 /// \param __a
1981 /// A 128-bit vector of [4 x float] containing the values to be stored.
1982 static __inline__ void __DEFAULT_FN_ATTRS
_mm_storeu_ps(float * __p,__m128 __a)1983 _mm_storeu_ps(float *__p, __m128 __a)
1984 {
1985 struct __storeu_ps {
1986 __m128_u __v;
1987 } __attribute__((__packed__, __may_alias__));
1988 ((struct __storeu_ps*)__p)->__v = __a;
1989 }
1990
1991 /// Stores a 128-bit vector of [4 x float] into an aligned memory
1992 /// location.
1993 ///
1994 /// \headerfile <x86intrin.h>
1995 ///
1996 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
1997 ///
1998 /// \param __p
1999 /// A pointer to a 128-bit memory location. The address of the memory
2000 /// location has to be 16-byte aligned.
2001 /// \param __a
2002 /// A 128-bit vector of [4 x float] containing the values to be stored.
2003 static __inline__ void __DEFAULT_FN_ATTRS
_mm_store_ps(float * __p,__m128 __a)2004 _mm_store_ps(float *__p, __m128 __a)
2005 {
2006 *(__m128*)__p = __a;
2007 }
2008
2009 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2010 /// four contiguous elements in an aligned memory location.
2011 ///
2012 /// \headerfile <x86intrin.h>
2013 ///
2014 /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2015 /// instruction.
2016 ///
2017 /// \param __p
2018 /// A pointer to a 128-bit memory location.
2019 /// \param __a
2020 /// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2021 /// of the four contiguous elements pointed by \a __p.
2022 static __inline__ void __DEFAULT_FN_ATTRS
_mm_store1_ps(float * __p,__m128 __a)2023 _mm_store1_ps(float *__p, __m128 __a)
2024 {
2025 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
2026 _mm_store_ps(__p, __a);
2027 }
2028
2029 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2030 /// four contiguous elements in an aligned memory location.
2031 ///
2032 /// \headerfile <x86intrin.h>
2033 ///
2034 /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2035 /// instruction.
2036 ///
2037 /// \param __p
2038 /// A pointer to a 128-bit memory location.
2039 /// \param __a
2040 /// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2041 /// of the four contiguous elements pointed by \a __p.
2042 static __inline__ void __DEFAULT_FN_ATTRS
_mm_store_ps1(float * __p,__m128 __a)2043 _mm_store_ps1(float *__p, __m128 __a)
2044 {
2045 _mm_store1_ps(__p, __a);
2046 }
2047
2048 /// Stores float values from a 128-bit vector of [4 x float] to an
2049 /// aligned memory location in reverse order.
2050 ///
2051 /// \headerfile <x86intrin.h>
2052 ///
2053 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
2054 /// instruction.
2055 ///
2056 /// \param __p
2057 /// A pointer to a 128-bit memory location. The address of the memory
2058 /// location has to be 128-bit aligned.
2059 /// \param __a
2060 /// A 128-bit vector of [4 x float] containing the values to be stored.
2061 static __inline__ void __DEFAULT_FN_ATTRS
_mm_storer_ps(float * __p,__m128 __a)2062 _mm_storer_ps(float *__p, __m128 __a)
2063 {
2064 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
2065 _mm_store_ps(__p, __a);
2066 }
2067
2068 #define _MM_HINT_ET0 7
2069 #define _MM_HINT_ET1 6
2070 #define _MM_HINT_T0 3
2071 #define _MM_HINT_T1 2
2072 #define _MM_HINT_T2 1
2073 #define _MM_HINT_NTA 0
2074
2075 #ifndef _MSC_VER
2076 /* FIXME: We have to #define this because "sel" must be a constant integer, and
2077 Sema doesn't do any form of constant propagation yet. */
2078
2079 /// Loads one cache line of data from the specified address to a location
2080 /// closer to the processor.
2081 ///
2082 /// \headerfile <x86intrin.h>
2083 ///
2084 /// \code
2085 /// void _mm_prefetch(const void * a, const int sel);
2086 /// \endcode
2087 ///
2088 /// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
2089 ///
2090 /// \param a
2091 /// A pointer to a memory location containing a cache line of data.
2092 /// \param sel
2093 /// A predefined integer constant specifying the type of prefetch
2094 /// operation: \n
2095 /// _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
2096 /// PREFETCHNTA instruction will be generated. \n
2097 /// _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
2098 /// be generated. \n
2099 /// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
2100 /// be generated. \n
2101 /// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
2102 /// be generated.
2103 #define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \
2104 ((sel) >> 2) & 1, (sel) & 0x3))
2105 #endif
2106
2107 /// Stores a 64-bit integer in the specified aligned memory location. To
2108 /// minimize caching, the data is flagged as non-temporal (unlikely to be
2109 /// used again soon).
2110 ///
2111 /// \headerfile <x86intrin.h>
2112 ///
2113 /// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
2114 ///
2115 /// \param __p
2116 /// A pointer to an aligned memory location used to store the register value.
2117 /// \param __a
2118 /// A 64-bit integer containing the value to be stored.
2119 static __inline__ void __DEFAULT_FN_ATTRS_MMX
_mm_stream_pi(__m64 * __p,__m64 __a)2120 _mm_stream_pi(__m64 *__p, __m64 __a)
2121 {
2122 __builtin_ia32_movntq(__p, __a);
2123 }
2124
2125 /// Moves packed float values from a 128-bit vector of [4 x float] to a
2126 /// 128-bit aligned memory location. To minimize caching, the data is flagged
2127 /// as non-temporal (unlikely to be used again soon).
2128 ///
2129 /// \headerfile <x86intrin.h>
2130 ///
2131 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
2132 ///
2133 /// \param __p
2134 /// A pointer to a 128-bit aligned memory location that will receive the
2135 /// single-precision floating-point values.
2136 /// \param __a
2137 /// A 128-bit vector of [4 x float] containing the values to be moved.
2138 static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_ps(float * __p,__m128 __a)2139 _mm_stream_ps(float *__p, __m128 __a)
2140 {
2141 __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
2142 }
2143
2144 #if defined(__cplusplus)
2145 extern "C" {
2146 #endif
2147
2148 /// Forces strong memory ordering (serialization) between store
2149 /// instructions preceding this instruction and store instructions following
2150 /// this instruction, ensuring the system completes all previous stores
2151 /// before executing subsequent stores.
2152 ///
2153 /// \headerfile <x86intrin.h>
2154 ///
2155 /// This intrinsic corresponds to the <c> SFENCE </c> instruction.
2156 ///
2157 void _mm_sfence(void);
2158
2159 #if defined(__cplusplus)
2160 } // extern "C"
2161 #endif
2162
2163 /// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
2164 /// returns it, as specified by the immediate integer operand.
2165 ///
2166 /// \headerfile <x86intrin.h>
2167 ///
2168 /// \code
2169 /// int _mm_extract_pi16(__m64 a, int n);
2170 /// \endcode
2171 ///
2172 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
2173 ///
2174 /// \param a
2175 /// A 64-bit vector of [4 x i16].
2176 /// \param n
2177 /// An immediate integer operand that determines which bits are extracted: \n
2178 /// 0: Bits [15:0] are copied to the destination. \n
2179 /// 1: Bits [31:16] are copied to the destination. \n
2180 /// 2: Bits [47:32] are copied to the destination. \n
2181 /// 3: Bits [63:48] are copied to the destination.
2182 /// \returns A 16-bit integer containing the extracted 16 bits of packed data.
2183 #define _mm_extract_pi16(a, n) \
2184 (int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n)
2185
2186 /// Copies data from the 64-bit vector of [4 x i16] to the destination,
2187 /// and inserts the lower 16-bits of an integer operand at the 16-bit offset
2188 /// specified by the immediate operand \a n.
2189 ///
2190 /// \headerfile <x86intrin.h>
2191 ///
2192 /// \code
2193 /// __m64 _mm_insert_pi16(__m64 a, int d, int n);
2194 /// \endcode
2195 ///
2196 /// This intrinsic corresponds to the <c> PINSRW </c> instruction.
2197 ///
2198 /// \param a
2199 /// A 64-bit vector of [4 x i16].
2200 /// \param d
2201 /// An integer. The lower 16-bit value from this operand is written to the
2202 /// destination at the offset specified by operand \a n.
2203 /// \param n
2204 /// An immediate integer operant that determines which the bits to be used
2205 /// in the destination. \n
2206 /// 0: Bits [15:0] are copied to the destination. \n
2207 /// 1: Bits [31:16] are copied to the destination. \n
2208 /// 2: Bits [47:32] are copied to the destination. \n
2209 /// 3: Bits [63:48] are copied to the destination. \n
2210 /// The remaining bits in the destination are copied from the corresponding
2211 /// bits in operand \a a.
2212 /// \returns A 64-bit integer vector containing the copied packed data from the
2213 /// operands.
2214 #define _mm_insert_pi16(a, d, n) \
2215 (__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n)
2216
2217 /// Compares each of the corresponding packed 16-bit integer values of
2218 /// the 64-bit integer vectors, and writes the greater value to the
2219 /// corresponding bits in the destination.
2220 ///
2221 /// \headerfile <x86intrin.h>
2222 ///
2223 /// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
2224 ///
2225 /// \param __a
2226 /// A 64-bit integer vector containing one of the source operands.
2227 /// \param __b
2228 /// A 64-bit integer vector containing one of the source operands.
2229 /// \returns A 64-bit integer vector containing the comparison results.
2230 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_max_pi16(__m64 __a,__m64 __b)2231 _mm_max_pi16(__m64 __a, __m64 __b)
2232 {
2233 return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
2234 }
2235
2236 /// Compares each of the corresponding packed 8-bit unsigned integer
2237 /// values of the 64-bit integer vectors, and writes the greater value to the
2238 /// corresponding bits in the destination.
2239 ///
2240 /// \headerfile <x86intrin.h>
2241 ///
2242 /// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
2243 ///
2244 /// \param __a
2245 /// A 64-bit integer vector containing one of the source operands.
2246 /// \param __b
2247 /// A 64-bit integer vector containing one of the source operands.
2248 /// \returns A 64-bit integer vector containing the comparison results.
2249 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_max_pu8(__m64 __a,__m64 __b)2250 _mm_max_pu8(__m64 __a, __m64 __b)
2251 {
2252 return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
2253 }
2254
2255 /// Compares each of the corresponding packed 16-bit integer values of
2256 /// the 64-bit integer vectors, and writes the lesser value to the
2257 /// corresponding bits in the destination.
2258 ///
2259 /// \headerfile <x86intrin.h>
2260 ///
2261 /// This intrinsic corresponds to the <c> PMINSW </c> instruction.
2262 ///
2263 /// \param __a
2264 /// A 64-bit integer vector containing one of the source operands.
2265 /// \param __b
2266 /// A 64-bit integer vector containing one of the source operands.
2267 /// \returns A 64-bit integer vector containing the comparison results.
2268 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_min_pi16(__m64 __a,__m64 __b)2269 _mm_min_pi16(__m64 __a, __m64 __b)
2270 {
2271 return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
2272 }
2273
2274 /// Compares each of the corresponding packed 8-bit unsigned integer
2275 /// values of the 64-bit integer vectors, and writes the lesser value to the
2276 /// corresponding bits in the destination.
2277 ///
2278 /// \headerfile <x86intrin.h>
2279 ///
2280 /// This intrinsic corresponds to the <c> PMINUB </c> instruction.
2281 ///
2282 /// \param __a
2283 /// A 64-bit integer vector containing one of the source operands.
2284 /// \param __b
2285 /// A 64-bit integer vector containing one of the source operands.
2286 /// \returns A 64-bit integer vector containing the comparison results.
2287 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_min_pu8(__m64 __a,__m64 __b)2288 _mm_min_pu8(__m64 __a, __m64 __b)
2289 {
2290 return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
2291 }
2292
2293 /// Takes the most significant bit from each 8-bit element in a 64-bit
2294 /// integer vector to create an 8-bit mask value. Zero-extends the value to
2295 /// 32-bit integer and writes it to the destination.
2296 ///
2297 /// \headerfile <x86intrin.h>
2298 ///
2299 /// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
2300 ///
2301 /// \param __a
2302 /// A 64-bit integer vector containing the values with bits to be extracted.
2303 /// \returns The most significant bit from each 8-bit element in \a __a,
2304 /// written to bits [7:0].
2305 static __inline__ int __DEFAULT_FN_ATTRS_MMX
_mm_movemask_pi8(__m64 __a)2306 _mm_movemask_pi8(__m64 __a)
2307 {
2308 return __builtin_ia32_pmovmskb((__v8qi)__a);
2309 }
2310
2311 /// Multiplies packed 16-bit unsigned integer values and writes the
2312 /// high-order 16 bits of each 32-bit product to the corresponding bits in
2313 /// the destination.
2314 ///
2315 /// \headerfile <x86intrin.h>
2316 ///
2317 /// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
2318 ///
2319 /// \param __a
2320 /// A 64-bit integer vector containing one of the source operands.
2321 /// \param __b
2322 /// A 64-bit integer vector containing one of the source operands.
2323 /// \returns A 64-bit integer vector containing the products of both operands.
2324 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_mulhi_pu16(__m64 __a,__m64 __b)2325 _mm_mulhi_pu16(__m64 __a, __m64 __b)
2326 {
2327 return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
2328 }
2329
2330 /// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
2331 /// destination, as specified by the immediate value operand.
2332 ///
2333 /// \headerfile <x86intrin.h>
2334 ///
2335 /// \code
2336 /// __m64 _mm_shuffle_pi16(__m64 a, const int n);
2337 /// \endcode
2338 ///
2339 /// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
2340 ///
2341 /// \param a
2342 /// A 64-bit integer vector containing the values to be shuffled.
2343 /// \param n
2344 /// An immediate value containing an 8-bit value specifying which elements to
2345 /// copy from \a a. The destinations within the 64-bit destination are
2346 /// assigned values as follows: \n
2347 /// Bits [1:0] are used to assign values to bits [15:0] in the
2348 /// destination. \n
2349 /// Bits [3:2] are used to assign values to bits [31:16] in the
2350 /// destination. \n
2351 /// Bits [5:4] are used to assign values to bits [47:32] in the
2352 /// destination. \n
2353 /// Bits [7:6] are used to assign values to bits [63:48] in the
2354 /// destination. \n
2355 /// Bit value assignments: \n
2356 /// 00: assigned from bits [15:0] of \a a. \n
2357 /// 01: assigned from bits [31:16] of \a a. \n
2358 /// 10: assigned from bits [47:32] of \a a. \n
2359 /// 11: assigned from bits [63:48] of \a a.
2360 /// \returns A 64-bit integer vector containing the shuffled values.
2361 #define _mm_shuffle_pi16(a, n) \
2362 (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n))
2363
2364 /// Conditionally copies the values from each 8-bit element in the first
2365 /// 64-bit integer vector operand to the specified memory location, as
2366 /// specified by the most significant bit in the corresponding element in the
2367 /// second 64-bit integer vector operand.
2368 ///
2369 /// To minimize caching, the data is flagged as non-temporal
2370 /// (unlikely to be used again soon).
2371 ///
2372 /// \headerfile <x86intrin.h>
2373 ///
2374 /// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
2375 ///
2376 /// \param __d
2377 /// A 64-bit integer vector containing the values with elements to be copied.
2378 /// \param __n
2379 /// A 64-bit integer vector operand. The most significant bit from each 8-bit
2380 /// element determines whether the corresponding element in operand \a __d
2381 /// is copied. If the most significant bit of a given element is 1, the
2382 /// corresponding element in operand \a __d is copied.
2383 /// \param __p
2384 /// A pointer to a 64-bit memory location that will receive the conditionally
2385 /// copied integer values. The address of the memory location does not have
2386 /// to be aligned.
2387 static __inline__ void __DEFAULT_FN_ATTRS_MMX
_mm_maskmove_si64(__m64 __d,__m64 __n,char * __p)2388 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
2389 {
2390 __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
2391 }
2392
2393 /// Computes the rounded averages of the packed unsigned 8-bit integer
2394 /// values and writes the averages to the corresponding bits in the
2395 /// destination.
2396 ///
2397 /// \headerfile <x86intrin.h>
2398 ///
2399 /// This intrinsic corresponds to the <c> PAVGB </c> instruction.
2400 ///
2401 /// \param __a
2402 /// A 64-bit integer vector containing one of the source operands.
2403 /// \param __b
2404 /// A 64-bit integer vector containing one of the source operands.
2405 /// \returns A 64-bit integer vector containing the averages of both operands.
2406 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_avg_pu8(__m64 __a,__m64 __b)2407 _mm_avg_pu8(__m64 __a, __m64 __b)
2408 {
2409 return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
2410 }
2411
2412 /// Computes the rounded averages of the packed unsigned 16-bit integer
2413 /// values and writes the averages to the corresponding bits in the
2414 /// destination.
2415 ///
2416 /// \headerfile <x86intrin.h>
2417 ///
2418 /// This intrinsic corresponds to the <c> PAVGW </c> instruction.
2419 ///
2420 /// \param __a
2421 /// A 64-bit integer vector containing one of the source operands.
2422 /// \param __b
2423 /// A 64-bit integer vector containing one of the source operands.
2424 /// \returns A 64-bit integer vector containing the averages of both operands.
2425 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_avg_pu16(__m64 __a,__m64 __b)2426 _mm_avg_pu16(__m64 __a, __m64 __b)
2427 {
2428 return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
2429 }
2430
2431 /// Subtracts the corresponding 8-bit unsigned integer values of the two
2432 /// 64-bit vector operands and computes the absolute value for each of the
2433 /// difference. Then sum of the 8 absolute differences is written to the
2434 /// bits [15:0] of the destination; the remaining bits [63:16] are cleared.
2435 ///
2436 /// \headerfile <x86intrin.h>
2437 ///
2438 /// This intrinsic corresponds to the <c> PSADBW </c> instruction.
2439 ///
2440 /// \param __a
2441 /// A 64-bit integer vector containing one of the source operands.
2442 /// \param __b
2443 /// A 64-bit integer vector containing one of the source operands.
2444 /// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
2445 /// sets of absolute differences between both operands. The upper bits are
2446 /// cleared.
2447 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_sad_pu8(__m64 __a,__m64 __b)2448 _mm_sad_pu8(__m64 __a, __m64 __b)
2449 {
2450 return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
2451 }
2452
2453 #if defined(__cplusplus)
2454 extern "C" {
2455 #endif
2456
2457 /// Returns the contents of the MXCSR register as a 32-bit unsigned
2458 /// integer value.
2459 ///
2460 /// There are several groups of macros associated with this
2461 /// intrinsic, including:
2462 /// <ul>
2463 /// <li>
2464 /// For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2465 /// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2466 /// _MM_EXCEPT_INEXACT. There is a convenience wrapper
2467 /// _MM_GET_EXCEPTION_STATE().
2468 /// </li>
2469 /// <li>
2470 /// For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2471 /// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2472 /// There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
2473 /// </li>
2474 /// <li>
2475 /// For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2476 /// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2477 /// _MM_GET_ROUNDING_MODE().
2478 /// </li>
2479 /// <li>
2480 /// For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2481 /// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
2482 /// </li>
2483 /// <li>
2484 /// For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2485 /// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2486 /// _MM_GET_DENORMALS_ZERO_MODE().
2487 /// </li>
2488 /// </ul>
2489 ///
2490 /// For example, the following expression checks if an overflow exception has
2491 /// occurred:
2492 /// \code
2493 /// ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
2494 /// \endcode
2495 ///
2496 /// The following expression gets the current rounding mode:
2497 /// \code
2498 /// _MM_GET_ROUNDING_MODE()
2499 /// \endcode
2500 ///
2501 /// \headerfile <x86intrin.h>
2502 ///
2503 /// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
2504 ///
2505 /// \returns A 32-bit unsigned integer containing the contents of the MXCSR
2506 /// register.
2507 unsigned int _mm_getcsr(void);
2508
2509 /// Sets the MXCSR register with the 32-bit unsigned integer value.
2510 ///
2511 /// There are several groups of macros associated with this intrinsic,
2512 /// including:
2513 /// <ul>
2514 /// <li>
2515 /// For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2516 /// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2517 /// _MM_EXCEPT_INEXACT. There is a convenience wrapper
2518 /// _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
2519 /// </li>
2520 /// <li>
2521 /// For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2522 /// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2523 /// There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
2524 /// of these macros.
2525 /// </li>
2526 /// <li>
2527 /// For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2528 /// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2529 /// _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
2530 /// </li>
2531 /// <li>
2532 /// For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2533 /// There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
2534 /// one of these macros.
2535 /// </li>
2536 /// <li>
2537 /// For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2538 /// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2539 /// _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
2540 /// </li>
2541 /// </ul>
2542 ///
2543 /// For example, the following expression causes subsequent floating-point
2544 /// operations to round up:
2545 /// _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
2546 ///
2547 /// The following example sets the DAZ and FTZ flags:
2548 /// \code
2549 /// void setFlags() {
2550 /// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
2551 /// _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
2552 /// }
2553 /// \endcode
2554 ///
2555 /// \headerfile <x86intrin.h>
2556 ///
2557 /// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
2558 ///
2559 /// \param __i
2560 /// A 32-bit unsigned integer value to be written to the MXCSR register.
2561 void _mm_setcsr(unsigned int __i);
2562
2563 #if defined(__cplusplus)
2564 } // extern "C"
2565 #endif
2566
2567 /// Selects 4 float values from the 128-bit operands of [4 x float], as
2568 /// specified by the immediate value operand.
2569 ///
2570 /// \headerfile <x86intrin.h>
2571 ///
2572 /// \code
2573 /// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
2574 /// \endcode
2575 ///
2576 /// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
2577 ///
2578 /// \param a
2579 /// A 128-bit vector of [4 x float].
2580 /// \param b
2581 /// A 128-bit vector of [4 x float].
2582 /// \param mask
2583 /// An immediate value containing an 8-bit value specifying which elements to
2584 /// copy from \a a and \a b. \n
2585 /// Bits [3:0] specify the values copied from operand \a a. \n
2586 /// Bits [7:4] specify the values copied from operand \a b. \n
2587 /// The destinations within the 128-bit destination are assigned values as
2588 /// follows: \n
2589 /// Bits [1:0] are used to assign values to bits [31:0] in the
2590 /// destination. \n
2591 /// Bits [3:2] are used to assign values to bits [63:32] in the
2592 /// destination. \n
2593 /// Bits [5:4] are used to assign values to bits [95:64] in the
2594 /// destination. \n
2595 /// Bits [7:6] are used to assign values to bits [127:96] in the
2596 /// destination. \n
2597 /// Bit value assignments: \n
2598 /// 00: Bits [31:0] copied from the specified operand. \n
2599 /// 01: Bits [63:32] copied from the specified operand. \n
2600 /// 10: Bits [95:64] copied from the specified operand. \n
2601 /// 11: Bits [127:96] copied from the specified operand.
2602 /// \returns A 128-bit vector of [4 x float] containing the shuffled values.
2603 #define _mm_shuffle_ps(a, b, mask) \
2604 (__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
2605 (int)(mask))
2606
2607 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
2608 /// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2609 ///
2610 /// \headerfile <x86intrin.h>
2611 ///
2612 /// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
2613 ///
2614 /// \param __a
2615 /// A 128-bit vector of [4 x float]. \n
2616 /// Bits [95:64] are written to bits [31:0] of the destination. \n
2617 /// Bits [127:96] are written to bits [95:64] of the destination.
2618 /// \param __b
2619 /// A 128-bit vector of [4 x float].
2620 /// Bits [95:64] are written to bits [63:32] of the destination. \n
2621 /// Bits [127:96] are written to bits [127:96] of the destination.
2622 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2623 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_unpackhi_ps(__m128 __a,__m128 __b)2624 _mm_unpackhi_ps(__m128 __a, __m128 __b)
2625 {
2626 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
2627 }
2628
2629 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
2630 /// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2631 ///
2632 /// \headerfile <x86intrin.h>
2633 ///
2634 /// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
2635 ///
2636 /// \param __a
2637 /// A 128-bit vector of [4 x float]. \n
2638 /// Bits [31:0] are written to bits [31:0] of the destination. \n
2639 /// Bits [63:32] are written to bits [95:64] of the destination.
2640 /// \param __b
2641 /// A 128-bit vector of [4 x float]. \n
2642 /// Bits [31:0] are written to bits [63:32] of the destination. \n
2643 /// Bits [63:32] are written to bits [127:96] of the destination.
2644 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2645 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_unpacklo_ps(__m128 __a,__m128 __b)2646 _mm_unpacklo_ps(__m128 __a, __m128 __b)
2647 {
2648 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
2649 }
2650
2651 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2652 /// 32 bits are set to the lower 32 bits of the second parameter. The upper
2653 /// 96 bits are set to the upper 96 bits of the first parameter.
2654 ///
2655 /// \headerfile <x86intrin.h>
2656 ///
2657 /// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
2658 /// instruction.
2659 ///
2660 /// \param __a
2661 /// A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
2662 /// written to the upper 96 bits of the result.
2663 /// \param __b
2664 /// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
2665 /// written to the lower 32 bits of the result.
2666 /// \returns A 128-bit floating-point vector of [4 x float].
2667 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_move_ss(__m128 __a,__m128 __b)2668 _mm_move_ss(__m128 __a, __m128 __b)
2669 {
2670 __a[0] = __b[0];
2671 return __a;
2672 }
2673
2674 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2675 /// 64 bits are set to the upper 64 bits of the second parameter. The upper
2676 /// 64 bits are set to the upper 64 bits of the first parameter.
2677 ///
2678 /// \headerfile <x86intrin.h>
2679 ///
2680 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
2681 ///
2682 /// \param __a
2683 /// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2684 /// written to the upper 64 bits of the result.
2685 /// \param __b
2686 /// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2687 /// written to the lower 64 bits of the result.
2688 /// \returns A 128-bit floating-point vector of [4 x float].
2689 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_movehl_ps(__m128 __a,__m128 __b)2690 _mm_movehl_ps(__m128 __a, __m128 __b)
2691 {
2692 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
2693 }
2694
2695 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2696 /// 64 bits are set to the lower 64 bits of the first parameter. The upper
2697 /// 64 bits are set to the lower 64 bits of the second parameter.
2698 ///
2699 /// \headerfile <x86intrin.h>
2700 ///
2701 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
2702 ///
2703 /// \param __a
2704 /// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2705 /// written to the lower 64 bits of the result.
2706 /// \param __b
2707 /// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2708 /// written to the upper 64 bits of the result.
2709 /// \returns A 128-bit floating-point vector of [4 x float].
2710 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_movelh_ps(__m128 __a,__m128 __b)2711 _mm_movelh_ps(__m128 __a, __m128 __b)
2712 {
2713 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
2714 }
2715
2716 /// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
2717 /// float].
2718 ///
2719 /// \headerfile <x86intrin.h>
2720 ///
2721 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2722 ///
2723 /// \param __a
2724 /// A 64-bit vector of [4 x i16]. The elements of the destination are copied
2725 /// from the corresponding elements in this operand.
2726 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2727 /// values from the operand.
2728 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
_mm_cvtpi16_ps(__m64 __a)2729 _mm_cvtpi16_ps(__m64 __a)
2730 {
2731 __m64 __b, __c;
2732 __m128 __r;
2733
2734 __b = _mm_setzero_si64();
2735 __b = _mm_cmpgt_pi16(__b, __a);
2736 __c = _mm_unpackhi_pi16(__a, __b);
2737 __r = _mm_setzero_ps();
2738 __r = _mm_cvtpi32_ps(__r, __c);
2739 __r = _mm_movelh_ps(__r, __r);
2740 __c = _mm_unpacklo_pi16(__a, __b);
2741 __r = _mm_cvtpi32_ps(__r, __c);
2742
2743 return __r;
2744 }
2745
2746 /// Converts a 64-bit vector of 16-bit unsigned integer values into a
2747 /// 128-bit vector of [4 x float].
2748 ///
2749 /// \headerfile <x86intrin.h>
2750 ///
2751 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2752 ///
2753 /// \param __a
2754 /// A 64-bit vector of 16-bit unsigned integer values. The elements of the
2755 /// destination are copied from the corresponding elements in this operand.
2756 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2757 /// values from the operand.
2758 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
_mm_cvtpu16_ps(__m64 __a)2759 _mm_cvtpu16_ps(__m64 __a)
2760 {
2761 __m64 __b, __c;
2762 __m128 __r;
2763
2764 __b = _mm_setzero_si64();
2765 __c = _mm_unpackhi_pi16(__a, __b);
2766 __r = _mm_setzero_ps();
2767 __r = _mm_cvtpi32_ps(__r, __c);
2768 __r = _mm_movelh_ps(__r, __r);
2769 __c = _mm_unpacklo_pi16(__a, __b);
2770 __r = _mm_cvtpi32_ps(__r, __c);
2771
2772 return __r;
2773 }
2774
2775 /// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
2776 /// into a 128-bit vector of [4 x float].
2777 ///
2778 /// \headerfile <x86intrin.h>
2779 ///
2780 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2781 ///
2782 /// \param __a
2783 /// A 64-bit vector of [8 x i8]. The elements of the destination are copied
2784 /// from the corresponding lower 4 elements in this operand.
2785 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2786 /// values from the operand.
2787 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
_mm_cvtpi8_ps(__m64 __a)2788 _mm_cvtpi8_ps(__m64 __a)
2789 {
2790 __m64 __b;
2791
2792 __b = _mm_setzero_si64();
2793 __b = _mm_cmpgt_pi8(__b, __a);
2794 __b = _mm_unpacklo_pi8(__a, __b);
2795
2796 return _mm_cvtpi16_ps(__b);
2797 }
2798
2799 /// Converts the lower four unsigned 8-bit integer values from a 64-bit
2800 /// vector of [8 x u8] into a 128-bit vector of [4 x float].
2801 ///
2802 /// \headerfile <x86intrin.h>
2803 ///
2804 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2805 ///
2806 /// \param __a
2807 /// A 64-bit vector of unsigned 8-bit integer values. The elements of the
2808 /// destination are copied from the corresponding lower 4 elements in this
2809 /// operand.
2810 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2811 /// values from the source operand.
2812 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
_mm_cvtpu8_ps(__m64 __a)2813 _mm_cvtpu8_ps(__m64 __a)
2814 {
2815 __m64 __b;
2816
2817 __b = _mm_setzero_si64();
2818 __b = _mm_unpacklo_pi8(__a, __b);
2819
2820 return _mm_cvtpi16_ps(__b);
2821 }
2822
2823 /// Converts the two 32-bit signed integer values from each 64-bit vector
2824 /// operand of [2 x i32] into a 128-bit vector of [4 x float].
2825 ///
2826 /// \headerfile <x86intrin.h>
2827 ///
2828 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2829 ///
2830 /// \param __a
2831 /// A 64-bit vector of [2 x i32]. The lower elements of the destination are
2832 /// copied from the elements in this operand.
2833 /// \param __b
2834 /// A 64-bit vector of [2 x i32]. The upper elements of the destination are
2835 /// copied from the elements in this operand.
2836 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
2837 /// copied and converted values from the first operand. The upper 64 bits
2838 /// contain the copied and converted values from the second operand.
2839 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
_mm_cvtpi32x2_ps(__m64 __a,__m64 __b)2840 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
2841 {
2842 __m128 __c;
2843
2844 __c = _mm_setzero_ps();
2845 __c = _mm_cvtpi32_ps(__c, __b);
2846 __c = _mm_movelh_ps(__c, __c);
2847
2848 return _mm_cvtpi32_ps(__c, __a);
2849 }
2850
2851 /// Converts each single-precision floating-point element of a 128-bit
2852 /// floating-point vector of [4 x float] into a 16-bit signed integer, and
2853 /// packs the results into a 64-bit integer vector of [4 x i16].
2854 ///
2855 /// If the floating-point element is NaN or infinity, or if the
2856 /// floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
2857 /// it is converted to 0x8000. Otherwise if the floating-point element is
2858 /// greater than 0x7FFF, it is converted to 0x7FFF.
2859 ///
2860 /// \headerfile <x86intrin.h>
2861 ///
2862 /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2863 ///
2864 /// \param __a
2865 /// A 128-bit floating-point vector of [4 x float].
2866 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
2867 /// values.
2868 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_cvtps_pi16(__m128 __a)2869 _mm_cvtps_pi16(__m128 __a)
2870 {
2871 __m64 __b, __c;
2872
2873 __b = _mm_cvtps_pi32(__a);
2874 __a = _mm_movehl_ps(__a, __a);
2875 __c = _mm_cvtps_pi32(__a);
2876
2877 return _mm_packs_pi32(__b, __c);
2878 }
2879
2880 /// Converts each single-precision floating-point element of a 128-bit
2881 /// floating-point vector of [4 x float] into an 8-bit signed integer, and
2882 /// packs the results into the lower 32 bits of a 64-bit integer vector of
2883 /// [8 x i8]. The upper 32 bits of the vector are set to 0.
2884 ///
2885 /// If the floating-point element is NaN or infinity, or if the
2886 /// floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
2887 /// is converted to 0x80. Otherwise if the floating-point element is greater
2888 /// than 0x7F, it is converted to 0x7F.
2889 ///
2890 /// \headerfile <x86intrin.h>
2891 ///
2892 /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2893 ///
2894 /// \param __a
2895 /// 128-bit floating-point vector of [4 x float].
2896 /// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
2897 /// converted values and the uppper 32 bits are set to zero.
2898 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_cvtps_pi8(__m128 __a)2899 _mm_cvtps_pi8(__m128 __a)
2900 {
2901 __m64 __b, __c;
2902
2903 __b = _mm_cvtps_pi16(__a);
2904 __c = _mm_setzero_si64();
2905
2906 return _mm_packs_pi16(__b, __c);
2907 }
2908
2909 /// Extracts the sign bits from each single-precision floating-point
2910 /// element of a 128-bit floating-point vector of [4 x float] and returns the
2911 /// sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
2912 /// to zero.
2913 ///
2914 /// \headerfile <x86intrin.h>
2915 ///
2916 /// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
2917 ///
2918 /// \param __a
2919 /// A 128-bit floating-point vector of [4 x float].
2920 /// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
2921 /// single-precision floating-point element of the parameter. Bits [31:4] are
2922 /// set to zero.
2923 static __inline__ int __DEFAULT_FN_ATTRS
_mm_movemask_ps(__m128 __a)2924 _mm_movemask_ps(__m128 __a)
2925 {
2926 return __builtin_ia32_movmskps((__v4sf)__a);
2927 }
2928
2929
2930 #define _MM_ALIGN16 __attribute__((aligned(16)))
2931
2932 #define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
2933
2934 #define _MM_EXCEPT_INVALID (0x0001U)
2935 #define _MM_EXCEPT_DENORM (0x0002U)
2936 #define _MM_EXCEPT_DIV_ZERO (0x0004U)
2937 #define _MM_EXCEPT_OVERFLOW (0x0008U)
2938 #define _MM_EXCEPT_UNDERFLOW (0x0010U)
2939 #define _MM_EXCEPT_INEXACT (0x0020U)
2940 #define _MM_EXCEPT_MASK (0x003fU)
2941
2942 #define _MM_MASK_INVALID (0x0080U)
2943 #define _MM_MASK_DENORM (0x0100U)
2944 #define _MM_MASK_DIV_ZERO (0x0200U)
2945 #define _MM_MASK_OVERFLOW (0x0400U)
2946 #define _MM_MASK_UNDERFLOW (0x0800U)
2947 #define _MM_MASK_INEXACT (0x1000U)
2948 #define _MM_MASK_MASK (0x1f80U)
2949
2950 #define _MM_ROUND_NEAREST (0x0000U)
2951 #define _MM_ROUND_DOWN (0x2000U)
2952 #define _MM_ROUND_UP (0x4000U)
2953 #define _MM_ROUND_TOWARD_ZERO (0x6000U)
2954 #define _MM_ROUND_MASK (0x6000U)
2955
2956 #define _MM_FLUSH_ZERO_MASK (0x8000U)
2957 #define _MM_FLUSH_ZERO_ON (0x8000U)
2958 #define _MM_FLUSH_ZERO_OFF (0x0000U)
2959
2960 #define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
2961 #define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
2962 #define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
2963 #define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
2964
2965 #define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
2966 #define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
2967 #define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
2968 #define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
2969
2970 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
2971 do { \
2972 __m128 tmp3, tmp2, tmp1, tmp0; \
2973 tmp0 = _mm_unpacklo_ps((row0), (row1)); \
2974 tmp2 = _mm_unpacklo_ps((row2), (row3)); \
2975 tmp1 = _mm_unpackhi_ps((row0), (row1)); \
2976 tmp3 = _mm_unpackhi_ps((row2), (row3)); \
2977 (row0) = _mm_movelh_ps(tmp0, tmp2); \
2978 (row1) = _mm_movehl_ps(tmp2, tmp0); \
2979 (row2) = _mm_movelh_ps(tmp1, tmp3); \
2980 (row3) = _mm_movehl_ps(tmp3, tmp1); \
2981 } while (0)
2982
2983 /* Aliases for compatibility. */
2984 #define _m_pextrw _mm_extract_pi16
2985 #define _m_pinsrw _mm_insert_pi16
2986 #define _m_pmaxsw _mm_max_pi16
2987 #define _m_pmaxub _mm_max_pu8
2988 #define _m_pminsw _mm_min_pi16
2989 #define _m_pminub _mm_min_pu8
2990 #define _m_pmovmskb _mm_movemask_pi8
2991 #define _m_pmulhuw _mm_mulhi_pu16
2992 #define _m_pshufw _mm_shuffle_pi16
2993 #define _m_maskmovq _mm_maskmove_si64
2994 #define _m_pavgb _mm_avg_pu8
2995 #define _m_pavgw _mm_avg_pu16
2996 #define _m_psadbw _mm_sad_pu8
2997 #define _m_ _mm_
2998 #define _m_ _mm_
2999
3000 #undef __DEFAULT_FN_ATTRS
3001 #undef __DEFAULT_FN_ATTRS_MMX
3002
3003 /* Ugly hack for backwards-compatibility (compatible with gcc) */
3004 #if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
3005 #include <emmintrin.h>
3006 #endif
3007
3008 #endif /* __XMMINTRIN_H */
3009