1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #ifndef __EMMINTRIN_H
11 #define __EMMINTRIN_H
12 
13 #if !defined(__i386__) && !defined(__x86_64__)
14 #error "This header is only meant to be used on x86 and x64 architecture"
15 #endif
16 
17 #include <xmmintrin.h>
18 
19 typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
20 typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
21 
22 typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
23 typedef long long __m128i_u __attribute__((__vector_size__(16), __aligned__(1)));
24 
25 /* Type defines.  */
26 typedef double __v2df __attribute__ ((__vector_size__ (16)));
27 typedef long long __v2di __attribute__ ((__vector_size__ (16)));
28 typedef short __v8hi __attribute__((__vector_size__(16)));
29 typedef char __v16qi __attribute__((__vector_size__(16)));
30 
31 /* Unsigned types */
32 typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
33 typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
34 typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
35 
36 /* We need an explicitly signed variant for char. Note that this shouldn't
37  * appear in the interface though. */
38 typedef signed char __v16qs __attribute__((__vector_size__(16)));
39 
40 /* Define the default attributes for the functions in this file. */
41 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2"), __min_vector_width__(128)))
42 #define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse2"), __min_vector_width__(64)))
43 
44 /// Adds lower double-precision values in both operands and returns the
45 ///    sum in the lower 64 bits of the result. The upper 64 bits of the result
46 ///    are copied from the upper double-precision value of the first operand.
47 ///
48 /// \headerfile <x86intrin.h>
49 ///
50 /// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
51 ///
52 /// \param __a
53 ///    A 128-bit vector of [2 x double] containing one of the source operands.
54 /// \param __b
55 ///    A 128-bit vector of [2 x double] containing one of the source operands.
56 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
57 ///    sum of the lower 64 bits of both operands. The upper 64 bits are copied
58 ///    from the upper 64 bits of the first source operand.
59 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_add_sd(__m128d __a,__m128d __b)60 _mm_add_sd(__m128d __a, __m128d __b)
61 {
62   __a[0] += __b[0];
63   return __a;
64 }
65 
66 /// Adds two 128-bit vectors of [2 x double].
67 ///
68 /// \headerfile <x86intrin.h>
69 ///
70 /// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
71 ///
72 /// \param __a
73 ///    A 128-bit vector of [2 x double] containing one of the source operands.
74 /// \param __b
75 ///    A 128-bit vector of [2 x double] containing one of the source operands.
76 /// \returns A 128-bit vector of [2 x double] containing the sums of both
77 ///    operands.
78 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_add_pd(__m128d __a,__m128d __b)79 _mm_add_pd(__m128d __a, __m128d __b)
80 {
81   return (__m128d)((__v2df)__a + (__v2df)__b);
82 }
83 
84 /// Subtracts the lower double-precision value of the second operand
85 ///    from the lower double-precision value of the first operand and returns
86 ///    the difference in the lower 64 bits of the result. The upper 64 bits of
87 ///    the result are copied from the upper double-precision value of the first
88 ///    operand.
89 ///
90 /// \headerfile <x86intrin.h>
91 ///
92 /// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
93 ///
94 /// \param __a
95 ///    A 128-bit vector of [2 x double] containing the minuend.
96 /// \param __b
97 ///    A 128-bit vector of [2 x double] containing the subtrahend.
98 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
99 ///    difference of the lower 64 bits of both operands. The upper 64 bits are
100 ///    copied from the upper 64 bits of the first source operand.
101 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_sub_sd(__m128d __a,__m128d __b)102 _mm_sub_sd(__m128d __a, __m128d __b)
103 {
104   __a[0] -= __b[0];
105   return __a;
106 }
107 
108 /// Subtracts two 128-bit vectors of [2 x double].
109 ///
110 /// \headerfile <x86intrin.h>
111 ///
112 /// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
113 ///
114 /// \param __a
115 ///    A 128-bit vector of [2 x double] containing the minuend.
116 /// \param __b
117 ///    A 128-bit vector of [2 x double] containing the subtrahend.
118 /// \returns A 128-bit vector of [2 x double] containing the differences between
119 ///    both operands.
120 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_sub_pd(__m128d __a,__m128d __b)121 _mm_sub_pd(__m128d __a, __m128d __b)
122 {
123   return (__m128d)((__v2df)__a - (__v2df)__b);
124 }
125 
126 /// Multiplies lower double-precision values in both operands and returns
127 ///    the product in the lower 64 bits of the result. The upper 64 bits of the
128 ///    result are copied from the upper double-precision value of the first
129 ///    operand.
130 ///
131 /// \headerfile <x86intrin.h>
132 ///
133 /// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
134 ///
135 /// \param __a
136 ///    A 128-bit vector of [2 x double] containing one of the source operands.
137 /// \param __b
138 ///    A 128-bit vector of [2 x double] containing one of the source operands.
139 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
140 ///    product of the lower 64 bits of both operands. The upper 64 bits are
141 ///    copied from the upper 64 bits of the first source operand.
142 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_mul_sd(__m128d __a,__m128d __b)143 _mm_mul_sd(__m128d __a, __m128d __b)
144 {
145   __a[0] *= __b[0];
146   return __a;
147 }
148 
149 /// Multiplies two 128-bit vectors of [2 x double].
150 ///
151 /// \headerfile <x86intrin.h>
152 ///
153 /// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
154 ///
155 /// \param __a
156 ///    A 128-bit vector of [2 x double] containing one of the operands.
157 /// \param __b
158 ///    A 128-bit vector of [2 x double] containing one of the operands.
159 /// \returns A 128-bit vector of [2 x double] containing the products of both
160 ///    operands.
161 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_mul_pd(__m128d __a,__m128d __b)162 _mm_mul_pd(__m128d __a, __m128d __b)
163 {
164   return (__m128d)((__v2df)__a * (__v2df)__b);
165 }
166 
167 /// Divides the lower double-precision value of the first operand by the
168 ///    lower double-precision value of the second operand and returns the
169 ///    quotient in the lower 64 bits of the result. The upper 64 bits of the
170 ///    result are copied from the upper double-precision value of the first
171 ///    operand.
172 ///
173 /// \headerfile <x86intrin.h>
174 ///
175 /// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
176 ///
177 /// \param __a
178 ///    A 128-bit vector of [2 x double] containing the dividend.
179 /// \param __b
180 ///    A 128-bit vector of [2 x double] containing divisor.
181 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
182 ///    quotient of the lower 64 bits of both operands. The upper 64 bits are
183 ///    copied from the upper 64 bits of the first source operand.
184 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_div_sd(__m128d __a,__m128d __b)185 _mm_div_sd(__m128d __a, __m128d __b)
186 {
187   __a[0] /= __b[0];
188   return __a;
189 }
190 
191 /// Performs an element-by-element division of two 128-bit vectors of
192 ///    [2 x double].
193 ///
194 /// \headerfile <x86intrin.h>
195 ///
196 /// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
197 ///
198 /// \param __a
199 ///    A 128-bit vector of [2 x double] containing the dividend.
200 /// \param __b
201 ///    A 128-bit vector of [2 x double] containing the divisor.
202 /// \returns A 128-bit vector of [2 x double] containing the quotients of both
203 ///    operands.
204 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_div_pd(__m128d __a,__m128d __b)205 _mm_div_pd(__m128d __a, __m128d __b)
206 {
207   return (__m128d)((__v2df)__a / (__v2df)__b);
208 }
209 
210 /// Calculates the square root of the lower double-precision value of
211 ///    the second operand and returns it in the lower 64 bits of the result.
212 ///    The upper 64 bits of the result are copied from the upper
213 ///    double-precision value of the first operand.
214 ///
215 /// \headerfile <x86intrin.h>
216 ///
217 /// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
218 ///
219 /// \param __a
220 ///    A 128-bit vector of [2 x double] containing one of the operands. The
221 ///    upper 64 bits of this operand are copied to the upper 64 bits of the
222 ///    result.
223 /// \param __b
224 ///    A 128-bit vector of [2 x double] containing one of the operands. The
225 ///    square root is calculated using the lower 64 bits of this operand.
226 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
227 ///    square root of the lower 64 bits of operand \a __b, and whose upper 64
228 ///    bits are copied from the upper 64 bits of operand \a __a.
229 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_sqrt_sd(__m128d __a,__m128d __b)230 _mm_sqrt_sd(__m128d __a, __m128d __b)
231 {
232   __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
233   return __extension__ (__m128d) { __c[0], __a[1] };
234 }
235 
236 /// Calculates the square root of the each of two values stored in a
237 ///    128-bit vector of [2 x double].
238 ///
239 /// \headerfile <x86intrin.h>
240 ///
241 /// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
242 ///
243 /// \param __a
244 ///    A 128-bit vector of [2 x double].
245 /// \returns A 128-bit vector of [2 x double] containing the square roots of the
246 ///    values in the operand.
247 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_sqrt_pd(__m128d __a)248 _mm_sqrt_pd(__m128d __a)
249 {
250   return __builtin_ia32_sqrtpd((__v2df)__a);
251 }
252 
253 /// Compares lower 64-bit double-precision values of both operands, and
254 ///    returns the lesser of the pair of values in the lower 64-bits of the
255 ///    result. The upper 64 bits of the result are copied from the upper
256 ///    double-precision value of the first operand.
257 ///
258 /// \headerfile <x86intrin.h>
259 ///
260 /// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
261 ///
262 /// \param __a
263 ///    A 128-bit vector of [2 x double] containing one of the operands. The
264 ///    lower 64 bits of this operand are used in the comparison.
265 /// \param __b
266 ///    A 128-bit vector of [2 x double] containing one of the operands. The
267 ///    lower 64 bits of this operand are used in the comparison.
268 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
269 ///    minimum value between both operands. The upper 64 bits are copied from
270 ///    the upper 64 bits of the first source operand.
271 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_min_sd(__m128d __a,__m128d __b)272 _mm_min_sd(__m128d __a, __m128d __b)
273 {
274   return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
275 }
276 
277 /// Performs element-by-element comparison of the two 128-bit vectors of
278 ///    [2 x double] and returns the vector containing the lesser of each pair of
279 ///    values.
280 ///
281 /// \headerfile <x86intrin.h>
282 ///
283 /// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
284 ///
285 /// \param __a
286 ///    A 128-bit vector of [2 x double] containing one of the operands.
287 /// \param __b
288 ///    A 128-bit vector of [2 x double] containing one of the operands.
289 /// \returns A 128-bit vector of [2 x double] containing the minimum values
290 ///    between both operands.
291 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_min_pd(__m128d __a,__m128d __b)292 _mm_min_pd(__m128d __a, __m128d __b)
293 {
294   return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
295 }
296 
297 /// Compares lower 64-bit double-precision values of both operands, and
298 ///    returns the greater of the pair of values in the lower 64-bits of the
299 ///    result. The upper 64 bits of the result are copied from the upper
300 ///    double-precision value of the first operand.
301 ///
302 /// \headerfile <x86intrin.h>
303 ///
304 /// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
305 ///
306 /// \param __a
307 ///    A 128-bit vector of [2 x double] containing one of the operands. The
308 ///    lower 64 bits of this operand are used in the comparison.
309 /// \param __b
310 ///    A 128-bit vector of [2 x double] containing one of the operands. The
311 ///    lower 64 bits of this operand are used in the comparison.
312 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
313 ///    maximum value between both operands. The upper 64 bits are copied from
314 ///    the upper 64 bits of the first source operand.
315 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_max_sd(__m128d __a,__m128d __b)316 _mm_max_sd(__m128d __a, __m128d __b)
317 {
318   return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
319 }
320 
321 /// Performs element-by-element comparison of the two 128-bit vectors of
322 ///    [2 x double] and returns the vector containing the greater of each pair
323 ///    of values.
324 ///
325 /// \headerfile <x86intrin.h>
326 ///
327 /// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
328 ///
329 /// \param __a
330 ///    A 128-bit vector of [2 x double] containing one of the operands.
331 /// \param __b
332 ///    A 128-bit vector of [2 x double] containing one of the operands.
333 /// \returns A 128-bit vector of [2 x double] containing the maximum values
334 ///    between both operands.
335 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_max_pd(__m128d __a,__m128d __b)336 _mm_max_pd(__m128d __a, __m128d __b)
337 {
338   return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
339 }
340 
341 /// Performs a bitwise AND of two 128-bit vectors of [2 x double].
342 ///
343 /// \headerfile <x86intrin.h>
344 ///
345 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
346 ///
347 /// \param __a
348 ///    A 128-bit vector of [2 x double] containing one of the source operands.
349 /// \param __b
350 ///    A 128-bit vector of [2 x double] containing one of the source operands.
351 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
352 ///    values between both operands.
353 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_and_pd(__m128d __a,__m128d __b)354 _mm_and_pd(__m128d __a, __m128d __b)
355 {
356   return (__m128d)((__v2du)__a & (__v2du)__b);
357 }
358 
359 /// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
360 ///    the one's complement of the values contained in the first source operand.
361 ///
362 /// \headerfile <x86intrin.h>
363 ///
364 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
365 ///
366 /// \param __a
367 ///    A 128-bit vector of [2 x double] containing the left source operand. The
368 ///    one's complement of this value is used in the bitwise AND.
369 /// \param __b
370 ///    A 128-bit vector of [2 x double] containing the right source operand.
371 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
372 ///    values in the second operand and the one's complement of the first
373 ///    operand.
374 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_andnot_pd(__m128d __a,__m128d __b)375 _mm_andnot_pd(__m128d __a, __m128d __b)
376 {
377   return (__m128d)(~(__v2du)__a & (__v2du)__b);
378 }
379 
380 /// Performs a bitwise OR of two 128-bit vectors of [2 x double].
381 ///
382 /// \headerfile <x86intrin.h>
383 ///
384 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
385 ///
386 /// \param __a
387 ///    A 128-bit vector of [2 x double] containing one of the source operands.
388 /// \param __b
389 ///    A 128-bit vector of [2 x double] containing one of the source operands.
390 /// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
391 ///    values between both operands.
392 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_or_pd(__m128d __a,__m128d __b)393 _mm_or_pd(__m128d __a, __m128d __b)
394 {
395   return (__m128d)((__v2du)__a | (__v2du)__b);
396 }
397 
398 /// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
399 ///
400 /// \headerfile <x86intrin.h>
401 ///
402 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
403 ///
404 /// \param __a
405 ///    A 128-bit vector of [2 x double] containing one of the source operands.
406 /// \param __b
407 ///    A 128-bit vector of [2 x double] containing one of the source operands.
408 /// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
409 ///    values between both operands.
410 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_xor_pd(__m128d __a,__m128d __b)411 _mm_xor_pd(__m128d __a, __m128d __b)
412 {
413   return (__m128d)((__v2du)__a ^ (__v2du)__b);
414 }
415 
416 /// Compares each of the corresponding double-precision values of the
417 ///    128-bit vectors of [2 x double] for equality. Each comparison yields 0x0
418 ///    for false, 0xFFFFFFFFFFFFFFFF for true.
419 ///
420 /// \headerfile <x86intrin.h>
421 ///
422 /// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
423 ///
424 /// \param __a
425 ///    A 128-bit vector of [2 x double].
426 /// \param __b
427 ///    A 128-bit vector of [2 x double].
428 /// \returns A 128-bit vector containing the comparison results.
429 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpeq_pd(__m128d __a,__m128d __b)430 _mm_cmpeq_pd(__m128d __a, __m128d __b)
431 {
432   return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
433 }
434 
435 /// Compares each of the corresponding double-precision values of the
436 ///    128-bit vectors of [2 x double] to determine if the values in the first
437 ///    operand are less than those in the second operand. Each comparison
438 ///    yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
439 ///
440 /// \headerfile <x86intrin.h>
441 ///
442 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
443 ///
444 /// \param __a
445 ///    A 128-bit vector of [2 x double].
446 /// \param __b
447 ///    A 128-bit vector of [2 x double].
448 /// \returns A 128-bit vector containing the comparison results.
449 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmplt_pd(__m128d __a,__m128d __b)450 _mm_cmplt_pd(__m128d __a, __m128d __b)
451 {
452   return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
453 }
454 
455 /// Compares each of the corresponding double-precision values of the
456 ///    128-bit vectors of [2 x double] to determine if the values in the first
457 ///    operand are less than or equal to those in the second operand.
458 ///
459 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
460 ///
461 /// \headerfile <x86intrin.h>
462 ///
463 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
464 ///
465 /// \param __a
466 ///    A 128-bit vector of [2 x double].
467 /// \param __b
468 ///    A 128-bit vector of [2 x double].
469 /// \returns A 128-bit vector containing the comparison results.
470 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmple_pd(__m128d __a,__m128d __b)471 _mm_cmple_pd(__m128d __a, __m128d __b)
472 {
473   return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
474 }
475 
476 /// Compares each of the corresponding double-precision values of the
477 ///    128-bit vectors of [2 x double] to determine if the values in the first
478 ///    operand are greater than those in the second operand.
479 ///
480 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
481 ///
482 /// \headerfile <x86intrin.h>
483 ///
484 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
485 ///
486 /// \param __a
487 ///    A 128-bit vector of [2 x double].
488 /// \param __b
489 ///    A 128-bit vector of [2 x double].
490 /// \returns A 128-bit vector containing the comparison results.
491 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpgt_pd(__m128d __a,__m128d __b)492 _mm_cmpgt_pd(__m128d __a, __m128d __b)
493 {
494   return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
495 }
496 
497 /// Compares each of the corresponding double-precision values of the
498 ///    128-bit vectors of [2 x double] to determine if the values in the first
499 ///    operand are greater than or equal to those in the second operand.
500 ///
501 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
502 ///
503 /// \headerfile <x86intrin.h>
504 ///
505 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
506 ///
507 /// \param __a
508 ///    A 128-bit vector of [2 x double].
509 /// \param __b
510 ///    A 128-bit vector of [2 x double].
511 /// \returns A 128-bit vector containing the comparison results.
512 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpge_pd(__m128d __a,__m128d __b)513 _mm_cmpge_pd(__m128d __a, __m128d __b)
514 {
515   return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
516 }
517 
518 /// Compares each of the corresponding double-precision values of the
519 ///    128-bit vectors of [2 x double] to determine if the values in the first
520 ///    operand are ordered with respect to those in the second operand.
521 ///
522 ///    A pair of double-precision values are "ordered" with respect to each
523 ///    other if neither value is a NaN. Each comparison yields 0x0 for false,
524 ///    0xFFFFFFFFFFFFFFFF for true.
525 ///
526 /// \headerfile <x86intrin.h>
527 ///
528 /// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
529 ///
530 /// \param __a
531 ///    A 128-bit vector of [2 x double].
532 /// \param __b
533 ///    A 128-bit vector of [2 x double].
534 /// \returns A 128-bit vector containing the comparison results.
535 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpord_pd(__m128d __a,__m128d __b)536 _mm_cmpord_pd(__m128d __a, __m128d __b)
537 {
538   return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
539 }
540 
541 /// Compares each of the corresponding double-precision values of the
542 ///    128-bit vectors of [2 x double] to determine if the values in the first
543 ///    operand are unordered with respect to those in the second operand.
544 ///
545 ///    A pair of double-precision values are "unordered" with respect to each
546 ///    other if one or both values are NaN. Each comparison yields 0x0 for
547 ///    false, 0xFFFFFFFFFFFFFFFF for true.
548 ///
549 /// \headerfile <x86intrin.h>
550 ///
551 /// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
552 ///   instruction.
553 ///
554 /// \param __a
555 ///    A 128-bit vector of [2 x double].
556 /// \param __b
557 ///    A 128-bit vector of [2 x double].
558 /// \returns A 128-bit vector containing the comparison results.
559 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpunord_pd(__m128d __a,__m128d __b)560 _mm_cmpunord_pd(__m128d __a, __m128d __b)
561 {
562   return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
563 }
564 
565 /// Compares each of the corresponding double-precision values of the
566 ///    128-bit vectors of [2 x double] to determine if the values in the first
567 ///    operand are unequal to those in the second operand.
568 ///
569 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
570 ///
571 /// \headerfile <x86intrin.h>
572 ///
573 /// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
574 ///
575 /// \param __a
576 ///    A 128-bit vector of [2 x double].
577 /// \param __b
578 ///    A 128-bit vector of [2 x double].
579 /// \returns A 128-bit vector containing the comparison results.
580 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpneq_pd(__m128d __a,__m128d __b)581 _mm_cmpneq_pd(__m128d __a, __m128d __b)
582 {
583   return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
584 }
585 
586 /// Compares each of the corresponding double-precision values of the
587 ///    128-bit vectors of [2 x double] to determine if the values in the first
588 ///    operand are not less than those in the second operand.
589 ///
590 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
591 ///
592 /// \headerfile <x86intrin.h>
593 ///
594 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
595 ///
596 /// \param __a
597 ///    A 128-bit vector of [2 x double].
598 /// \param __b
599 ///    A 128-bit vector of [2 x double].
600 /// \returns A 128-bit vector containing the comparison results.
601 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpnlt_pd(__m128d __a,__m128d __b)602 _mm_cmpnlt_pd(__m128d __a, __m128d __b)
603 {
604   return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
605 }
606 
607 /// Compares each of the corresponding double-precision values of the
608 ///    128-bit vectors of [2 x double] to determine if the values in the first
609 ///    operand are not less than or equal to those in the second operand.
610 ///
611 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
612 ///
613 /// \headerfile <x86intrin.h>
614 ///
615 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
616 ///
617 /// \param __a
618 ///    A 128-bit vector of [2 x double].
619 /// \param __b
620 ///    A 128-bit vector of [2 x double].
621 /// \returns A 128-bit vector containing the comparison results.
622 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpnle_pd(__m128d __a,__m128d __b)623 _mm_cmpnle_pd(__m128d __a, __m128d __b)
624 {
625   return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
626 }
627 
628 /// Compares each of the corresponding double-precision values of the
629 ///    128-bit vectors of [2 x double] to determine if the values in the first
630 ///    operand are not greater than those in the second operand.
631 ///
632 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
633 ///
634 /// \headerfile <x86intrin.h>
635 ///
636 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
637 ///
638 /// \param __a
639 ///    A 128-bit vector of [2 x double].
640 /// \param __b
641 ///    A 128-bit vector of [2 x double].
642 /// \returns A 128-bit vector containing the comparison results.
643 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpngt_pd(__m128d __a,__m128d __b)644 _mm_cmpngt_pd(__m128d __a, __m128d __b)
645 {
646   return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
647 }
648 
649 /// Compares each of the corresponding double-precision values of the
650 ///    128-bit vectors of [2 x double] to determine if the values in the first
651 ///    operand are not greater than or equal to those in the second operand.
652 ///
653 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
654 ///
655 /// \headerfile <x86intrin.h>
656 ///
657 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
658 ///
659 /// \param __a
660 ///    A 128-bit vector of [2 x double].
661 /// \param __b
662 ///    A 128-bit vector of [2 x double].
663 /// \returns A 128-bit vector containing the comparison results.
664 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpnge_pd(__m128d __a,__m128d __b)665 _mm_cmpnge_pd(__m128d __a, __m128d __b)
666 {
667   return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
668 }
669 
670 /// Compares the lower double-precision floating-point values in each of
671 ///    the two 128-bit floating-point vectors of [2 x double] for equality.
672 ///
673 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
674 ///
675 /// \headerfile <x86intrin.h>
676 ///
677 /// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
678 ///
679 /// \param __a
680 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
681 ///    compared to the lower double-precision value of \a __b.
682 /// \param __b
683 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
684 ///    compared to the lower double-precision value of \a __a.
685 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
686 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
687 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpeq_sd(__m128d __a,__m128d __b)688 _mm_cmpeq_sd(__m128d __a, __m128d __b)
689 {
690   return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
691 }
692 
693 /// Compares the lower double-precision floating-point values in each of
694 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
695 ///    the value in the first parameter is less than the corresponding value in
696 ///    the second parameter.
697 ///
698 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
699 ///
700 /// \headerfile <x86intrin.h>
701 ///
702 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
703 ///
704 /// \param __a
705 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
706 ///    compared to the lower double-precision value of \a __b.
707 /// \param __b
708 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
709 ///    compared to the lower double-precision value of \a __a.
710 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
711 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
712 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmplt_sd(__m128d __a,__m128d __b)713 _mm_cmplt_sd(__m128d __a, __m128d __b)
714 {
715   return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
716 }
717 
718 /// Compares the lower double-precision floating-point values in each of
719 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
720 ///    the value in the first parameter is less than or equal to the
721 ///    corresponding value in the second parameter.
722 ///
723 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
724 ///
725 /// \headerfile <x86intrin.h>
726 ///
727 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
728 ///
729 /// \param __a
730 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
731 ///    compared to the lower double-precision value of \a __b.
732 /// \param __b
733 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
734 ///    compared to the lower double-precision value of \a __a.
735 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
736 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
737 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmple_sd(__m128d __a,__m128d __b)738 _mm_cmple_sd(__m128d __a, __m128d __b)
739 {
740   return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
741 }
742 
743 /// Compares the lower double-precision floating-point values in each of
744 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
745 ///    the value in the first parameter is greater than the corresponding value
746 ///    in the second parameter.
747 ///
748 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
749 ///
750 /// \headerfile <x86intrin.h>
751 ///
752 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
753 ///
754 /// \param __a
755 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
756 ///     compared to the lower double-precision value of \a __b.
757 /// \param __b
758 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
759 ///     compared to the lower double-precision value of \a __a.
760 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
761 ///     results. The upper 64 bits are copied from the upper 64 bits of \a __a.
762 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpgt_sd(__m128d __a,__m128d __b)763 _mm_cmpgt_sd(__m128d __a, __m128d __b)
764 {
765   __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
766   return __extension__ (__m128d) { __c[0], __a[1] };
767 }
768 
769 /// Compares the lower double-precision floating-point values in each of
770 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
771 ///    the value in the first parameter is greater than or equal to the
772 ///    corresponding value in the second parameter.
773 ///
774 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
775 ///
776 /// \headerfile <x86intrin.h>
777 ///
778 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
779 ///
780 /// \param __a
781 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
782 ///    compared to the lower double-precision value of \a __b.
783 /// \param __b
784 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
785 ///    compared to the lower double-precision value of \a __a.
786 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
787 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
788 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpge_sd(__m128d __a,__m128d __b)789 _mm_cmpge_sd(__m128d __a, __m128d __b)
790 {
791   __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
792   return __extension__ (__m128d) { __c[0], __a[1] };
793 }
794 
795 /// Compares the lower double-precision floating-point values in each of
796 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
797 ///    the value in the first parameter is "ordered" with respect to the
798 ///    corresponding value in the second parameter.
799 ///
800 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
801 ///    of double-precision values are "ordered" with respect to each other if
802 ///    neither value is a NaN.
803 ///
804 /// \headerfile <x86intrin.h>
805 ///
806 /// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
807 ///
808 /// \param __a
809 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
810 ///    compared to the lower double-precision value of \a __b.
811 /// \param __b
812 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
813 ///    compared to the lower double-precision value of \a __a.
814 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
815 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
816 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpord_sd(__m128d __a,__m128d __b)817 _mm_cmpord_sd(__m128d __a, __m128d __b)
818 {
819   return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
820 }
821 
822 /// Compares the lower double-precision floating-point values in each of
823 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
824 ///    the value in the first parameter is "unordered" with respect to the
825 ///    corresponding value in the second parameter.
826 ///
827 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
828 ///    of double-precision values are "unordered" with respect to each other if
829 ///    one or both values are NaN.
830 ///
831 /// \headerfile <x86intrin.h>
832 ///
833 /// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
834 ///   instruction.
835 ///
836 /// \param __a
837 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
838 ///    compared to the lower double-precision value of \a __b.
839 /// \param __b
840 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
841 ///    compared to the lower double-precision value of \a __a.
842 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
843 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
844 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpunord_sd(__m128d __a,__m128d __b)845 _mm_cmpunord_sd(__m128d __a, __m128d __b)
846 {
847   return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
848 }
849 
850 /// Compares the lower double-precision floating-point values in each of
851 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
852 ///    the value in the first parameter is unequal to the corresponding value in
853 ///    the second parameter.
854 ///
855 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
856 ///
857 /// \headerfile <x86intrin.h>
858 ///
859 /// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
860 ///
861 /// \param __a
862 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
863 ///    compared to the lower double-precision value of \a __b.
864 /// \param __b
865 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
866 ///    compared to the lower double-precision value of \a __a.
867 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
868 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
869 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpneq_sd(__m128d __a,__m128d __b)870 _mm_cmpneq_sd(__m128d __a, __m128d __b)
871 {
872   return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
873 }
874 
875 /// Compares the lower double-precision floating-point values in each of
876 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
877 ///    the value in the first parameter is not less than the corresponding
878 ///    value in the second parameter.
879 ///
880 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
881 ///
882 /// \headerfile <x86intrin.h>
883 ///
884 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
885 ///
886 /// \param __a
887 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
888 ///    compared to the lower double-precision value of \a __b.
889 /// \param __b
890 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
891 ///    compared to the lower double-precision value of \a __a.
892 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
893 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
894 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpnlt_sd(__m128d __a,__m128d __b)895 _mm_cmpnlt_sd(__m128d __a, __m128d __b)
896 {
897   return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
898 }
899 
900 /// Compares the lower double-precision floating-point values in each of
901 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
902 ///    the value in the first parameter is not less than or equal to the
903 ///    corresponding value in the second parameter.
904 ///
905 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
906 ///
907 /// \headerfile <x86intrin.h>
908 ///
909 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
910 ///
911 /// \param __a
912 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
913 ///    compared to the lower double-precision value of \a __b.
914 /// \param __b
915 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
916 ///    compared to the lower double-precision value of \a __a.
917 /// \returns  A 128-bit vector. The lower 64 bits contains the comparison
918 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
919 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpnle_sd(__m128d __a,__m128d __b)920 _mm_cmpnle_sd(__m128d __a, __m128d __b)
921 {
922   return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
923 }
924 
925 /// Compares the lower double-precision floating-point values in each of
926 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
927 ///    the value in the first parameter is not greater than the corresponding
928 ///    value in the second parameter.
929 ///
930 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
931 ///
932 /// \headerfile <x86intrin.h>
933 ///
934 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
935 ///
936 /// \param __a
937 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
938 ///    compared to the lower double-precision value of \a __b.
939 /// \param __b
940 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
941 ///    compared to the lower double-precision value of \a __a.
942 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
943 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
944 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpngt_sd(__m128d __a,__m128d __b)945 _mm_cmpngt_sd(__m128d __a, __m128d __b)
946 {
947   __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
948   return __extension__ (__m128d) { __c[0], __a[1] };
949 }
950 
951 /// Compares the lower double-precision floating-point values in each of
952 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
953 ///    the value in the first parameter is not greater than or equal to the
954 ///    corresponding value in the second parameter.
955 ///
956 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
957 ///
958 /// \headerfile <x86intrin.h>
959 ///
960 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
961 ///
962 /// \param __a
963 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
964 ///    compared to the lower double-precision value of \a __b.
965 /// \param __b
966 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
967 ///    compared to the lower double-precision value of \a __a.
968 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
969 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
970 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpnge_sd(__m128d __a,__m128d __b)971 _mm_cmpnge_sd(__m128d __a, __m128d __b)
972 {
973   __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
974   return __extension__ (__m128d) { __c[0], __a[1] };
975 }
976 
977 /// Compares the lower double-precision floating-point values in each of
978 ///    the two 128-bit floating-point vectors of [2 x double] for equality.
979 ///
980 ///    The comparison yields 0 for false, 1 for true. If either of the two
981 ///    lower double-precision values is NaN, 0 is returned.
982 ///
983 /// \headerfile <x86intrin.h>
984 ///
985 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
986 ///
987 /// \param __a
988 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
989 ///    compared to the lower double-precision value of \a __b.
990 /// \param __b
991 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
992 ///    compared to the lower double-precision value of \a __a.
993 /// \returns An integer containing the comparison results. If either of the two
994 ///    lower double-precision values is NaN, 0 is returned.
995 static __inline__ int __DEFAULT_FN_ATTRS
_mm_comieq_sd(__m128d __a,__m128d __b)996 _mm_comieq_sd(__m128d __a, __m128d __b)
997 {
998   return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
999 }
1000 
1001 /// Compares the lower double-precision floating-point values in each of
1002 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1003 ///    the value in the first parameter is less than the corresponding value in
1004 ///    the second parameter.
1005 ///
1006 ///    The comparison yields 0 for false, 1 for true. If either of the two
1007 ///    lower double-precision values is NaN, 0 is returned.
1008 ///
1009 /// \headerfile <x86intrin.h>
1010 ///
1011 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1012 ///
1013 /// \param __a
1014 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1015 ///    compared to the lower double-precision value of \a __b.
1016 /// \param __b
1017 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1018 ///    compared to the lower double-precision value of \a __a.
1019 /// \returns An integer containing the comparison results. If either of the two
1020 ///     lower double-precision values is NaN, 0 is returned.
1021 static __inline__ int __DEFAULT_FN_ATTRS
_mm_comilt_sd(__m128d __a,__m128d __b)1022 _mm_comilt_sd(__m128d __a, __m128d __b)
1023 {
1024   return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
1025 }
1026 
1027 /// Compares the lower double-precision floating-point values in each of
1028 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1029 ///    the value in the first parameter is less than or equal to the
1030 ///    corresponding value in the second parameter.
1031 ///
1032 ///    The comparison yields 0 for false, 1 for true. If either of the two
1033 ///    lower double-precision values is NaN, 0 is returned.
1034 ///
1035 /// \headerfile <x86intrin.h>
1036 ///
1037 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1038 ///
1039 /// \param __a
1040 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1041 ///    compared to the lower double-precision value of \a __b.
1042 /// \param __b
1043 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
1044 ///     compared to the lower double-precision value of \a __a.
1045 /// \returns An integer containing the comparison results. If either of the two
1046 ///     lower double-precision values is NaN, 0 is returned.
1047 static __inline__ int __DEFAULT_FN_ATTRS
_mm_comile_sd(__m128d __a,__m128d __b)1048 _mm_comile_sd(__m128d __a, __m128d __b)
1049 {
1050   return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
1051 }
1052 
1053 /// Compares the lower double-precision floating-point values in each of
1054 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1055 ///    the value in the first parameter is greater than the corresponding value
1056 ///    in the second parameter.
1057 ///
1058 ///    The comparison yields 0 for false, 1 for true. If either of the two
1059 ///    lower double-precision values is NaN, 0 is returned.
1060 ///
1061 /// \headerfile <x86intrin.h>
1062 ///
1063 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1064 ///
1065 /// \param __a
1066 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1067 ///    compared to the lower double-precision value of \a __b.
1068 /// \param __b
1069 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1070 ///    compared to the lower double-precision value of \a __a.
1071 /// \returns An integer containing the comparison results. If either of the two
1072 ///     lower double-precision values is NaN, 0 is returned.
1073 static __inline__ int __DEFAULT_FN_ATTRS
_mm_comigt_sd(__m128d __a,__m128d __b)1074 _mm_comigt_sd(__m128d __a, __m128d __b)
1075 {
1076   return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
1077 }
1078 
1079 /// Compares the lower double-precision floating-point values in each of
1080 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1081 ///    the value in the first parameter is greater than or equal to the
1082 ///    corresponding value in the second parameter.
1083 ///
1084 ///    The comparison yields 0 for false, 1 for true. If either of the two
1085 ///    lower double-precision values is NaN, 0 is returned.
1086 ///
1087 /// \headerfile <x86intrin.h>
1088 ///
1089 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1090 ///
1091 /// \param __a
1092 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1093 ///    compared to the lower double-precision value of \a __b.
1094 /// \param __b
1095 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1096 ///    compared to the lower double-precision value of \a __a.
1097 /// \returns An integer containing the comparison results. If either of the two
1098 ///    lower double-precision values is NaN, 0 is returned.
1099 static __inline__ int __DEFAULT_FN_ATTRS
_mm_comige_sd(__m128d __a,__m128d __b)1100 _mm_comige_sd(__m128d __a, __m128d __b)
1101 {
1102   return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
1103 }
1104 
1105 /// Compares the lower double-precision floating-point values in each of
1106 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1107 ///    the value in the first parameter is unequal to the corresponding value in
1108 ///    the second parameter.
1109 ///
1110 ///    The comparison yields 0 for false, 1 for true. If either of the two
1111 ///    lower double-precision values is NaN, 1 is returned.
1112 ///
1113 /// \headerfile <x86intrin.h>
1114 ///
1115 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1116 ///
1117 /// \param __a
1118 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1119 ///    compared to the lower double-precision value of \a __b.
1120 /// \param __b
1121 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1122 ///    compared to the lower double-precision value of \a __a.
1123 /// \returns An integer containing the comparison results. If either of the two
1124 ///     lower double-precision values is NaN, 1 is returned.
1125 static __inline__ int __DEFAULT_FN_ATTRS
_mm_comineq_sd(__m128d __a,__m128d __b)1126 _mm_comineq_sd(__m128d __a, __m128d __b)
1127 {
1128   return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
1129 }
1130 
1131 /// Compares the lower double-precision floating-point values in each of
1132 ///    the two 128-bit floating-point vectors of [2 x double] for equality. The
1133 ///    comparison yields 0 for false, 1 for true.
1134 ///
1135 ///    If either of the two lower double-precision values is NaN, 0 is returned.
1136 ///
1137 /// \headerfile <x86intrin.h>
1138 ///
1139 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1140 ///
1141 /// \param __a
1142 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1143 ///    compared to the lower double-precision value of \a __b.
1144 /// \param __b
1145 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1146 ///    compared to the lower double-precision value of \a __a.
1147 /// \returns An integer containing the comparison results. If either of the two
1148 ///    lower double-precision values is NaN, 0 is returned.
1149 static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomieq_sd(__m128d __a,__m128d __b)1150 _mm_ucomieq_sd(__m128d __a, __m128d __b)
1151 {
1152   return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
1153 }
1154 
1155 /// Compares the lower double-precision floating-point values in each of
1156 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1157 ///    the value in the first parameter is less than the corresponding value in
1158 ///    the second parameter.
1159 ///
1160 ///    The comparison yields 0 for false, 1 for true. If either of the two lower
1161 ///    double-precision values is NaN, 0 is returned.
1162 ///
1163 /// \headerfile <x86intrin.h>
1164 ///
1165 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1166 ///
1167 /// \param __a
1168 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1169 ///    compared to the lower double-precision value of \a __b.
1170 /// \param __b
1171 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1172 ///    compared to the lower double-precision value of \a __a.
1173 /// \returns An integer containing the comparison results. If either of the two
1174 ///    lower double-precision values is NaN, 0 is returned.
1175 static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomilt_sd(__m128d __a,__m128d __b)1176 _mm_ucomilt_sd(__m128d __a, __m128d __b)
1177 {
1178   return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
1179 }
1180 
1181 /// Compares the lower double-precision floating-point values in each of
1182 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1183 ///    the value in the first parameter is less than or equal to the
1184 ///    corresponding value in the second parameter.
1185 ///
1186 ///    The comparison yields 0 for false, 1 for true. If either of the two lower
1187 ///    double-precision values is NaN, 0 is returned.
1188 ///
1189 /// \headerfile <x86intrin.h>
1190 ///
1191 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1192 ///
1193 /// \param __a
1194 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1195 ///    compared to the lower double-precision value of \a __b.
1196 /// \param __b
1197 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
1198 ///     compared to the lower double-precision value of \a __a.
1199 /// \returns An integer containing the comparison results. If either of the two
1200 ///     lower double-precision values is NaN, 0 is returned.
1201 static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomile_sd(__m128d __a,__m128d __b)1202 _mm_ucomile_sd(__m128d __a, __m128d __b)
1203 {
1204   return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
1205 }
1206 
1207 /// Compares the lower double-precision floating-point values in each of
1208 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1209 ///    the value in the first parameter is greater than the corresponding value
1210 ///    in the second parameter.
1211 ///
1212 ///    The comparison yields 0 for false, 1 for true. If either of the two lower
1213 ///    double-precision values is NaN, 0 is returned.
1214 ///
1215 /// \headerfile <x86intrin.h>
1216 ///
1217 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1218 ///
1219 /// \param __a
1220 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1221 ///    compared to the lower double-precision value of \a __b.
1222 /// \param __b
1223 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
1224 ///     compared to the lower double-precision value of \a __a.
1225 /// \returns An integer containing the comparison results. If either of the two
1226 ///     lower double-precision values is NaN, 0 is returned.
1227 static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomigt_sd(__m128d __a,__m128d __b)1228 _mm_ucomigt_sd(__m128d __a, __m128d __b)
1229 {
1230   return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
1231 }
1232 
1233 /// Compares the lower double-precision floating-point values in each of
1234 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1235 ///    the value in the first parameter is greater than or equal to the
1236 ///    corresponding value in the second parameter.
1237 ///
1238 ///    The comparison yields 0 for false, 1 for true.  If either of the two
1239 ///    lower double-precision values is NaN, 0 is returned.
1240 ///
1241 /// \headerfile <x86intrin.h>
1242 ///
1243 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1244 ///
1245 /// \param __a
1246 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1247 ///    compared to the lower double-precision value of \a __b.
1248 /// \param __b
1249 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1250 ///    compared to the lower double-precision value of \a __a.
1251 /// \returns An integer containing the comparison results. If either of the two
1252 ///    lower double-precision values is NaN, 0 is returned.
1253 static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomige_sd(__m128d __a,__m128d __b)1254 _mm_ucomige_sd(__m128d __a, __m128d __b)
1255 {
1256   return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
1257 }
1258 
1259 /// Compares the lower double-precision floating-point values in each of
1260 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1261 ///    the value in the first parameter is unequal to the corresponding value in
1262 ///    the second parameter.
1263 ///
1264 ///    The comparison yields 0 for false, 1 for true. If either of the two lower
1265 ///    double-precision values is NaN, 1 is returned.
1266 ///
1267 /// \headerfile <x86intrin.h>
1268 ///
1269 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1270 ///
1271 /// \param __a
1272 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1273 ///    compared to the lower double-precision value of \a __b.
1274 /// \param __b
1275 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1276 ///    compared to the lower double-precision value of \a __a.
1277 /// \returns An integer containing the comparison result. If either of the two
1278 ///    lower double-precision values is NaN, 1 is returned.
1279 static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomineq_sd(__m128d __a,__m128d __b)1280 _mm_ucomineq_sd(__m128d __a, __m128d __b)
1281 {
1282   return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
1283 }
1284 
1285 /// Converts the two double-precision floating-point elements of a
1286 ///    128-bit vector of [2 x double] into two single-precision floating-point
1287 ///    values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
1288 ///    The upper 64 bits of the result vector are set to zero.
1289 ///
1290 /// \headerfile <x86intrin.h>
1291 ///
1292 /// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
1293 ///
1294 /// \param __a
1295 ///    A 128-bit vector of [2 x double].
1296 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1297 ///    converted values. The upper 64 bits are set to zero.
1298 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cvtpd_ps(__m128d __a)1299 _mm_cvtpd_ps(__m128d __a)
1300 {
1301   return __builtin_ia32_cvtpd2ps((__v2df)__a);
1302 }
1303 
1304 /// Converts the lower two single-precision floating-point elements of a
1305 ///    128-bit vector of [4 x float] into two double-precision floating-point
1306 ///    values, returned in a 128-bit vector of [2 x double]. The upper two
1307 ///    elements of the input vector are unused.
1308 ///
1309 /// \headerfile <x86intrin.h>
1310 ///
1311 /// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
1312 ///
1313 /// \param __a
1314 ///    A 128-bit vector of [4 x float]. The lower two single-precision
1315 ///    floating-point elements are converted to double-precision values. The
1316 ///    upper two elements are unused.
1317 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1318 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cvtps_pd(__m128 __a)1319 _mm_cvtps_pd(__m128 __a)
1320 {
1321   return (__m128d) __builtin_convertvector(
1322       __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
1323 }
1324 
1325 /// Converts the lower two integer elements of a 128-bit vector of
1326 ///    [4 x i32] into two double-precision floating-point values, returned in a
1327 ///    128-bit vector of [2 x double].
1328 ///
1329 ///    The upper two elements of the input vector are unused.
1330 ///
1331 /// \headerfile <x86intrin.h>
1332 ///
1333 /// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
1334 ///
1335 /// \param __a
1336 ///    A 128-bit integer vector of [4 x i32]. The lower two integer elements are
1337 ///    converted to double-precision values.
1338 ///
1339 ///    The upper two elements are unused.
1340 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1341 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cvtepi32_pd(__m128i __a)1342 _mm_cvtepi32_pd(__m128i __a)
1343 {
1344   return (__m128d) __builtin_convertvector(
1345       __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
1346 }
1347 
1348 /// Converts the two double-precision floating-point elements of a
1349 ///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1350 ///    returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
1351 ///    64 bits of the result vector are set to zero.
1352 ///
1353 /// \headerfile <x86intrin.h>
1354 ///
1355 /// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
1356 ///
1357 /// \param __a
1358 ///    A 128-bit vector of [2 x double].
1359 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1360 ///    converted values. The upper 64 bits are set to zero.
1361 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtpd_epi32(__m128d __a)1362 _mm_cvtpd_epi32(__m128d __a)
1363 {
1364   return __builtin_ia32_cvtpd2dq((__v2df)__a);
1365 }
1366 
1367 /// Converts the low-order element of a 128-bit vector of [2 x double]
1368 ///    into a 32-bit signed integer value.
1369 ///
1370 /// \headerfile <x86intrin.h>
1371 ///
1372 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
1373 ///
1374 /// \param __a
1375 ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1376 ///    conversion.
1377 /// \returns A 32-bit signed integer containing the converted value.
1378 static __inline__ int __DEFAULT_FN_ATTRS
_mm_cvtsd_si32(__m128d __a)1379 _mm_cvtsd_si32(__m128d __a)
1380 {
1381   return __builtin_ia32_cvtsd2si((__v2df)__a);
1382 }
1383 
1384 /// Converts the lower double-precision floating-point element of a
1385 ///    128-bit vector of [2 x double], in the second parameter, into a
1386 ///    single-precision floating-point value, returned in the lower 32 bits of a
1387 ///    128-bit vector of [4 x float]. The upper 96 bits of the result vector are
1388 ///    copied from the upper 96 bits of the first parameter.
1389 ///
1390 /// \headerfile <x86intrin.h>
1391 ///
1392 /// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
1393 ///
1394 /// \param __a
1395 ///    A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
1396 ///    copied to the upper 96 bits of the result.
1397 /// \param __b
1398 ///    A 128-bit vector of [2 x double]. The lower double-precision
1399 ///    floating-point element is used in the conversion.
1400 /// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
1401 ///    converted value from the second parameter. The upper 96 bits are copied
1402 ///    from the upper 96 bits of the first parameter.
1403 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cvtsd_ss(__m128 __a,__m128d __b)1404 _mm_cvtsd_ss(__m128 __a, __m128d __b)
1405 {
1406   return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
1407 }
1408 
1409 /// Converts a 32-bit signed integer value, in the second parameter, into
1410 ///    a double-precision floating-point value, returned in the lower 64 bits of
1411 ///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1412 ///    are copied from the upper 64 bits of the first parameter.
1413 ///
1414 /// \headerfile <x86intrin.h>
1415 ///
1416 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
1417 ///
1418 /// \param __a
1419 ///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1420 ///    copied to the upper 64 bits of the result.
1421 /// \param __b
1422 ///    A 32-bit signed integer containing the value to be converted.
1423 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1424 ///    converted value from the second parameter. The upper 64 bits are copied
1425 ///    from the upper 64 bits of the first parameter.
1426 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cvtsi32_sd(__m128d __a,int __b)1427 _mm_cvtsi32_sd(__m128d __a, int __b)
1428 {
1429   __a[0] = __b;
1430   return __a;
1431 }
1432 
1433 /// Converts the lower single-precision floating-point element of a
1434 ///    128-bit vector of [4 x float], in the second parameter, into a
1435 ///    double-precision floating-point value, returned in the lower 64 bits of
1436 ///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1437 ///    are copied from the upper 64 bits of the first parameter.
1438 ///
1439 /// \headerfile <x86intrin.h>
1440 ///
1441 /// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
1442 ///
1443 /// \param __a
1444 ///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1445 ///    copied to the upper 64 bits of the result.
1446 /// \param __b
1447 ///    A 128-bit vector of [4 x float]. The lower single-precision
1448 ///    floating-point element is used in the conversion.
1449 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1450 ///    converted value from the second parameter. The upper 64 bits are copied
1451 ///    from the upper 64 bits of the first parameter.
1452 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cvtss_sd(__m128d __a,__m128 __b)1453 _mm_cvtss_sd(__m128d __a, __m128 __b)
1454 {
1455   __a[0] = __b[0];
1456   return __a;
1457 }
1458 
1459 /// Converts the two double-precision floating-point elements of a
1460 ///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1461 ///    returned in the lower 64 bits of a 128-bit vector of [4 x i32].
1462 ///
1463 ///    If the result of either conversion is inexact, the result is truncated
1464 ///    (rounded towards zero) regardless of the current MXCSR setting. The upper
1465 ///    64 bits of the result vector are set to zero.
1466 ///
1467 /// \headerfile <x86intrin.h>
1468 ///
1469 /// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
1470 ///   instruction.
1471 ///
1472 /// \param __a
1473 ///    A 128-bit vector of [2 x double].
1474 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1475 ///    converted values. The upper 64 bits are set to zero.
1476 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvttpd_epi32(__m128d __a)1477 _mm_cvttpd_epi32(__m128d __a)
1478 {
1479   return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
1480 }
1481 
1482 /// Converts the low-order element of a [2 x double] vector into a 32-bit
1483 ///    signed integer value, truncating the result when it is inexact.
1484 ///
1485 /// \headerfile <x86intrin.h>
1486 ///
1487 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
1488 ///   instruction.
1489 ///
1490 /// \param __a
1491 ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1492 ///    conversion.
1493 /// \returns A 32-bit signed integer containing the converted value.
1494 static __inline__ int __DEFAULT_FN_ATTRS
_mm_cvttsd_si32(__m128d __a)1495 _mm_cvttsd_si32(__m128d __a)
1496 {
1497   return __builtin_ia32_cvttsd2si((__v2df)__a);
1498 }
1499 
1500 /// Converts the two double-precision floating-point elements of a
1501 ///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1502 ///    returned in a 64-bit vector of [2 x i32].
1503 ///
1504 /// \headerfile <x86intrin.h>
1505 ///
1506 /// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
1507 ///
1508 /// \param __a
1509 ///    A 128-bit vector of [2 x double].
1510 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
1511 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_cvtpd_pi32(__m128d __a)1512 _mm_cvtpd_pi32(__m128d __a)
1513 {
1514   return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
1515 }
1516 
1517 /// Converts the two double-precision floating-point elements of a
1518 ///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1519 ///    returned in a 64-bit vector of [2 x i32].
1520 ///
1521 ///    If the result of either conversion is inexact, the result is truncated
1522 ///    (rounded towards zero) regardless of the current MXCSR setting.
1523 ///
1524 /// \headerfile <x86intrin.h>
1525 ///
1526 /// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
1527 ///
1528 /// \param __a
1529 ///    A 128-bit vector of [2 x double].
1530 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
1531 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_cvttpd_pi32(__m128d __a)1532 _mm_cvttpd_pi32(__m128d __a)
1533 {
1534   return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
1535 }
1536 
1537 /// Converts the two signed 32-bit integer elements of a 64-bit vector of
1538 ///    [2 x i32] into two double-precision floating-point values, returned in a
1539 ///    128-bit vector of [2 x double].
1540 ///
1541 /// \headerfile <x86intrin.h>
1542 ///
1543 /// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
1544 ///
1545 /// \param __a
1546 ///    A 64-bit vector of [2 x i32].
1547 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1548 static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX
_mm_cvtpi32_pd(__m64 __a)1549 _mm_cvtpi32_pd(__m64 __a)
1550 {
1551   return __builtin_ia32_cvtpi2pd((__v2si)__a);
1552 }
1553 
1554 /// Returns the low-order element of a 128-bit vector of [2 x double] as
1555 ///    a double-precision floating-point value.
1556 ///
1557 /// \headerfile <x86intrin.h>
1558 ///
1559 /// This intrinsic has no corresponding instruction.
1560 ///
1561 /// \param __a
1562 ///    A 128-bit vector of [2 x double]. The lower 64 bits are returned.
1563 /// \returns A double-precision floating-point value copied from the lower 64
1564 ///    bits of \a __a.
1565 static __inline__ double __DEFAULT_FN_ATTRS
_mm_cvtsd_f64(__m128d __a)1566 _mm_cvtsd_f64(__m128d __a)
1567 {
1568   return __a[0];
1569 }
1570 
1571 /// Loads a 128-bit floating-point vector of [2 x double] from an aligned
1572 ///    memory location.
1573 ///
1574 /// \headerfile <x86intrin.h>
1575 ///
1576 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
1577 ///
1578 /// \param __dp
1579 ///    A pointer to a 128-bit memory location. The address of the memory
1580 ///    location has to be 16-byte aligned.
1581 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
1582 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_load_pd(double const * __dp)1583 _mm_load_pd(double const *__dp)
1584 {
1585   return *(const __m128d*)__dp;
1586 }
1587 
1588 /// Loads a double-precision floating-point value from a specified memory
1589 ///    location and duplicates it to both vector elements of a 128-bit vector of
1590 ///    [2 x double].
1591 ///
1592 /// \headerfile <x86intrin.h>
1593 ///
1594 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
1595 ///
1596 /// \param __dp
1597 ///    A pointer to a memory location containing a double-precision value.
1598 /// \returns A 128-bit vector of [2 x double] containing the loaded and
1599 ///    duplicated values.
1600 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_load1_pd(double const * __dp)1601 _mm_load1_pd(double const *__dp)
1602 {
1603   struct __mm_load1_pd_struct {
1604     double __u;
1605   } __attribute__((__packed__, __may_alias__));
1606   double __u = ((const struct __mm_load1_pd_struct*)__dp)->__u;
1607   return __extension__ (__m128d){ __u, __u };
1608 }
1609 
1610 #define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
1611 
1612 /// Loads two double-precision values, in reverse order, from an aligned
1613 ///    memory location into a 128-bit vector of [2 x double].
1614 ///
1615 /// \headerfile <x86intrin.h>
1616 ///
1617 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
1618 /// needed shuffling instructions. In AVX mode, the shuffling may be combined
1619 /// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
1620 ///
1621 /// \param __dp
1622 ///    A 16-byte aligned pointer to an array of double-precision values to be
1623 ///    loaded in reverse order.
1624 /// \returns A 128-bit vector of [2 x double] containing the reversed loaded
1625 ///    values.
1626 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_loadr_pd(double const * __dp)1627 _mm_loadr_pd(double const *__dp)
1628 {
1629   __m128d __u = *(const __m128d*)__dp;
1630   return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
1631 }
1632 
1633 /// Loads a 128-bit floating-point vector of [2 x double] from an
1634 ///    unaligned memory location.
1635 ///
1636 /// \headerfile <x86intrin.h>
1637 ///
1638 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1639 ///
1640 /// \param __dp
1641 ///    A pointer to a 128-bit memory location. The address of the memory
1642 ///    location does not have to be aligned.
1643 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
1644 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_loadu_pd(double const * __dp)1645 _mm_loadu_pd(double const *__dp)
1646 {
1647   struct __loadu_pd {
1648     __m128d_u __v;
1649   } __attribute__((__packed__, __may_alias__));
1650   return ((const struct __loadu_pd*)__dp)->__v;
1651 }
1652 
1653 /// Loads a 64-bit integer value to the low element of a 128-bit integer
1654 ///    vector and clears the upper element.
1655 ///
1656 /// \headerfile <x86intrin.h>
1657 ///
1658 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1659 ///
1660 /// \param __a
1661 ///    A pointer to a 64-bit memory location. The address of the memory
1662 ///    location does not have to be aligned.
1663 /// \returns A 128-bit vector of [2 x i64] containing the loaded value.
1664 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_loadu_si64(void const * __a)1665 _mm_loadu_si64(void const *__a)
1666 {
1667   struct __loadu_si64 {
1668     long long __v;
1669   } __attribute__((__packed__, __may_alias__));
1670   long long __u = ((const struct __loadu_si64*)__a)->__v;
1671   return __extension__ (__m128i)(__v2di){__u, 0LL};
1672 }
1673 
1674 /// Loads a 32-bit integer value to the low element of a 128-bit integer
1675 ///    vector and clears the upper element.
1676 ///
1677 /// \headerfile <x86intrin.h>
1678 ///
1679 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
1680 ///
1681 /// \param __a
1682 ///    A pointer to a 32-bit memory location. The address of the memory
1683 ///    location does not have to be aligned.
1684 /// \returns A 128-bit vector of [4 x i32] containing the loaded value.
1685 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_loadu_si32(void const * __a)1686 _mm_loadu_si32(void const *__a)
1687 {
1688   struct __loadu_si32 {
1689     int __v;
1690   } __attribute__((__packed__, __may_alias__));
1691   int __u = ((const struct __loadu_si32*)__a)->__v;
1692   return __extension__ (__m128i)(__v4si){__u, 0, 0, 0};
1693 }
1694 
1695 /// Loads a 16-bit integer value to the low element of a 128-bit integer
1696 ///    vector and clears the upper element.
1697 ///
1698 /// \headerfile <x86intrin.h>
1699 ///
1700 /// This intrinsic does not correspond to a specific instruction.
1701 ///
1702 /// \param __a
1703 ///    A pointer to a 16-bit memory location. The address of the memory
1704 ///    location does not have to be aligned.
1705 /// \returns A 128-bit vector of [8 x i16] containing the loaded value.
1706 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_loadu_si16(void const * __a)1707 _mm_loadu_si16(void const *__a)
1708 {
1709   struct __loadu_si16 {
1710     short __v;
1711   } __attribute__((__packed__, __may_alias__));
1712   short __u = ((const struct __loadu_si16*)__a)->__v;
1713   return __extension__ (__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
1714 }
1715 
1716 /// Loads a 64-bit double-precision value to the low element of a
1717 ///    128-bit integer vector and clears the upper element.
1718 ///
1719 /// \headerfile <x86intrin.h>
1720 ///
1721 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1722 ///
1723 /// \param __dp
1724 ///    A pointer to a memory location containing a double-precision value.
1725 ///    The address of the memory location does not have to be aligned.
1726 /// \returns A 128-bit vector of [2 x double] containing the loaded value.
1727 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_load_sd(double const * __dp)1728 _mm_load_sd(double const *__dp)
1729 {
1730   struct __mm_load_sd_struct {
1731     double __u;
1732   } __attribute__((__packed__, __may_alias__));
1733   double __u = ((const struct __mm_load_sd_struct*)__dp)->__u;
1734   return __extension__ (__m128d){ __u, 0 };
1735 }
1736 
1737 /// Loads a double-precision value into the high-order bits of a 128-bit
1738 ///    vector of [2 x double]. The low-order bits are copied from the low-order
1739 ///    bits of the first operand.
1740 ///
1741 /// \headerfile <x86intrin.h>
1742 ///
1743 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1744 ///
1745 /// \param __a
1746 ///    A 128-bit vector of [2 x double]. \n
1747 ///    Bits [63:0] are written to bits [63:0] of the result.
1748 /// \param __dp
1749 ///    A pointer to a 64-bit memory location containing a double-precision
1750 ///    floating-point value that is loaded. The loaded value is written to bits
1751 ///    [127:64] of the result. The address of the memory location does not have
1752 ///    to be aligned.
1753 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1754 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_loadh_pd(__m128d __a,double const * __dp)1755 _mm_loadh_pd(__m128d __a, double const *__dp)
1756 {
1757   struct __mm_loadh_pd_struct {
1758     double __u;
1759   } __attribute__((__packed__, __may_alias__));
1760   double __u = ((const struct __mm_loadh_pd_struct*)__dp)->__u;
1761   return __extension__ (__m128d){ __a[0], __u };
1762 }
1763 
1764 /// Loads a double-precision value into the low-order bits of a 128-bit
1765 ///    vector of [2 x double]. The high-order bits are copied from the
1766 ///    high-order bits of the first operand.
1767 ///
1768 /// \headerfile <x86intrin.h>
1769 ///
1770 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1771 ///
1772 /// \param __a
1773 ///    A 128-bit vector of [2 x double]. \n
1774 ///    Bits [127:64] are written to bits [127:64] of the result.
1775 /// \param __dp
1776 ///    A pointer to a 64-bit memory location containing a double-precision
1777 ///    floating-point value that is loaded. The loaded value is written to bits
1778 ///    [63:0] of the result. The address of the memory location does not have to
1779 ///    be aligned.
1780 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1781 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_loadl_pd(__m128d __a,double const * __dp)1782 _mm_loadl_pd(__m128d __a, double const *__dp)
1783 {
1784   struct __mm_loadl_pd_struct {
1785     double __u;
1786   } __attribute__((__packed__, __may_alias__));
1787   double __u = ((const struct __mm_loadl_pd_struct*)__dp)->__u;
1788   return __extension__ (__m128d){ __u, __a[1] };
1789 }
1790 
1791 /// Constructs a 128-bit floating-point vector of [2 x double] with
1792 ///    unspecified content. This could be used as an argument to another
1793 ///    intrinsic function where the argument is required but the value is not
1794 ///    actually used.
1795 ///
1796 /// \headerfile <x86intrin.h>
1797 ///
1798 /// This intrinsic has no corresponding instruction.
1799 ///
1800 /// \returns A 128-bit floating-point vector of [2 x double] with unspecified
1801 ///    content.
1802 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_undefined_pd(void)1803 _mm_undefined_pd(void)
1804 {
1805   return (__m128d)__builtin_ia32_undef128();
1806 }
1807 
1808 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1809 ///    64 bits of the vector are initialized with the specified double-precision
1810 ///    floating-point value. The upper 64 bits are set to zero.
1811 ///
1812 /// \headerfile <x86intrin.h>
1813 ///
1814 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1815 ///
1816 /// \param __w
1817 ///    A double-precision floating-point value used to initialize the lower 64
1818 ///    bits of the result.
1819 /// \returns An initialized 128-bit floating-point vector of [2 x double]. The
1820 ///    lower 64 bits contain the value of the parameter. The upper 64 bits are
1821 ///    set to zero.
1822 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_set_sd(double __w)1823 _mm_set_sd(double __w)
1824 {
1825   return __extension__ (__m128d){ __w, 0 };
1826 }
1827 
1828 /// Constructs a 128-bit floating-point vector of [2 x double], with each
1829 ///    of the two double-precision floating-point vector elements set to the
1830 ///    specified double-precision floating-point value.
1831 ///
1832 /// \headerfile <x86intrin.h>
1833 ///
1834 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1835 ///
1836 /// \param __w
1837 ///    A double-precision floating-point value used to initialize each vector
1838 ///    element of the result.
1839 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1840 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_set1_pd(double __w)1841 _mm_set1_pd(double __w)
1842 {
1843   return __extension__ (__m128d){ __w, __w };
1844 }
1845 
1846 /// Constructs a 128-bit floating-point vector of [2 x double], with each
1847 ///    of the two double-precision floating-point vector elements set to the
1848 ///    specified double-precision floating-point value.
1849 ///
1850 /// \headerfile <x86intrin.h>
1851 ///
1852 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1853 ///
1854 /// \param __w
1855 ///    A double-precision floating-point value used to initialize each vector
1856 ///    element of the result.
1857 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1858 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_set_pd1(double __w)1859 _mm_set_pd1(double __w)
1860 {
1861   return _mm_set1_pd(__w);
1862 }
1863 
1864 /// Constructs a 128-bit floating-point vector of [2 x double]
1865 ///    initialized with the specified double-precision floating-point values.
1866 ///
1867 /// \headerfile <x86intrin.h>
1868 ///
1869 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1870 ///
1871 /// \param __w
1872 ///    A double-precision floating-point value used to initialize the upper 64
1873 ///    bits of the result.
1874 /// \param __x
1875 ///    A double-precision floating-point value used to initialize the lower 64
1876 ///    bits of the result.
1877 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1878 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_set_pd(double __w,double __x)1879 _mm_set_pd(double __w, double __x)
1880 {
1881   return __extension__ (__m128d){ __x, __w };
1882 }
1883 
1884 /// Constructs a 128-bit floating-point vector of [2 x double],
1885 ///    initialized in reverse order with the specified double-precision
1886 ///    floating-point values.
1887 ///
1888 /// \headerfile <x86intrin.h>
1889 ///
1890 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1891 ///
1892 /// \param __w
1893 ///    A double-precision floating-point value used to initialize the lower 64
1894 ///    bits of the result.
1895 /// \param __x
1896 ///    A double-precision floating-point value used to initialize the upper 64
1897 ///    bits of the result.
1898 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1899 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_setr_pd(double __w,double __x)1900 _mm_setr_pd(double __w, double __x)
1901 {
1902   return __extension__ (__m128d){ __w, __x };
1903 }
1904 
1905 /// Constructs a 128-bit floating-point vector of [2 x double]
1906 ///    initialized to zero.
1907 ///
1908 /// \headerfile <x86intrin.h>
1909 ///
1910 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1911 ///
1912 /// \returns An initialized 128-bit floating-point vector of [2 x double] with
1913 ///    all elements set to zero.
1914 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_setzero_pd(void)1915 _mm_setzero_pd(void)
1916 {
1917   return __extension__ (__m128d){ 0, 0 };
1918 }
1919 
1920 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1921 ///    64 bits are set to the lower 64 bits of the second parameter. The upper
1922 ///    64 bits are set to the upper 64 bits of the first parameter.
1923 ///
1924 /// \headerfile <x86intrin.h>
1925 ///
1926 /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
1927 ///
1928 /// \param __a
1929 ///    A 128-bit vector of [2 x double]. The upper 64 bits are written to the
1930 ///    upper 64 bits of the result.
1931 /// \param __b
1932 ///    A 128-bit vector of [2 x double]. The lower 64 bits are written to the
1933 ///    lower 64 bits of the result.
1934 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1935 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_move_sd(__m128d __a,__m128d __b)1936 _mm_move_sd(__m128d __a, __m128d __b)
1937 {
1938   __a[0] = __b[0];
1939   return __a;
1940 }
1941 
1942 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1943 ///    memory location.
1944 ///
1945 /// \headerfile <x86intrin.h>
1946 ///
1947 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1948 ///
1949 /// \param __dp
1950 ///    A pointer to a 64-bit memory location.
1951 /// \param __a
1952 ///    A 128-bit vector of [2 x double] containing the value to be stored.
1953 static __inline__ void __DEFAULT_FN_ATTRS
_mm_store_sd(double * __dp,__m128d __a)1954 _mm_store_sd(double *__dp, __m128d __a)
1955 {
1956   struct __mm_store_sd_struct {
1957     double __u;
1958   } __attribute__((__packed__, __may_alias__));
1959   ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
1960 }
1961 
1962 /// Moves packed double-precision values from a 128-bit vector of
1963 ///    [2 x double] to a memory location.
1964 ///
1965 /// \headerfile <x86intrin.h>
1966 ///
1967 /// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
1968 ///
1969 /// \param __dp
1970 ///    A pointer to an aligned memory location that can store two
1971 ///    double-precision values.
1972 /// \param __a
1973 ///    A packed 128-bit vector of [2 x double] containing the values to be
1974 ///    moved.
1975 static __inline__ void __DEFAULT_FN_ATTRS
_mm_store_pd(double * __dp,__m128d __a)1976 _mm_store_pd(double *__dp, __m128d __a)
1977 {
1978   *(__m128d*)__dp = __a;
1979 }
1980 
1981 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1982 ///    the upper and lower 64 bits of a memory location.
1983 ///
1984 /// \headerfile <x86intrin.h>
1985 ///
1986 /// This intrinsic corresponds to the
1987 ///   <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1988 ///
1989 /// \param __dp
1990 ///    A pointer to a memory location that can store two double-precision
1991 ///    values.
1992 /// \param __a
1993 ///    A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1994 ///    of the values in \a __dp.
1995 static __inline__ void __DEFAULT_FN_ATTRS
_mm_store1_pd(double * __dp,__m128d __a)1996 _mm_store1_pd(double *__dp, __m128d __a)
1997 {
1998   __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
1999   _mm_store_pd(__dp, __a);
2000 }
2001 
2002 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
2003 ///    the upper and lower 64 bits of a memory location.
2004 ///
2005 /// \headerfile <x86intrin.h>
2006 ///
2007 /// This intrinsic corresponds to the
2008 ///   <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
2009 ///
2010 /// \param __dp
2011 ///    A pointer to a memory location that can store two double-precision
2012 ///    values.
2013 /// \param __a
2014 ///    A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
2015 ///    of the values in \a __dp.
2016 static __inline__ void __DEFAULT_FN_ATTRS
_mm_store_pd1(double * __dp,__m128d __a)2017 _mm_store_pd1(double *__dp, __m128d __a)
2018 {
2019   _mm_store1_pd(__dp, __a);
2020 }
2021 
2022 /// Stores a 128-bit vector of [2 x double] into an unaligned memory
2023 ///    location.
2024 ///
2025 /// \headerfile <x86intrin.h>
2026 ///
2027 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
2028 ///
2029 /// \param __dp
2030 ///    A pointer to a 128-bit memory location. The address of the memory
2031 ///    location does not have to be aligned.
2032 /// \param __a
2033 ///    A 128-bit vector of [2 x double] containing the values to be stored.
2034 static __inline__ void __DEFAULT_FN_ATTRS
_mm_storeu_pd(double * __dp,__m128d __a)2035 _mm_storeu_pd(double *__dp, __m128d __a)
2036 {
2037   struct __storeu_pd {
2038     __m128d_u __v;
2039   } __attribute__((__packed__, __may_alias__));
2040   ((struct __storeu_pd*)__dp)->__v = __a;
2041 }
2042 
2043 /// Stores two double-precision values, in reverse order, from a 128-bit
2044 ///    vector of [2 x double] to a 16-byte aligned memory location.
2045 ///
2046 /// \headerfile <x86intrin.h>
2047 ///
2048 /// This intrinsic corresponds to a shuffling instruction followed by a
2049 /// <c> VMOVAPD / MOVAPD </c> instruction.
2050 ///
2051 /// \param __dp
2052 ///    A pointer to a 16-byte aligned memory location that can store two
2053 ///    double-precision values.
2054 /// \param __a
2055 ///    A 128-bit vector of [2 x double] containing the values to be reversed and
2056 ///    stored.
2057 static __inline__ void __DEFAULT_FN_ATTRS
_mm_storer_pd(double * __dp,__m128d __a)2058 _mm_storer_pd(double *__dp, __m128d __a)
2059 {
2060   __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
2061   *(__m128d *)__dp = __a;
2062 }
2063 
2064 /// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
2065 ///    memory location.
2066 ///
2067 /// \headerfile <x86intrin.h>
2068 ///
2069 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
2070 ///
2071 /// \param __dp
2072 ///    A pointer to a 64-bit memory location.
2073 /// \param __a
2074 ///    A 128-bit vector of [2 x double] containing the value to be stored.
2075 static __inline__ void __DEFAULT_FN_ATTRS
_mm_storeh_pd(double * __dp,__m128d __a)2076 _mm_storeh_pd(double *__dp, __m128d __a)
2077 {
2078   struct __mm_storeh_pd_struct {
2079     double __u;
2080   } __attribute__((__packed__, __may_alias__));
2081   ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
2082 }
2083 
2084 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
2085 ///    memory location.
2086 ///
2087 /// \headerfile <x86intrin.h>
2088 ///
2089 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
2090 ///
2091 /// \param __dp
2092 ///    A pointer to a 64-bit memory location.
2093 /// \param __a
2094 ///    A 128-bit vector of [2 x double] containing the value to be stored.
2095 static __inline__ void __DEFAULT_FN_ATTRS
_mm_storel_pd(double * __dp,__m128d __a)2096 _mm_storel_pd(double *__dp, __m128d __a)
2097 {
2098   struct __mm_storeh_pd_struct {
2099     double __u;
2100   } __attribute__((__packed__, __may_alias__));
2101   ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
2102 }
2103 
2104 /// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
2105 ///    saving the lower 8 bits of each sum in the corresponding element of a
2106 ///    128-bit result vector of [16 x i8].
2107 ///
2108 ///    The integer elements of both parameters can be either signed or unsigned.
2109 ///
2110 /// \headerfile <x86intrin.h>
2111 ///
2112 /// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
2113 ///
2114 /// \param __a
2115 ///    A 128-bit vector of [16 x i8].
2116 /// \param __b
2117 ///    A 128-bit vector of [16 x i8].
2118 /// \returns A 128-bit vector of [16 x i8] containing the sums of both
2119 ///    parameters.
2120 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_add_epi8(__m128i __a,__m128i __b)2121 _mm_add_epi8(__m128i __a, __m128i __b)
2122 {
2123   return (__m128i)((__v16qu)__a + (__v16qu)__b);
2124 }
2125 
2126 /// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
2127 ///    saving the lower 16 bits of each sum in the corresponding element of a
2128 ///    128-bit result vector of [8 x i16].
2129 ///
2130 ///    The integer elements of both parameters can be either signed or unsigned.
2131 ///
2132 /// \headerfile <x86intrin.h>
2133 ///
2134 /// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
2135 ///
2136 /// \param __a
2137 ///    A 128-bit vector of [8 x i16].
2138 /// \param __b
2139 ///    A 128-bit vector of [8 x i16].
2140 /// \returns A 128-bit vector of [8 x i16] containing the sums of both
2141 ///    parameters.
2142 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_add_epi16(__m128i __a,__m128i __b)2143 _mm_add_epi16(__m128i __a, __m128i __b)
2144 {
2145   return (__m128i)((__v8hu)__a + (__v8hu)__b);
2146 }
2147 
2148 /// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
2149 ///    saving the lower 32 bits of each sum in the corresponding element of a
2150 ///    128-bit result vector of [4 x i32].
2151 ///
2152 ///    The integer elements of both parameters can be either signed or unsigned.
2153 ///
2154 /// \headerfile <x86intrin.h>
2155 ///
2156 /// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
2157 ///
2158 /// \param __a
2159 ///    A 128-bit vector of [4 x i32].
2160 /// \param __b
2161 ///    A 128-bit vector of [4 x i32].
2162 /// \returns A 128-bit vector of [4 x i32] containing the sums of both
2163 ///    parameters.
2164 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_add_epi32(__m128i __a,__m128i __b)2165 _mm_add_epi32(__m128i __a, __m128i __b)
2166 {
2167   return (__m128i)((__v4su)__a + (__v4su)__b);
2168 }
2169 
2170 /// Adds two signed or unsigned 64-bit integer values, returning the
2171 ///    lower 64 bits of the sum.
2172 ///
2173 /// \headerfile <x86intrin.h>
2174 ///
2175 /// This intrinsic corresponds to the <c> PADDQ </c> instruction.
2176 ///
2177 /// \param __a
2178 ///    A 64-bit integer.
2179 /// \param __b
2180 ///    A 64-bit integer.
2181 /// \returns A 64-bit integer containing the sum of both parameters.
2182 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_add_si64(__m64 __a,__m64 __b)2183 _mm_add_si64(__m64 __a, __m64 __b)
2184 {
2185   return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
2186 }
2187 
2188 /// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
2189 ///    saving the lower 64 bits of each sum in the corresponding element of a
2190 ///    128-bit result vector of [2 x i64].
2191 ///
2192 ///    The integer elements of both parameters can be either signed or unsigned.
2193 ///
2194 /// \headerfile <x86intrin.h>
2195 ///
2196 /// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
2197 ///
2198 /// \param __a
2199 ///    A 128-bit vector of [2 x i64].
2200 /// \param __b
2201 ///    A 128-bit vector of [2 x i64].
2202 /// \returns A 128-bit vector of [2 x i64] containing the sums of both
2203 ///    parameters.
2204 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_add_epi64(__m128i __a,__m128i __b)2205 _mm_add_epi64(__m128i __a, __m128i __b)
2206 {
2207   return (__m128i)((__v2du)__a + (__v2du)__b);
2208 }
2209 
2210 /// Adds, with saturation, the corresponding elements of two 128-bit
2211 ///    signed [16 x i8] vectors, saving each sum in the corresponding element of
2212 ///    a 128-bit result vector of [16 x i8]. Positive sums greater than 0x7F are
2213 ///    saturated to 0x7F. Negative sums less than 0x80 are saturated to 0x80.
2214 ///
2215 /// \headerfile <x86intrin.h>
2216 ///
2217 /// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
2218 ///
2219 /// \param __a
2220 ///    A 128-bit signed [16 x i8] vector.
2221 /// \param __b
2222 ///    A 128-bit signed [16 x i8] vector.
2223 /// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
2224 ///    both parameters.
2225 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_adds_epi8(__m128i __a,__m128i __b)2226 _mm_adds_epi8(__m128i __a, __m128i __b)
2227 {
2228   return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
2229 }
2230 
2231 /// Adds, with saturation, the corresponding elements of two 128-bit
2232 ///    signed [8 x i16] vectors, saving each sum in the corresponding element of
2233 ///    a 128-bit result vector of [8 x i16]. Positive sums greater than 0x7FFF
2234 ///    are saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
2235 ///    0x8000.
2236 ///
2237 /// \headerfile <x86intrin.h>
2238 ///
2239 /// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
2240 ///
2241 /// \param __a
2242 ///    A 128-bit signed [8 x i16] vector.
2243 /// \param __b
2244 ///    A 128-bit signed [8 x i16] vector.
2245 /// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
2246 ///    both parameters.
2247 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_adds_epi16(__m128i __a,__m128i __b)2248 _mm_adds_epi16(__m128i __a, __m128i __b)
2249 {
2250   return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
2251 }
2252 
2253 /// Adds, with saturation, the corresponding elements of two 128-bit
2254 ///    unsigned [16 x i8] vectors, saving each sum in the corresponding element
2255 ///    of a 128-bit result vector of [16 x i8]. Positive sums greater than 0xFF
2256 ///    are saturated to 0xFF. Negative sums are saturated to 0x00.
2257 ///
2258 /// \headerfile <x86intrin.h>
2259 ///
2260 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2261 ///
2262 /// \param __a
2263 ///    A 128-bit unsigned [16 x i8] vector.
2264 /// \param __b
2265 ///    A 128-bit unsigned [16 x i8] vector.
2266 /// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
2267 ///    of both parameters.
2268 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_adds_epu8(__m128i __a,__m128i __b)2269 _mm_adds_epu8(__m128i __a, __m128i __b)
2270 {
2271   return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
2272 }
2273 
2274 /// Adds, with saturation, the corresponding elements of two 128-bit
2275 ///    unsigned [8 x i16] vectors, saving each sum in the corresponding element
2276 ///    of a 128-bit result vector of [8 x i16]. Positive sums greater than
2277 ///    0xFFFF are saturated to 0xFFFF. Negative sums are saturated to 0x0000.
2278 ///
2279 /// \headerfile <x86intrin.h>
2280 ///
2281 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2282 ///
2283 /// \param __a
2284 ///    A 128-bit unsigned [8 x i16] vector.
2285 /// \param __b
2286 ///    A 128-bit unsigned [8 x i16] vector.
2287 /// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
2288 ///    of both parameters.
2289 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_adds_epu16(__m128i __a,__m128i __b)2290 _mm_adds_epu16(__m128i __a, __m128i __b)
2291 {
2292   return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
2293 }
2294 
2295 /// Computes the rounded averages of corresponding elements of two
2296 ///    128-bit unsigned [16 x i8] vectors, saving each result in the
2297 ///    corresponding element of a 128-bit result vector of [16 x i8].
2298 ///
2299 /// \headerfile <x86intrin.h>
2300 ///
2301 /// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
2302 ///
2303 /// \param __a
2304 ///    A 128-bit unsigned [16 x i8] vector.
2305 /// \param __b
2306 ///    A 128-bit unsigned [16 x i8] vector.
2307 /// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
2308 ///    averages of both parameters.
2309 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_avg_epu8(__m128i __a,__m128i __b)2310 _mm_avg_epu8(__m128i __a, __m128i __b)
2311 {
2312   return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
2313 }
2314 
2315 /// Computes the rounded averages of corresponding elements of two
2316 ///    128-bit unsigned [8 x i16] vectors, saving each result in the
2317 ///    corresponding element of a 128-bit result vector of [8 x i16].
2318 ///
2319 /// \headerfile <x86intrin.h>
2320 ///
2321 /// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
2322 ///
2323 /// \param __a
2324 ///    A 128-bit unsigned [8 x i16] vector.
2325 /// \param __b
2326 ///    A 128-bit unsigned [8 x i16] vector.
2327 /// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
2328 ///    averages of both parameters.
2329 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_avg_epu16(__m128i __a,__m128i __b)2330 _mm_avg_epu16(__m128i __a, __m128i __b)
2331 {
2332   return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
2333 }
2334 
2335 /// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
2336 ///    vectors, producing eight intermediate 32-bit signed integer products, and
2337 ///    adds the consecutive pairs of 32-bit products to form a 128-bit signed
2338 ///    [4 x i32] vector.
2339 ///
2340 ///    For example, bits [15:0] of both parameters are multiplied producing a
2341 ///    32-bit product, bits [31:16] of both parameters are multiplied producing
2342 ///    a 32-bit product, and the sum of those two products becomes bits [31:0]
2343 ///    of the result.
2344 ///
2345 /// \headerfile <x86intrin.h>
2346 ///
2347 /// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
2348 ///
2349 /// \param __a
2350 ///    A 128-bit signed [8 x i16] vector.
2351 /// \param __b
2352 ///    A 128-bit signed [8 x i16] vector.
2353 /// \returns A 128-bit signed [4 x i32] vector containing the sums of products
2354 ///    of both parameters.
2355 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_madd_epi16(__m128i __a,__m128i __b)2356 _mm_madd_epi16(__m128i __a, __m128i __b)
2357 {
2358   return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
2359 }
2360 
2361 /// Compares corresponding elements of two 128-bit signed [8 x i16]
2362 ///    vectors, saving the greater value from each comparison in the
2363 ///    corresponding element of a 128-bit result vector of [8 x i16].
2364 ///
2365 /// \headerfile <x86intrin.h>
2366 ///
2367 /// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
2368 ///
2369 /// \param __a
2370 ///    A 128-bit signed [8 x i16] vector.
2371 /// \param __b
2372 ///    A 128-bit signed [8 x i16] vector.
2373 /// \returns A 128-bit signed [8 x i16] vector containing the greater value of
2374 ///    each comparison.
2375 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_max_epi16(__m128i __a,__m128i __b)2376 _mm_max_epi16(__m128i __a, __m128i __b)
2377 {
2378   return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
2379 }
2380 
2381 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2382 ///    vectors, saving the greater value from each comparison in the
2383 ///    corresponding element of a 128-bit result vector of [16 x i8].
2384 ///
2385 /// \headerfile <x86intrin.h>
2386 ///
2387 /// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
2388 ///
2389 /// \param __a
2390 ///    A 128-bit unsigned [16 x i8] vector.
2391 /// \param __b
2392 ///    A 128-bit unsigned [16 x i8] vector.
2393 /// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
2394 ///    each comparison.
2395 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_max_epu8(__m128i __a,__m128i __b)2396 _mm_max_epu8(__m128i __a, __m128i __b)
2397 {
2398   return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
2399 }
2400 
2401 /// Compares corresponding elements of two 128-bit signed [8 x i16]
2402 ///    vectors, saving the smaller value from each comparison in the
2403 ///    corresponding element of a 128-bit result vector of [8 x i16].
2404 ///
2405 /// \headerfile <x86intrin.h>
2406 ///
2407 /// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
2408 ///
2409 /// \param __a
2410 ///    A 128-bit signed [8 x i16] vector.
2411 /// \param __b
2412 ///    A 128-bit signed [8 x i16] vector.
2413 /// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
2414 ///    each comparison.
2415 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_min_epi16(__m128i __a,__m128i __b)2416 _mm_min_epi16(__m128i __a, __m128i __b)
2417 {
2418   return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
2419 }
2420 
2421 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2422 ///    vectors, saving the smaller value from each comparison in the
2423 ///    corresponding element of a 128-bit result vector of [16 x i8].
2424 ///
2425 /// \headerfile <x86intrin.h>
2426 ///
2427 /// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
2428 ///
2429 /// \param __a
2430 ///    A 128-bit unsigned [16 x i8] vector.
2431 /// \param __b
2432 ///    A 128-bit unsigned [16 x i8] vector.
2433 /// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
2434 ///    each comparison.
2435 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_min_epu8(__m128i __a,__m128i __b)2436 _mm_min_epu8(__m128i __a, __m128i __b)
2437 {
2438   return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
2439 }
2440 
2441 /// Multiplies the corresponding elements of two signed [8 x i16]
2442 ///    vectors, saving the upper 16 bits of each 32-bit product in the
2443 ///    corresponding element of a 128-bit signed [8 x i16] result vector.
2444 ///
2445 /// \headerfile <x86intrin.h>
2446 ///
2447 /// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
2448 ///
2449 /// \param __a
2450 ///    A 128-bit signed [8 x i16] vector.
2451 /// \param __b
2452 ///    A 128-bit signed [8 x i16] vector.
2453 /// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
2454 ///    each of the eight 32-bit products.
2455 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mulhi_epi16(__m128i __a,__m128i __b)2456 _mm_mulhi_epi16(__m128i __a, __m128i __b)
2457 {
2458   return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
2459 }
2460 
2461 /// Multiplies the corresponding elements of two unsigned [8 x i16]
2462 ///    vectors, saving the upper 16 bits of each 32-bit product in the
2463 ///    corresponding element of a 128-bit unsigned [8 x i16] result vector.
2464 ///
2465 /// \headerfile <x86intrin.h>
2466 ///
2467 /// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
2468 ///
2469 /// \param __a
2470 ///    A 128-bit unsigned [8 x i16] vector.
2471 /// \param __b
2472 ///    A 128-bit unsigned [8 x i16] vector.
2473 /// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
2474 ///    of each of the eight 32-bit products.
2475 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mulhi_epu16(__m128i __a,__m128i __b)2476 _mm_mulhi_epu16(__m128i __a, __m128i __b)
2477 {
2478   return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
2479 }
2480 
2481 /// Multiplies the corresponding elements of two signed [8 x i16]
2482 ///    vectors, saving the lower 16 bits of each 32-bit product in the
2483 ///    corresponding element of a 128-bit signed [8 x i16] result vector.
2484 ///
2485 /// \headerfile <x86intrin.h>
2486 ///
2487 /// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
2488 ///
2489 /// \param __a
2490 ///    A 128-bit signed [8 x i16] vector.
2491 /// \param __b
2492 ///    A 128-bit signed [8 x i16] vector.
2493 /// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
2494 ///    each of the eight 32-bit products.
2495 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mullo_epi16(__m128i __a,__m128i __b)2496 _mm_mullo_epi16(__m128i __a, __m128i __b)
2497 {
2498   return (__m128i)((__v8hu)__a * (__v8hu)__b);
2499 }
2500 
2501 /// Multiplies 32-bit unsigned integer values contained in the lower bits
2502 ///    of the two 64-bit integer vectors and returns the 64-bit unsigned
2503 ///    product.
2504 ///
2505 /// \headerfile <x86intrin.h>
2506 ///
2507 /// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
2508 ///
2509 /// \param __a
2510 ///    A 64-bit integer containing one of the source operands.
2511 /// \param __b
2512 ///    A 64-bit integer containing one of the source operands.
2513 /// \returns A 64-bit integer vector containing the product of both operands.
2514 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_mul_su32(__m64 __a,__m64 __b)2515 _mm_mul_su32(__m64 __a, __m64 __b)
2516 {
2517   return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
2518 }
2519 
2520 /// Multiplies 32-bit unsigned integer values contained in the lower
2521 ///    bits of the corresponding elements of two [2 x i64] vectors, and returns
2522 ///    the 64-bit products in the corresponding elements of a [2 x i64] vector.
2523 ///
2524 /// \headerfile <x86intrin.h>
2525 ///
2526 /// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
2527 ///
2528 /// \param __a
2529 ///    A [2 x i64] vector containing one of the source operands.
2530 /// \param __b
2531 ///    A [2 x i64] vector containing one of the source operands.
2532 /// \returns A [2 x i64] vector containing the product of both operands.
2533 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mul_epu32(__m128i __a,__m128i __b)2534 _mm_mul_epu32(__m128i __a, __m128i __b)
2535 {
2536   return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
2537 }
2538 
2539 /// Computes the absolute differences of corresponding 8-bit integer
2540 ///    values in two 128-bit vectors. Sums the first 8 absolute differences, and
2541 ///    separately sums the second 8 absolute differences. Packs these two
2542 ///    unsigned 16-bit integer sums into the upper and lower elements of a
2543 ///    [2 x i64] vector.
2544 ///
2545 /// \headerfile <x86intrin.h>
2546 ///
2547 /// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
2548 ///
2549 /// \param __a
2550 ///    A 128-bit integer vector containing one of the source operands.
2551 /// \param __b
2552 ///    A 128-bit integer vector containing one of the source operands.
2553 /// \returns A [2 x i64] vector containing the sums of the sets of absolute
2554 ///    differences between both operands.
2555 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sad_epu8(__m128i __a,__m128i __b)2556 _mm_sad_epu8(__m128i __a, __m128i __b)
2557 {
2558   return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
2559 }
2560 
2561 /// Subtracts the corresponding 8-bit integer values in the operands.
2562 ///
2563 /// \headerfile <x86intrin.h>
2564 ///
2565 /// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
2566 ///
2567 /// \param __a
2568 ///    A 128-bit integer vector containing the minuends.
2569 /// \param __b
2570 ///    A 128-bit integer vector containing the subtrahends.
2571 /// \returns A 128-bit integer vector containing the differences of the values
2572 ///    in the operands.
2573 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sub_epi8(__m128i __a,__m128i __b)2574 _mm_sub_epi8(__m128i __a, __m128i __b)
2575 {
2576   return (__m128i)((__v16qu)__a - (__v16qu)__b);
2577 }
2578 
2579 /// Subtracts the corresponding 16-bit integer values in the operands.
2580 ///
2581 /// \headerfile <x86intrin.h>
2582 ///
2583 /// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
2584 ///
2585 /// \param __a
2586 ///    A 128-bit integer vector containing the minuends.
2587 /// \param __b
2588 ///    A 128-bit integer vector containing the subtrahends.
2589 /// \returns A 128-bit integer vector containing the differences of the values
2590 ///    in the operands.
2591 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sub_epi16(__m128i __a,__m128i __b)2592 _mm_sub_epi16(__m128i __a, __m128i __b)
2593 {
2594   return (__m128i)((__v8hu)__a - (__v8hu)__b);
2595 }
2596 
2597 /// Subtracts the corresponding 32-bit integer values in the operands.
2598 ///
2599 /// \headerfile <x86intrin.h>
2600 ///
2601 /// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
2602 ///
2603 /// \param __a
2604 ///    A 128-bit integer vector containing the minuends.
2605 /// \param __b
2606 ///    A 128-bit integer vector containing the subtrahends.
2607 /// \returns A 128-bit integer vector containing the differences of the values
2608 ///    in the operands.
2609 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sub_epi32(__m128i __a,__m128i __b)2610 _mm_sub_epi32(__m128i __a, __m128i __b)
2611 {
2612   return (__m128i)((__v4su)__a - (__v4su)__b);
2613 }
2614 
2615 /// Subtracts signed or unsigned 64-bit integer values and writes the
2616 ///    difference to the corresponding bits in the destination.
2617 ///
2618 /// \headerfile <x86intrin.h>
2619 ///
2620 /// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
2621 ///
2622 /// \param __a
2623 ///    A 64-bit integer vector containing the minuend.
2624 /// \param __b
2625 ///    A 64-bit integer vector containing the subtrahend.
2626 /// \returns A 64-bit integer vector containing the difference of the values in
2627 ///    the operands.
2628 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_sub_si64(__m64 __a,__m64 __b)2629 _mm_sub_si64(__m64 __a, __m64 __b)
2630 {
2631   return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
2632 }
2633 
2634 /// Subtracts the corresponding elements of two [2 x i64] vectors.
2635 ///
2636 /// \headerfile <x86intrin.h>
2637 ///
2638 /// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
2639 ///
2640 /// \param __a
2641 ///    A 128-bit integer vector containing the minuends.
2642 /// \param __b
2643 ///    A 128-bit integer vector containing the subtrahends.
2644 /// \returns A 128-bit integer vector containing the differences of the values
2645 ///    in the operands.
2646 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sub_epi64(__m128i __a,__m128i __b)2647 _mm_sub_epi64(__m128i __a, __m128i __b)
2648 {
2649   return (__m128i)((__v2du)__a - (__v2du)__b);
2650 }
2651 
2652 /// Subtracts corresponding 8-bit signed integer values in the input and
2653 ///    returns the differences in the corresponding bytes in the destination.
2654 ///    Differences greater than 0x7F are saturated to 0x7F, and differences less
2655 ///    than 0x80 are saturated to 0x80.
2656 ///
2657 /// \headerfile <x86intrin.h>
2658 ///
2659 /// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
2660 ///
2661 /// \param __a
2662 ///    A 128-bit integer vector containing the minuends.
2663 /// \param __b
2664 ///    A 128-bit integer vector containing the subtrahends.
2665 /// \returns A 128-bit integer vector containing the differences of the values
2666 ///    in the operands.
2667 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_subs_epi8(__m128i __a,__m128i __b)2668 _mm_subs_epi8(__m128i __a, __m128i __b)
2669 {
2670   return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
2671 }
2672 
2673 /// Subtracts corresponding 16-bit signed integer values in the input and
2674 ///    returns the differences in the corresponding bytes in the destination.
2675 ///    Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
2676 ///    than 0x8000 are saturated to 0x8000.
2677 ///
2678 /// \headerfile <x86intrin.h>
2679 ///
2680 /// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
2681 ///
2682 /// \param __a
2683 ///    A 128-bit integer vector containing the minuends.
2684 /// \param __b
2685 ///    A 128-bit integer vector containing the subtrahends.
2686 /// \returns A 128-bit integer vector containing the differences of the values
2687 ///    in the operands.
2688 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_subs_epi16(__m128i __a,__m128i __b)2689 _mm_subs_epi16(__m128i __a, __m128i __b)
2690 {
2691   return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
2692 }
2693 
2694 /// Subtracts corresponding 8-bit unsigned integer values in the input
2695 ///    and returns the differences in the corresponding bytes in the
2696 ///    destination. Differences less than 0x00 are saturated to 0x00.
2697 ///
2698 /// \headerfile <x86intrin.h>
2699 ///
2700 /// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
2701 ///
2702 /// \param __a
2703 ///    A 128-bit integer vector containing the minuends.
2704 /// \param __b
2705 ///    A 128-bit integer vector containing the subtrahends.
2706 /// \returns A 128-bit integer vector containing the unsigned integer
2707 ///    differences of the values in the operands.
2708 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_subs_epu8(__m128i __a,__m128i __b)2709 _mm_subs_epu8(__m128i __a, __m128i __b)
2710 {
2711   return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
2712 }
2713 
2714 /// Subtracts corresponding 16-bit unsigned integer values in the input
2715 ///    and returns the differences in the corresponding bytes in the
2716 ///    destination. Differences less than 0x0000 are saturated to 0x0000.
2717 ///
2718 /// \headerfile <x86intrin.h>
2719 ///
2720 /// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
2721 ///
2722 /// \param __a
2723 ///    A 128-bit integer vector containing the minuends.
2724 /// \param __b
2725 ///    A 128-bit integer vector containing the subtrahends.
2726 /// \returns A 128-bit integer vector containing the unsigned integer
2727 ///    differences of the values in the operands.
2728 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_subs_epu16(__m128i __a,__m128i __b)2729 _mm_subs_epu16(__m128i __a, __m128i __b)
2730 {
2731   return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
2732 }
2733 
2734 /// Performs a bitwise AND of two 128-bit integer vectors.
2735 ///
2736 /// \headerfile <x86intrin.h>
2737 ///
2738 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
2739 ///
2740 /// \param __a
2741 ///    A 128-bit integer vector containing one of the source operands.
2742 /// \param __b
2743 ///    A 128-bit integer vector containing one of the source operands.
2744 /// \returns A 128-bit integer vector containing the bitwise AND of the values
2745 ///    in both operands.
2746 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_and_si128(__m128i __a,__m128i __b)2747 _mm_and_si128(__m128i __a, __m128i __b)
2748 {
2749   return (__m128i)((__v2du)__a & (__v2du)__b);
2750 }
2751 
2752 /// Performs a bitwise AND of two 128-bit integer vectors, using the
2753 ///    one's complement of the values contained in the first source operand.
2754 ///
2755 /// \headerfile <x86intrin.h>
2756 ///
2757 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
2758 ///
2759 /// \param __a
2760 ///    A 128-bit vector containing the left source operand. The one's complement
2761 ///    of this value is used in the bitwise AND.
2762 /// \param __b
2763 ///    A 128-bit vector containing the right source operand.
2764 /// \returns A 128-bit integer vector containing the bitwise AND of the one's
2765 ///    complement of the first operand and the values in the second operand.
2766 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_andnot_si128(__m128i __a,__m128i __b)2767 _mm_andnot_si128(__m128i __a, __m128i __b)
2768 {
2769   return (__m128i)(~(__v2du)__a & (__v2du)__b);
2770 }
2771 /// Performs a bitwise OR of two 128-bit integer vectors.
2772 ///
2773 /// \headerfile <x86intrin.h>
2774 ///
2775 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
2776 ///
2777 /// \param __a
2778 ///    A 128-bit integer vector containing one of the source operands.
2779 /// \param __b
2780 ///    A 128-bit integer vector containing one of the source operands.
2781 /// \returns A 128-bit integer vector containing the bitwise OR of the values
2782 ///    in both operands.
2783 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_or_si128(__m128i __a,__m128i __b)2784 _mm_or_si128(__m128i __a, __m128i __b)
2785 {
2786   return (__m128i)((__v2du)__a | (__v2du)__b);
2787 }
2788 
2789 /// Performs a bitwise exclusive OR of two 128-bit integer vectors.
2790 ///
2791 /// \headerfile <x86intrin.h>
2792 ///
2793 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
2794 ///
2795 /// \param __a
2796 ///    A 128-bit integer vector containing one of the source operands.
2797 /// \param __b
2798 ///    A 128-bit integer vector containing one of the source operands.
2799 /// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
2800 ///    values in both operands.
2801 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_xor_si128(__m128i __a,__m128i __b)2802 _mm_xor_si128(__m128i __a, __m128i __b)
2803 {
2804   return (__m128i)((__v2du)__a ^ (__v2du)__b);
2805 }
2806 
2807 /// Left-shifts the 128-bit integer vector operand by the specified
2808 ///    number of bytes. Low-order bits are cleared.
2809 ///
2810 /// \headerfile <x86intrin.h>
2811 ///
2812 /// \code
2813 /// __m128i _mm_slli_si128(__m128i a, const int imm);
2814 /// \endcode
2815 ///
2816 /// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
2817 ///
2818 /// \param a
2819 ///    A 128-bit integer vector containing the source operand.
2820 /// \param imm
2821 ///    An immediate value specifying the number of bytes to left-shift operand
2822 ///    \a a.
2823 /// \returns A 128-bit integer vector containing the left-shifted value.
2824 #define _mm_slli_si128(a, imm) \
2825   ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
2826 
2827 #define _mm_bslli_si128(a, imm) \
2828   ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
2829 
2830 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
2831 ///    by the specified number of bits. Low-order bits are cleared.
2832 ///
2833 /// \headerfile <x86intrin.h>
2834 ///
2835 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2836 ///
2837 /// \param __a
2838 ///    A 128-bit integer vector containing the source operand.
2839 /// \param __count
2840 ///    An integer value specifying the number of bits to left-shift each value
2841 ///    in operand \a __a.
2842 /// \returns A 128-bit integer vector containing the left-shifted values.
2843 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_slli_epi16(__m128i __a,int __count)2844 _mm_slli_epi16(__m128i __a, int __count)
2845 {
2846   return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
2847 }
2848 
2849 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
2850 ///    by the specified number of bits. Low-order bits are cleared.
2851 ///
2852 /// \headerfile <x86intrin.h>
2853 ///
2854 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2855 ///
2856 /// \param __a
2857 ///    A 128-bit integer vector containing the source operand.
2858 /// \param __count
2859 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2860 ///    to left-shift each value in operand \a __a.
2861 /// \returns A 128-bit integer vector containing the left-shifted values.
2862 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sll_epi16(__m128i __a,__m128i __count)2863 _mm_sll_epi16(__m128i __a, __m128i __count)
2864 {
2865   return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
2866 }
2867 
2868 /// Left-shifts each 32-bit value in the 128-bit integer vector operand
2869 ///    by the specified number of bits. Low-order bits are cleared.
2870 ///
2871 /// \headerfile <x86intrin.h>
2872 ///
2873 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2874 ///
2875 /// \param __a
2876 ///    A 128-bit integer vector containing the source operand.
2877 /// \param __count
2878 ///    An integer value specifying the number of bits to left-shift each value
2879 ///    in operand \a __a.
2880 /// \returns A 128-bit integer vector containing the left-shifted values.
2881 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_slli_epi32(__m128i __a,int __count)2882 _mm_slli_epi32(__m128i __a, int __count)
2883 {
2884   return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
2885 }
2886 
2887 /// Left-shifts each 32-bit value in the 128-bit integer vector operand
2888 ///    by the specified number of bits. Low-order bits are cleared.
2889 ///
2890 /// \headerfile <x86intrin.h>
2891 ///
2892 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2893 ///
2894 /// \param __a
2895 ///    A 128-bit integer vector containing the source operand.
2896 /// \param __count
2897 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2898 ///    to left-shift each value in operand \a __a.
2899 /// \returns A 128-bit integer vector containing the left-shifted values.
2900 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sll_epi32(__m128i __a,__m128i __count)2901 _mm_sll_epi32(__m128i __a, __m128i __count)
2902 {
2903   return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
2904 }
2905 
2906 /// Left-shifts each 64-bit value in the 128-bit integer vector operand
2907 ///    by the specified number of bits. Low-order bits are cleared.
2908 ///
2909 /// \headerfile <x86intrin.h>
2910 ///
2911 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2912 ///
2913 /// \param __a
2914 ///    A 128-bit integer vector containing the source operand.
2915 /// \param __count
2916 ///    An integer value specifying the number of bits to left-shift each value
2917 ///    in operand \a __a.
2918 /// \returns A 128-bit integer vector containing the left-shifted values.
2919 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_slli_epi64(__m128i __a,int __count)2920 _mm_slli_epi64(__m128i __a, int __count)
2921 {
2922   return __builtin_ia32_psllqi128((__v2di)__a, __count);
2923 }
2924 
2925 /// Left-shifts each 64-bit value in the 128-bit integer vector operand
2926 ///    by the specified number of bits. Low-order bits are cleared.
2927 ///
2928 /// \headerfile <x86intrin.h>
2929 ///
2930 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2931 ///
2932 /// \param __a
2933 ///    A 128-bit integer vector containing the source operand.
2934 /// \param __count
2935 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2936 ///    to left-shift each value in operand \a __a.
2937 /// \returns A 128-bit integer vector containing the left-shifted values.
2938 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sll_epi64(__m128i __a,__m128i __count)2939 _mm_sll_epi64(__m128i __a, __m128i __count)
2940 {
2941   return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
2942 }
2943 
2944 /// Right-shifts each 16-bit value in the 128-bit integer vector operand
2945 ///    by the specified number of bits. High-order bits are filled with the sign
2946 ///    bit of the initial value.
2947 ///
2948 /// \headerfile <x86intrin.h>
2949 ///
2950 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2951 ///
2952 /// \param __a
2953 ///    A 128-bit integer vector containing the source operand.
2954 /// \param __count
2955 ///    An integer value specifying the number of bits to right-shift each value
2956 ///    in operand \a __a.
2957 /// \returns A 128-bit integer vector containing the right-shifted values.
2958 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srai_epi16(__m128i __a,int __count)2959 _mm_srai_epi16(__m128i __a, int __count)
2960 {
2961   return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
2962 }
2963 
2964 /// Right-shifts each 16-bit value in the 128-bit integer vector operand
2965 ///    by the specified number of bits. High-order bits are filled with the sign
2966 ///    bit of the initial value.
2967 ///
2968 /// \headerfile <x86intrin.h>
2969 ///
2970 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2971 ///
2972 /// \param __a
2973 ///    A 128-bit integer vector containing the source operand.
2974 /// \param __count
2975 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2976 ///    to right-shift each value in operand \a __a.
2977 /// \returns A 128-bit integer vector containing the right-shifted values.
2978 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sra_epi16(__m128i __a,__m128i __count)2979 _mm_sra_epi16(__m128i __a, __m128i __count)
2980 {
2981   return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
2982 }
2983 
2984 /// Right-shifts each 32-bit value in the 128-bit integer vector operand
2985 ///    by the specified number of bits. High-order bits are filled with the sign
2986 ///    bit of the initial value.
2987 ///
2988 /// \headerfile <x86intrin.h>
2989 ///
2990 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2991 ///
2992 /// \param __a
2993 ///    A 128-bit integer vector containing the source operand.
2994 /// \param __count
2995 ///    An integer value specifying the number of bits to right-shift each value
2996 ///    in operand \a __a.
2997 /// \returns A 128-bit integer vector containing the right-shifted values.
2998 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srai_epi32(__m128i __a,int __count)2999 _mm_srai_epi32(__m128i __a, int __count)
3000 {
3001   return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
3002 }
3003 
3004 /// Right-shifts each 32-bit value in the 128-bit integer vector operand
3005 ///    by the specified number of bits. High-order bits are filled with the sign
3006 ///    bit of the initial value.
3007 ///
3008 /// \headerfile <x86intrin.h>
3009 ///
3010 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
3011 ///
3012 /// \param __a
3013 ///    A 128-bit integer vector containing the source operand.
3014 /// \param __count
3015 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
3016 ///    to right-shift each value in operand \a __a.
3017 /// \returns A 128-bit integer vector containing the right-shifted values.
3018 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sra_epi32(__m128i __a,__m128i __count)3019 _mm_sra_epi32(__m128i __a, __m128i __count)
3020 {
3021   return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
3022 }
3023 
3024 /// Right-shifts the 128-bit integer vector operand by the specified
3025 ///    number of bytes. High-order bits are cleared.
3026 ///
3027 /// \headerfile <x86intrin.h>
3028 ///
3029 /// \code
3030 /// __m128i _mm_srli_si128(__m128i a, const int imm);
3031 /// \endcode
3032 ///
3033 /// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
3034 ///
3035 /// \param a
3036 ///    A 128-bit integer vector containing the source operand.
3037 /// \param imm
3038 ///    An immediate value specifying the number of bytes to right-shift operand
3039 ///    \a a.
3040 /// \returns A 128-bit integer vector containing the right-shifted value.
3041 #define _mm_srli_si128(a, imm) \
3042   ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
3043 
3044 #define _mm_bsrli_si128(a, imm) \
3045   ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
3046 
3047 /// Right-shifts each of 16-bit values in the 128-bit integer vector
3048 ///    operand by the specified number of bits. High-order bits are cleared.
3049 ///
3050 /// \headerfile <x86intrin.h>
3051 ///
3052 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
3053 ///
3054 /// \param __a
3055 ///    A 128-bit integer vector containing the source operand.
3056 /// \param __count
3057 ///    An integer value specifying the number of bits to right-shift each value
3058 ///    in operand \a __a.
3059 /// \returns A 128-bit integer vector containing the right-shifted values.
3060 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srli_epi16(__m128i __a,int __count)3061 _mm_srli_epi16(__m128i __a, int __count)
3062 {
3063   return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
3064 }
3065 
3066 /// Right-shifts each of 16-bit values in the 128-bit integer vector
3067 ///    operand by the specified number of bits. High-order bits are cleared.
3068 ///
3069 /// \headerfile <x86intrin.h>
3070 ///
3071 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
3072 ///
3073 /// \param __a
3074 ///    A 128-bit integer vector containing the source operand.
3075 /// \param __count
3076 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
3077 ///    to right-shift each value in operand \a __a.
3078 /// \returns A 128-bit integer vector containing the right-shifted values.
3079 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srl_epi16(__m128i __a,__m128i __count)3080 _mm_srl_epi16(__m128i __a, __m128i __count)
3081 {
3082   return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
3083 }
3084 
3085 /// Right-shifts each of 32-bit values in the 128-bit integer vector
3086 ///    operand by the specified number of bits. High-order bits are cleared.
3087 ///
3088 /// \headerfile <x86intrin.h>
3089 ///
3090 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
3091 ///
3092 /// \param __a
3093 ///    A 128-bit integer vector containing the source operand.
3094 /// \param __count
3095 ///    An integer value specifying the number of bits to right-shift each value
3096 ///    in operand \a __a.
3097 /// \returns A 128-bit integer vector containing the right-shifted values.
3098 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srli_epi32(__m128i __a,int __count)3099 _mm_srli_epi32(__m128i __a, int __count)
3100 {
3101   return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
3102 }
3103 
3104 /// Right-shifts each of 32-bit values in the 128-bit integer vector
3105 ///    operand by the specified number of bits. High-order bits are cleared.
3106 ///
3107 /// \headerfile <x86intrin.h>
3108 ///
3109 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
3110 ///
3111 /// \param __a
3112 ///    A 128-bit integer vector containing the source operand.
3113 /// \param __count
3114 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
3115 ///    to right-shift each value in operand \a __a.
3116 /// \returns A 128-bit integer vector containing the right-shifted values.
3117 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srl_epi32(__m128i __a,__m128i __count)3118 _mm_srl_epi32(__m128i __a, __m128i __count)
3119 {
3120   return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
3121 }
3122 
3123 /// Right-shifts each of 64-bit values in the 128-bit integer vector
3124 ///    operand by the specified number of bits. High-order bits are cleared.
3125 ///
3126 /// \headerfile <x86intrin.h>
3127 ///
3128 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3129 ///
3130 /// \param __a
3131 ///    A 128-bit integer vector containing the source operand.
3132 /// \param __count
3133 ///    An integer value specifying the number of bits to right-shift each value
3134 ///    in operand \a __a.
3135 /// \returns A 128-bit integer vector containing the right-shifted values.
3136 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srli_epi64(__m128i __a,int __count)3137 _mm_srli_epi64(__m128i __a, int __count)
3138 {
3139   return __builtin_ia32_psrlqi128((__v2di)__a, __count);
3140 }
3141 
3142 /// Right-shifts each of 64-bit values in the 128-bit integer vector
3143 ///    operand by the specified number of bits. High-order bits are cleared.
3144 ///
3145 /// \headerfile <x86intrin.h>
3146 ///
3147 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3148 ///
3149 /// \param __a
3150 ///    A 128-bit integer vector containing the source operand.
3151 /// \param __count
3152 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
3153 ///    to right-shift each value in operand \a __a.
3154 /// \returns A 128-bit integer vector containing the right-shifted values.
3155 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srl_epi64(__m128i __a,__m128i __count)3156 _mm_srl_epi64(__m128i __a, __m128i __count)
3157 {
3158   return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
3159 }
3160 
3161 /// Compares each of the corresponding 8-bit values of the 128-bit
3162 ///    integer vectors for equality. Each comparison yields 0x0 for false, 0xFF
3163 ///    for true.
3164 ///
3165 /// \headerfile <x86intrin.h>
3166 ///
3167 /// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
3168 ///
3169 /// \param __a
3170 ///    A 128-bit integer vector.
3171 /// \param __b
3172 ///    A 128-bit integer vector.
3173 /// \returns A 128-bit integer vector containing the comparison results.
3174 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmpeq_epi8(__m128i __a,__m128i __b)3175 _mm_cmpeq_epi8(__m128i __a, __m128i __b)
3176 {
3177   return (__m128i)((__v16qi)__a == (__v16qi)__b);
3178 }
3179 
3180 /// Compares each of the corresponding 16-bit values of the 128-bit
3181 ///    integer vectors for equality. Each comparison yields 0x0 for false,
3182 ///    0xFFFF for true.
3183 ///
3184 /// \headerfile <x86intrin.h>
3185 ///
3186 /// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
3187 ///
3188 /// \param __a
3189 ///    A 128-bit integer vector.
3190 /// \param __b
3191 ///    A 128-bit integer vector.
3192 /// \returns A 128-bit integer vector containing the comparison results.
3193 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmpeq_epi16(__m128i __a,__m128i __b)3194 _mm_cmpeq_epi16(__m128i __a, __m128i __b)
3195 {
3196   return (__m128i)((__v8hi)__a == (__v8hi)__b);
3197 }
3198 
3199 /// Compares each of the corresponding 32-bit values of the 128-bit
3200 ///    integer vectors for equality. Each comparison yields 0x0 for false,
3201 ///    0xFFFFFFFF for true.
3202 ///
3203 /// \headerfile <x86intrin.h>
3204 ///
3205 /// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
3206 ///
3207 /// \param __a
3208 ///    A 128-bit integer vector.
3209 /// \param __b
3210 ///    A 128-bit integer vector.
3211 /// \returns A 128-bit integer vector containing the comparison results.
3212 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmpeq_epi32(__m128i __a,__m128i __b)3213 _mm_cmpeq_epi32(__m128i __a, __m128i __b)
3214 {
3215   return (__m128i)((__v4si)__a == (__v4si)__b);
3216 }
3217 
3218 /// Compares each of the corresponding signed 8-bit values of the 128-bit
3219 ///    integer vectors to determine if the values in the first operand are
3220 ///    greater than those in the second operand. Each comparison yields 0x0 for
3221 ///    false, 0xFF for true.
3222 ///
3223 /// \headerfile <x86intrin.h>
3224 ///
3225 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3226 ///
3227 /// \param __a
3228 ///    A 128-bit integer vector.
3229 /// \param __b
3230 ///    A 128-bit integer vector.
3231 /// \returns A 128-bit integer vector containing the comparison results.
3232 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmpgt_epi8(__m128i __a,__m128i __b)3233 _mm_cmpgt_epi8(__m128i __a, __m128i __b)
3234 {
3235   /* This function always performs a signed comparison, but __v16qi is a char
3236      which may be signed or unsigned, so use __v16qs. */
3237   return (__m128i)((__v16qs)__a > (__v16qs)__b);
3238 }
3239 
3240 /// Compares each of the corresponding signed 16-bit values of the
3241 ///    128-bit integer vectors to determine if the values in the first operand
3242 ///    are greater than those in the second operand.
3243 ///
3244 ///    Each comparison yields 0x0 for false, 0xFFFF for true.
3245 ///
3246 /// \headerfile <x86intrin.h>
3247 ///
3248 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3249 ///
3250 /// \param __a
3251 ///    A 128-bit integer vector.
3252 /// \param __b
3253 ///    A 128-bit integer vector.
3254 /// \returns A 128-bit integer vector containing the comparison results.
3255 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmpgt_epi16(__m128i __a,__m128i __b)3256 _mm_cmpgt_epi16(__m128i __a, __m128i __b)
3257 {
3258   return (__m128i)((__v8hi)__a > (__v8hi)__b);
3259 }
3260 
3261 /// Compares each of the corresponding signed 32-bit values of the
3262 ///    128-bit integer vectors to determine if the values in the first operand
3263 ///    are greater than those in the second operand.
3264 ///
3265 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3266 ///
3267 /// \headerfile <x86intrin.h>
3268 ///
3269 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3270 ///
3271 /// \param __a
3272 ///    A 128-bit integer vector.
3273 /// \param __b
3274 ///    A 128-bit integer vector.
3275 /// \returns A 128-bit integer vector containing the comparison results.
3276 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmpgt_epi32(__m128i __a,__m128i __b)3277 _mm_cmpgt_epi32(__m128i __a, __m128i __b)
3278 {
3279   return (__m128i)((__v4si)__a > (__v4si)__b);
3280 }
3281 
3282 /// Compares each of the corresponding signed 8-bit values of the 128-bit
3283 ///    integer vectors to determine if the values in the first operand are less
3284 ///    than those in the second operand.
3285 ///
3286 ///    Each comparison yields 0x0 for false, 0xFF for true.
3287 ///
3288 /// \headerfile <x86intrin.h>
3289 ///
3290 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3291 ///
3292 /// \param __a
3293 ///    A 128-bit integer vector.
3294 /// \param __b
3295 ///    A 128-bit integer vector.
3296 /// \returns A 128-bit integer vector containing the comparison results.
3297 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmplt_epi8(__m128i __a,__m128i __b)3298 _mm_cmplt_epi8(__m128i __a, __m128i __b)
3299 {
3300   return _mm_cmpgt_epi8(__b, __a);
3301 }
3302 
3303 /// Compares each of the corresponding signed 16-bit values of the
3304 ///    128-bit integer vectors to determine if the values in the first operand
3305 ///    are less than those in the second operand.
3306 ///
3307 ///    Each comparison yields 0x0 for false, 0xFFFF for true.
3308 ///
3309 /// \headerfile <x86intrin.h>
3310 ///
3311 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3312 ///
3313 /// \param __a
3314 ///    A 128-bit integer vector.
3315 /// \param __b
3316 ///    A 128-bit integer vector.
3317 /// \returns A 128-bit integer vector containing the comparison results.
3318 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmplt_epi16(__m128i __a,__m128i __b)3319 _mm_cmplt_epi16(__m128i __a, __m128i __b)
3320 {
3321   return _mm_cmpgt_epi16(__b, __a);
3322 }
3323 
3324 /// Compares each of the corresponding signed 32-bit values of the
3325 ///    128-bit integer vectors to determine if the values in the first operand
3326 ///    are less than those in the second operand.
3327 ///
3328 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3329 ///
3330 /// \headerfile <x86intrin.h>
3331 ///
3332 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3333 ///
3334 /// \param __a
3335 ///    A 128-bit integer vector.
3336 /// \param __b
3337 ///    A 128-bit integer vector.
3338 /// \returns A 128-bit integer vector containing the comparison results.
3339 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmplt_epi32(__m128i __a,__m128i __b)3340 _mm_cmplt_epi32(__m128i __a, __m128i __b)
3341 {
3342   return _mm_cmpgt_epi32(__b, __a);
3343 }
3344 
3345 #ifdef __x86_64__
3346 /// Converts a 64-bit signed integer value from the second operand into a
3347 ///    double-precision value and returns it in the lower element of a [2 x
3348 ///    double] vector; the upper element of the returned vector is copied from
3349 ///    the upper element of the first operand.
3350 ///
3351 /// \headerfile <x86intrin.h>
3352 ///
3353 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
3354 ///
3355 /// \param __a
3356 ///    A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
3357 ///    copied to the upper 64 bits of the destination.
3358 /// \param __b
3359 ///    A 64-bit signed integer operand containing the value to be converted.
3360 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
3361 ///    converted value of the second operand. The upper 64 bits are copied from
3362 ///    the upper 64 bits of the first operand.
3363 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cvtsi64_sd(__m128d __a,long long __b)3364 _mm_cvtsi64_sd(__m128d __a, long long __b)
3365 {
3366   __a[0] = __b;
3367   return __a;
3368 }
3369 
3370 /// Converts the first (lower) element of a vector of [2 x double] into a
3371 ///    64-bit signed integer value, according to the current rounding mode.
3372 ///
3373 /// \headerfile <x86intrin.h>
3374 ///
3375 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
3376 ///
3377 /// \param __a
3378 ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3379 ///    conversion.
3380 /// \returns A 64-bit signed integer containing the converted value.
3381 static __inline__ long long __DEFAULT_FN_ATTRS
_mm_cvtsd_si64(__m128d __a)3382 _mm_cvtsd_si64(__m128d __a)
3383 {
3384   return __builtin_ia32_cvtsd2si64((__v2df)__a);
3385 }
3386 
3387 /// Converts the first (lower) element of a vector of [2 x double] into a
3388 ///    64-bit signed integer value, truncating the result when it is inexact.
3389 ///
3390 /// \headerfile <x86intrin.h>
3391 ///
3392 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
3393 ///   instruction.
3394 ///
3395 /// \param __a
3396 ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3397 ///    conversion.
3398 /// \returns A 64-bit signed integer containing the converted value.
3399 static __inline__ long long __DEFAULT_FN_ATTRS
_mm_cvttsd_si64(__m128d __a)3400 _mm_cvttsd_si64(__m128d __a)
3401 {
3402   return __builtin_ia32_cvttsd2si64((__v2df)__a);
3403 }
3404 #endif
3405 
3406 /// Converts a vector of [4 x i32] into a vector of [4 x float].
3407 ///
3408 /// \headerfile <x86intrin.h>
3409 ///
3410 /// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
3411 ///
3412 /// \param __a
3413 ///    A 128-bit integer vector.
3414 /// \returns A 128-bit vector of [4 x float] containing the converted values.
3415 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cvtepi32_ps(__m128i __a)3416 _mm_cvtepi32_ps(__m128i __a)
3417 {
3418   return (__m128)__builtin_convertvector((__v4si)__a, __v4sf);
3419 }
3420 
3421 /// Converts a vector of [4 x float] into a vector of [4 x i32].
3422 ///
3423 /// \headerfile <x86intrin.h>
3424 ///
3425 /// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
3426 ///
3427 /// \param __a
3428 ///    A 128-bit vector of [4 x float].
3429 /// \returns A 128-bit integer vector of [4 x i32] containing the converted
3430 ///    values.
3431 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtps_epi32(__m128 __a)3432 _mm_cvtps_epi32(__m128 __a)
3433 {
3434   return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
3435 }
3436 
3437 /// Converts a vector of [4 x float] into a vector of [4 x i32],
3438 ///    truncating the result when it is inexact.
3439 ///
3440 /// \headerfile <x86intrin.h>
3441 ///
3442 /// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
3443 ///   instruction.
3444 ///
3445 /// \param __a
3446 ///    A 128-bit vector of [4 x float].
3447 /// \returns A 128-bit vector of [4 x i32] containing the converted values.
3448 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvttps_epi32(__m128 __a)3449 _mm_cvttps_epi32(__m128 __a)
3450 {
3451   return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
3452 }
3453 
3454 /// Returns a vector of [4 x i32] where the lowest element is the input
3455 ///    operand and the remaining elements are zero.
3456 ///
3457 /// \headerfile <x86intrin.h>
3458 ///
3459 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3460 ///
3461 /// \param __a
3462 ///    A 32-bit signed integer operand.
3463 /// \returns A 128-bit vector of [4 x i32].
3464 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtsi32_si128(int __a)3465 _mm_cvtsi32_si128(int __a)
3466 {
3467   return __extension__ (__m128i)(__v4si){ __a, 0, 0, 0 };
3468 }
3469 
3470 #ifdef __x86_64__
3471 /// Returns a vector of [2 x i64] where the lower element is the input
3472 ///    operand and the upper element is zero.
3473 ///
3474 /// \headerfile <x86intrin.h>
3475 ///
3476 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3477 ///
3478 /// \param __a
3479 ///    A 64-bit signed integer operand containing the value to be converted.
3480 /// \returns A 128-bit vector of [2 x i64] containing the converted value.
3481 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtsi64_si128(long long __a)3482 _mm_cvtsi64_si128(long long __a)
3483 {
3484   return __extension__ (__m128i)(__v2di){ __a, 0 };
3485 }
3486 #endif
3487 
3488 /// Moves the least significant 32 bits of a vector of [4 x i32] to a
3489 ///    32-bit signed integer value.
3490 ///
3491 /// \headerfile <x86intrin.h>
3492 ///
3493 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3494 ///
3495 /// \param __a
3496 ///    A vector of [4 x i32]. The least significant 32 bits are moved to the
3497 ///    destination.
3498 /// \returns A 32-bit signed integer containing the moved value.
3499 static __inline__ int __DEFAULT_FN_ATTRS
_mm_cvtsi128_si32(__m128i __a)3500 _mm_cvtsi128_si32(__m128i __a)
3501 {
3502   __v4si __b = (__v4si)__a;
3503   return __b[0];
3504 }
3505 
3506 #ifdef __x86_64__
3507 /// Moves the least significant 64 bits of a vector of [2 x i64] to a
3508 ///    64-bit signed integer value.
3509 ///
3510 /// \headerfile <x86intrin.h>
3511 ///
3512 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3513 ///
3514 /// \param __a
3515 ///    A vector of [2 x i64]. The least significant 64 bits are moved to the
3516 ///    destination.
3517 /// \returns A 64-bit signed integer containing the moved value.
3518 static __inline__ long long __DEFAULT_FN_ATTRS
_mm_cvtsi128_si64(__m128i __a)3519 _mm_cvtsi128_si64(__m128i __a)
3520 {
3521   return __a[0];
3522 }
3523 #endif
3524 
3525 /// Moves packed integer values from an aligned 128-bit memory location
3526 ///    to elements in a 128-bit integer vector.
3527 ///
3528 /// \headerfile <x86intrin.h>
3529 ///
3530 /// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
3531 ///
3532 /// \param __p
3533 ///    An aligned pointer to a memory location containing integer values.
3534 /// \returns A 128-bit integer vector containing the moved values.
3535 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_load_si128(__m128i const * __p)3536 _mm_load_si128(__m128i const *__p)
3537 {
3538   return *__p;
3539 }
3540 
3541 /// Moves packed integer values from an unaligned 128-bit memory location
3542 ///    to elements in a 128-bit integer vector.
3543 ///
3544 /// \headerfile <x86intrin.h>
3545 ///
3546 /// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
3547 ///
3548 /// \param __p
3549 ///    A pointer to a memory location containing integer values.
3550 /// \returns A 128-bit integer vector containing the moved values.
3551 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_loadu_si128(__m128i_u const * __p)3552 _mm_loadu_si128(__m128i_u const *__p)
3553 {
3554   struct __loadu_si128 {
3555     __m128i_u __v;
3556   } __attribute__((__packed__, __may_alias__));
3557   return ((const struct __loadu_si128*)__p)->__v;
3558 }
3559 
3560 /// Returns a vector of [2 x i64] where the lower element is taken from
3561 ///    the lower element of the operand, and the upper element is zero.
3562 ///
3563 /// \headerfile <x86intrin.h>
3564 ///
3565 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3566 ///
3567 /// \param __p
3568 ///    A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
3569 ///    the destination.
3570 /// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
3571 ///    moved value. The higher order bits are cleared.
3572 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_loadl_epi64(__m128i_u const * __p)3573 _mm_loadl_epi64(__m128i_u const *__p)
3574 {
3575   struct __mm_loadl_epi64_struct {
3576     long long __u;
3577   } __attribute__((__packed__, __may_alias__));
3578   return __extension__ (__m128i) { ((const struct __mm_loadl_epi64_struct*)__p)->__u, 0};
3579 }
3580 
3581 /// Generates a 128-bit vector of [4 x i32] with unspecified content.
3582 ///    This could be used as an argument to another intrinsic function where the
3583 ///    argument is required but the value is not actually used.
3584 ///
3585 /// \headerfile <x86intrin.h>
3586 ///
3587 /// This intrinsic has no corresponding instruction.
3588 ///
3589 /// \returns A 128-bit vector of [4 x i32] with unspecified content.
3590 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_undefined_si128(void)3591 _mm_undefined_si128(void)
3592 {
3593   return (__m128i)__builtin_ia32_undef128();
3594 }
3595 
3596 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3597 ///    the specified 64-bit integer values.
3598 ///
3599 /// \headerfile <x86intrin.h>
3600 ///
3601 /// This intrinsic is a utility function and does not correspond to a specific
3602 ///    instruction.
3603 ///
3604 /// \param __q1
3605 ///    A 64-bit integer value used to initialize the upper 64 bits of the
3606 ///    destination vector of [2 x i64].
3607 /// \param __q0
3608 ///    A 64-bit integer value used to initialize the lower 64 bits of the
3609 ///    destination vector of [2 x i64].
3610 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3611 ///    provided in the operands.
3612 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set_epi64x(long long __q1,long long __q0)3613 _mm_set_epi64x(long long __q1, long long __q0)
3614 {
3615   return __extension__ (__m128i)(__v2di){ __q0, __q1 };
3616 }
3617 
3618 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3619 ///    the specified 64-bit integer values.
3620 ///
3621 /// \headerfile <x86intrin.h>
3622 ///
3623 /// This intrinsic is a utility function and does not correspond to a specific
3624 ///    instruction.
3625 ///
3626 /// \param __q1
3627 ///    A 64-bit integer value used to initialize the upper 64 bits of the
3628 ///    destination vector of [2 x i64].
3629 /// \param __q0
3630 ///    A 64-bit integer value used to initialize the lower 64 bits of the
3631 ///    destination vector of [2 x i64].
3632 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3633 ///    provided in the operands.
3634 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set_epi64(__m64 __q1,__m64 __q0)3635 _mm_set_epi64(__m64 __q1, __m64 __q0)
3636 {
3637   return _mm_set_epi64x((long long)__q1, (long long)__q0);
3638 }
3639 
3640 /// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
3641 ///    the specified 32-bit integer values.
3642 ///
3643 /// \headerfile <x86intrin.h>
3644 ///
3645 /// This intrinsic is a utility function and does not correspond to a specific
3646 ///    instruction.
3647 ///
3648 /// \param __i3
3649 ///    A 32-bit integer value used to initialize bits [127:96] of the
3650 ///    destination vector.
3651 /// \param __i2
3652 ///    A 32-bit integer value used to initialize bits [95:64] of the destination
3653 ///    vector.
3654 /// \param __i1
3655 ///    A 32-bit integer value used to initialize bits [63:32] of the destination
3656 ///    vector.
3657 /// \param __i0
3658 ///    A 32-bit integer value used to initialize bits [31:0] of the destination
3659 ///    vector.
3660 /// \returns An initialized 128-bit vector of [4 x i32] containing the values
3661 ///    provided in the operands.
3662 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set_epi32(int __i3,int __i2,int __i1,int __i0)3663 _mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
3664 {
3665   return __extension__ (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
3666 }
3667 
3668 /// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
3669 ///    the specified 16-bit integer values.
3670 ///
3671 /// \headerfile <x86intrin.h>
3672 ///
3673 /// This intrinsic is a utility function and does not correspond to a specific
3674 ///    instruction.
3675 ///
3676 /// \param __w7
3677 ///    A 16-bit integer value used to initialize bits [127:112] of the
3678 ///    destination vector.
3679 /// \param __w6
3680 ///    A 16-bit integer value used to initialize bits [111:96] of the
3681 ///    destination vector.
3682 /// \param __w5
3683 ///    A 16-bit integer value used to initialize bits [95:80] of the destination
3684 ///    vector.
3685 /// \param __w4
3686 ///    A 16-bit integer value used to initialize bits [79:64] of the destination
3687 ///    vector.
3688 /// \param __w3
3689 ///    A 16-bit integer value used to initialize bits [63:48] of the destination
3690 ///    vector.
3691 /// \param __w2
3692 ///    A 16-bit integer value used to initialize bits [47:32] of the destination
3693 ///    vector.
3694 /// \param __w1
3695 ///    A 16-bit integer value used to initialize bits [31:16] of the destination
3696 ///    vector.
3697 /// \param __w0
3698 ///    A 16-bit integer value used to initialize bits [15:0] of the destination
3699 ///    vector.
3700 /// \returns An initialized 128-bit vector of [8 x i16] containing the values
3701 ///    provided in the operands.
3702 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set_epi16(short __w7,short __w6,short __w5,short __w4,short __w3,short __w2,short __w1,short __w0)3703 _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
3704 {
3705   return __extension__ (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
3706 }
3707 
3708 /// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
3709 ///    the specified 8-bit integer values.
3710 ///
3711 /// \headerfile <x86intrin.h>
3712 ///
3713 /// This intrinsic is a utility function and does not correspond to a specific
3714 ///    instruction.
3715 ///
3716 /// \param __b15
3717 ///    Initializes bits [127:120] of the destination vector.
3718 /// \param __b14
3719 ///    Initializes bits [119:112] of the destination vector.
3720 /// \param __b13
3721 ///    Initializes bits [111:104] of the destination vector.
3722 /// \param __b12
3723 ///    Initializes bits [103:96] of the destination vector.
3724 /// \param __b11
3725 ///    Initializes bits [95:88] of the destination vector.
3726 /// \param __b10
3727 ///    Initializes bits [87:80] of the destination vector.
3728 /// \param __b9
3729 ///    Initializes bits [79:72] of the destination vector.
3730 /// \param __b8
3731 ///    Initializes bits [71:64] of the destination vector.
3732 /// \param __b7
3733 ///    Initializes bits [63:56] of the destination vector.
3734 /// \param __b6
3735 ///    Initializes bits [55:48] of the destination vector.
3736 /// \param __b5
3737 ///    Initializes bits [47:40] of the destination vector.
3738 /// \param __b4
3739 ///    Initializes bits [39:32] of the destination vector.
3740 /// \param __b3
3741 ///    Initializes bits [31:24] of the destination vector.
3742 /// \param __b2
3743 ///    Initializes bits [23:16] of the destination vector.
3744 /// \param __b1
3745 ///    Initializes bits [15:8] of the destination vector.
3746 /// \param __b0
3747 ///    Initializes bits [7:0] of the destination vector.
3748 /// \returns An initialized 128-bit vector of [16 x i8] containing the values
3749 ///    provided in the operands.
3750 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set_epi8(char __b15,char __b14,char __b13,char __b12,char __b11,char __b10,char __b9,char __b8,char __b7,char __b6,char __b5,char __b4,char __b3,char __b2,char __b1,char __b0)3751 _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
3752 {
3753   return __extension__ (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
3754 }
3755 
3756 /// Initializes both values in a 128-bit integer vector with the
3757 ///    specified 64-bit integer value.
3758 ///
3759 /// \headerfile <x86intrin.h>
3760 ///
3761 /// This intrinsic is a utility function and does not correspond to a specific
3762 ///    instruction.
3763 ///
3764 /// \param __q
3765 ///    Integer value used to initialize the elements of the destination integer
3766 ///    vector.
3767 /// \returns An initialized 128-bit integer vector of [2 x i64] with both
3768 ///    elements containing the value provided in the operand.
3769 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set1_epi64x(long long __q)3770 _mm_set1_epi64x(long long __q)
3771 {
3772   return _mm_set_epi64x(__q, __q);
3773 }
3774 
3775 /// Initializes both values in a 128-bit vector of [2 x i64] with the
3776 ///    specified 64-bit value.
3777 ///
3778 /// \headerfile <x86intrin.h>
3779 ///
3780 /// This intrinsic is a utility function and does not correspond to a specific
3781 ///    instruction.
3782 ///
3783 /// \param __q
3784 ///    A 64-bit value used to initialize the elements of the destination integer
3785 ///    vector.
3786 /// \returns An initialized 128-bit vector of [2 x i64] with all elements
3787 ///    containing the value provided in the operand.
3788 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set1_epi64(__m64 __q)3789 _mm_set1_epi64(__m64 __q)
3790 {
3791   return _mm_set_epi64(__q, __q);
3792 }
3793 
3794 /// Initializes all values in a 128-bit vector of [4 x i32] with the
3795 ///    specified 32-bit value.
3796 ///
3797 /// \headerfile <x86intrin.h>
3798 ///
3799 /// This intrinsic is a utility function and does not correspond to a specific
3800 ///    instruction.
3801 ///
3802 /// \param __i
3803 ///    A 32-bit value used to initialize the elements of the destination integer
3804 ///    vector.
3805 /// \returns An initialized 128-bit vector of [4 x i32] with all elements
3806 ///    containing the value provided in the operand.
3807 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set1_epi32(int __i)3808 _mm_set1_epi32(int __i)
3809 {
3810   return _mm_set_epi32(__i, __i, __i, __i);
3811 }
3812 
3813 /// Initializes all values in a 128-bit vector of [8 x i16] with the
3814 ///    specified 16-bit value.
3815 ///
3816 /// \headerfile <x86intrin.h>
3817 ///
3818 /// This intrinsic is a utility function and does not correspond to a specific
3819 ///    instruction.
3820 ///
3821 /// \param __w
3822 ///    A 16-bit value used to initialize the elements of the destination integer
3823 ///    vector.
3824 /// \returns An initialized 128-bit vector of [8 x i16] with all elements
3825 ///    containing the value provided in the operand.
3826 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set1_epi16(short __w)3827 _mm_set1_epi16(short __w)
3828 {
3829   return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
3830 }
3831 
3832 /// Initializes all values in a 128-bit vector of [16 x i8] with the
3833 ///    specified 8-bit value.
3834 ///
3835 /// \headerfile <x86intrin.h>
3836 ///
3837 /// This intrinsic is a utility function and does not correspond to a specific
3838 ///    instruction.
3839 ///
3840 /// \param __b
3841 ///    An 8-bit value used to initialize the elements of the destination integer
3842 ///    vector.
3843 /// \returns An initialized 128-bit vector of [16 x i8] with all elements
3844 ///    containing the value provided in the operand.
3845 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set1_epi8(char __b)3846 _mm_set1_epi8(char __b)
3847 {
3848   return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b);
3849 }
3850 
3851 /// Constructs a 128-bit integer vector, initialized in reverse order
3852 ///     with the specified 64-bit integral values.
3853 ///
3854 /// \headerfile <x86intrin.h>
3855 ///
3856 /// This intrinsic does not correspond to a specific instruction.
3857 ///
3858 /// \param __q0
3859 ///    A 64-bit integral value used to initialize the lower 64 bits of the
3860 ///    result.
3861 /// \param __q1
3862 ///    A 64-bit integral value used to initialize the upper 64 bits of the
3863 ///    result.
3864 /// \returns An initialized 128-bit integer vector.
3865 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_setr_epi64(__m64 __q0,__m64 __q1)3866 _mm_setr_epi64(__m64 __q0, __m64 __q1)
3867 {
3868   return _mm_set_epi64(__q1, __q0);
3869 }
3870 
3871 /// Constructs a 128-bit integer vector, initialized in reverse order
3872 ///     with the specified 32-bit integral values.
3873 ///
3874 /// \headerfile <x86intrin.h>
3875 ///
3876 /// This intrinsic is a utility function and does not correspond to a specific
3877 ///    instruction.
3878 ///
3879 /// \param __i0
3880 ///    A 32-bit integral value used to initialize bits [31:0] of the result.
3881 /// \param __i1
3882 ///    A 32-bit integral value used to initialize bits [63:32] of the result.
3883 /// \param __i2
3884 ///    A 32-bit integral value used to initialize bits [95:64] of the result.
3885 /// \param __i3
3886 ///    A 32-bit integral value used to initialize bits [127:96] of the result.
3887 /// \returns An initialized 128-bit integer vector.
3888 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_setr_epi32(int __i0,int __i1,int __i2,int __i3)3889 _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
3890 {
3891   return _mm_set_epi32(__i3, __i2, __i1, __i0);
3892 }
3893 
3894 /// Constructs a 128-bit integer vector, initialized in reverse order
3895 ///     with the specified 16-bit integral values.
3896 ///
3897 /// \headerfile <x86intrin.h>
3898 ///
3899 /// This intrinsic is a utility function and does not correspond to a specific
3900 ///    instruction.
3901 ///
3902 /// \param __w0
3903 ///    A 16-bit integral value used to initialize bits [15:0] of the result.
3904 /// \param __w1
3905 ///    A 16-bit integral value used to initialize bits [31:16] of the result.
3906 /// \param __w2
3907 ///    A 16-bit integral value used to initialize bits [47:32] of the result.
3908 /// \param __w3
3909 ///    A 16-bit integral value used to initialize bits [63:48] of the result.
3910 /// \param __w4
3911 ///    A 16-bit integral value used to initialize bits [79:64] of the result.
3912 /// \param __w5
3913 ///    A 16-bit integral value used to initialize bits [95:80] of the result.
3914 /// \param __w6
3915 ///    A 16-bit integral value used to initialize bits [111:96] of the result.
3916 /// \param __w7
3917 ///    A 16-bit integral value used to initialize bits [127:112] of the result.
3918 /// \returns An initialized 128-bit integer vector.
3919 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_setr_epi16(short __w0,short __w1,short __w2,short __w3,short __w4,short __w5,short __w6,short __w7)3920 _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
3921 {
3922   return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
3923 }
3924 
3925 /// Constructs a 128-bit integer vector, initialized in reverse order
3926 ///     with the specified 8-bit integral values.
3927 ///
3928 /// \headerfile <x86intrin.h>
3929 ///
3930 /// This intrinsic is a utility function and does not correspond to a specific
3931 ///    instruction.
3932 ///
3933 /// \param __b0
3934 ///    An 8-bit integral value used to initialize bits [7:0] of the result.
3935 /// \param __b1
3936 ///    An 8-bit integral value used to initialize bits [15:8] of the result.
3937 /// \param __b2
3938 ///    An 8-bit integral value used to initialize bits [23:16] of the result.
3939 /// \param __b3
3940 ///    An 8-bit integral value used to initialize bits [31:24] of the result.
3941 /// \param __b4
3942 ///    An 8-bit integral value used to initialize bits [39:32] of the result.
3943 /// \param __b5
3944 ///    An 8-bit integral value used to initialize bits [47:40] of the result.
3945 /// \param __b6
3946 ///    An 8-bit integral value used to initialize bits [55:48] of the result.
3947 /// \param __b7
3948 ///    An 8-bit integral value used to initialize bits [63:56] of the result.
3949 /// \param __b8
3950 ///    An 8-bit integral value used to initialize bits [71:64] of the result.
3951 /// \param __b9
3952 ///    An 8-bit integral value used to initialize bits [79:72] of the result.
3953 /// \param __b10
3954 ///    An 8-bit integral value used to initialize bits [87:80] of the result.
3955 /// \param __b11
3956 ///    An 8-bit integral value used to initialize bits [95:88] of the result.
3957 /// \param __b12
3958 ///    An 8-bit integral value used to initialize bits [103:96] of the result.
3959 /// \param __b13
3960 ///    An 8-bit integral value used to initialize bits [111:104] of the result.
3961 /// \param __b14
3962 ///    An 8-bit integral value used to initialize bits [119:112] of the result.
3963 /// \param __b15
3964 ///    An 8-bit integral value used to initialize bits [127:120] of the result.
3965 /// \returns An initialized 128-bit integer vector.
3966 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_setr_epi8(char __b0,char __b1,char __b2,char __b3,char __b4,char __b5,char __b6,char __b7,char __b8,char __b9,char __b10,char __b11,char __b12,char __b13,char __b14,char __b15)3967 _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
3968 {
3969   return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8, __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
3970 }
3971 
3972 /// Creates a 128-bit integer vector initialized to zero.
3973 ///
3974 /// \headerfile <x86intrin.h>
3975 ///
3976 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
3977 ///
3978 /// \returns An initialized 128-bit integer vector with all elements set to
3979 ///    zero.
3980 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_setzero_si128(void)3981 _mm_setzero_si128(void)
3982 {
3983   return __extension__ (__m128i)(__v2di){ 0LL, 0LL };
3984 }
3985 
3986 /// Stores a 128-bit integer vector to a memory location aligned on a
3987 ///    128-bit boundary.
3988 ///
3989 /// \headerfile <x86intrin.h>
3990 ///
3991 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
3992 ///
3993 /// \param __p
3994 ///    A pointer to an aligned memory location that will receive the integer
3995 ///    values.
3996 /// \param __b
3997 ///    A 128-bit integer vector containing the values to be moved.
3998 static __inline__ void __DEFAULT_FN_ATTRS
_mm_store_si128(__m128i * __p,__m128i __b)3999 _mm_store_si128(__m128i *__p, __m128i __b)
4000 {
4001   *__p = __b;
4002 }
4003 
4004 /// Stores a 128-bit integer vector to an unaligned memory location.
4005 ///
4006 /// \headerfile <x86intrin.h>
4007 ///
4008 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
4009 ///
4010 /// \param __p
4011 ///    A pointer to a memory location that will receive the integer values.
4012 /// \param __b
4013 ///    A 128-bit integer vector containing the values to be moved.
4014 static __inline__ void __DEFAULT_FN_ATTRS
_mm_storeu_si128(__m128i_u * __p,__m128i __b)4015 _mm_storeu_si128(__m128i_u *__p, __m128i __b)
4016 {
4017   struct __storeu_si128 {
4018     __m128i_u __v;
4019   } __attribute__((__packed__, __may_alias__));
4020   ((struct __storeu_si128*)__p)->__v = __b;
4021 }
4022 
4023 /// Stores a 64-bit integer value from the low element of a 128-bit integer
4024 ///    vector.
4025 ///
4026 /// \headerfile <x86intrin.h>
4027 ///
4028 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
4029 ///
4030 /// \param __p
4031 ///    A pointer to a 64-bit memory location. The address of the memory
4032 ///    location does not have to be aligned.
4033 /// \param __b
4034 ///    A 128-bit integer vector containing the value to be stored.
4035 static __inline__ void __DEFAULT_FN_ATTRS
_mm_storeu_si64(void * __p,__m128i __b)4036 _mm_storeu_si64(void *__p, __m128i __b)
4037 {
4038   struct __storeu_si64 {
4039     long long __v;
4040   } __attribute__((__packed__, __may_alias__));
4041   ((struct __storeu_si64*)__p)->__v = ((__v2di)__b)[0];
4042 }
4043 
4044 /// Stores a 32-bit integer value from the low element of a 128-bit integer
4045 ///    vector.
4046 ///
4047 /// \headerfile <x86intrin.h>
4048 ///
4049 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
4050 ///
4051 /// \param __p
4052 ///    A pointer to a 32-bit memory location. The address of the memory
4053 ///    location does not have to be aligned.
4054 /// \param __b
4055 ///    A 128-bit integer vector containing the value to be stored.
4056 static __inline__ void __DEFAULT_FN_ATTRS
_mm_storeu_si32(void * __p,__m128i __b)4057 _mm_storeu_si32(void *__p, __m128i __b)
4058 {
4059   struct __storeu_si32 {
4060     int __v;
4061   } __attribute__((__packed__, __may_alias__));
4062   ((struct __storeu_si32*)__p)->__v = ((__v4si)__b)[0];
4063 }
4064 
4065 /// Stores a 16-bit integer value from the low element of a 128-bit integer
4066 ///    vector.
4067 ///
4068 /// \headerfile <x86intrin.h>
4069 ///
4070 /// This intrinsic does not correspond to a specific instruction.
4071 ///
4072 /// \param __p
4073 ///    A pointer to a 16-bit memory location. The address of the memory
4074 ///    location does not have to be aligned.
4075 /// \param __b
4076 ///    A 128-bit integer vector containing the value to be stored.
4077 static __inline__ void __DEFAULT_FN_ATTRS
_mm_storeu_si16(void * __p,__m128i __b)4078 _mm_storeu_si16(void *__p, __m128i __b)
4079 {
4080   struct __storeu_si16 {
4081     short __v;
4082   } __attribute__((__packed__, __may_alias__));
4083   ((struct __storeu_si16*)__p)->__v = ((__v8hi)__b)[0];
4084 }
4085 
4086 /// Moves bytes selected by the mask from the first operand to the
4087 ///    specified unaligned memory location. When a mask bit is 1, the
4088 ///    corresponding byte is written, otherwise it is not written.
4089 ///
4090 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
4091 ///    used again soon). Exception and trap behavior for elements not selected
4092 ///    for storage to memory are implementation dependent.
4093 ///
4094 /// \headerfile <x86intrin.h>
4095 ///
4096 /// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
4097 ///   instruction.
4098 ///
4099 /// \param __d
4100 ///    A 128-bit integer vector containing the values to be moved.
4101 /// \param __n
4102 ///    A 128-bit integer vector containing the mask. The most significant bit of
4103 ///    each byte represents the mask bits.
4104 /// \param __p
4105 ///    A pointer to an unaligned 128-bit memory location where the specified
4106 ///    values are moved.
4107 static __inline__ void __DEFAULT_FN_ATTRS
_mm_maskmoveu_si128(__m128i __d,__m128i __n,char * __p)4108 _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
4109 {
4110   __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
4111 }
4112 
4113 /// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
4114 ///    a memory location.
4115 ///
4116 /// \headerfile <x86intrin.h>
4117 ///
4118 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
4119 ///
4120 /// \param __p
4121 ///    A pointer to a 64-bit memory location that will receive the lower 64 bits
4122 ///    of the integer vector parameter.
4123 /// \param __a
4124 ///    A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
4125 ///    value to be stored.
4126 static __inline__ void __DEFAULT_FN_ATTRS
_mm_storel_epi64(__m128i_u * __p,__m128i __a)4127 _mm_storel_epi64(__m128i_u *__p, __m128i __a)
4128 {
4129   struct __mm_storel_epi64_struct {
4130     long long __u;
4131   } __attribute__((__packed__, __may_alias__));
4132   ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
4133 }
4134 
4135 /// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
4136 ///    aligned memory location.
4137 ///
4138 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
4139 ///    used again soon).
4140 ///
4141 /// \headerfile <x86intrin.h>
4142 ///
4143 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
4144 ///
4145 /// \param __p
4146 ///    A pointer to the 128-bit aligned memory location used to store the value.
4147 /// \param __a
4148 ///    A vector of [2 x double] containing the 64-bit values to be stored.
4149 static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_pd(double * __p,__m128d __a)4150 _mm_stream_pd(double *__p, __m128d __a)
4151 {
4152   __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p);
4153 }
4154 
4155 /// Stores a 128-bit integer vector to a 128-bit aligned memory location.
4156 ///
4157 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
4158 ///    used again soon).
4159 ///
4160 /// \headerfile <x86intrin.h>
4161 ///
4162 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
4163 ///
4164 /// \param __p
4165 ///    A pointer to the 128-bit aligned memory location used to store the value.
4166 /// \param __a
4167 ///    A 128-bit integer vector containing the values to be stored.
4168 static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_si128(__m128i * __p,__m128i __a)4169 _mm_stream_si128(__m128i *__p, __m128i __a)
4170 {
4171   __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p);
4172 }
4173 
4174 /// Stores a 32-bit integer value in the specified memory location.
4175 ///
4176 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
4177 ///    used again soon).
4178 ///
4179 /// \headerfile <x86intrin.h>
4180 ///
4181 /// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
4182 ///
4183 /// \param __p
4184 ///    A pointer to the 32-bit memory location used to store the value.
4185 /// \param __a
4186 ///    A 32-bit integer containing the value to be stored.
4187 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
_mm_stream_si32(int * __p,int __a)4188 _mm_stream_si32(int *__p, int __a)
4189 {
4190   __builtin_ia32_movnti(__p, __a);
4191 }
4192 
4193 #ifdef __x86_64__
4194 /// Stores a 64-bit integer value in the specified memory location.
4195 ///
4196 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
4197 ///    used again soon).
4198 ///
4199 /// \headerfile <x86intrin.h>
4200 ///
4201 /// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
4202 ///
4203 /// \param __p
4204 ///    A pointer to the 64-bit memory location used to store the value.
4205 /// \param __a
4206 ///    A 64-bit integer containing the value to be stored.
4207 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
_mm_stream_si64(long long * __p,long long __a)4208 _mm_stream_si64(long long *__p, long long __a)
4209 {
4210   __builtin_ia32_movnti64(__p, __a);
4211 }
4212 #endif
4213 
4214 #if defined(__cplusplus)
4215 extern "C" {
4216 #endif
4217 
4218 /// The cache line containing \a __p is flushed and invalidated from all
4219 ///    caches in the coherency domain.
4220 ///
4221 /// \headerfile <x86intrin.h>
4222 ///
4223 /// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
4224 ///
4225 /// \param __p
4226 ///    A pointer to the memory location used to identify the cache line to be
4227 ///    flushed.
4228 void _mm_clflush(void const * __p);
4229 
4230 /// Forces strong memory ordering (serialization) between load
4231 ///    instructions preceding this instruction and load instructions following
4232 ///    this instruction, ensuring the system completes all previous loads before
4233 ///    executing subsequent loads.
4234 ///
4235 /// \headerfile <x86intrin.h>
4236 ///
4237 /// This intrinsic corresponds to the <c> LFENCE </c> instruction.
4238 ///
4239 void _mm_lfence(void);
4240 
4241 /// Forces strong memory ordering (serialization) between load and store
4242 ///    instructions preceding this instruction and load and store instructions
4243 ///    following this instruction, ensuring that the system completes all
4244 ///    previous memory accesses before executing subsequent memory accesses.
4245 ///
4246 /// \headerfile <x86intrin.h>
4247 ///
4248 /// This intrinsic corresponds to the <c> MFENCE </c> instruction.
4249 ///
4250 void _mm_mfence(void);
4251 
4252 #if defined(__cplusplus)
4253 } // extern "C"
4254 #endif
4255 
4256 /// Converts 16-bit signed integers from both 128-bit integer vector
4257 ///    operands into 8-bit signed integers, and packs the results into the
4258 ///    destination. Positive values greater than 0x7F are saturated to 0x7F.
4259 ///    Negative values less than 0x80 are saturated to 0x80.
4260 ///
4261 /// \headerfile <x86intrin.h>
4262 ///
4263 /// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
4264 ///
4265 /// \param __a
4266 ///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4267 ///   a signed integer and is converted to a 8-bit signed integer with
4268 ///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4269 ///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4270 ///   written to the lower 64 bits of the result.
4271 /// \param __b
4272 ///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4273 ///   a signed integer and is converted to a 8-bit signed integer with
4274 ///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4275 ///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4276 ///   written to the higher 64 bits of the result.
4277 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
4278 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_packs_epi16(__m128i __a,__m128i __b)4279 _mm_packs_epi16(__m128i __a, __m128i __b)
4280 {
4281   return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
4282 }
4283 
4284 /// Converts 32-bit signed integers from both 128-bit integer vector
4285 ///    operands into 16-bit signed integers, and packs the results into the
4286 ///    destination. Positive values greater than 0x7FFF are saturated to 0x7FFF.
4287 ///    Negative values less than 0x8000 are saturated to 0x8000.
4288 ///
4289 /// \headerfile <x86intrin.h>
4290 ///
4291 /// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
4292 ///
4293 /// \param __a
4294 ///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4295 ///    a signed integer and is converted to a 16-bit signed integer with
4296 ///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4297 ///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4298 ///    are written to the lower 64 bits of the result.
4299 /// \param __b
4300 ///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4301 ///    a signed integer and is converted to a 16-bit signed integer with
4302 ///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4303 ///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4304 ///    are written to the higher 64 bits of the result.
4305 /// \returns A 128-bit vector of [8 x i16] containing the converted values.
4306 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_packs_epi32(__m128i __a,__m128i __b)4307 _mm_packs_epi32(__m128i __a, __m128i __b)
4308 {
4309   return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
4310 }
4311 
4312 /// Converts 16-bit signed integers from both 128-bit integer vector
4313 ///    operands into 8-bit unsigned integers, and packs the results into the
4314 ///    destination. Values greater than 0xFF are saturated to 0xFF. Values less
4315 ///    than 0x00 are saturated to 0x00.
4316 ///
4317 /// \headerfile <x86intrin.h>
4318 ///
4319 /// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
4320 ///
4321 /// \param __a
4322 ///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4323 ///    a signed integer and is converted to an 8-bit unsigned integer with
4324 ///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4325 ///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4326 ///    written to the lower 64 bits of the result.
4327 /// \param __b
4328 ///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4329 ///    a signed integer and is converted to an 8-bit unsigned integer with
4330 ///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4331 ///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4332 ///    written to the higher 64 bits of the result.
4333 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
4334 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_packus_epi16(__m128i __a,__m128i __b)4335 _mm_packus_epi16(__m128i __a, __m128i __b)
4336 {
4337   return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
4338 }
4339 
4340 /// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
4341 ///    the immediate-value parameter as a selector.
4342 ///
4343 /// \headerfile <x86intrin.h>
4344 ///
4345 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
4346 ///
4347 /// \param __a
4348 ///    A 128-bit integer vector.
4349 /// \param __imm
4350 ///    An immediate value. Bits [2:0] selects values from \a __a to be assigned
4351 ///    to bits[15:0] of the result. \n
4352 ///    000: assign values from bits [15:0] of \a __a. \n
4353 ///    001: assign values from bits [31:16] of \a __a. \n
4354 ///    010: assign values from bits [47:32] of \a __a. \n
4355 ///    011: assign values from bits [63:48] of \a __a. \n
4356 ///    100: assign values from bits [79:64] of \a __a. \n
4357 ///    101: assign values from bits [95:80] of \a __a. \n
4358 ///    110: assign values from bits [111:96] of \a __a. \n
4359 ///    111: assign values from bits [127:112] of \a __a.
4360 /// \returns An integer, whose lower 16 bits are selected from the 128-bit
4361 ///    integer vector parameter and the remaining bits are assigned zeros.
4362 #define _mm_extract_epi16(a, imm) \
4363   ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
4364                                                     (int)(imm)))
4365 
4366 /// Constructs a 128-bit integer vector by first making a copy of the
4367 ///    128-bit integer vector parameter, and then inserting the lower 16 bits
4368 ///    of an integer parameter into an offset specified by the immediate-value
4369 ///    parameter.
4370 ///
4371 /// \headerfile <x86intrin.h>
4372 ///
4373 /// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
4374 ///
4375 /// \param __a
4376 ///    A 128-bit integer vector of [8 x i16]. This vector is copied to the
4377 ///    result and then one of the eight elements in the result is replaced by
4378 ///    the lower 16 bits of \a __b.
4379 /// \param __b
4380 ///    An integer. The lower 16 bits of this parameter are written to the
4381 ///    result beginning at an offset specified by \a __imm.
4382 /// \param __imm
4383 ///    An immediate value specifying the bit offset in the result at which the
4384 ///    lower 16 bits of \a __b are written.
4385 /// \returns A 128-bit integer vector containing the constructed values.
4386 #define _mm_insert_epi16(a, b, imm) \
4387   ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
4388                                         (int)(imm)))
4389 
4390 /// Copies the values of the most significant bits from each 8-bit
4391 ///    element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
4392 ///    value, zero-extends the value, and writes it to the destination.
4393 ///
4394 /// \headerfile <x86intrin.h>
4395 ///
4396 /// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
4397 ///
4398 /// \param __a
4399 ///    A 128-bit integer vector containing the values with bits to be extracted.
4400 /// \returns The most significant bits from each 8-bit element in \a __a,
4401 ///    written to bits [15:0]. The other bits are assigned zeros.
4402 static __inline__ int __DEFAULT_FN_ATTRS
_mm_movemask_epi8(__m128i __a)4403 _mm_movemask_epi8(__m128i __a)
4404 {
4405   return __builtin_ia32_pmovmskb128((__v16qi)__a);
4406 }
4407 
4408 /// Constructs a 128-bit integer vector by shuffling four 32-bit
4409 ///    elements of a 128-bit integer vector parameter, using the immediate-value
4410 ///    parameter as a specifier.
4411 ///
4412 /// \headerfile <x86intrin.h>
4413 ///
4414 /// \code
4415 /// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
4416 /// \endcode
4417 ///
4418 /// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
4419 ///
4420 /// \param a
4421 ///    A 128-bit integer vector containing the values to be copied.
4422 /// \param imm
4423 ///    An immediate value containing an 8-bit value specifying which elements to
4424 ///    copy from a. The destinations within the 128-bit destination are assigned
4425 ///    values as follows: \n
4426 ///    Bits [1:0] are used to assign values to bits [31:0] of the result. \n
4427 ///    Bits [3:2] are used to assign values to bits [63:32] of the result. \n
4428 ///    Bits [5:4] are used to assign values to bits [95:64] of the result. \n
4429 ///    Bits [7:6] are used to assign values to bits [127:96] of the result. \n
4430 ///    Bit value assignments: \n
4431 ///    00: assign values from bits [31:0] of \a a. \n
4432 ///    01: assign values from bits [63:32] of \a a. \n
4433 ///    10: assign values from bits [95:64] of \a a. \n
4434 ///    11: assign values from bits [127:96] of \a a.
4435 /// \returns A 128-bit integer vector containing the shuffled values.
4436 #define _mm_shuffle_epi32(a, imm) \
4437   ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
4438 
4439 /// Constructs a 128-bit integer vector by shuffling four lower 16-bit
4440 ///    elements of a 128-bit integer vector of [8 x i16], using the immediate
4441 ///    value parameter as a specifier.
4442 ///
4443 /// \headerfile <x86intrin.h>
4444 ///
4445 /// \code
4446 /// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
4447 /// \endcode
4448 ///
4449 /// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
4450 ///
4451 /// \param a
4452 ///    A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
4453 ///    [127:64] of the result.
4454 /// \param imm
4455 ///    An 8-bit immediate value specifying which elements to copy from \a a. \n
4456 ///    Bits[1:0] are used to assign values to bits [15:0] of the result. \n
4457 ///    Bits[3:2] are used to assign values to bits [31:16] of the result. \n
4458 ///    Bits[5:4] are used to assign values to bits [47:32] of the result. \n
4459 ///    Bits[7:6] are used to assign values to bits [63:48] of the result. \n
4460 ///    Bit value assignments: \n
4461 ///    00: assign values from bits [15:0] of \a a. \n
4462 ///    01: assign values from bits [31:16] of \a a. \n
4463 ///    10: assign values from bits [47:32] of \a a. \n
4464 ///    11: assign values from bits [63:48] of \a a. \n
4465 /// \returns A 128-bit integer vector containing the shuffled values.
4466 #define _mm_shufflelo_epi16(a, imm) \
4467   ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
4468 
4469 /// Constructs a 128-bit integer vector by shuffling four upper 16-bit
4470 ///    elements of a 128-bit integer vector of [8 x i16], using the immediate
4471 ///    value parameter as a specifier.
4472 ///
4473 /// \headerfile <x86intrin.h>
4474 ///
4475 /// \code
4476 /// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
4477 /// \endcode
4478 ///
4479 /// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
4480 ///
4481 /// \param a
4482 ///    A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
4483 ///    [63:0] of the result.
4484 /// \param imm
4485 ///    An 8-bit immediate value specifying which elements to copy from \a a. \n
4486 ///    Bits[1:0] are used to assign values to bits [79:64] of the result. \n
4487 ///    Bits[3:2] are used to assign values to bits [95:80] of the result. \n
4488 ///    Bits[5:4] are used to assign values to bits [111:96] of the result. \n
4489 ///    Bits[7:6] are used to assign values to bits [127:112] of the result. \n
4490 ///    Bit value assignments: \n
4491 ///    00: assign values from bits [79:64] of \a a. \n
4492 ///    01: assign values from bits [95:80] of \a a. \n
4493 ///    10: assign values from bits [111:96] of \a a. \n
4494 ///    11: assign values from bits [127:112] of \a a. \n
4495 /// \returns A 128-bit integer vector containing the shuffled values.
4496 #define _mm_shufflehi_epi16(a, imm) \
4497   ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
4498 
4499 /// Unpacks the high-order (index 8-15) values from two 128-bit vectors
4500 ///    of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4501 ///
4502 /// \headerfile <x86intrin.h>
4503 ///
4504 /// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
4505 ///   instruction.
4506 ///
4507 /// \param __a
4508 ///    A 128-bit vector of [16 x i8].
4509 ///    Bits [71:64] are written to bits [7:0] of the result. \n
4510 ///    Bits [79:72] are written to bits [23:16] of the result. \n
4511 ///    Bits [87:80] are written to bits [39:32] of the result. \n
4512 ///    Bits [95:88] are written to bits [55:48] of the result. \n
4513 ///    Bits [103:96] are written to bits [71:64] of the result. \n
4514 ///    Bits [111:104] are written to bits [87:80] of the result. \n
4515 ///    Bits [119:112] are written to bits [103:96] of the result. \n
4516 ///    Bits [127:120] are written to bits [119:112] of the result.
4517 /// \param __b
4518 ///    A 128-bit vector of [16 x i8]. \n
4519 ///    Bits [71:64] are written to bits [15:8] of the result. \n
4520 ///    Bits [79:72] are written to bits [31:24] of the result. \n
4521 ///    Bits [87:80] are written to bits [47:40] of the result. \n
4522 ///    Bits [95:88] are written to bits [63:56] of the result. \n
4523 ///    Bits [103:96] are written to bits [79:72] of the result. \n
4524 ///    Bits [111:104] are written to bits [95:88] of the result. \n
4525 ///    Bits [119:112] are written to bits [111:104] of the result. \n
4526 ///    Bits [127:120] are written to bits [127:120] of the result.
4527 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4528 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpackhi_epi8(__m128i __a,__m128i __b)4529 _mm_unpackhi_epi8(__m128i __a, __m128i __b)
4530 {
4531   return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
4532 }
4533 
4534 /// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
4535 ///    [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
4536 ///
4537 /// \headerfile <x86intrin.h>
4538 ///
4539 /// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
4540 ///   instruction.
4541 ///
4542 /// \param __a
4543 ///    A 128-bit vector of [8 x i16].
4544 ///    Bits [79:64] are written to bits [15:0] of the result. \n
4545 ///    Bits [95:80] are written to bits [47:32] of the result. \n
4546 ///    Bits [111:96] are written to bits [79:64] of the result. \n
4547 ///    Bits [127:112] are written to bits [111:96] of the result.
4548 /// \param __b
4549 ///    A 128-bit vector of [8 x i16].
4550 ///    Bits [79:64] are written to bits [31:16] of the result. \n
4551 ///    Bits [95:80] are written to bits [63:48] of the result. \n
4552 ///    Bits [111:96] are written to bits [95:80] of the result. \n
4553 ///    Bits [127:112] are written to bits [127:112] of the result.
4554 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4555 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpackhi_epi16(__m128i __a,__m128i __b)4556 _mm_unpackhi_epi16(__m128i __a, __m128i __b)
4557 {
4558   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
4559 }
4560 
4561 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
4562 ///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4563 ///
4564 /// \headerfile <x86intrin.h>
4565 ///
4566 /// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
4567 ///   instruction.
4568 ///
4569 /// \param __a
4570 ///    A 128-bit vector of [4 x i32]. \n
4571 ///    Bits [95:64] are written to bits [31:0] of the destination. \n
4572 ///    Bits [127:96] are written to bits [95:64] of the destination.
4573 /// \param __b
4574 ///    A 128-bit vector of [4 x i32]. \n
4575 ///    Bits [95:64] are written to bits [64:32] of the destination. \n
4576 ///    Bits [127:96] are written to bits [127:96] of the destination.
4577 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4578 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpackhi_epi32(__m128i __a,__m128i __b)4579 _mm_unpackhi_epi32(__m128i __a, __m128i __b)
4580 {
4581   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
4582 }
4583 
4584 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4585 ///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4586 ///
4587 /// \headerfile <x86intrin.h>
4588 ///
4589 /// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
4590 ///   instruction.
4591 ///
4592 /// \param __a
4593 ///    A 128-bit vector of [2 x i64]. \n
4594 ///    Bits [127:64] are written to bits [63:0] of the destination.
4595 /// \param __b
4596 ///    A 128-bit vector of [2 x i64]. \n
4597 ///    Bits [127:64] are written to bits [127:64] of the destination.
4598 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4599 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpackhi_epi64(__m128i __a,__m128i __b)4600 _mm_unpackhi_epi64(__m128i __a, __m128i __b)
4601 {
4602   return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1);
4603 }
4604 
4605 /// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
4606 ///    [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4607 ///
4608 /// \headerfile <x86intrin.h>
4609 ///
4610 /// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
4611 ///   instruction.
4612 ///
4613 /// \param __a
4614 ///    A 128-bit vector of [16 x i8]. \n
4615 ///    Bits [7:0] are written to bits [7:0] of the result. \n
4616 ///    Bits [15:8] are written to bits [23:16] of the result. \n
4617 ///    Bits [23:16] are written to bits [39:32] of the result. \n
4618 ///    Bits [31:24] are written to bits [55:48] of the result. \n
4619 ///    Bits [39:32] are written to bits [71:64] of the result. \n
4620 ///    Bits [47:40] are written to bits [87:80] of the result. \n
4621 ///    Bits [55:48] are written to bits [103:96] of the result. \n
4622 ///    Bits [63:56] are written to bits [119:112] of the result.
4623 /// \param __b
4624 ///    A 128-bit vector of [16 x i8].
4625 ///    Bits [7:0] are written to bits [15:8] of the result. \n
4626 ///    Bits [15:8] are written to bits [31:24] of the result. \n
4627 ///    Bits [23:16] are written to bits [47:40] of the result. \n
4628 ///    Bits [31:24] are written to bits [63:56] of the result. \n
4629 ///    Bits [39:32] are written to bits [79:72] of the result. \n
4630 ///    Bits [47:40] are written to bits [95:88] of the result. \n
4631 ///    Bits [55:48] are written to bits [111:104] of the result. \n
4632 ///    Bits [63:56] are written to bits [127:120] of the result.
4633 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4634 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpacklo_epi8(__m128i __a,__m128i __b)4635 _mm_unpacklo_epi8(__m128i __a, __m128i __b)
4636 {
4637   return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
4638 }
4639 
4640 /// Unpacks the low-order (index 0-3) values from each of the two 128-bit
4641 ///    vectors of [8 x i16] and interleaves them into a 128-bit vector of
4642 ///    [8 x i16].
4643 ///
4644 /// \headerfile <x86intrin.h>
4645 ///
4646 /// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
4647 ///   instruction.
4648 ///
4649 /// \param __a
4650 ///    A 128-bit vector of [8 x i16].
4651 ///    Bits [15:0] are written to bits [15:0] of the result. \n
4652 ///    Bits [31:16] are written to bits [47:32] of the result. \n
4653 ///    Bits [47:32] are written to bits [79:64] of the result. \n
4654 ///    Bits [63:48] are written to bits [111:96] of the result.
4655 /// \param __b
4656 ///    A 128-bit vector of [8 x i16].
4657 ///    Bits [15:0] are written to bits [31:16] of the result. \n
4658 ///    Bits [31:16] are written to bits [63:48] of the result. \n
4659 ///    Bits [47:32] are written to bits [95:80] of the result. \n
4660 ///    Bits [63:48] are written to bits [127:112] of the result.
4661 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4662 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpacklo_epi16(__m128i __a,__m128i __b)4663 _mm_unpacklo_epi16(__m128i __a, __m128i __b)
4664 {
4665   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
4666 }
4667 
4668 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
4669 ///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4670 ///
4671 /// \headerfile <x86intrin.h>
4672 ///
4673 /// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
4674 ///   instruction.
4675 ///
4676 /// \param __a
4677 ///    A 128-bit vector of [4 x i32]. \n
4678 ///    Bits [31:0] are written to bits [31:0] of the destination. \n
4679 ///    Bits [63:32] are written to bits [95:64] of the destination.
4680 /// \param __b
4681 ///    A 128-bit vector of [4 x i32]. \n
4682 ///    Bits [31:0] are written to bits [64:32] of the destination. \n
4683 ///    Bits [63:32] are written to bits [127:96] of the destination.
4684 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4685 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpacklo_epi32(__m128i __a,__m128i __b)4686 _mm_unpacklo_epi32(__m128i __a, __m128i __b)
4687 {
4688   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
4689 }
4690 
4691 /// Unpacks the low-order 64-bit elements from two 128-bit vectors of
4692 ///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4693 ///
4694 /// \headerfile <x86intrin.h>
4695 ///
4696 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
4697 ///   instruction.
4698 ///
4699 /// \param __a
4700 ///    A 128-bit vector of [2 x i64]. \n
4701 ///    Bits [63:0] are written to bits [63:0] of the destination. \n
4702 /// \param __b
4703 ///    A 128-bit vector of [2 x i64]. \n
4704 ///    Bits [63:0] are written to bits [127:64] of the destination. \n
4705 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4706 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpacklo_epi64(__m128i __a,__m128i __b)4707 _mm_unpacklo_epi64(__m128i __a, __m128i __b)
4708 {
4709   return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0);
4710 }
4711 
4712 /// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
4713 ///    integer.
4714 ///
4715 /// \headerfile <x86intrin.h>
4716 ///
4717 /// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
4718 ///
4719 /// \param __a
4720 ///    A 128-bit integer vector operand. The lower 64 bits are moved to the
4721 ///    destination.
4722 /// \returns A 64-bit integer containing the lower 64 bits of the parameter.
4723 static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_movepi64_pi64(__m128i __a)4724 _mm_movepi64_pi64(__m128i __a)
4725 {
4726   return (__m64)__a[0];
4727 }
4728 
4729 /// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
4730 ///    upper bits.
4731 ///
4732 /// \headerfile <x86intrin.h>
4733 ///
4734 /// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
4735 ///
4736 /// \param __a
4737 ///    A 64-bit value.
4738 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4739 ///    the operand. The upper 64 bits are assigned zeros.
4740 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_movpi64_epi64(__m64 __a)4741 _mm_movpi64_epi64(__m64 __a)
4742 {
4743   return __extension__ (__m128i)(__v2di){ (long long)__a, 0 };
4744 }
4745 
4746 /// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
4747 ///    integer vector, zeroing the upper bits.
4748 ///
4749 /// \headerfile <x86intrin.h>
4750 ///
4751 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
4752 ///
4753 /// \param __a
4754 ///    A 128-bit integer vector operand. The lower 64 bits are moved to the
4755 ///    destination.
4756 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4757 ///    the operand. The upper 64 bits are assigned zeros.
4758 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_move_epi64(__m128i __a)4759 _mm_move_epi64(__m128i __a)
4760 {
4761   return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
4762 }
4763 
4764 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4765 ///    [2 x double] and interleaves them into a 128-bit vector of [2 x
4766 ///    double].
4767 ///
4768 /// \headerfile <x86intrin.h>
4769 ///
4770 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
4771 ///
4772 /// \param __a
4773 ///    A 128-bit vector of [2 x double]. \n
4774 ///    Bits [127:64] are written to bits [63:0] of the destination.
4775 /// \param __b
4776 ///    A 128-bit vector of [2 x double]. \n
4777 ///    Bits [127:64] are written to bits [127:64] of the destination.
4778 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4779 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_unpackhi_pd(__m128d __a,__m128d __b)4780 _mm_unpackhi_pd(__m128d __a, __m128d __b)
4781 {
4782   return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1);
4783 }
4784 
4785 /// Unpacks the low-order 64-bit elements from two 128-bit vectors
4786 ///    of [2 x double] and interleaves them into a 128-bit vector of [2 x
4787 ///    double].
4788 ///
4789 /// \headerfile <x86intrin.h>
4790 ///
4791 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
4792 ///
4793 /// \param __a
4794 ///    A 128-bit vector of [2 x double]. \n
4795 ///    Bits [63:0] are written to bits [63:0] of the destination.
4796 /// \param __b
4797 ///    A 128-bit vector of [2 x double]. \n
4798 ///    Bits [63:0] are written to bits [127:64] of the destination.
4799 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4800 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_unpacklo_pd(__m128d __a,__m128d __b)4801 _mm_unpacklo_pd(__m128d __a, __m128d __b)
4802 {
4803   return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0);
4804 }
4805 
4806 /// Extracts the sign bits of the double-precision values in the 128-bit
4807 ///    vector of [2 x double], zero-extends the value, and writes it to the
4808 ///    low-order bits of the destination.
4809 ///
4810 /// \headerfile <x86intrin.h>
4811 ///
4812 /// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
4813 ///
4814 /// \param __a
4815 ///    A 128-bit vector of [2 x double] containing the values with sign bits to
4816 ///    be extracted.
4817 /// \returns The sign bits from each of the double-precision elements in \a __a,
4818 ///    written to bits [1:0]. The remaining bits are assigned values of zero.
4819 static __inline__ int __DEFAULT_FN_ATTRS
_mm_movemask_pd(__m128d __a)4820 _mm_movemask_pd(__m128d __a)
4821 {
4822   return __builtin_ia32_movmskpd((__v2df)__a);
4823 }
4824 
4825 
4826 /// Constructs a 128-bit floating-point vector of [2 x double] from two
4827 ///    128-bit vector parameters of [2 x double], using the immediate-value
4828 ///     parameter as a specifier.
4829 ///
4830 /// \headerfile <x86intrin.h>
4831 ///
4832 /// \code
4833 /// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
4834 /// \endcode
4835 ///
4836 /// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
4837 ///
4838 /// \param a
4839 ///    A 128-bit vector of [2 x double].
4840 /// \param b
4841 ///    A 128-bit vector of [2 x double].
4842 /// \param i
4843 ///    An 8-bit immediate value. The least significant two bits specify which
4844 ///    elements to copy from \a a and \a b: \n
4845 ///    Bit[0] = 0: lower element of \a a copied to lower element of result. \n
4846 ///    Bit[0] = 1: upper element of \a a copied to lower element of result. \n
4847 ///    Bit[1] = 0: lower element of \a b copied to upper element of result. \n
4848 ///    Bit[1] = 1: upper element of \a b copied to upper element of result. \n
4849 /// \returns A 128-bit vector of [2 x double] containing the shuffled values.
4850 #define _mm_shuffle_pd(a, b, i) \
4851   ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4852                                   (int)(i)))
4853 
4854 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4855 ///    floating-point vector of [4 x float].
4856 ///
4857 /// \headerfile <x86intrin.h>
4858 ///
4859 /// This intrinsic has no corresponding instruction.
4860 ///
4861 /// \param __a
4862 ///    A 128-bit floating-point vector of [2 x double].
4863 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4864 ///    bitwise pattern as the parameter.
4865 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_castpd_ps(__m128d __a)4866 _mm_castpd_ps(__m128d __a)
4867 {
4868   return (__m128)__a;
4869 }
4870 
4871 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4872 ///    integer vector.
4873 ///
4874 /// \headerfile <x86intrin.h>
4875 ///
4876 /// This intrinsic has no corresponding instruction.
4877 ///
4878 /// \param __a
4879 ///    A 128-bit floating-point vector of [2 x double].
4880 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4881 ///    parameter.
4882 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_castpd_si128(__m128d __a)4883 _mm_castpd_si128(__m128d __a)
4884 {
4885   return (__m128i)__a;
4886 }
4887 
4888 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4889 ///    floating-point vector of [2 x double].
4890 ///
4891 /// \headerfile <x86intrin.h>
4892 ///
4893 /// This intrinsic has no corresponding instruction.
4894 ///
4895 /// \param __a
4896 ///    A 128-bit floating-point vector of [4 x float].
4897 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4898 ///    bitwise pattern as the parameter.
4899 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_castps_pd(__m128 __a)4900 _mm_castps_pd(__m128 __a)
4901 {
4902   return (__m128d)__a;
4903 }
4904 
4905 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4906 ///    integer vector.
4907 ///
4908 /// \headerfile <x86intrin.h>
4909 ///
4910 /// This intrinsic has no corresponding instruction.
4911 ///
4912 /// \param __a
4913 ///    A 128-bit floating-point vector of [4 x float].
4914 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4915 ///    parameter.
4916 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_castps_si128(__m128 __a)4917 _mm_castps_si128(__m128 __a)
4918 {
4919   return (__m128i)__a;
4920 }
4921 
4922 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
4923 ///    of [4 x float].
4924 ///
4925 /// \headerfile <x86intrin.h>
4926 ///
4927 /// This intrinsic has no corresponding instruction.
4928 ///
4929 /// \param __a
4930 ///    A 128-bit integer vector.
4931 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4932 ///    bitwise pattern as the parameter.
4933 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_castsi128_ps(__m128i __a)4934 _mm_castsi128_ps(__m128i __a)
4935 {
4936   return (__m128)__a;
4937 }
4938 
4939 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
4940 ///    of [2 x double].
4941 ///
4942 /// \headerfile <x86intrin.h>
4943 ///
4944 /// This intrinsic has no corresponding instruction.
4945 ///
4946 /// \param __a
4947 ///    A 128-bit integer vector.
4948 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4949 ///    bitwise pattern as the parameter.
4950 static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_castsi128_pd(__m128i __a)4951 _mm_castsi128_pd(__m128i __a)
4952 {
4953   return (__m128d)__a;
4954 }
4955 
4956 #if defined(__cplusplus)
4957 extern "C" {
4958 #endif
4959 
4960 /// Indicates that a spin loop is being executed for the purposes of
4961 ///    optimizing power consumption during the loop.
4962 ///
4963 /// \headerfile <x86intrin.h>
4964 ///
4965 /// This intrinsic corresponds to the <c> PAUSE </c> instruction.
4966 ///
4967 void _mm_pause(void);
4968 
4969 #if defined(__cplusplus)
4970 } // extern "C"
4971 #endif
4972 #undef __DEFAULT_FN_ATTRS
4973 #undef __DEFAULT_FN_ATTRS_MMX
4974 
4975 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
4976 
4977 #define _MM_DENORMALS_ZERO_ON   (0x0040U)
4978 #define _MM_DENORMALS_ZERO_OFF  (0x0000U)
4979 
4980 #define _MM_DENORMALS_ZERO_MASK (0x0040U)
4981 
4982 #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
4983 #define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
4984 
4985 #endif /* __EMMINTRIN_H */
4986