1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #ifndef __EMMINTRIN_H
11 #define __EMMINTRIN_H
12 
13 #if !defined(__i386__) && !defined(__x86_64__)
14 #error "This header is only meant to be used on x86 and x64 architecture"
15 #endif
16 
17 #include <xmmintrin.h>
18 
19 typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
20 typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
21 
22 typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
23 typedef long long __m128i_u
24     __attribute__((__vector_size__(16), __aligned__(1)));
25 
26 /* Type defines.  */
27 typedef double __v2df __attribute__((__vector_size__(16)));
28 typedef long long __v2di __attribute__((__vector_size__(16)));
29 typedef short __v8hi __attribute__((__vector_size__(16)));
30 typedef char __v16qi __attribute__((__vector_size__(16)));
31 
32 /* Unsigned types */
33 typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
34 typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
35 typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
36 
37 /* We need an explicitly signed variant for char. Note that this shouldn't
38  * appear in the interface though. */
39 typedef signed char __v16qs __attribute__((__vector_size__(16)));
40 
41 #ifdef __SSE2__
42 /* Both _Float16 and __bf16 require SSE2 being enabled. */
43 typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
44 typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
45 typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
46 
47 typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16)));
48 typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
49 #endif
50 
51 /* Define the default attributes for the functions in this file. */
52 #define __DEFAULT_FN_ATTRS                                                     \
53   __attribute__((__always_inline__, __nodebug__,                               \
54                  __target__("sse2,no-evex512"), __min_vector_width__(128)))
55 #define __DEFAULT_FN_ATTRS_MMX                                                 \
56   __attribute__((__always_inline__, __nodebug__,                               \
57                  __target__("mmx,sse2,no-evex512"), __min_vector_width__(64)))
58 
59 /// Adds lower double-precision values in both operands and returns the
60 ///    sum in the lower 64 bits of the result. The upper 64 bits of the result
61 ///    are copied from the upper double-precision value of the first operand.
62 ///
63 /// \headerfile <x86intrin.h>
64 ///
65 /// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
66 ///
67 /// \param __a
68 ///    A 128-bit vector of [2 x double] containing one of the source operands.
69 /// \param __b
70 ///    A 128-bit vector of [2 x double] containing one of the source operands.
71 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
72 ///    sum of the lower 64 bits of both operands. The upper 64 bits are copied
73 ///    from the upper 64 bits of the first source operand.
74 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a,
75                                                         __m128d __b) {
76   __a[0] += __b[0];
77   return __a;
78 }
79 
80 /// Adds two 128-bit vectors of [2 x double].
81 ///
82 /// \headerfile <x86intrin.h>
83 ///
84 /// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
85 ///
86 /// \param __a
87 ///    A 128-bit vector of [2 x double] containing one of the source operands.
88 /// \param __b
89 ///    A 128-bit vector of [2 x double] containing one of the source operands.
90 /// \returns A 128-bit vector of [2 x double] containing the sums of both
91 ///    operands.
92 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a,
93                                                         __m128d __b) {
94   return (__m128d)((__v2df)__a + (__v2df)__b);
95 }
96 
97 /// Subtracts the lower double-precision value of the second operand
98 ///    from the lower double-precision value of the first operand and returns
99 ///    the difference in the lower 64 bits of the result. The upper 64 bits of
100 ///    the result are copied from the upper double-precision value of the first
101 ///    operand.
102 ///
103 /// \headerfile <x86intrin.h>
104 ///
105 /// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
106 ///
107 /// \param __a
108 ///    A 128-bit vector of [2 x double] containing the minuend.
109 /// \param __b
110 ///    A 128-bit vector of [2 x double] containing the subtrahend.
111 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
112 ///    difference of the lower 64 bits of both operands. The upper 64 bits are
113 ///    copied from the upper 64 bits of the first source operand.
114 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a,
115                                                         __m128d __b) {
116   __a[0] -= __b[0];
117   return __a;
118 }
119 
120 /// Subtracts two 128-bit vectors of [2 x double].
121 ///
122 /// \headerfile <x86intrin.h>
123 ///
124 /// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
125 ///
126 /// \param __a
127 ///    A 128-bit vector of [2 x double] containing the minuend.
128 /// \param __b
129 ///    A 128-bit vector of [2 x double] containing the subtrahend.
130 /// \returns A 128-bit vector of [2 x double] containing the differences between
131 ///    both operands.
132 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a,
133                                                         __m128d __b) {
134   return (__m128d)((__v2df)__a - (__v2df)__b);
135 }
136 
137 /// Multiplies lower double-precision values in both operands and returns
138 ///    the product in the lower 64 bits of the result. The upper 64 bits of the
139 ///    result are copied from the upper double-precision value of the first
140 ///    operand.
141 ///
142 /// \headerfile <x86intrin.h>
143 ///
144 /// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
145 ///
146 /// \param __a
147 ///    A 128-bit vector of [2 x double] containing one of the source operands.
148 /// \param __b
149 ///    A 128-bit vector of [2 x double] containing one of the source operands.
150 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
151 ///    product of the lower 64 bits of both operands. The upper 64 bits are
152 ///    copied from the upper 64 bits of the first source operand.
153 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a,
154                                                         __m128d __b) {
155   __a[0] *= __b[0];
156   return __a;
157 }
158 
159 /// Multiplies two 128-bit vectors of [2 x double].
160 ///
161 /// \headerfile <x86intrin.h>
162 ///
163 /// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
164 ///
165 /// \param __a
166 ///    A 128-bit vector of [2 x double] containing one of the operands.
167 /// \param __b
168 ///    A 128-bit vector of [2 x double] containing one of the operands.
169 /// \returns A 128-bit vector of [2 x double] containing the products of both
170 ///    operands.
171 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a,
172                                                         __m128d __b) {
173   return (__m128d)((__v2df)__a * (__v2df)__b);
174 }
175 
176 /// Divides the lower double-precision value of the first operand by the
177 ///    lower double-precision value of the second operand and returns the
178 ///    quotient in the lower 64 bits of the result. The upper 64 bits of the
179 ///    result are copied from the upper double-precision value of the first
180 ///    operand.
181 ///
182 /// \headerfile <x86intrin.h>
183 ///
184 /// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
185 ///
186 /// \param __a
187 ///    A 128-bit vector of [2 x double] containing the dividend.
188 /// \param __b
189 ///    A 128-bit vector of [2 x double] containing divisor.
190 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
191 ///    quotient of the lower 64 bits of both operands. The upper 64 bits are
192 ///    copied from the upper 64 bits of the first source operand.
193 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a,
194                                                         __m128d __b) {
195   __a[0] /= __b[0];
196   return __a;
197 }
198 
199 /// Performs an element-by-element division of two 128-bit vectors of
200 ///    [2 x double].
201 ///
202 /// \headerfile <x86intrin.h>
203 ///
204 /// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
205 ///
206 /// \param __a
207 ///    A 128-bit vector of [2 x double] containing the dividend.
208 /// \param __b
209 ///    A 128-bit vector of [2 x double] containing the divisor.
210 /// \returns A 128-bit vector of [2 x double] containing the quotients of both
211 ///    operands.
212 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a,
213                                                         __m128d __b) {
214   return (__m128d)((__v2df)__a / (__v2df)__b);
215 }
216 
217 /// Calculates the square root of the lower double-precision value of
218 ///    the second operand and returns it in the lower 64 bits of the result.
219 ///    The upper 64 bits of the result are copied from the upper
220 ///    double-precision value of the first operand.
221 ///
222 /// \headerfile <x86intrin.h>
223 ///
224 /// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
225 ///
226 /// \param __a
227 ///    A 128-bit vector of [2 x double] containing one of the operands. The
228 ///    upper 64 bits of this operand are copied to the upper 64 bits of the
229 ///    result.
230 /// \param __b
231 ///    A 128-bit vector of [2 x double] containing one of the operands. The
232 ///    square root is calculated using the lower 64 bits of this operand.
233 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
234 ///    square root of the lower 64 bits of operand \a __b, and whose upper 64
235 ///    bits are copied from the upper 64 bits of operand \a __a.
236 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
237                                                          __m128d __b) {
238   __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
239   return __extension__(__m128d){__c[0], __a[1]};
240 }
241 
242 /// Calculates the square root of the each of two values stored in a
243 ///    128-bit vector of [2 x double].
244 ///
245 /// \headerfile <x86intrin.h>
246 ///
247 /// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
248 ///
249 /// \param __a
250 ///    A 128-bit vector of [2 x double].
251 /// \returns A 128-bit vector of [2 x double] containing the square roots of the
252 ///    values in the operand.
253 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) {
254   return __builtin_ia32_sqrtpd((__v2df)__a);
255 }
256 
257 /// Compares lower 64-bit double-precision values of both operands, and
258 ///    returns the lesser of the pair of values in the lower 64-bits of the
259 ///    result. The upper 64 bits of the result are copied from the upper
260 ///    double-precision value of the first operand.
261 ///
262 /// \headerfile <x86intrin.h>
263 ///
264 /// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
265 ///
266 /// \param __a
267 ///    A 128-bit vector of [2 x double] containing one of the operands. The
268 ///    lower 64 bits of this operand are used in the comparison.
269 /// \param __b
270 ///    A 128-bit vector of [2 x double] containing one of the operands. The
271 ///    lower 64 bits of this operand are used in the comparison.
272 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
273 ///    minimum value between both operands. The upper 64 bits are copied from
274 ///    the upper 64 bits of the first source operand.
275 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a,
276                                                         __m128d __b) {
277   return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
278 }
279 
280 /// Performs element-by-element comparison of the two 128-bit vectors of
281 ///    [2 x double] and returns the vector containing the lesser of each pair of
282 ///    values.
283 ///
284 /// \headerfile <x86intrin.h>
285 ///
286 /// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
287 ///
288 /// \param __a
289 ///    A 128-bit vector of [2 x double] containing one of the operands.
290 /// \param __b
291 ///    A 128-bit vector of [2 x double] containing one of the operands.
292 /// \returns A 128-bit vector of [2 x double] containing the minimum values
293 ///    between both operands.
294 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a,
295                                                         __m128d __b) {
296   return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
297 }
298 
299 /// Compares lower 64-bit double-precision values of both operands, and
300 ///    returns the greater of the pair of values in the lower 64-bits of the
301 ///    result. The upper 64 bits of the result are copied from the upper
302 ///    double-precision value of the first operand.
303 ///
304 /// \headerfile <x86intrin.h>
305 ///
306 /// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
307 ///
308 /// \param __a
309 ///    A 128-bit vector of [2 x double] containing one of the operands. The
310 ///    lower 64 bits of this operand are used in the comparison.
311 /// \param __b
312 ///    A 128-bit vector of [2 x double] containing one of the operands. The
313 ///    lower 64 bits of this operand are used in the comparison.
314 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
315 ///    maximum value between both operands. The upper 64 bits are copied from
316 ///    the upper 64 bits of the first source operand.
317 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a,
318                                                         __m128d __b) {
319   return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
320 }
321 
322 /// Performs element-by-element comparison of the two 128-bit vectors of
323 ///    [2 x double] and returns the vector containing the greater of each pair
324 ///    of values.
325 ///
326 /// \headerfile <x86intrin.h>
327 ///
328 /// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
329 ///
330 /// \param __a
331 ///    A 128-bit vector of [2 x double] containing one of the operands.
332 /// \param __b
333 ///    A 128-bit vector of [2 x double] containing one of the operands.
334 /// \returns A 128-bit vector of [2 x double] containing the maximum values
335 ///    between both operands.
336 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a,
337                                                         __m128d __b) {
338   return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
339 }
340 
341 /// Performs a bitwise AND of two 128-bit vectors of [2 x double].
342 ///
343 /// \headerfile <x86intrin.h>
344 ///
345 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
346 ///
347 /// \param __a
348 ///    A 128-bit vector of [2 x double] containing one of the source operands.
349 /// \param __b
350 ///    A 128-bit vector of [2 x double] containing one of the source operands.
351 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
352 ///    values between both operands.
353 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a,
354                                                         __m128d __b) {
355   return (__m128d)((__v2du)__a & (__v2du)__b);
356 }
357 
358 /// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
359 ///    the one's complement of the values contained in the first source operand.
360 ///
361 /// \headerfile <x86intrin.h>
362 ///
363 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
364 ///
365 /// \param __a
366 ///    A 128-bit vector of [2 x double] containing the left source operand. The
367 ///    one's complement of this value is used in the bitwise AND.
368 /// \param __b
369 ///    A 128-bit vector of [2 x double] containing the right source operand.
370 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
371 ///    values in the second operand and the one's complement of the first
372 ///    operand.
373 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a,
374                                                            __m128d __b) {
375   return (__m128d)(~(__v2du)__a & (__v2du)__b);
376 }
377 
378 /// Performs a bitwise OR of two 128-bit vectors of [2 x double].
379 ///
380 /// \headerfile <x86intrin.h>
381 ///
382 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
383 ///
384 /// \param __a
385 ///    A 128-bit vector of [2 x double] containing one of the source operands.
386 /// \param __b
387 ///    A 128-bit vector of [2 x double] containing one of the source operands.
388 /// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
389 ///    values between both operands.
390 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a,
391                                                        __m128d __b) {
392   return (__m128d)((__v2du)__a | (__v2du)__b);
393 }
394 
395 /// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
396 ///
397 /// \headerfile <x86intrin.h>
398 ///
399 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
400 ///
401 /// \param __a
402 ///    A 128-bit vector of [2 x double] containing one of the source operands.
403 /// \param __b
404 ///    A 128-bit vector of [2 x double] containing one of the source operands.
405 /// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
406 ///    values between both operands.
407 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a,
408                                                         __m128d __b) {
409   return (__m128d)((__v2du)__a ^ (__v2du)__b);
410 }
411 
412 /// Compares each of the corresponding double-precision values of the
413 ///    128-bit vectors of [2 x double] for equality. Each comparison yields 0x0
414 ///    for false, 0xFFFFFFFFFFFFFFFF for true.
415 ///
416 /// \headerfile <x86intrin.h>
417 ///
418 /// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
419 ///
420 /// \param __a
421 ///    A 128-bit vector of [2 x double].
422 /// \param __b
423 ///    A 128-bit vector of [2 x double].
424 /// \returns A 128-bit vector containing the comparison results.
425 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a,
426                                                           __m128d __b) {
427   return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
428 }
429 
430 /// Compares each of the corresponding double-precision values of the
431 ///    128-bit vectors of [2 x double] to determine if the values in the first
432 ///    operand are less than those in the second operand. Each comparison
433 ///    yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
434 ///
435 /// \headerfile <x86intrin.h>
436 ///
437 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
438 ///
439 /// \param __a
440 ///    A 128-bit vector of [2 x double].
441 /// \param __b
442 ///    A 128-bit vector of [2 x double].
443 /// \returns A 128-bit vector containing the comparison results.
444 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a,
445                                                           __m128d __b) {
446   return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
447 }
448 
449 /// Compares each of the corresponding double-precision values of the
450 ///    128-bit vectors of [2 x double] to determine if the values in the first
451 ///    operand are less than or equal to those in the second operand.
452 ///
453 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
454 ///
455 /// \headerfile <x86intrin.h>
456 ///
457 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
458 ///
459 /// \param __a
460 ///    A 128-bit vector of [2 x double].
461 /// \param __b
462 ///    A 128-bit vector of [2 x double].
463 /// \returns A 128-bit vector containing the comparison results.
464 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a,
465                                                           __m128d __b) {
466   return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
467 }
468 
469 /// Compares each of the corresponding double-precision values of the
470 ///    128-bit vectors of [2 x double] to determine if the values in the first
471 ///    operand are greater than those in the second operand.
472 ///
473 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
474 ///
475 /// \headerfile <x86intrin.h>
476 ///
477 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
478 ///
479 /// \param __a
480 ///    A 128-bit vector of [2 x double].
481 /// \param __b
482 ///    A 128-bit vector of [2 x double].
483 /// \returns A 128-bit vector containing the comparison results.
484 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a,
485                                                           __m128d __b) {
486   return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
487 }
488 
489 /// Compares each of the corresponding double-precision values of the
490 ///    128-bit vectors of [2 x double] to determine if the values in the first
491 ///    operand are greater than or equal to those in the second operand.
492 ///
493 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
494 ///
495 /// \headerfile <x86intrin.h>
496 ///
497 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
498 ///
499 /// \param __a
500 ///    A 128-bit vector of [2 x double].
501 /// \param __b
502 ///    A 128-bit vector of [2 x double].
503 /// \returns A 128-bit vector containing the comparison results.
504 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a,
505                                                           __m128d __b) {
506   return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
507 }
508 
509 /// Compares each of the corresponding double-precision values of the
510 ///    128-bit vectors of [2 x double] to determine if the values in the first
511 ///    operand are ordered with respect to those in the second operand.
512 ///
513 ///    A pair of double-precision values are "ordered" with respect to each
514 ///    other if neither value is a NaN. Each comparison yields 0x0 for false,
515 ///    0xFFFFFFFFFFFFFFFF for true.
516 ///
517 /// \headerfile <x86intrin.h>
518 ///
519 /// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
520 ///
521 /// \param __a
522 ///    A 128-bit vector of [2 x double].
523 /// \param __b
524 ///    A 128-bit vector of [2 x double].
525 /// \returns A 128-bit vector containing the comparison results.
526 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a,
527                                                            __m128d __b) {
528   return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
529 }
530 
531 /// Compares each of the corresponding double-precision values of the
532 ///    128-bit vectors of [2 x double] to determine if the values in the first
533 ///    operand are unordered with respect to those in the second operand.
534 ///
535 ///    A pair of double-precision values are "unordered" with respect to each
536 ///    other if one or both values are NaN. Each comparison yields 0x0 for
537 ///    false, 0xFFFFFFFFFFFFFFFF for true.
538 ///
539 /// \headerfile <x86intrin.h>
540 ///
541 /// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
542 ///   instruction.
543 ///
544 /// \param __a
545 ///    A 128-bit vector of [2 x double].
546 /// \param __b
547 ///    A 128-bit vector of [2 x double].
548 /// \returns A 128-bit vector containing the comparison results.
549 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a,
550                                                              __m128d __b) {
551   return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
552 }
553 
554 /// Compares each of the corresponding double-precision values of the
555 ///    128-bit vectors of [2 x double] to determine if the values in the first
556 ///    operand are unequal to those in the second operand.
557 ///
558 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
559 ///
560 /// \headerfile <x86intrin.h>
561 ///
562 /// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
563 ///
564 /// \param __a
565 ///    A 128-bit vector of [2 x double].
566 /// \param __b
567 ///    A 128-bit vector of [2 x double].
568 /// \returns A 128-bit vector containing the comparison results.
569 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a,
570                                                            __m128d __b) {
571   return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
572 }
573 
574 /// Compares each of the corresponding double-precision values of the
575 ///    128-bit vectors of [2 x double] to determine if the values in the first
576 ///    operand are not less than those in the second operand.
577 ///
578 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
579 ///
580 /// \headerfile <x86intrin.h>
581 ///
582 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
583 ///
584 /// \param __a
585 ///    A 128-bit vector of [2 x double].
586 /// \param __b
587 ///    A 128-bit vector of [2 x double].
588 /// \returns A 128-bit vector containing the comparison results.
589 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a,
590                                                            __m128d __b) {
591   return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
592 }
593 
594 /// Compares each of the corresponding double-precision values of the
595 ///    128-bit vectors of [2 x double] to determine if the values in the first
596 ///    operand are not less than or equal to those in the second operand.
597 ///
598 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
599 ///
600 /// \headerfile <x86intrin.h>
601 ///
602 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
603 ///
604 /// \param __a
605 ///    A 128-bit vector of [2 x double].
606 /// \param __b
607 ///    A 128-bit vector of [2 x double].
608 /// \returns A 128-bit vector containing the comparison results.
609 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a,
610                                                            __m128d __b) {
611   return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
612 }
613 
614 /// Compares each of the corresponding double-precision values of the
615 ///    128-bit vectors of [2 x double] to determine if the values in the first
616 ///    operand are not greater than those in the second operand.
617 ///
618 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
619 ///
620 /// \headerfile <x86intrin.h>
621 ///
622 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
623 ///
624 /// \param __a
625 ///    A 128-bit vector of [2 x double].
626 /// \param __b
627 ///    A 128-bit vector of [2 x double].
628 /// \returns A 128-bit vector containing the comparison results.
629 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a,
630                                                            __m128d __b) {
631   return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
632 }
633 
634 /// Compares each of the corresponding double-precision values of the
635 ///    128-bit vectors of [2 x double] to determine if the values in the first
636 ///    operand are not greater than or equal to those in the second operand.
637 ///
638 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
639 ///
640 /// \headerfile <x86intrin.h>
641 ///
642 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
643 ///
644 /// \param __a
645 ///    A 128-bit vector of [2 x double].
646 /// \param __b
647 ///    A 128-bit vector of [2 x double].
648 /// \returns A 128-bit vector containing the comparison results.
649 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a,
650                                                            __m128d __b) {
651   return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
652 }
653 
654 /// Compares the lower double-precision floating-point values in each of
655 ///    the two 128-bit floating-point vectors of [2 x double] for equality.
656 ///
657 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
658 ///
659 /// \headerfile <x86intrin.h>
660 ///
661 /// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
662 ///
663 /// \param __a
664 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
665 ///    compared to the lower double-precision value of \a __b.
666 /// \param __b
667 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
668 ///    compared to the lower double-precision value of \a __a.
669 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
670 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
671 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a,
672                                                           __m128d __b) {
673   return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
674 }
675 
676 /// Compares the lower double-precision floating-point values in each of
677 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
678 ///    the value in the first parameter is less than the corresponding value in
679 ///    the second parameter.
680 ///
681 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
682 ///
683 /// \headerfile <x86intrin.h>
684 ///
685 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
686 ///
687 /// \param __a
688 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
689 ///    compared to the lower double-precision value of \a __b.
690 /// \param __b
691 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
692 ///    compared to the lower double-precision value of \a __a.
693 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
694 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
695 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a,
696                                                           __m128d __b) {
697   return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
698 }
699 
700 /// Compares the lower double-precision floating-point values in each of
701 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
702 ///    the value in the first parameter is less than or equal to the
703 ///    corresponding value in the second parameter.
704 ///
705 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
706 ///
707 /// \headerfile <x86intrin.h>
708 ///
709 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
710 ///
711 /// \param __a
712 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
713 ///    compared to the lower double-precision value of \a __b.
714 /// \param __b
715 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
716 ///    compared to the lower double-precision value of \a __a.
717 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
718 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
719 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a,
720                                                           __m128d __b) {
721   return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
722 }
723 
724 /// Compares the lower double-precision floating-point values in each of
725 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
726 ///    the value in the first parameter is greater than the corresponding value
727 ///    in the second parameter.
728 ///
729 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
730 ///
731 /// \headerfile <x86intrin.h>
732 ///
733 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
734 ///
735 /// \param __a
736 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
737 ///     compared to the lower double-precision value of \a __b.
738 /// \param __b
739 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
740 ///     compared to the lower double-precision value of \a __a.
741 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
742 ///     results. The upper 64 bits are copied from the upper 64 bits of \a __a.
743 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a,
744                                                           __m128d __b) {
745   __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
746   return __extension__(__m128d){__c[0], __a[1]};
747 }
748 
749 /// Compares the lower double-precision floating-point values in each of
750 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
751 ///    the value in the first parameter is greater than or equal to the
752 ///    corresponding value in the second parameter.
753 ///
754 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
755 ///
756 /// \headerfile <x86intrin.h>
757 ///
758 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
759 ///
760 /// \param __a
761 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
762 ///    compared to the lower double-precision value of \a __b.
763 /// \param __b
764 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
765 ///    compared to the lower double-precision value of \a __a.
766 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
767 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
768 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a,
769                                                           __m128d __b) {
770   __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
771   return __extension__(__m128d){__c[0], __a[1]};
772 }
773 
774 /// Compares the lower double-precision floating-point values in each of
775 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
776 ///    the value in the first parameter is "ordered" with respect to the
777 ///    corresponding value in the second parameter.
778 ///
779 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
780 ///    of double-precision values are "ordered" with respect to each other if
781 ///    neither value is a NaN.
782 ///
783 /// \headerfile <x86intrin.h>
784 ///
785 /// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
786 ///
787 /// \param __a
788 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
789 ///    compared to the lower double-precision value of \a __b.
790 /// \param __b
791 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
792 ///    compared to the lower double-precision value of \a __a.
793 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
794 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
795 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a,
796                                                            __m128d __b) {
797   return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
798 }
799 
800 /// Compares the lower double-precision floating-point values in each of
801 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
802 ///    the value in the first parameter is "unordered" with respect to the
803 ///    corresponding value in the second parameter.
804 ///
805 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
806 ///    of double-precision values are "unordered" with respect to each other if
807 ///    one or both values are NaN.
808 ///
809 /// \headerfile <x86intrin.h>
810 ///
811 /// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
812 ///   instruction.
813 ///
814 /// \param __a
815 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
816 ///    compared to the lower double-precision value of \a __b.
817 /// \param __b
818 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
819 ///    compared to the lower double-precision value of \a __a.
820 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
821 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
822 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a,
823                                                              __m128d __b) {
824   return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
825 }
826 
827 /// Compares the lower double-precision floating-point values in each of
828 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
829 ///    the value in the first parameter is unequal to the corresponding value in
830 ///    the second parameter.
831 ///
832 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
833 ///
834 /// \headerfile <x86intrin.h>
835 ///
836 /// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
837 ///
838 /// \param __a
839 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
840 ///    compared to the lower double-precision value of \a __b.
841 /// \param __b
842 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
843 ///    compared to the lower double-precision value of \a __a.
844 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
845 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
846 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a,
847                                                            __m128d __b) {
848   return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
849 }
850 
851 /// Compares the lower double-precision floating-point values in each of
852 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
853 ///    the value in the first parameter is not less than the corresponding
854 ///    value in the second parameter.
855 ///
856 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
857 ///
858 /// \headerfile <x86intrin.h>
859 ///
860 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
861 ///
862 /// \param __a
863 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
864 ///    compared to the lower double-precision value of \a __b.
865 /// \param __b
866 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
867 ///    compared to the lower double-precision value of \a __a.
868 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
869 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
870 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a,
871                                                            __m128d __b) {
872   return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
873 }
874 
875 /// Compares the lower double-precision floating-point values in each of
876 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
877 ///    the value in the first parameter is not less than or equal to the
878 ///    corresponding value in the second parameter.
879 ///
880 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
881 ///
882 /// \headerfile <x86intrin.h>
883 ///
884 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
885 ///
886 /// \param __a
887 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
888 ///    compared to the lower double-precision value of \a __b.
889 /// \param __b
890 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
891 ///    compared to the lower double-precision value of \a __a.
892 /// \returns  A 128-bit vector. The lower 64 bits contains the comparison
893 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
894 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a,
895                                                            __m128d __b) {
896   return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
897 }
898 
899 /// Compares the lower double-precision floating-point values in each of
900 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
901 ///    the value in the first parameter is not greater than the corresponding
902 ///    value in the second parameter.
903 ///
904 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
905 ///
906 /// \headerfile <x86intrin.h>
907 ///
908 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
909 ///
910 /// \param __a
911 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
912 ///    compared to the lower double-precision value of \a __b.
913 /// \param __b
914 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
915 ///    compared to the lower double-precision value of \a __a.
916 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
917 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
918 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a,
919                                                            __m128d __b) {
920   __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
921   return __extension__(__m128d){__c[0], __a[1]};
922 }
923 
924 /// Compares the lower double-precision floating-point values in each of
925 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
926 ///    the value in the first parameter is not greater than or equal to the
927 ///    corresponding value in the second parameter.
928 ///
929 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
930 ///
931 /// \headerfile <x86intrin.h>
932 ///
933 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
934 ///
935 /// \param __a
936 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
937 ///    compared to the lower double-precision value of \a __b.
938 /// \param __b
939 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
940 ///    compared to the lower double-precision value of \a __a.
941 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
942 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
943 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a,
944                                                            __m128d __b) {
945   __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
946   return __extension__(__m128d){__c[0], __a[1]};
947 }
948 
949 /// Compares the lower double-precision floating-point values in each of
950 ///    the two 128-bit floating-point vectors of [2 x double] for equality.
951 ///
952 ///    The comparison yields 0 for false, 1 for true. If either of the two
953 ///    lower double-precision values is NaN, 0 is returned.
954 ///
955 /// \headerfile <x86intrin.h>
956 ///
957 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
958 ///
959 /// \param __a
960 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
961 ///    compared to the lower double-precision value of \a __b.
962 /// \param __b
963 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
964 ///    compared to the lower double-precision value of \a __a.
965 /// \returns An integer containing the comparison results. If either of the two
966 ///    lower double-precision values is NaN, 0 is returned.
967 static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a,
968                                                        __m128d __b) {
969   return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
970 }
971 
972 /// Compares the lower double-precision floating-point values in each of
973 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
974 ///    the value in the first parameter is less than the corresponding value in
975 ///    the second parameter.
976 ///
977 ///    The comparison yields 0 for false, 1 for true. If either of the two
978 ///    lower double-precision values is NaN, 0 is returned.
979 ///
980 /// \headerfile <x86intrin.h>
981 ///
982 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
983 ///
984 /// \param __a
985 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
986 ///    compared to the lower double-precision value of \a __b.
987 /// \param __b
988 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
989 ///    compared to the lower double-precision value of \a __a.
990 /// \returns An integer containing the comparison results. If either of the two
991 ///     lower double-precision values is NaN, 0 is returned.
992 static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a,
993                                                        __m128d __b) {
994   return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
995 }
996 
997 /// Compares the lower double-precision floating-point values in each of
998 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
999 ///    the value in the first parameter is less than or equal to the
1000 ///    corresponding value in the second parameter.
1001 ///
1002 ///    The comparison yields 0 for false, 1 for true. If either of the two
1003 ///    lower double-precision values is NaN, 0 is returned.
1004 ///
1005 /// \headerfile <x86intrin.h>
1006 ///
1007 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1008 ///
1009 /// \param __a
1010 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1011 ///    compared to the lower double-precision value of \a __b.
1012 /// \param __b
1013 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
1014 ///     compared to the lower double-precision value of \a __a.
1015 /// \returns An integer containing the comparison results. If either of the two
1016 ///     lower double-precision values is NaN, 0 is returned.
1017 static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a,
1018                                                        __m128d __b) {
1019   return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
1020 }
1021 
1022 /// Compares the lower double-precision floating-point values in each of
1023 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1024 ///    the value in the first parameter is greater than the corresponding value
1025 ///    in the second parameter.
1026 ///
1027 ///    The comparison yields 0 for false, 1 for true. If either of the two
1028 ///    lower double-precision values is NaN, 0 is returned.
1029 ///
1030 /// \headerfile <x86intrin.h>
1031 ///
1032 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1033 ///
1034 /// \param __a
1035 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1036 ///    compared to the lower double-precision value of \a __b.
1037 /// \param __b
1038 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1039 ///    compared to the lower double-precision value of \a __a.
1040 /// \returns An integer containing the comparison results. If either of the two
1041 ///     lower double-precision values is NaN, 0 is returned.
1042 static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a,
1043                                                        __m128d __b) {
1044   return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
1045 }
1046 
1047 /// Compares the lower double-precision floating-point values in each of
1048 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1049 ///    the value in the first parameter is greater than or equal to the
1050 ///    corresponding value in the second parameter.
1051 ///
1052 ///    The comparison yields 0 for false, 1 for true. If either of the two
1053 ///    lower double-precision values is NaN, 0 is returned.
1054 ///
1055 /// \headerfile <x86intrin.h>
1056 ///
1057 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1058 ///
1059 /// \param __a
1060 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1061 ///    compared to the lower double-precision value of \a __b.
1062 /// \param __b
1063 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1064 ///    compared to the lower double-precision value of \a __a.
1065 /// \returns An integer containing the comparison results. If either of the two
1066 ///    lower double-precision values is NaN, 0 is returned.
1067 static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a,
1068                                                        __m128d __b) {
1069   return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
1070 }
1071 
1072 /// Compares the lower double-precision floating-point values in each of
1073 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1074 ///    the value in the first parameter is unequal to the corresponding value in
1075 ///    the second parameter.
1076 ///
1077 ///    The comparison yields 0 for false, 1 for true. If either of the two
1078 ///    lower double-precision values is NaN, 1 is returned.
1079 ///
1080 /// \headerfile <x86intrin.h>
1081 ///
1082 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1083 ///
1084 /// \param __a
1085 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1086 ///    compared to the lower double-precision value of \a __b.
1087 /// \param __b
1088 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1089 ///    compared to the lower double-precision value of \a __a.
1090 /// \returns An integer containing the comparison results. If either of the two
1091 ///     lower double-precision values is NaN, 1 is returned.
1092 static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a,
1093                                                         __m128d __b) {
1094   return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
1095 }
1096 
1097 /// Compares the lower double-precision floating-point values in each of
1098 ///    the two 128-bit floating-point vectors of [2 x double] for equality. The
1099 ///    comparison yields 0 for false, 1 for true.
1100 ///
1101 ///    If either of the two lower double-precision values is NaN, 0 is returned.
1102 ///
1103 /// \headerfile <x86intrin.h>
1104 ///
1105 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1106 ///
1107 /// \param __a
1108 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1109 ///    compared to the lower double-precision value of \a __b.
1110 /// \param __b
1111 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1112 ///    compared to the lower double-precision value of \a __a.
1113 /// \returns An integer containing the comparison results. If either of the two
1114 ///    lower double-precision values is NaN, 0 is returned.
1115 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a,
1116                                                         __m128d __b) {
1117   return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
1118 }
1119 
1120 /// Compares the lower double-precision floating-point values in each of
1121 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1122 ///    the value in the first parameter is less than the corresponding value in
1123 ///    the second parameter.
1124 ///
1125 ///    The comparison yields 0 for false, 1 for true. If either of the two lower
1126 ///    double-precision values is NaN, 0 is returned.
1127 ///
1128 /// \headerfile <x86intrin.h>
1129 ///
1130 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1131 ///
1132 /// \param __a
1133 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1134 ///    compared to the lower double-precision value of \a __b.
1135 /// \param __b
1136 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1137 ///    compared to the lower double-precision value of \a __a.
1138 /// \returns An integer containing the comparison results. If either of the two
1139 ///    lower double-precision values is NaN, 0 is returned.
1140 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a,
1141                                                         __m128d __b) {
1142   return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
1143 }
1144 
1145 /// Compares the lower double-precision floating-point values in each of
1146 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1147 ///    the value in the first parameter is less than or equal to the
1148 ///    corresponding value in the second parameter.
1149 ///
1150 ///    The comparison yields 0 for false, 1 for true. If either of the two lower
1151 ///    double-precision values is NaN, 0 is returned.
1152 ///
1153 /// \headerfile <x86intrin.h>
1154 ///
1155 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1156 ///
1157 /// \param __a
1158 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1159 ///    compared to the lower double-precision value of \a __b.
1160 /// \param __b
1161 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
1162 ///     compared to the lower double-precision value of \a __a.
1163 /// \returns An integer containing the comparison results. If either of the two
1164 ///     lower double-precision values is NaN, 0 is returned.
1165 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a,
1166                                                         __m128d __b) {
1167   return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
1168 }
1169 
1170 /// Compares the lower double-precision floating-point values in each of
1171 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1172 ///    the value in the first parameter is greater than the corresponding value
1173 ///    in the second parameter.
1174 ///
1175 ///    The comparison yields 0 for false, 1 for true. If either of the two lower
1176 ///    double-precision values is NaN, 0 is returned.
1177 ///
1178 /// \headerfile <x86intrin.h>
1179 ///
1180 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1181 ///
1182 /// \param __a
1183 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1184 ///    compared to the lower double-precision value of \a __b.
1185 /// \param __b
1186 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
1187 ///     compared to the lower double-precision value of \a __a.
1188 /// \returns An integer containing the comparison results. If either of the two
1189 ///     lower double-precision values is NaN, 0 is returned.
1190 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a,
1191                                                         __m128d __b) {
1192   return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
1193 }
1194 
1195 /// Compares the lower double-precision floating-point values in each of
1196 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1197 ///    the value in the first parameter is greater than or equal to the
1198 ///    corresponding value in the second parameter.
1199 ///
1200 ///    The comparison yields 0 for false, 1 for true.  If either of the two
1201 ///    lower double-precision values is NaN, 0 is returned.
1202 ///
1203 /// \headerfile <x86intrin.h>
1204 ///
1205 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1206 ///
1207 /// \param __a
1208 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1209 ///    compared to the lower double-precision value of \a __b.
1210 /// \param __b
1211 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1212 ///    compared to the lower double-precision value of \a __a.
1213 /// \returns An integer containing the comparison results. If either of the two
1214 ///    lower double-precision values is NaN, 0 is returned.
1215 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a,
1216                                                         __m128d __b) {
1217   return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
1218 }
1219 
1220 /// Compares the lower double-precision floating-point values in each of
1221 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1222 ///    the value in the first parameter is unequal to the corresponding value in
1223 ///    the second parameter.
1224 ///
1225 ///    The comparison yields 0 for false, 1 for true. If either of the two lower
1226 ///    double-precision values is NaN, 1 is returned.
1227 ///
1228 /// \headerfile <x86intrin.h>
1229 ///
1230 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1231 ///
1232 /// \param __a
1233 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1234 ///    compared to the lower double-precision value of \a __b.
1235 /// \param __b
1236 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1237 ///    compared to the lower double-precision value of \a __a.
1238 /// \returns An integer containing the comparison result. If either of the two
1239 ///    lower double-precision values is NaN, 1 is returned.
1240 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a,
1241                                                          __m128d __b) {
1242   return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
1243 }
1244 
1245 /// Converts the two double-precision floating-point elements of a
1246 ///    128-bit vector of [2 x double] into two single-precision floating-point
1247 ///    values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
1248 ///    The upper 64 bits of the result vector are set to zero.
1249 ///
1250 /// \headerfile <x86intrin.h>
1251 ///
1252 /// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
1253 ///
1254 /// \param __a
1255 ///    A 128-bit vector of [2 x double].
1256 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1257 ///    converted values. The upper 64 bits are set to zero.
1258 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a) {
1259   return __builtin_ia32_cvtpd2ps((__v2df)__a);
1260 }
1261 
1262 /// Converts the lower two single-precision floating-point elements of a
1263 ///    128-bit vector of [4 x float] into two double-precision floating-point
1264 ///    values, returned in a 128-bit vector of [2 x double]. The upper two
1265 ///    elements of the input vector are unused.
1266 ///
1267 /// \headerfile <x86intrin.h>
1268 ///
1269 /// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
1270 ///
1271 /// \param __a
1272 ///    A 128-bit vector of [4 x float]. The lower two single-precision
1273 ///    floating-point elements are converted to double-precision values. The
1274 ///    upper two elements are unused.
1275 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1276 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a) {
1277   return (__m128d) __builtin_convertvector(
1278       __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
1279 }
1280 
1281 /// Converts the lower two integer elements of a 128-bit vector of
1282 ///    [4 x i32] into two double-precision floating-point values, returned in a
1283 ///    128-bit vector of [2 x double].
1284 ///
1285 ///    The upper two elements of the input vector are unused.
1286 ///
1287 /// \headerfile <x86intrin.h>
1288 ///
1289 /// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
1290 ///
1291 /// \param __a
1292 ///    A 128-bit integer vector of [4 x i32]. The lower two integer elements are
1293 ///    converted to double-precision values.
1294 ///
1295 ///    The upper two elements are unused.
1296 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1297 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a) {
1298   return (__m128d) __builtin_convertvector(
1299       __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
1300 }
1301 
1302 /// Converts the two double-precision floating-point elements of a
1303 ///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1304 ///    returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
1305 ///    64 bits of the result vector are set to zero.
1306 ///
1307 /// \headerfile <x86intrin.h>
1308 ///
1309 /// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
1310 ///
1311 /// \param __a
1312 ///    A 128-bit vector of [2 x double].
1313 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1314 ///    converted values. The upper 64 bits are set to zero.
1315 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a) {
1316   return __builtin_ia32_cvtpd2dq((__v2df)__a);
1317 }
1318 
1319 /// Converts the low-order element of a 128-bit vector of [2 x double]
1320 ///    into a 32-bit signed integer value.
1321 ///
1322 /// \headerfile <x86intrin.h>
1323 ///
1324 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
1325 ///
1326 /// \param __a
1327 ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1328 ///    conversion.
1329 /// \returns A 32-bit signed integer containing the converted value.
1330 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a) {
1331   return __builtin_ia32_cvtsd2si((__v2df)__a);
1332 }
1333 
1334 /// Converts the lower double-precision floating-point element of a
1335 ///    128-bit vector of [2 x double], in the second parameter, into a
1336 ///    single-precision floating-point value, returned in the lower 32 bits of a
1337 ///    128-bit vector of [4 x float]. The upper 96 bits of the result vector are
1338 ///    copied from the upper 96 bits of the first parameter.
1339 ///
1340 /// \headerfile <x86intrin.h>
1341 ///
1342 /// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
1343 ///
1344 /// \param __a
1345 ///    A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
1346 ///    copied to the upper 96 bits of the result.
1347 /// \param __b
1348 ///    A 128-bit vector of [2 x double]. The lower double-precision
1349 ///    floating-point element is used in the conversion.
1350 /// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
1351 ///    converted value from the second parameter. The upper 96 bits are copied
1352 ///    from the upper 96 bits of the first parameter.
1353 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a,
1354                                                          __m128d __b) {
1355   return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
1356 }
1357 
1358 /// Converts a 32-bit signed integer value, in the second parameter, into
1359 ///    a double-precision floating-point value, returned in the lower 64 bits of
1360 ///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1361 ///    are copied from the upper 64 bits of the first parameter.
1362 ///
1363 /// \headerfile <x86intrin.h>
1364 ///
1365 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
1366 ///
1367 /// \param __a
1368 ///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1369 ///    copied to the upper 64 bits of the result.
1370 /// \param __b
1371 ///    A 32-bit signed integer containing the value to be converted.
1372 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1373 ///    converted value from the second parameter. The upper 64 bits are copied
1374 ///    from the upper 64 bits of the first parameter.
1375 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a,
1376                                                             int __b) {
1377   __a[0] = __b;
1378   return __a;
1379 }
1380 
1381 /// Converts the lower single-precision floating-point element of a
1382 ///    128-bit vector of [4 x float], in the second parameter, into a
1383 ///    double-precision floating-point value, returned in the lower 64 bits of
1384 ///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1385 ///    are copied from the upper 64 bits of the first parameter.
1386 ///
1387 /// \headerfile <x86intrin.h>
1388 ///
1389 /// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
1390 ///
1391 /// \param __a
1392 ///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1393 ///    copied to the upper 64 bits of the result.
1394 /// \param __b
1395 ///    A 128-bit vector of [4 x float]. The lower single-precision
1396 ///    floating-point element is used in the conversion.
1397 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1398 ///    converted value from the second parameter. The upper 64 bits are copied
1399 ///    from the upper 64 bits of the first parameter.
1400 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a,
1401                                                           __m128 __b) {
1402   __a[0] = __b[0];
1403   return __a;
1404 }
1405 
1406 /// Converts the two double-precision floating-point elements of a
1407 ///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1408 ///    returned in the lower 64 bits of a 128-bit vector of [4 x i32].
1409 ///
1410 ///    If the result of either conversion is inexact, the result is truncated
1411 ///    (rounded towards zero) regardless of the current MXCSR setting. The upper
1412 ///    64 bits of the result vector are set to zero.
1413 ///
1414 /// \headerfile <x86intrin.h>
1415 ///
1416 /// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
1417 ///   instruction.
1418 ///
1419 /// \param __a
1420 ///    A 128-bit vector of [2 x double].
1421 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1422 ///    converted values. The upper 64 bits are set to zero.
1423 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a) {
1424   return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
1425 }
1426 
1427 /// Converts the low-order element of a [2 x double] vector into a 32-bit
1428 ///    signed integer value, truncating the result when it is inexact.
1429 ///
1430 /// \headerfile <x86intrin.h>
1431 ///
1432 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
1433 ///   instruction.
1434 ///
1435 /// \param __a
1436 ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1437 ///    conversion.
1438 /// \returns A 32-bit signed integer containing the converted value.
1439 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) {
1440   return __builtin_ia32_cvttsd2si((__v2df)__a);
1441 }
1442 
1443 /// Converts the two double-precision floating-point elements of a
1444 ///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1445 ///    returned in a 64-bit vector of [2 x i32].
1446 ///
1447 /// \headerfile <x86intrin.h>
1448 ///
1449 /// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
1450 ///
1451 /// \param __a
1452 ///    A 128-bit vector of [2 x double].
1453 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
1454 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) {
1455   return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
1456 }
1457 
1458 /// Converts the two double-precision floating-point elements of a
1459 ///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1460 ///    returned in a 64-bit vector of [2 x i32].
1461 ///
1462 ///    If the result of either conversion is inexact, the result is truncated
1463 ///    (rounded towards zero) regardless of the current MXCSR setting.
1464 ///
1465 /// \headerfile <x86intrin.h>
1466 ///
1467 /// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
1468 ///
1469 /// \param __a
1470 ///    A 128-bit vector of [2 x double].
1471 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
1472 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) {
1473   return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
1474 }
1475 
1476 /// Converts the two signed 32-bit integer elements of a 64-bit vector of
1477 ///    [2 x i32] into two double-precision floating-point values, returned in a
1478 ///    128-bit vector of [2 x double].
1479 ///
1480 /// \headerfile <x86intrin.h>
1481 ///
1482 /// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
1483 ///
1484 /// \param __a
1485 ///    A 64-bit vector of [2 x i32].
1486 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1487 static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a) {
1488   return __builtin_ia32_cvtpi2pd((__v2si)__a);
1489 }
1490 
1491 /// Returns the low-order element of a 128-bit vector of [2 x double] as
1492 ///    a double-precision floating-point value.
1493 ///
1494 /// \headerfile <x86intrin.h>
1495 ///
1496 /// This intrinsic has no corresponding instruction.
1497 ///
1498 /// \param __a
1499 ///    A 128-bit vector of [2 x double]. The lower 64 bits are returned.
1500 /// \returns A double-precision floating-point value copied from the lower 64
1501 ///    bits of \a __a.
1502 static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a) {
1503   return __a[0];
1504 }
1505 
1506 /// Loads a 128-bit floating-point vector of [2 x double] from an aligned
1507 ///    memory location.
1508 ///
1509 /// \headerfile <x86intrin.h>
1510 ///
1511 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
1512 ///
1513 /// \param __dp
1514 ///    A pointer to a 128-bit memory location. The address of the memory
1515 ///    location has to be 16-byte aligned.
1516 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
1517 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp) {
1518   return *(const __m128d *)__dp;
1519 }
1520 
1521 /// Loads a double-precision floating-point value from a specified memory
1522 ///    location and duplicates it to both vector elements of a 128-bit vector of
1523 ///    [2 x double].
1524 ///
1525 /// \headerfile <x86intrin.h>
1526 ///
1527 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
1528 ///
1529 /// \param __dp
1530 ///    A pointer to a memory location containing a double-precision value.
1531 /// \returns A 128-bit vector of [2 x double] containing the loaded and
1532 ///    duplicated values.
1533 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp) {
1534   struct __mm_load1_pd_struct {
1535     double __u;
1536   } __attribute__((__packed__, __may_alias__));
1537   double __u = ((const struct __mm_load1_pd_struct *)__dp)->__u;
1538   return __extension__(__m128d){__u, __u};
1539 }
1540 
1541 #define _mm_load_pd1(dp) _mm_load1_pd(dp)
1542 
1543 /// Loads two double-precision values, in reverse order, from an aligned
1544 ///    memory location into a 128-bit vector of [2 x double].
1545 ///
1546 /// \headerfile <x86intrin.h>
1547 ///
1548 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
1549 /// needed shuffling instructions. In AVX mode, the shuffling may be combined
1550 /// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
1551 ///
1552 /// \param __dp
1553 ///    A 16-byte aligned pointer to an array of double-precision values to be
1554 ///    loaded in reverse order.
1555 /// \returns A 128-bit vector of [2 x double] containing the reversed loaded
1556 ///    values.
1557 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp) {
1558   __m128d __u = *(const __m128d *)__dp;
1559   return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
1560 }
1561 
1562 /// Loads a 128-bit floating-point vector of [2 x double] from an
1563 ///    unaligned memory location.
1564 ///
1565 /// \headerfile <x86intrin.h>
1566 ///
1567 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1568 ///
1569 /// \param __dp
1570 ///    A pointer to a 128-bit memory location. The address of the memory
1571 ///    location does not have to be aligned.
1572 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
1573 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp) {
1574   struct __loadu_pd {
1575     __m128d_u __v;
1576   } __attribute__((__packed__, __may_alias__));
1577   return ((const struct __loadu_pd *)__dp)->__v;
1578 }
1579 
1580 /// Loads a 64-bit integer value to the low element of a 128-bit integer
1581 ///    vector and clears the upper element.
1582 ///
1583 /// \headerfile <x86intrin.h>
1584 ///
1585 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1586 ///
1587 /// \param __a
1588 ///    A pointer to a 64-bit memory location. The address of the memory
1589 ///    location does not have to be aligned.
1590 /// \returns A 128-bit vector of [2 x i64] containing the loaded value.
1591 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a) {
1592   struct __loadu_si64 {
1593     long long __v;
1594   } __attribute__((__packed__, __may_alias__));
1595   long long __u = ((const struct __loadu_si64 *)__a)->__v;
1596   return __extension__(__m128i)(__v2di){__u, 0LL};
1597 }
1598 
1599 /// Loads a 32-bit integer value to the low element of a 128-bit integer
1600 ///    vector and clears the upper element.
1601 ///
1602 /// \headerfile <x86intrin.h>
1603 ///
1604 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
1605 ///
1606 /// \param __a
1607 ///    A pointer to a 32-bit memory location. The address of the memory
1608 ///    location does not have to be aligned.
1609 /// \returns A 128-bit vector of [4 x i32] containing the loaded value.
1610 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a) {
1611   struct __loadu_si32 {
1612     int __v;
1613   } __attribute__((__packed__, __may_alias__));
1614   int __u = ((const struct __loadu_si32 *)__a)->__v;
1615   return __extension__(__m128i)(__v4si){__u, 0, 0, 0};
1616 }
1617 
1618 /// Loads a 16-bit integer value to the low element of a 128-bit integer
1619 ///    vector and clears the upper element.
1620 ///
1621 /// \headerfile <x86intrin.h>
1622 ///
1623 /// This intrinsic does not correspond to a specific instruction.
1624 ///
1625 /// \param __a
1626 ///    A pointer to a 16-bit memory location. The address of the memory
1627 ///    location does not have to be aligned.
1628 /// \returns A 128-bit vector of [8 x i16] containing the loaded value.
1629 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a) {
1630   struct __loadu_si16 {
1631     short __v;
1632   } __attribute__((__packed__, __may_alias__));
1633   short __u = ((const struct __loadu_si16 *)__a)->__v;
1634   return __extension__(__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
1635 }
1636 
1637 /// Loads a 64-bit double-precision value to the low element of a
1638 ///    128-bit integer vector and clears the upper element.
1639 ///
1640 /// \headerfile <x86intrin.h>
1641 ///
1642 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1643 ///
1644 /// \param __dp
1645 ///    A pointer to a memory location containing a double-precision value.
1646 ///    The address of the memory location does not have to be aligned.
1647 /// \returns A 128-bit vector of [2 x double] containing the loaded value.
1648 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp) {
1649   struct __mm_load_sd_struct {
1650     double __u;
1651   } __attribute__((__packed__, __may_alias__));
1652   double __u = ((const struct __mm_load_sd_struct *)__dp)->__u;
1653   return __extension__(__m128d){__u, 0};
1654 }
1655 
1656 /// Loads a double-precision value into the high-order bits of a 128-bit
1657 ///    vector of [2 x double]. The low-order bits are copied from the low-order
1658 ///    bits of the first operand.
1659 ///
1660 /// \headerfile <x86intrin.h>
1661 ///
1662 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1663 ///
1664 /// \param __a
1665 ///    A 128-bit vector of [2 x double]. \n
1666 ///    Bits [63:0] are written to bits [63:0] of the result.
1667 /// \param __dp
1668 ///    A pointer to a 64-bit memory location containing a double-precision
1669 ///    floating-point value that is loaded. The loaded value is written to bits
1670 ///    [127:64] of the result. The address of the memory location does not have
1671 ///    to be aligned.
1672 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1673 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a,
1674                                                           double const *__dp) {
1675   struct __mm_loadh_pd_struct {
1676     double __u;
1677   } __attribute__((__packed__, __may_alias__));
1678   double __u = ((const struct __mm_loadh_pd_struct *)__dp)->__u;
1679   return __extension__(__m128d){__a[0], __u};
1680 }
1681 
1682 /// Loads a double-precision value into the low-order bits of a 128-bit
1683 ///    vector of [2 x double]. The high-order bits are copied from the
1684 ///    high-order bits of the first operand.
1685 ///
1686 /// \headerfile <x86intrin.h>
1687 ///
1688 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1689 ///
1690 /// \param __a
1691 ///    A 128-bit vector of [2 x double]. \n
1692 ///    Bits [127:64] are written to bits [127:64] of the result.
1693 /// \param __dp
1694 ///    A pointer to a 64-bit memory location containing a double-precision
1695 ///    floating-point value that is loaded. The loaded value is written to bits
1696 ///    [63:0] of the result. The address of the memory location does not have to
1697 ///    be aligned.
1698 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1699 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a,
1700                                                           double const *__dp) {
1701   struct __mm_loadl_pd_struct {
1702     double __u;
1703   } __attribute__((__packed__, __may_alias__));
1704   double __u = ((const struct __mm_loadl_pd_struct *)__dp)->__u;
1705   return __extension__(__m128d){__u, __a[1]};
1706 }
1707 
1708 /// Constructs a 128-bit floating-point vector of [2 x double] with
1709 ///    unspecified content. This could be used as an argument to another
1710 ///    intrinsic function where the argument is required but the value is not
1711 ///    actually used.
1712 ///
1713 /// \headerfile <x86intrin.h>
1714 ///
1715 /// This intrinsic has no corresponding instruction.
1716 ///
1717 /// \returns A 128-bit floating-point vector of [2 x double] with unspecified
1718 ///    content.
1719 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void) {
1720   return (__m128d)__builtin_ia32_undef128();
1721 }
1722 
1723 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1724 ///    64 bits of the vector are initialized with the specified double-precision
1725 ///    floating-point value. The upper 64 bits are set to zero.
1726 ///
1727 /// \headerfile <x86intrin.h>
1728 ///
1729 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1730 ///
1731 /// \param __w
1732 ///    A double-precision floating-point value used to initialize the lower 64
1733 ///    bits of the result.
1734 /// \returns An initialized 128-bit floating-point vector of [2 x double]. The
1735 ///    lower 64 bits contain the value of the parameter. The upper 64 bits are
1736 ///    set to zero.
1737 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w) {
1738   return __extension__(__m128d){__w, 0};
1739 }
1740 
1741 /// Constructs a 128-bit floating-point vector of [2 x double], with each
1742 ///    of the two double-precision floating-point vector elements set to the
1743 ///    specified double-precision floating-point value.
1744 ///
1745 /// \headerfile <x86intrin.h>
1746 ///
1747 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1748 ///
1749 /// \param __w
1750 ///    A double-precision floating-point value used to initialize each vector
1751 ///    element of the result.
1752 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1753 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w) {
1754   return __extension__(__m128d){__w, __w};
1755 }
1756 
1757 /// Constructs a 128-bit floating-point vector of [2 x double], with each
1758 ///    of the two double-precision floating-point vector elements set to the
1759 ///    specified double-precision floating-point value.
1760 ///
1761 /// \headerfile <x86intrin.h>
1762 ///
1763 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1764 ///
1765 /// \param __w
1766 ///    A double-precision floating-point value used to initialize each vector
1767 ///    element of the result.
1768 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1769 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w) {
1770   return _mm_set1_pd(__w);
1771 }
1772 
1773 /// Constructs a 128-bit floating-point vector of [2 x double]
1774 ///    initialized with the specified double-precision floating-point values.
1775 ///
1776 /// \headerfile <x86intrin.h>
1777 ///
1778 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1779 ///
1780 /// \param __w
1781 ///    A double-precision floating-point value used to initialize the upper 64
1782 ///    bits of the result.
1783 /// \param __x
1784 ///    A double-precision floating-point value used to initialize the lower 64
1785 ///    bits of the result.
1786 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1787 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w,
1788                                                         double __x) {
1789   return __extension__(__m128d){__x, __w};
1790 }
1791 
1792 /// Constructs a 128-bit floating-point vector of [2 x double],
1793 ///    initialized in reverse order with the specified double-precision
1794 ///    floating-point values.
1795 ///
1796 /// \headerfile <x86intrin.h>
1797 ///
1798 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1799 ///
1800 /// \param __w
1801 ///    A double-precision floating-point value used to initialize the lower 64
1802 ///    bits of the result.
1803 /// \param __x
1804 ///    A double-precision floating-point value used to initialize the upper 64
1805 ///    bits of the result.
1806 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1807 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w,
1808                                                          double __x) {
1809   return __extension__(__m128d){__w, __x};
1810 }
1811 
1812 /// Constructs a 128-bit floating-point vector of [2 x double]
1813 ///    initialized to zero.
1814 ///
1815 /// \headerfile <x86intrin.h>
1816 ///
1817 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1818 ///
1819 /// \returns An initialized 128-bit floating-point vector of [2 x double] with
1820 ///    all elements set to zero.
1821 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) {
1822   return __extension__(__m128d){0.0, 0.0};
1823 }
1824 
1825 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1826 ///    64 bits are set to the lower 64 bits of the second parameter. The upper
1827 ///    64 bits are set to the upper 64 bits of the first parameter.
1828 ///
1829 /// \headerfile <x86intrin.h>
1830 ///
1831 /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
1832 ///
1833 /// \param __a
1834 ///    A 128-bit vector of [2 x double]. The upper 64 bits are written to the
1835 ///    upper 64 bits of the result.
1836 /// \param __b
1837 ///    A 128-bit vector of [2 x double]. The lower 64 bits are written to the
1838 ///    lower 64 bits of the result.
1839 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1840 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a,
1841                                                          __m128d __b) {
1842   __a[0] = __b[0];
1843   return __a;
1844 }
1845 
1846 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1847 ///    memory location.
1848 ///
1849 /// \headerfile <x86intrin.h>
1850 ///
1851 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1852 ///
1853 /// \param __dp
1854 ///    A pointer to a 64-bit memory location.
1855 /// \param __a
1856 ///    A 128-bit vector of [2 x double] containing the value to be stored.
1857 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp,
1858                                                        __m128d __a) {
1859   struct __mm_store_sd_struct {
1860     double __u;
1861   } __attribute__((__packed__, __may_alias__));
1862   ((struct __mm_store_sd_struct *)__dp)->__u = __a[0];
1863 }
1864 
1865 /// Moves packed double-precision values from a 128-bit vector of
1866 ///    [2 x double] to a memory location.
1867 ///
1868 /// \headerfile <x86intrin.h>
1869 ///
1870 /// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
1871 ///
1872 /// \param __dp
1873 ///    A pointer to an aligned memory location that can store two
1874 ///    double-precision values.
1875 /// \param __a
1876 ///    A packed 128-bit vector of [2 x double] containing the values to be
1877 ///    moved.
1878 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp,
1879                                                        __m128d __a) {
1880   *(__m128d *)__dp = __a;
1881 }
1882 
1883 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1884 ///    the upper and lower 64 bits of a memory location.
1885 ///
1886 /// \headerfile <x86intrin.h>
1887 ///
1888 /// This intrinsic corresponds to the
1889 ///   <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1890 ///
1891 /// \param __dp
1892 ///    A pointer to a memory location that can store two double-precision
1893 ///    values.
1894 /// \param __a
1895 ///    A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1896 ///    of the values in \a __dp.
1897 static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp,
1898                                                         __m128d __a) {
1899   __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
1900   _mm_store_pd(__dp, __a);
1901 }
1902 
1903 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1904 ///    the upper and lower 64 bits of a memory location.
1905 ///
1906 /// \headerfile <x86intrin.h>
1907 ///
1908 /// This intrinsic corresponds to the
1909 ///   <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1910 ///
1911 /// \param __dp
1912 ///    A pointer to a memory location that can store two double-precision
1913 ///    values.
1914 /// \param __a
1915 ///    A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1916 ///    of the values in \a __dp.
1917 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp,
1918                                                         __m128d __a) {
1919   _mm_store1_pd(__dp, __a);
1920 }
1921 
1922 /// Stores a 128-bit vector of [2 x double] into an unaligned memory
1923 ///    location.
1924 ///
1925 /// \headerfile <x86intrin.h>
1926 ///
1927 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1928 ///
1929 /// \param __dp
1930 ///    A pointer to a 128-bit memory location. The address of the memory
1931 ///    location does not have to be aligned.
1932 /// \param __a
1933 ///    A 128-bit vector of [2 x double] containing the values to be stored.
1934 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp,
1935                                                         __m128d __a) {
1936   struct __storeu_pd {
1937     __m128d_u __v;
1938   } __attribute__((__packed__, __may_alias__));
1939   ((struct __storeu_pd *)__dp)->__v = __a;
1940 }
1941 
1942 /// Stores two double-precision values, in reverse order, from a 128-bit
1943 ///    vector of [2 x double] to a 16-byte aligned memory location.
1944 ///
1945 /// \headerfile <x86intrin.h>
1946 ///
1947 /// This intrinsic corresponds to a shuffling instruction followed by a
1948 /// <c> VMOVAPD / MOVAPD </c> instruction.
1949 ///
1950 /// \param __dp
1951 ///    A pointer to a 16-byte aligned memory location that can store two
1952 ///    double-precision values.
1953 /// \param __a
1954 ///    A 128-bit vector of [2 x double] containing the values to be reversed and
1955 ///    stored.
1956 static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp,
1957                                                         __m128d __a) {
1958   __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
1959   *(__m128d *)__dp = __a;
1960 }
1961 
1962 /// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
1963 ///    memory location.
1964 ///
1965 /// \headerfile <x86intrin.h>
1966 ///
1967 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1968 ///
1969 /// \param __dp
1970 ///    A pointer to a 64-bit memory location.
1971 /// \param __a
1972 ///    A 128-bit vector of [2 x double] containing the value to be stored.
1973 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp,
1974                                                         __m128d __a) {
1975   struct __mm_storeh_pd_struct {
1976     double __u;
1977   } __attribute__((__packed__, __may_alias__));
1978   ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[1];
1979 }
1980 
1981 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1982 ///    memory location.
1983 ///
1984 /// \headerfile <x86intrin.h>
1985 ///
1986 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1987 ///
1988 /// \param __dp
1989 ///    A pointer to a 64-bit memory location.
1990 /// \param __a
1991 ///    A 128-bit vector of [2 x double] containing the value to be stored.
1992 static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp,
1993                                                         __m128d __a) {
1994   struct __mm_storeh_pd_struct {
1995     double __u;
1996   } __attribute__((__packed__, __may_alias__));
1997   ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[0];
1998 }
1999 
2000 /// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
2001 ///    saving the lower 8 bits of each sum in the corresponding element of a
2002 ///    128-bit result vector of [16 x i8].
2003 ///
2004 ///    The integer elements of both parameters can be either signed or unsigned.
2005 ///
2006 /// \headerfile <x86intrin.h>
2007 ///
2008 /// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
2009 ///
2010 /// \param __a
2011 ///    A 128-bit vector of [16 x i8].
2012 /// \param __b
2013 ///    A 128-bit vector of [16 x i8].
2014 /// \returns A 128-bit vector of [16 x i8] containing the sums of both
2015 ///    parameters.
2016 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a,
2017                                                           __m128i __b) {
2018   return (__m128i)((__v16qu)__a + (__v16qu)__b);
2019 }
2020 
2021 /// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
2022 ///    saving the lower 16 bits of each sum in the corresponding element of a
2023 ///    128-bit result vector of [8 x i16].
2024 ///
2025 ///    The integer elements of both parameters can be either signed or unsigned.
2026 ///
2027 /// \headerfile <x86intrin.h>
2028 ///
2029 /// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
2030 ///
2031 /// \param __a
2032 ///    A 128-bit vector of [8 x i16].
2033 /// \param __b
2034 ///    A 128-bit vector of [8 x i16].
2035 /// \returns A 128-bit vector of [8 x i16] containing the sums of both
2036 ///    parameters.
2037 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a,
2038                                                            __m128i __b) {
2039   return (__m128i)((__v8hu)__a + (__v8hu)__b);
2040 }
2041 
2042 /// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
2043 ///    saving the lower 32 bits of each sum in the corresponding element of a
2044 ///    128-bit result vector of [4 x i32].
2045 ///
2046 ///    The integer elements of both parameters can be either signed or unsigned.
2047 ///
2048 /// \headerfile <x86intrin.h>
2049 ///
2050 /// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
2051 ///
2052 /// \param __a
2053 ///    A 128-bit vector of [4 x i32].
2054 /// \param __b
2055 ///    A 128-bit vector of [4 x i32].
2056 /// \returns A 128-bit vector of [4 x i32] containing the sums of both
2057 ///    parameters.
2058 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a,
2059                                                            __m128i __b) {
2060   return (__m128i)((__v4su)__a + (__v4su)__b);
2061 }
2062 
2063 /// Adds two signed or unsigned 64-bit integer values, returning the
2064 ///    lower 64 bits of the sum.
2065 ///
2066 /// \headerfile <x86intrin.h>
2067 ///
2068 /// This intrinsic corresponds to the <c> PADDQ </c> instruction.
2069 ///
2070 /// \param __a
2071 ///    A 64-bit integer.
2072 /// \param __b
2073 ///    A 64-bit integer.
2074 /// \returns A 64-bit integer containing the sum of both parameters.
2075 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a,
2076                                                             __m64 __b) {
2077   return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
2078 }
2079 
2080 /// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
2081 ///    saving the lower 64 bits of each sum in the corresponding element of a
2082 ///    128-bit result vector of [2 x i64].
2083 ///
2084 ///    The integer elements of both parameters can be either signed or unsigned.
2085 ///
2086 /// \headerfile <x86intrin.h>
2087 ///
2088 /// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
2089 ///
2090 /// \param __a
2091 ///    A 128-bit vector of [2 x i64].
2092 /// \param __b
2093 ///    A 128-bit vector of [2 x i64].
2094 /// \returns A 128-bit vector of [2 x i64] containing the sums of both
2095 ///    parameters.
2096 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a,
2097                                                            __m128i __b) {
2098   return (__m128i)((__v2du)__a + (__v2du)__b);
2099 }
2100 
2101 /// Adds, with saturation, the corresponding elements of two 128-bit
2102 ///    signed [16 x i8] vectors, saving each sum in the corresponding element of
2103 ///    a 128-bit result vector of [16 x i8]. Positive sums greater than 0x7F are
2104 ///    saturated to 0x7F. Negative sums less than 0x80 are saturated to 0x80.
2105 ///
2106 /// \headerfile <x86intrin.h>
2107 ///
2108 /// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
2109 ///
2110 /// \param __a
2111 ///    A 128-bit signed [16 x i8] vector.
2112 /// \param __b
2113 ///    A 128-bit signed [16 x i8] vector.
2114 /// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
2115 ///    both parameters.
2116 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a,
2117                                                            __m128i __b) {
2118   return (__m128i)__builtin_elementwise_add_sat((__v16qs)__a, (__v16qs)__b);
2119 }
2120 
2121 /// Adds, with saturation, the corresponding elements of two 128-bit
2122 ///    signed [8 x i16] vectors, saving each sum in the corresponding element of
2123 ///    a 128-bit result vector of [8 x i16]. Positive sums greater than 0x7FFF
2124 ///    are saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
2125 ///    0x8000.
2126 ///
2127 /// \headerfile <x86intrin.h>
2128 ///
2129 /// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
2130 ///
2131 /// \param __a
2132 ///    A 128-bit signed [8 x i16] vector.
2133 /// \param __b
2134 ///    A 128-bit signed [8 x i16] vector.
2135 /// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
2136 ///    both parameters.
2137 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a,
2138                                                             __m128i __b) {
2139   return (__m128i)__builtin_elementwise_add_sat((__v8hi)__a, (__v8hi)__b);
2140 }
2141 
2142 /// Adds, with saturation, the corresponding elements of two 128-bit
2143 ///    unsigned [16 x i8] vectors, saving each sum in the corresponding element
2144 ///    of a 128-bit result vector of [16 x i8]. Positive sums greater than 0xFF
2145 ///    are saturated to 0xFF. Negative sums are saturated to 0x00.
2146 ///
2147 /// \headerfile <x86intrin.h>
2148 ///
2149 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2150 ///
2151 /// \param __a
2152 ///    A 128-bit unsigned [16 x i8] vector.
2153 /// \param __b
2154 ///    A 128-bit unsigned [16 x i8] vector.
2155 /// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
2156 ///    of both parameters.
2157 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a,
2158                                                            __m128i __b) {
2159   return (__m128i)__builtin_elementwise_add_sat((__v16qu)__a, (__v16qu)__b);
2160 }
2161 
2162 /// Adds, with saturation, the corresponding elements of two 128-bit
2163 ///    unsigned [8 x i16] vectors, saving each sum in the corresponding element
2164 ///    of a 128-bit result vector of [8 x i16]. Positive sums greater than
2165 ///    0xFFFF are saturated to 0xFFFF. Negative sums are saturated to 0x0000.
2166 ///
2167 /// \headerfile <x86intrin.h>
2168 ///
2169 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2170 ///
2171 /// \param __a
2172 ///    A 128-bit unsigned [8 x i16] vector.
2173 /// \param __b
2174 ///    A 128-bit unsigned [8 x i16] vector.
2175 /// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
2176 ///    of both parameters.
2177 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a,
2178                                                             __m128i __b) {
2179   return (__m128i)__builtin_elementwise_add_sat((__v8hu)__a, (__v8hu)__b);
2180 }
2181 
2182 /// Computes the rounded averages of corresponding elements of two
2183 ///    128-bit unsigned [16 x i8] vectors, saving each result in the
2184 ///    corresponding element of a 128-bit result vector of [16 x i8].
2185 ///
2186 /// \headerfile <x86intrin.h>
2187 ///
2188 /// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
2189 ///
2190 /// \param __a
2191 ///    A 128-bit unsigned [16 x i8] vector.
2192 /// \param __b
2193 ///    A 128-bit unsigned [16 x i8] vector.
2194 /// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
2195 ///    averages of both parameters.
2196 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a,
2197                                                           __m128i __b) {
2198   return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
2199 }
2200 
2201 /// Computes the rounded averages of corresponding elements of two
2202 ///    128-bit unsigned [8 x i16] vectors, saving each result in the
2203 ///    corresponding element of a 128-bit result vector of [8 x i16].
2204 ///
2205 /// \headerfile <x86intrin.h>
2206 ///
2207 /// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
2208 ///
2209 /// \param __a
2210 ///    A 128-bit unsigned [8 x i16] vector.
2211 /// \param __b
2212 ///    A 128-bit unsigned [8 x i16] vector.
2213 /// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
2214 ///    averages of both parameters.
2215 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a,
2216                                                            __m128i __b) {
2217   return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
2218 }
2219 
2220 /// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
2221 ///    vectors, producing eight intermediate 32-bit signed integer products, and
2222 ///    adds the consecutive pairs of 32-bit products to form a 128-bit signed
2223 ///    [4 x i32] vector.
2224 ///
2225 ///    For example, bits [15:0] of both parameters are multiplied producing a
2226 ///    32-bit product, bits [31:16] of both parameters are multiplied producing
2227 ///    a 32-bit product, and the sum of those two products becomes bits [31:0]
2228 ///    of the result.
2229 ///
2230 /// \headerfile <x86intrin.h>
2231 ///
2232 /// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
2233 ///
2234 /// \param __a
2235 ///    A 128-bit signed [8 x i16] vector.
2236 /// \param __b
2237 ///    A 128-bit signed [8 x i16] vector.
2238 /// \returns A 128-bit signed [4 x i32] vector containing the sums of products
2239 ///    of both parameters.
2240 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a,
2241                                                             __m128i __b) {
2242   return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
2243 }
2244 
2245 /// Compares corresponding elements of two 128-bit signed [8 x i16]
2246 ///    vectors, saving the greater value from each comparison in the
2247 ///    corresponding element of a 128-bit result vector of [8 x i16].
2248 ///
2249 /// \headerfile <x86intrin.h>
2250 ///
2251 /// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
2252 ///
2253 /// \param __a
2254 ///    A 128-bit signed [8 x i16] vector.
2255 /// \param __b
2256 ///    A 128-bit signed [8 x i16] vector.
2257 /// \returns A 128-bit signed [8 x i16] vector containing the greater value of
2258 ///    each comparison.
2259 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a,
2260                                                            __m128i __b) {
2261   return (__m128i)__builtin_elementwise_max((__v8hi)__a, (__v8hi)__b);
2262 }
2263 
2264 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2265 ///    vectors, saving the greater value from each comparison in the
2266 ///    corresponding element of a 128-bit result vector of [16 x i8].
2267 ///
2268 /// \headerfile <x86intrin.h>
2269 ///
2270 /// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
2271 ///
2272 /// \param __a
2273 ///    A 128-bit unsigned [16 x i8] vector.
2274 /// \param __b
2275 ///    A 128-bit unsigned [16 x i8] vector.
2276 /// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
2277 ///    each comparison.
2278 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a,
2279                                                           __m128i __b) {
2280   return (__m128i)__builtin_elementwise_max((__v16qu)__a, (__v16qu)__b);
2281 }
2282 
2283 /// Compares corresponding elements of two 128-bit signed [8 x i16]
2284 ///    vectors, saving the smaller value from each comparison in the
2285 ///    corresponding element of a 128-bit result vector of [8 x i16].
2286 ///
2287 /// \headerfile <x86intrin.h>
2288 ///
2289 /// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
2290 ///
2291 /// \param __a
2292 ///    A 128-bit signed [8 x i16] vector.
2293 /// \param __b
2294 ///    A 128-bit signed [8 x i16] vector.
2295 /// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
2296 ///    each comparison.
2297 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a,
2298                                                            __m128i __b) {
2299   return (__m128i)__builtin_elementwise_min((__v8hi)__a, (__v8hi)__b);
2300 }
2301 
2302 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2303 ///    vectors, saving the smaller value from each comparison in the
2304 ///    corresponding element of a 128-bit result vector of [16 x i8].
2305 ///
2306 /// \headerfile <x86intrin.h>
2307 ///
2308 /// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
2309 ///
2310 /// \param __a
2311 ///    A 128-bit unsigned [16 x i8] vector.
2312 /// \param __b
2313 ///    A 128-bit unsigned [16 x i8] vector.
2314 /// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
2315 ///    each comparison.
2316 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a,
2317                                                           __m128i __b) {
2318   return (__m128i)__builtin_elementwise_min((__v16qu)__a, (__v16qu)__b);
2319 }
2320 
2321 /// Multiplies the corresponding elements of two signed [8 x i16]
2322 ///    vectors, saving the upper 16 bits of each 32-bit product in the
2323 ///    corresponding element of a 128-bit signed [8 x i16] result vector.
2324 ///
2325 /// \headerfile <x86intrin.h>
2326 ///
2327 /// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
2328 ///
2329 /// \param __a
2330 ///    A 128-bit signed [8 x i16] vector.
2331 /// \param __b
2332 ///    A 128-bit signed [8 x i16] vector.
2333 /// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
2334 ///    each of the eight 32-bit products.
2335 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a,
2336                                                              __m128i __b) {
2337   return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
2338 }
2339 
2340 /// Multiplies the corresponding elements of two unsigned [8 x i16]
2341 ///    vectors, saving the upper 16 bits of each 32-bit product in the
2342 ///    corresponding element of a 128-bit unsigned [8 x i16] result vector.
2343 ///
2344 /// \headerfile <x86intrin.h>
2345 ///
2346 /// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
2347 ///
2348 /// \param __a
2349 ///    A 128-bit unsigned [8 x i16] vector.
2350 /// \param __b
2351 ///    A 128-bit unsigned [8 x i16] vector.
2352 /// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
2353 ///    of each of the eight 32-bit products.
2354 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a,
2355                                                              __m128i __b) {
2356   return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
2357 }
2358 
2359 /// Multiplies the corresponding elements of two signed [8 x i16]
2360 ///    vectors, saving the lower 16 bits of each 32-bit product in the
2361 ///    corresponding element of a 128-bit signed [8 x i16] result vector.
2362 ///
2363 /// \headerfile <x86intrin.h>
2364 ///
2365 /// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
2366 ///
2367 /// \param __a
2368 ///    A 128-bit signed [8 x i16] vector.
2369 /// \param __b
2370 ///    A 128-bit signed [8 x i16] vector.
2371 /// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
2372 ///    each of the eight 32-bit products.
2373 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a,
2374                                                              __m128i __b) {
2375   return (__m128i)((__v8hu)__a * (__v8hu)__b);
2376 }
2377 
2378 /// Multiplies 32-bit unsigned integer values contained in the lower bits
2379 ///    of the two 64-bit integer vectors and returns the 64-bit unsigned
2380 ///    product.
2381 ///
2382 /// \headerfile <x86intrin.h>
2383 ///
2384 /// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
2385 ///
2386 /// \param __a
2387 ///    A 64-bit integer containing one of the source operands.
2388 /// \param __b
2389 ///    A 64-bit integer containing one of the source operands.
2390 /// \returns A 64-bit integer vector containing the product of both operands.
2391 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a,
2392                                                             __m64 __b) {
2393   return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
2394 }
2395 
2396 /// Multiplies 32-bit unsigned integer values contained in the lower
2397 ///    bits of the corresponding elements of two [2 x i64] vectors, and returns
2398 ///    the 64-bit products in the corresponding elements of a [2 x i64] vector.
2399 ///
2400 /// \headerfile <x86intrin.h>
2401 ///
2402 /// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
2403 ///
2404 /// \param __a
2405 ///    A [2 x i64] vector containing one of the source operands.
2406 /// \param __b
2407 ///    A [2 x i64] vector containing one of the source operands.
2408 /// \returns A [2 x i64] vector containing the product of both operands.
2409 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a,
2410                                                            __m128i __b) {
2411   return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
2412 }
2413 
2414 /// Computes the absolute differences of corresponding 8-bit integer
2415 ///    values in two 128-bit vectors. Sums the first 8 absolute differences, and
2416 ///    separately sums the second 8 absolute differences. Packs these two
2417 ///    unsigned 16-bit integer sums into the upper and lower elements of a
2418 ///    [2 x i64] vector.
2419 ///
2420 /// \headerfile <x86intrin.h>
2421 ///
2422 /// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
2423 ///
2424 /// \param __a
2425 ///    A 128-bit integer vector containing one of the source operands.
2426 /// \param __b
2427 ///    A 128-bit integer vector containing one of the source operands.
2428 /// \returns A [2 x i64] vector containing the sums of the sets of absolute
2429 ///    differences between both operands.
2430 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a,
2431                                                           __m128i __b) {
2432   return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
2433 }
2434 
2435 /// Subtracts the corresponding 8-bit integer values in the operands.
2436 ///
2437 /// \headerfile <x86intrin.h>
2438 ///
2439 /// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
2440 ///
2441 /// \param __a
2442 ///    A 128-bit integer vector containing the minuends.
2443 /// \param __b
2444 ///    A 128-bit integer vector containing the subtrahends.
2445 /// \returns A 128-bit integer vector containing the differences of the values
2446 ///    in the operands.
2447 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a,
2448                                                           __m128i __b) {
2449   return (__m128i)((__v16qu)__a - (__v16qu)__b);
2450 }
2451 
2452 /// Subtracts the corresponding 16-bit integer values in the operands.
2453 ///
2454 /// \headerfile <x86intrin.h>
2455 ///
2456 /// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
2457 ///
2458 /// \param __a
2459 ///    A 128-bit integer vector containing the minuends.
2460 /// \param __b
2461 ///    A 128-bit integer vector containing the subtrahends.
2462 /// \returns A 128-bit integer vector containing the differences of the values
2463 ///    in the operands.
2464 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a,
2465                                                            __m128i __b) {
2466   return (__m128i)((__v8hu)__a - (__v8hu)__b);
2467 }
2468 
2469 /// Subtracts the corresponding 32-bit integer values in the operands.
2470 ///
2471 /// \headerfile <x86intrin.h>
2472 ///
2473 /// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
2474 ///
2475 /// \param __a
2476 ///    A 128-bit integer vector containing the minuends.
2477 /// \param __b
2478 ///    A 128-bit integer vector containing the subtrahends.
2479 /// \returns A 128-bit integer vector containing the differences of the values
2480 ///    in the operands.
2481 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a,
2482                                                            __m128i __b) {
2483   return (__m128i)((__v4su)__a - (__v4su)__b);
2484 }
2485 
2486 /// Subtracts signed or unsigned 64-bit integer values and writes the
2487 ///    difference to the corresponding bits in the destination.
2488 ///
2489 /// \headerfile <x86intrin.h>
2490 ///
2491 /// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
2492 ///
2493 /// \param __a
2494 ///    A 64-bit integer vector containing the minuend.
2495 /// \param __b
2496 ///    A 64-bit integer vector containing the subtrahend.
2497 /// \returns A 64-bit integer vector containing the difference of the values in
2498 ///    the operands.
2499 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a,
2500                                                             __m64 __b) {
2501   return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
2502 }
2503 
2504 /// Subtracts the corresponding elements of two [2 x i64] vectors.
2505 ///
2506 /// \headerfile <x86intrin.h>
2507 ///
2508 /// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
2509 ///
2510 /// \param __a
2511 ///    A 128-bit integer vector containing the minuends.
2512 /// \param __b
2513 ///    A 128-bit integer vector containing the subtrahends.
2514 /// \returns A 128-bit integer vector containing the differences of the values
2515 ///    in the operands.
2516 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a,
2517                                                            __m128i __b) {
2518   return (__m128i)((__v2du)__a - (__v2du)__b);
2519 }
2520 
2521 /// Subtracts corresponding 8-bit signed integer values in the input and
2522 ///    returns the differences in the corresponding bytes in the destination.
2523 ///    Differences greater than 0x7F are saturated to 0x7F, and differences less
2524 ///    than 0x80 are saturated to 0x80.
2525 ///
2526 /// \headerfile <x86intrin.h>
2527 ///
2528 /// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
2529 ///
2530 /// \param __a
2531 ///    A 128-bit integer vector containing the minuends.
2532 /// \param __b
2533 ///    A 128-bit integer vector containing the subtrahends.
2534 /// \returns A 128-bit integer vector containing the differences of the values
2535 ///    in the operands.
2536 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a,
2537                                                            __m128i __b) {
2538   return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b);
2539 }
2540 
2541 /// Subtracts corresponding 16-bit signed integer values in the input and
2542 ///    returns the differences in the corresponding bytes in the destination.
2543 ///    Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
2544 ///    than 0x8000 are saturated to 0x8000.
2545 ///
2546 /// \headerfile <x86intrin.h>
2547 ///
2548 /// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
2549 ///
2550 /// \param __a
2551 ///    A 128-bit integer vector containing the minuends.
2552 /// \param __b
2553 ///    A 128-bit integer vector containing the subtrahends.
2554 /// \returns A 128-bit integer vector containing the differences of the values
2555 ///    in the operands.
2556 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a,
2557                                                             __m128i __b) {
2558   return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b);
2559 }
2560 
2561 /// Subtracts corresponding 8-bit unsigned integer values in the input
2562 ///    and returns the differences in the corresponding bytes in the
2563 ///    destination. Differences less than 0x00 are saturated to 0x00.
2564 ///
2565 /// \headerfile <x86intrin.h>
2566 ///
2567 /// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
2568 ///
2569 /// \param __a
2570 ///    A 128-bit integer vector containing the minuends.
2571 /// \param __b
2572 ///    A 128-bit integer vector containing the subtrahends.
2573 /// \returns A 128-bit integer vector containing the unsigned integer
2574 ///    differences of the values in the operands.
2575 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a,
2576                                                            __m128i __b) {
2577   return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b);
2578 }
2579 
2580 /// Subtracts corresponding 16-bit unsigned integer values in the input
2581 ///    and returns the differences in the corresponding bytes in the
2582 ///    destination. Differences less than 0x0000 are saturated to 0x0000.
2583 ///
2584 /// \headerfile <x86intrin.h>
2585 ///
2586 /// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
2587 ///
2588 /// \param __a
2589 ///    A 128-bit integer vector containing the minuends.
2590 /// \param __b
2591 ///    A 128-bit integer vector containing the subtrahends.
2592 /// \returns A 128-bit integer vector containing the unsigned integer
2593 ///    differences of the values in the operands.
2594 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a,
2595                                                             __m128i __b) {
2596   return (__m128i)__builtin_elementwise_sub_sat((__v8hu)__a, (__v8hu)__b);
2597 }
2598 
2599 /// Performs a bitwise AND of two 128-bit integer vectors.
2600 ///
2601 /// \headerfile <x86intrin.h>
2602 ///
2603 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
2604 ///
2605 /// \param __a
2606 ///    A 128-bit integer vector containing one of the source operands.
2607 /// \param __b
2608 ///    A 128-bit integer vector containing one of the source operands.
2609 /// \returns A 128-bit integer vector containing the bitwise AND of the values
2610 ///    in both operands.
2611 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a,
2612                                                            __m128i __b) {
2613   return (__m128i)((__v2du)__a & (__v2du)__b);
2614 }
2615 
2616 /// Performs a bitwise AND of two 128-bit integer vectors, using the
2617 ///    one's complement of the values contained in the first source operand.
2618 ///
2619 /// \headerfile <x86intrin.h>
2620 ///
2621 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
2622 ///
2623 /// \param __a
2624 ///    A 128-bit vector containing the left source operand. The one's complement
2625 ///    of this value is used in the bitwise AND.
2626 /// \param __b
2627 ///    A 128-bit vector containing the right source operand.
2628 /// \returns A 128-bit integer vector containing the bitwise AND of the one's
2629 ///    complement of the first operand and the values in the second operand.
2630 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a,
2631                                                               __m128i __b) {
2632   return (__m128i)(~(__v2du)__a & (__v2du)__b);
2633 }
2634 /// Performs a bitwise OR of two 128-bit integer vectors.
2635 ///
2636 /// \headerfile <x86intrin.h>
2637 ///
2638 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
2639 ///
2640 /// \param __a
2641 ///    A 128-bit integer vector containing one of the source operands.
2642 /// \param __b
2643 ///    A 128-bit integer vector containing one of the source operands.
2644 /// \returns A 128-bit integer vector containing the bitwise OR of the values
2645 ///    in both operands.
2646 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a,
2647                                                           __m128i __b) {
2648   return (__m128i)((__v2du)__a | (__v2du)__b);
2649 }
2650 
2651 /// Performs a bitwise exclusive OR of two 128-bit integer vectors.
2652 ///
2653 /// \headerfile <x86intrin.h>
2654 ///
2655 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
2656 ///
2657 /// \param __a
2658 ///    A 128-bit integer vector containing one of the source operands.
2659 /// \param __b
2660 ///    A 128-bit integer vector containing one of the source operands.
2661 /// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
2662 ///    values in both operands.
2663 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a,
2664                                                            __m128i __b) {
2665   return (__m128i)((__v2du)__a ^ (__v2du)__b);
2666 }
2667 
2668 /// Left-shifts the 128-bit integer vector operand by the specified
2669 ///    number of bytes. Low-order bits are cleared.
2670 ///
2671 /// \headerfile <x86intrin.h>
2672 ///
2673 /// \code
2674 /// __m128i _mm_slli_si128(__m128i a, const int imm);
2675 /// \endcode
2676 ///
2677 /// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
2678 ///
2679 /// \param a
2680 ///    A 128-bit integer vector containing the source operand.
2681 /// \param imm
2682 ///    An immediate value specifying the number of bytes to left-shift operand
2683 ///    \a a.
2684 /// \returns A 128-bit integer vector containing the left-shifted value.
2685 #define _mm_slli_si128(a, imm)                                                 \
2686   ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a),          \
2687                                                 (int)(imm)))
2688 
2689 #define _mm_bslli_si128(a, imm)                                                \
2690   ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a),          \
2691                                                 (int)(imm)))
2692 
2693 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
2694 ///    by the specified number of bits. Low-order bits are cleared.
2695 ///
2696 /// \headerfile <x86intrin.h>
2697 ///
2698 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2699 ///
2700 /// \param __a
2701 ///    A 128-bit integer vector containing the source operand.
2702 /// \param __count
2703 ///    An integer value specifying the number of bits to left-shift each value
2704 ///    in operand \a __a.
2705 /// \returns A 128-bit integer vector containing the left-shifted values.
2706 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a,
2707                                                             int __count) {
2708   return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
2709 }
2710 
2711 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
2712 ///    by the specified number of bits. Low-order bits are cleared.
2713 ///
2714 /// \headerfile <x86intrin.h>
2715 ///
2716 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2717 ///
2718 /// \param __a
2719 ///    A 128-bit integer vector containing the source operand.
2720 /// \param __count
2721 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2722 ///    to left-shift each value in operand \a __a.
2723 /// \returns A 128-bit integer vector containing the left-shifted values.
2724 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a,
2725                                                            __m128i __count) {
2726   return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
2727 }
2728 
2729 /// Left-shifts each 32-bit value in the 128-bit integer vector operand
2730 ///    by the specified number of bits. Low-order bits are cleared.
2731 ///
2732 /// \headerfile <x86intrin.h>
2733 ///
2734 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2735 ///
2736 /// \param __a
2737 ///    A 128-bit integer vector containing the source operand.
2738 /// \param __count
2739 ///    An integer value specifying the number of bits to left-shift each value
2740 ///    in operand \a __a.
2741 /// \returns A 128-bit integer vector containing the left-shifted values.
2742 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a,
2743                                                             int __count) {
2744   return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
2745 }
2746 
2747 /// Left-shifts each 32-bit value in the 128-bit integer vector operand
2748 ///    by the specified number of bits. Low-order bits are cleared.
2749 ///
2750 /// \headerfile <x86intrin.h>
2751 ///
2752 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2753 ///
2754 /// \param __a
2755 ///    A 128-bit integer vector containing the source operand.
2756 /// \param __count
2757 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2758 ///    to left-shift each value in operand \a __a.
2759 /// \returns A 128-bit integer vector containing the left-shifted values.
2760 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a,
2761                                                            __m128i __count) {
2762   return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
2763 }
2764 
2765 /// Left-shifts each 64-bit value in the 128-bit integer vector operand
2766 ///    by the specified number of bits. Low-order bits are cleared.
2767 ///
2768 /// \headerfile <x86intrin.h>
2769 ///
2770 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2771 ///
2772 /// \param __a
2773 ///    A 128-bit integer vector containing the source operand.
2774 /// \param __count
2775 ///    An integer value specifying the number of bits to left-shift each value
2776 ///    in operand \a __a.
2777 /// \returns A 128-bit integer vector containing the left-shifted values.
2778 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a,
2779                                                             int __count) {
2780   return __builtin_ia32_psllqi128((__v2di)__a, __count);
2781 }
2782 
2783 /// Left-shifts each 64-bit value in the 128-bit integer vector operand
2784 ///    by the specified number of bits. Low-order bits are cleared.
2785 ///
2786 /// \headerfile <x86intrin.h>
2787 ///
2788 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2789 ///
2790 /// \param __a
2791 ///    A 128-bit integer vector containing the source operand.
2792 /// \param __count
2793 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2794 ///    to left-shift each value in operand \a __a.
2795 /// \returns A 128-bit integer vector containing the left-shifted values.
2796 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a,
2797                                                            __m128i __count) {
2798   return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
2799 }
2800 
2801 /// Right-shifts each 16-bit value in the 128-bit integer vector operand
2802 ///    by the specified number of bits. High-order bits are filled with the sign
2803 ///    bit of the initial value.
2804 ///
2805 /// \headerfile <x86intrin.h>
2806 ///
2807 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2808 ///
2809 /// \param __a
2810 ///    A 128-bit integer vector containing the source operand.
2811 /// \param __count
2812 ///    An integer value specifying the number of bits to right-shift each value
2813 ///    in operand \a __a.
2814 /// \returns A 128-bit integer vector containing the right-shifted values.
2815 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a,
2816                                                             int __count) {
2817   return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
2818 }
2819 
2820 /// Right-shifts each 16-bit value in the 128-bit integer vector operand
2821 ///    by the specified number of bits. High-order bits are filled with the sign
2822 ///    bit of the initial value.
2823 ///
2824 /// \headerfile <x86intrin.h>
2825 ///
2826 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2827 ///
2828 /// \param __a
2829 ///    A 128-bit integer vector containing the source operand.
2830 /// \param __count
2831 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2832 ///    to right-shift each value in operand \a __a.
2833 /// \returns A 128-bit integer vector containing the right-shifted values.
2834 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a,
2835                                                            __m128i __count) {
2836   return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
2837 }
2838 
2839 /// Right-shifts each 32-bit value in the 128-bit integer vector operand
2840 ///    by the specified number of bits. High-order bits are filled with the sign
2841 ///    bit of the initial value.
2842 ///
2843 /// \headerfile <x86intrin.h>
2844 ///
2845 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2846 ///
2847 /// \param __a
2848 ///    A 128-bit integer vector containing the source operand.
2849 /// \param __count
2850 ///    An integer value specifying the number of bits to right-shift each value
2851 ///    in operand \a __a.
2852 /// \returns A 128-bit integer vector containing the right-shifted values.
2853 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a,
2854                                                             int __count) {
2855   return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
2856 }
2857 
2858 /// Right-shifts each 32-bit value in the 128-bit integer vector operand
2859 ///    by the specified number of bits. High-order bits are filled with the sign
2860 ///    bit of the initial value.
2861 ///
2862 /// \headerfile <x86intrin.h>
2863 ///
2864 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2865 ///
2866 /// \param __a
2867 ///    A 128-bit integer vector containing the source operand.
2868 /// \param __count
2869 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2870 ///    to right-shift each value in operand \a __a.
2871 /// \returns A 128-bit integer vector containing the right-shifted values.
2872 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a,
2873                                                            __m128i __count) {
2874   return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
2875 }
2876 
2877 /// Right-shifts the 128-bit integer vector operand by the specified
2878 ///    number of bytes. High-order bits are cleared.
2879 ///
2880 /// \headerfile <x86intrin.h>
2881 ///
2882 /// \code
2883 /// __m128i _mm_srli_si128(__m128i a, const int imm);
2884 /// \endcode
2885 ///
2886 /// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
2887 ///
2888 /// \param a
2889 ///    A 128-bit integer vector containing the source operand.
2890 /// \param imm
2891 ///    An immediate value specifying the number of bytes to right-shift operand
2892 ///    \a a.
2893 /// \returns A 128-bit integer vector containing the right-shifted value.
2894 #define _mm_srli_si128(a, imm)                                                 \
2895   ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a),          \
2896                                                 (int)(imm)))
2897 
2898 #define _mm_bsrli_si128(a, imm)                                                \
2899   ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a),          \
2900                                                 (int)(imm)))
2901 
2902 /// Right-shifts each of 16-bit values in the 128-bit integer vector
2903 ///    operand by the specified number of bits. High-order bits are cleared.
2904 ///
2905 /// \headerfile <x86intrin.h>
2906 ///
2907 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2908 ///
2909 /// \param __a
2910 ///    A 128-bit integer vector containing the source operand.
2911 /// \param __count
2912 ///    An integer value specifying the number of bits to right-shift each value
2913 ///    in operand \a __a.
2914 /// \returns A 128-bit integer vector containing the right-shifted values.
2915 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a,
2916                                                             int __count) {
2917   return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
2918 }
2919 
2920 /// Right-shifts each of 16-bit values in the 128-bit integer vector
2921 ///    operand by the specified number of bits. High-order bits are cleared.
2922 ///
2923 /// \headerfile <x86intrin.h>
2924 ///
2925 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2926 ///
2927 /// \param __a
2928 ///    A 128-bit integer vector containing the source operand.
2929 /// \param __count
2930 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2931 ///    to right-shift each value in operand \a __a.
2932 /// \returns A 128-bit integer vector containing the right-shifted values.
2933 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a,
2934                                                            __m128i __count) {
2935   return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
2936 }
2937 
2938 /// Right-shifts each of 32-bit values in the 128-bit integer vector
2939 ///    operand by the specified number of bits. High-order bits are cleared.
2940 ///
2941 /// \headerfile <x86intrin.h>
2942 ///
2943 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
2944 ///
2945 /// \param __a
2946 ///    A 128-bit integer vector containing the source operand.
2947 /// \param __count
2948 ///    An integer value specifying the number of bits to right-shift each value
2949 ///    in operand \a __a.
2950 /// \returns A 128-bit integer vector containing the right-shifted values.
2951 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a,
2952                                                             int __count) {
2953   return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
2954 }
2955 
2956 /// Right-shifts each of 32-bit values in the 128-bit integer vector
2957 ///    operand by the specified number of bits. High-order bits are cleared.
2958 ///
2959 /// \headerfile <x86intrin.h>
2960 ///
2961 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
2962 ///
2963 /// \param __a
2964 ///    A 128-bit integer vector containing the source operand.
2965 /// \param __count
2966 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2967 ///    to right-shift each value in operand \a __a.
2968 /// \returns A 128-bit integer vector containing the right-shifted values.
2969 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a,
2970                                                            __m128i __count) {
2971   return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
2972 }
2973 
2974 /// Right-shifts each of 64-bit values in the 128-bit integer vector
2975 ///    operand by the specified number of bits. High-order bits are cleared.
2976 ///
2977 /// \headerfile <x86intrin.h>
2978 ///
2979 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
2980 ///
2981 /// \param __a
2982 ///    A 128-bit integer vector containing the source operand.
2983 /// \param __count
2984 ///    An integer value specifying the number of bits to right-shift each value
2985 ///    in operand \a __a.
2986 /// \returns A 128-bit integer vector containing the right-shifted values.
2987 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a,
2988                                                             int __count) {
2989   return __builtin_ia32_psrlqi128((__v2di)__a, __count);
2990 }
2991 
2992 /// Right-shifts each of 64-bit values in the 128-bit integer vector
2993 ///    operand by the specified number of bits. High-order bits are cleared.
2994 ///
2995 /// \headerfile <x86intrin.h>
2996 ///
2997 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
2998 ///
2999 /// \param __a
3000 ///    A 128-bit integer vector containing the source operand.
3001 /// \param __count
3002 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
3003 ///    to right-shift each value in operand \a __a.
3004 /// \returns A 128-bit integer vector containing the right-shifted values.
3005 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a,
3006                                                            __m128i __count) {
3007   return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
3008 }
3009 
3010 /// Compares each of the corresponding 8-bit values of the 128-bit
3011 ///    integer vectors for equality. Each comparison yields 0x0 for false, 0xFF
3012 ///    for true.
3013 ///
3014 /// \headerfile <x86intrin.h>
3015 ///
3016 /// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
3017 ///
3018 /// \param __a
3019 ///    A 128-bit integer vector.
3020 /// \param __b
3021 ///    A 128-bit integer vector.
3022 /// \returns A 128-bit integer vector containing the comparison results.
3023 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a,
3024                                                             __m128i __b) {
3025   return (__m128i)((__v16qi)__a == (__v16qi)__b);
3026 }
3027 
3028 /// Compares each of the corresponding 16-bit values of the 128-bit
3029 ///    integer vectors for equality. Each comparison yields 0x0 for false,
3030 ///    0xFFFF for true.
3031 ///
3032 /// \headerfile <x86intrin.h>
3033 ///
3034 /// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
3035 ///
3036 /// \param __a
3037 ///    A 128-bit integer vector.
3038 /// \param __b
3039 ///    A 128-bit integer vector.
3040 /// \returns A 128-bit integer vector containing the comparison results.
3041 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a,
3042                                                              __m128i __b) {
3043   return (__m128i)((__v8hi)__a == (__v8hi)__b);
3044 }
3045 
3046 /// Compares each of the corresponding 32-bit values of the 128-bit
3047 ///    integer vectors for equality. Each comparison yields 0x0 for false,
3048 ///    0xFFFFFFFF for true.
3049 ///
3050 /// \headerfile <x86intrin.h>
3051 ///
3052 /// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
3053 ///
3054 /// \param __a
3055 ///    A 128-bit integer vector.
3056 /// \param __b
3057 ///    A 128-bit integer vector.
3058 /// \returns A 128-bit integer vector containing the comparison results.
3059 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a,
3060                                                              __m128i __b) {
3061   return (__m128i)((__v4si)__a == (__v4si)__b);
3062 }
3063 
3064 /// Compares each of the corresponding signed 8-bit values of the 128-bit
3065 ///    integer vectors to determine if the values in the first operand are
3066 ///    greater than those in the second operand. Each comparison yields 0x0 for
3067 ///    false, 0xFF for true.
3068 ///
3069 /// \headerfile <x86intrin.h>
3070 ///
3071 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3072 ///
3073 /// \param __a
3074 ///    A 128-bit integer vector.
3075 /// \param __b
3076 ///    A 128-bit integer vector.
3077 /// \returns A 128-bit integer vector containing the comparison results.
3078 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a,
3079                                                             __m128i __b) {
3080   /* This function always performs a signed comparison, but __v16qi is a char
3081      which may be signed or unsigned, so use __v16qs. */
3082   return (__m128i)((__v16qs)__a > (__v16qs)__b);
3083 }
3084 
3085 /// Compares each of the corresponding signed 16-bit values of the
3086 ///    128-bit integer vectors to determine if the values in the first operand
3087 ///    are greater than those in the second operand.
3088 ///
3089 ///    Each comparison yields 0x0 for false, 0xFFFF for true.
3090 ///
3091 /// \headerfile <x86intrin.h>
3092 ///
3093 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3094 ///
3095 /// \param __a
3096 ///    A 128-bit integer vector.
3097 /// \param __b
3098 ///    A 128-bit integer vector.
3099 /// \returns A 128-bit integer vector containing the comparison results.
3100 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a,
3101                                                              __m128i __b) {
3102   return (__m128i)((__v8hi)__a > (__v8hi)__b);
3103 }
3104 
3105 /// Compares each of the corresponding signed 32-bit values of the
3106 ///    128-bit integer vectors to determine if the values in the first operand
3107 ///    are greater than those in the second operand.
3108 ///
3109 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3110 ///
3111 /// \headerfile <x86intrin.h>
3112 ///
3113 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3114 ///
3115 /// \param __a
3116 ///    A 128-bit integer vector.
3117 /// \param __b
3118 ///    A 128-bit integer vector.
3119 /// \returns A 128-bit integer vector containing the comparison results.
3120 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a,
3121                                                              __m128i __b) {
3122   return (__m128i)((__v4si)__a > (__v4si)__b);
3123 }
3124 
3125 /// Compares each of the corresponding signed 8-bit values of the 128-bit
3126 ///    integer vectors to determine if the values in the first operand are less
3127 ///    than those in the second operand.
3128 ///
3129 ///    Each comparison yields 0x0 for false, 0xFF for true.
3130 ///
3131 /// \headerfile <x86intrin.h>
3132 ///
3133 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3134 ///
3135 /// \param __a
3136 ///    A 128-bit integer vector.
3137 /// \param __b
3138 ///    A 128-bit integer vector.
3139 /// \returns A 128-bit integer vector containing the comparison results.
3140 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a,
3141                                                             __m128i __b) {
3142   return _mm_cmpgt_epi8(__b, __a);
3143 }
3144 
3145 /// Compares each of the corresponding signed 16-bit values of the
3146 ///    128-bit integer vectors to determine if the values in the first operand
3147 ///    are less than those in the second operand.
3148 ///
3149 ///    Each comparison yields 0x0 for false, 0xFFFF for true.
3150 ///
3151 /// \headerfile <x86intrin.h>
3152 ///
3153 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3154 ///
3155 /// \param __a
3156 ///    A 128-bit integer vector.
3157 /// \param __b
3158 ///    A 128-bit integer vector.
3159 /// \returns A 128-bit integer vector containing the comparison results.
3160 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a,
3161                                                              __m128i __b) {
3162   return _mm_cmpgt_epi16(__b, __a);
3163 }
3164 
3165 /// Compares each of the corresponding signed 32-bit values of the
3166 ///    128-bit integer vectors to determine if the values in the first operand
3167 ///    are less than those in the second operand.
3168 ///
3169 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3170 ///
3171 /// \headerfile <x86intrin.h>
3172 ///
3173 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3174 ///
3175 /// \param __a
3176 ///    A 128-bit integer vector.
3177 /// \param __b
3178 ///    A 128-bit integer vector.
3179 /// \returns A 128-bit integer vector containing the comparison results.
3180 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a,
3181                                                              __m128i __b) {
3182   return _mm_cmpgt_epi32(__b, __a);
3183 }
3184 
3185 #ifdef __x86_64__
3186 /// Converts a 64-bit signed integer value from the second operand into a
3187 ///    double-precision value and returns it in the lower element of a [2 x
3188 ///    double] vector; the upper element of the returned vector is copied from
3189 ///    the upper element of the first operand.
3190 ///
3191 /// \headerfile <x86intrin.h>
3192 ///
3193 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
3194 ///
3195 /// \param __a
3196 ///    A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
3197 ///    copied to the upper 64 bits of the destination.
3198 /// \param __b
3199 ///    A 64-bit signed integer operand containing the value to be converted.
3200 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
3201 ///    converted value of the second operand. The upper 64 bits are copied from
3202 ///    the upper 64 bits of the first operand.
3203 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi64_sd(__m128d __a,
3204                                                             long long __b) {
3205   __a[0] = __b;
3206   return __a;
3207 }
3208 
3209 /// Converts the first (lower) element of a vector of [2 x double] into a
3210 ///    64-bit signed integer value, according to the current rounding mode.
3211 ///
3212 /// \headerfile <x86intrin.h>
3213 ///
3214 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
3215 ///
3216 /// \param __a
3217 ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3218 ///    conversion.
3219 /// \returns A 64-bit signed integer containing the converted value.
3220 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsd_si64(__m128d __a) {
3221   return __builtin_ia32_cvtsd2si64((__v2df)__a);
3222 }
3223 
3224 /// Converts the first (lower) element of a vector of [2 x double] into a
3225 ///    64-bit signed integer value, truncating the result when it is inexact.
3226 ///
3227 /// \headerfile <x86intrin.h>
3228 ///
3229 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
3230 ///   instruction.
3231 ///
3232 /// \param __a
3233 ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3234 ///    conversion.
3235 /// \returns A 64-bit signed integer containing the converted value.
3236 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvttsd_si64(__m128d __a) {
3237   return __builtin_ia32_cvttsd2si64((__v2df)__a);
3238 }
3239 #endif
3240 
3241 /// Converts a vector of [4 x i32] into a vector of [4 x float].
3242 ///
3243 /// \headerfile <x86intrin.h>
3244 ///
3245 /// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
3246 ///
3247 /// \param __a
3248 ///    A 128-bit integer vector.
3249 /// \returns A 128-bit vector of [4 x float] containing the converted values.
3250 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a) {
3251   return (__m128) __builtin_convertvector((__v4si)__a, __v4sf);
3252 }
3253 
3254 /// Converts a vector of [4 x float] into a vector of [4 x i32].
3255 ///
3256 /// \headerfile <x86intrin.h>
3257 ///
3258 /// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
3259 ///
3260 /// \param __a
3261 ///    A 128-bit vector of [4 x float].
3262 /// \returns A 128-bit integer vector of [4 x i32] containing the converted
3263 ///    values.
3264 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a) {
3265   return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
3266 }
3267 
3268 /// Converts a vector of [4 x float] into a vector of [4 x i32],
3269 ///    truncating the result when it is inexact.
3270 ///
3271 /// \headerfile <x86intrin.h>
3272 ///
3273 /// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
3274 ///   instruction.
3275 ///
3276 /// \param __a
3277 ///    A 128-bit vector of [4 x float].
3278 /// \returns A 128-bit vector of [4 x i32] containing the converted values.
3279 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a) {
3280   return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
3281 }
3282 
3283 /// Returns a vector of [4 x i32] where the lowest element is the input
3284 ///    operand and the remaining elements are zero.
3285 ///
3286 /// \headerfile <x86intrin.h>
3287 ///
3288 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3289 ///
3290 /// \param __a
3291 ///    A 32-bit signed integer operand.
3292 /// \returns A 128-bit vector of [4 x i32].
3293 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a) {
3294   return __extension__(__m128i)(__v4si){__a, 0, 0, 0};
3295 }
3296 
3297 /// Returns a vector of [2 x i64] where the lower element is the input
3298 ///    operand and the upper element is zero.
3299 ///
3300 /// \headerfile <x86intrin.h>
3301 ///
3302 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction
3303 /// in 64-bit mode.
3304 ///
3305 /// \param __a
3306 ///    A 64-bit signed integer operand containing the value to be converted.
3307 /// \returns A 128-bit vector of [2 x i64] containing the converted value.
3308 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a) {
3309   return __extension__(__m128i)(__v2di){__a, 0};
3310 }
3311 
3312 /// Moves the least significant 32 bits of a vector of [4 x i32] to a
3313 ///    32-bit signed integer value.
3314 ///
3315 /// \headerfile <x86intrin.h>
3316 ///
3317 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3318 ///
3319 /// \param __a
3320 ///    A vector of [4 x i32]. The least significant 32 bits are moved to the
3321 ///    destination.
3322 /// \returns A 32-bit signed integer containing the moved value.
3323 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a) {
3324   __v4si __b = (__v4si)__a;
3325   return __b[0];
3326 }
3327 
3328 /// Moves the least significant 64 bits of a vector of [2 x i64] to a
3329 ///    64-bit signed integer value.
3330 ///
3331 /// \headerfile <x86intrin.h>
3332 ///
3333 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3334 ///
3335 /// \param __a
3336 ///    A vector of [2 x i64]. The least significant 64 bits are moved to the
3337 ///    destination.
3338 /// \returns A 64-bit signed integer containing the moved value.
3339 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a) {
3340   return __a[0];
3341 }
3342 
3343 /// Moves packed integer values from an aligned 128-bit memory location
3344 ///    to elements in a 128-bit integer vector.
3345 ///
3346 /// \headerfile <x86intrin.h>
3347 ///
3348 /// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
3349 ///
3350 /// \param __p
3351 ///    An aligned pointer to a memory location containing integer values.
3352 /// \returns A 128-bit integer vector containing the moved values.
3353 static __inline__ __m128i __DEFAULT_FN_ATTRS
3354 _mm_load_si128(__m128i const *__p) {
3355   return *__p;
3356 }
3357 
3358 /// Moves packed integer values from an unaligned 128-bit memory location
3359 ///    to elements in a 128-bit integer vector.
3360 ///
3361 /// \headerfile <x86intrin.h>
3362 ///
3363 /// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
3364 ///
3365 /// \param __p
3366 ///    A pointer to a memory location containing integer values.
3367 /// \returns A 128-bit integer vector containing the moved values.
3368 static __inline__ __m128i __DEFAULT_FN_ATTRS
3369 _mm_loadu_si128(__m128i_u const *__p) {
3370   struct __loadu_si128 {
3371     __m128i_u __v;
3372   } __attribute__((__packed__, __may_alias__));
3373   return ((const struct __loadu_si128 *)__p)->__v;
3374 }
3375 
3376 /// Returns a vector of [2 x i64] where the lower element is taken from
3377 ///    the lower element of the operand, and the upper element is zero.
3378 ///
3379 /// \headerfile <x86intrin.h>
3380 ///
3381 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3382 ///
3383 /// \param __p
3384 ///    A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
3385 ///    the destination.
3386 /// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
3387 ///    moved value. The higher order bits are cleared.
3388 static __inline__ __m128i __DEFAULT_FN_ATTRS
3389 _mm_loadl_epi64(__m128i_u const *__p) {
3390   struct __mm_loadl_epi64_struct {
3391     long long __u;
3392   } __attribute__((__packed__, __may_alias__));
3393   return __extension__(__m128i){
3394       ((const struct __mm_loadl_epi64_struct *)__p)->__u, 0};
3395 }
3396 
3397 /// Generates a 128-bit vector of [4 x i32] with unspecified content.
3398 ///    This could be used as an argument to another intrinsic function where the
3399 ///    argument is required but the value is not actually used.
3400 ///
3401 /// \headerfile <x86intrin.h>
3402 ///
3403 /// This intrinsic has no corresponding instruction.
3404 ///
3405 /// \returns A 128-bit vector of [4 x i32] with unspecified content.
3406 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void) {
3407   return (__m128i)__builtin_ia32_undef128();
3408 }
3409 
3410 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3411 ///    the specified 64-bit integer values.
3412 ///
3413 /// \headerfile <x86intrin.h>
3414 ///
3415 /// This intrinsic is a utility function and does not correspond to a specific
3416 ///    instruction.
3417 ///
3418 /// \param __q1
3419 ///    A 64-bit integer value used to initialize the upper 64 bits of the
3420 ///    destination vector of [2 x i64].
3421 /// \param __q0
3422 ///    A 64-bit integer value used to initialize the lower 64 bits of the
3423 ///    destination vector of [2 x i64].
3424 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3425 ///    provided in the operands.
3426 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1,
3427                                                             long long __q0) {
3428   return __extension__(__m128i)(__v2di){__q0, __q1};
3429 }
3430 
3431 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3432 ///    the specified 64-bit integer values.
3433 ///
3434 /// \headerfile <x86intrin.h>
3435 ///
3436 /// This intrinsic is a utility function and does not correspond to a specific
3437 ///    instruction.
3438 ///
3439 /// \param __q1
3440 ///    A 64-bit integer value used to initialize the upper 64 bits of the
3441 ///    destination vector of [2 x i64].
3442 /// \param __q0
3443 ///    A 64-bit integer value used to initialize the lower 64 bits of the
3444 ///    destination vector of [2 x i64].
3445 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3446 ///    provided in the operands.
3447 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1,
3448                                                            __m64 __q0) {
3449   return _mm_set_epi64x((long long)__q1, (long long)__q0);
3450 }
3451 
3452 /// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
3453 ///    the specified 32-bit integer values.
3454 ///
3455 /// \headerfile <x86intrin.h>
3456 ///
3457 /// This intrinsic is a utility function and does not correspond to a specific
3458 ///    instruction.
3459 ///
3460 /// \param __i3
3461 ///    A 32-bit integer value used to initialize bits [127:96] of the
3462 ///    destination vector.
3463 /// \param __i2
3464 ///    A 32-bit integer value used to initialize bits [95:64] of the destination
3465 ///    vector.
3466 /// \param __i1
3467 ///    A 32-bit integer value used to initialize bits [63:32] of the destination
3468 ///    vector.
3469 /// \param __i0
3470 ///    A 32-bit integer value used to initialize bits [31:0] of the destination
3471 ///    vector.
3472 /// \returns An initialized 128-bit vector of [4 x i32] containing the values
3473 ///    provided in the operands.
3474 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2,
3475                                                            int __i1, int __i0) {
3476   return __extension__(__m128i)(__v4si){__i0, __i1, __i2, __i3};
3477 }
3478 
3479 /// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
3480 ///    the specified 16-bit integer values.
3481 ///
3482 /// \headerfile <x86intrin.h>
3483 ///
3484 /// This intrinsic is a utility function and does not correspond to a specific
3485 ///    instruction.
3486 ///
3487 /// \param __w7
3488 ///    A 16-bit integer value used to initialize bits [127:112] of the
3489 ///    destination vector.
3490 /// \param __w6
3491 ///    A 16-bit integer value used to initialize bits [111:96] of the
3492 ///    destination vector.
3493 /// \param __w5
3494 ///    A 16-bit integer value used to initialize bits [95:80] of the destination
3495 ///    vector.
3496 /// \param __w4
3497 ///    A 16-bit integer value used to initialize bits [79:64] of the destination
3498 ///    vector.
3499 /// \param __w3
3500 ///    A 16-bit integer value used to initialize bits [63:48] of the destination
3501 ///    vector.
3502 /// \param __w2
3503 ///    A 16-bit integer value used to initialize bits [47:32] of the destination
3504 ///    vector.
3505 /// \param __w1
3506 ///    A 16-bit integer value used to initialize bits [31:16] of the destination
3507 ///    vector.
3508 /// \param __w0
3509 ///    A 16-bit integer value used to initialize bits [15:0] of the destination
3510 ///    vector.
3511 /// \returns An initialized 128-bit vector of [8 x i16] containing the values
3512 ///    provided in the operands.
3513 static __inline__ __m128i __DEFAULT_FN_ATTRS
3514 _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3,
3515               short __w2, short __w1, short __w0) {
3516   return __extension__(__m128i)(__v8hi){__w0, __w1, __w2, __w3,
3517                                         __w4, __w5, __w6, __w7};
3518 }
3519 
3520 /// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
3521 ///    the specified 8-bit integer values.
3522 ///
3523 /// \headerfile <x86intrin.h>
3524 ///
3525 /// This intrinsic is a utility function and does not correspond to a specific
3526 ///    instruction.
3527 ///
3528 /// \param __b15
3529 ///    Initializes bits [127:120] of the destination vector.
3530 /// \param __b14
3531 ///    Initializes bits [119:112] of the destination vector.
3532 /// \param __b13
3533 ///    Initializes bits [111:104] of the destination vector.
3534 /// \param __b12
3535 ///    Initializes bits [103:96] of the destination vector.
3536 /// \param __b11
3537 ///    Initializes bits [95:88] of the destination vector.
3538 /// \param __b10
3539 ///    Initializes bits [87:80] of the destination vector.
3540 /// \param __b9
3541 ///    Initializes bits [79:72] of the destination vector.
3542 /// \param __b8
3543 ///    Initializes bits [71:64] of the destination vector.
3544 /// \param __b7
3545 ///    Initializes bits [63:56] of the destination vector.
3546 /// \param __b6
3547 ///    Initializes bits [55:48] of the destination vector.
3548 /// \param __b5
3549 ///    Initializes bits [47:40] of the destination vector.
3550 /// \param __b4
3551 ///    Initializes bits [39:32] of the destination vector.
3552 /// \param __b3
3553 ///    Initializes bits [31:24] of the destination vector.
3554 /// \param __b2
3555 ///    Initializes bits [23:16] of the destination vector.
3556 /// \param __b1
3557 ///    Initializes bits [15:8] of the destination vector.
3558 /// \param __b0
3559 ///    Initializes bits [7:0] of the destination vector.
3560 /// \returns An initialized 128-bit vector of [16 x i8] containing the values
3561 ///    provided in the operands.
3562 static __inline__ __m128i __DEFAULT_FN_ATTRS
3563 _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11,
3564              char __b10, char __b9, char __b8, char __b7, char __b6, char __b5,
3565              char __b4, char __b3, char __b2, char __b1, char __b0) {
3566   return __extension__(__m128i)(__v16qi){
3567       __b0, __b1, __b2,  __b3,  __b4,  __b5,  __b6,  __b7,
3568       __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15};
3569 }
3570 
3571 /// Initializes both values in a 128-bit integer vector with the
3572 ///    specified 64-bit integer value.
3573 ///
3574 /// \headerfile <x86intrin.h>
3575 ///
3576 /// This intrinsic is a utility function and does not correspond to a specific
3577 ///    instruction.
3578 ///
3579 /// \param __q
3580 ///    Integer value used to initialize the elements of the destination integer
3581 ///    vector.
3582 /// \returns An initialized 128-bit integer vector of [2 x i64] with both
3583 ///    elements containing the value provided in the operand.
3584 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q) {
3585   return _mm_set_epi64x(__q, __q);
3586 }
3587 
3588 /// Initializes both values in a 128-bit vector of [2 x i64] with the
3589 ///    specified 64-bit value.
3590 ///
3591 /// \headerfile <x86intrin.h>
3592 ///
3593 /// This intrinsic is a utility function and does not correspond to a specific
3594 ///    instruction.
3595 ///
3596 /// \param __q
3597 ///    A 64-bit value used to initialize the elements of the destination integer
3598 ///    vector.
3599 /// \returns An initialized 128-bit vector of [2 x i64] with all elements
3600 ///    containing the value provided in the operand.
3601 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q) {
3602   return _mm_set_epi64(__q, __q);
3603 }
3604 
3605 /// Initializes all values in a 128-bit vector of [4 x i32] with the
3606 ///    specified 32-bit value.
3607 ///
3608 /// \headerfile <x86intrin.h>
3609 ///
3610 /// This intrinsic is a utility function and does not correspond to a specific
3611 ///    instruction.
3612 ///
3613 /// \param __i
3614 ///    A 32-bit value used to initialize the elements of the destination integer
3615 ///    vector.
3616 /// \returns An initialized 128-bit vector of [4 x i32] with all elements
3617 ///    containing the value provided in the operand.
3618 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i) {
3619   return _mm_set_epi32(__i, __i, __i, __i);
3620 }
3621 
3622 /// Initializes all values in a 128-bit vector of [8 x i16] with the
3623 ///    specified 16-bit value.
3624 ///
3625 /// \headerfile <x86intrin.h>
3626 ///
3627 /// This intrinsic is a utility function and does not correspond to a specific
3628 ///    instruction.
3629 ///
3630 /// \param __w
3631 ///    A 16-bit value used to initialize the elements of the destination integer
3632 ///    vector.
3633 /// \returns An initialized 128-bit vector of [8 x i16] with all elements
3634 ///    containing the value provided in the operand.
3635 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w) {
3636   return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
3637 }
3638 
3639 /// Initializes all values in a 128-bit vector of [16 x i8] with the
3640 ///    specified 8-bit value.
3641 ///
3642 /// \headerfile <x86intrin.h>
3643 ///
3644 /// This intrinsic is a utility function and does not correspond to a specific
3645 ///    instruction.
3646 ///
3647 /// \param __b
3648 ///    An 8-bit value used to initialize the elements of the destination integer
3649 ///    vector.
3650 /// \returns An initialized 128-bit vector of [16 x i8] with all elements
3651 ///    containing the value provided in the operand.
3652 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b) {
3653   return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
3654                       __b, __b, __b, __b, __b);
3655 }
3656 
3657 /// Constructs a 128-bit integer vector, initialized in reverse order
3658 ///     with the specified 64-bit integral values.
3659 ///
3660 /// \headerfile <x86intrin.h>
3661 ///
3662 /// This intrinsic does not correspond to a specific instruction.
3663 ///
3664 /// \param __q0
3665 ///    A 64-bit integral value used to initialize the lower 64 bits of the
3666 ///    result.
3667 /// \param __q1
3668 ///    A 64-bit integral value used to initialize the upper 64 bits of the
3669 ///    result.
3670 /// \returns An initialized 128-bit integer vector.
3671 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0,
3672                                                             __m64 __q1) {
3673   return _mm_set_epi64(__q1, __q0);
3674 }
3675 
3676 /// Constructs a 128-bit integer vector, initialized in reverse order
3677 ///     with the specified 32-bit integral values.
3678 ///
3679 /// \headerfile <x86intrin.h>
3680 ///
3681 /// This intrinsic is a utility function and does not correspond to a specific
3682 ///    instruction.
3683 ///
3684 /// \param __i0
3685 ///    A 32-bit integral value used to initialize bits [31:0] of the result.
3686 /// \param __i1
3687 ///    A 32-bit integral value used to initialize bits [63:32] of the result.
3688 /// \param __i2
3689 ///    A 32-bit integral value used to initialize bits [95:64] of the result.
3690 /// \param __i3
3691 ///    A 32-bit integral value used to initialize bits [127:96] of the result.
3692 /// \returns An initialized 128-bit integer vector.
3693 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1,
3694                                                             int __i2,
3695                                                             int __i3) {
3696   return _mm_set_epi32(__i3, __i2, __i1, __i0);
3697 }
3698 
3699 /// Constructs a 128-bit integer vector, initialized in reverse order
3700 ///     with the specified 16-bit integral values.
3701 ///
3702 /// \headerfile <x86intrin.h>
3703 ///
3704 /// This intrinsic is a utility function and does not correspond to a specific
3705 ///    instruction.
3706 ///
3707 /// \param __w0
3708 ///    A 16-bit integral value used to initialize bits [15:0] of the result.
3709 /// \param __w1
3710 ///    A 16-bit integral value used to initialize bits [31:16] of the result.
3711 /// \param __w2
3712 ///    A 16-bit integral value used to initialize bits [47:32] of the result.
3713 /// \param __w3
3714 ///    A 16-bit integral value used to initialize bits [63:48] of the result.
3715 /// \param __w4
3716 ///    A 16-bit integral value used to initialize bits [79:64] of the result.
3717 /// \param __w5
3718 ///    A 16-bit integral value used to initialize bits [95:80] of the result.
3719 /// \param __w6
3720 ///    A 16-bit integral value used to initialize bits [111:96] of the result.
3721 /// \param __w7
3722 ///    A 16-bit integral value used to initialize bits [127:112] of the result.
3723 /// \returns An initialized 128-bit integer vector.
3724 static __inline__ __m128i __DEFAULT_FN_ATTRS
3725 _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4,
3726                short __w5, short __w6, short __w7) {
3727   return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
3728 }
3729 
3730 /// Constructs a 128-bit integer vector, initialized in reverse order
3731 ///     with the specified 8-bit integral values.
3732 ///
3733 /// \headerfile <x86intrin.h>
3734 ///
3735 /// This intrinsic is a utility function and does not correspond to a specific
3736 ///    instruction.
3737 ///
3738 /// \param __b0
3739 ///    An 8-bit integral value used to initialize bits [7:0] of the result.
3740 /// \param __b1
3741 ///    An 8-bit integral value used to initialize bits [15:8] of the result.
3742 /// \param __b2
3743 ///    An 8-bit integral value used to initialize bits [23:16] of the result.
3744 /// \param __b3
3745 ///    An 8-bit integral value used to initialize bits [31:24] of the result.
3746 /// \param __b4
3747 ///    An 8-bit integral value used to initialize bits [39:32] of the result.
3748 /// \param __b5
3749 ///    An 8-bit integral value used to initialize bits [47:40] of the result.
3750 /// \param __b6
3751 ///    An 8-bit integral value used to initialize bits [55:48] of the result.
3752 /// \param __b7
3753 ///    An 8-bit integral value used to initialize bits [63:56] of the result.
3754 /// \param __b8
3755 ///    An 8-bit integral value used to initialize bits [71:64] of the result.
3756 /// \param __b9
3757 ///    An 8-bit integral value used to initialize bits [79:72] of the result.
3758 /// \param __b10
3759 ///    An 8-bit integral value used to initialize bits [87:80] of the result.
3760 /// \param __b11
3761 ///    An 8-bit integral value used to initialize bits [95:88] of the result.
3762 /// \param __b12
3763 ///    An 8-bit integral value used to initialize bits [103:96] of the result.
3764 /// \param __b13
3765 ///    An 8-bit integral value used to initialize bits [111:104] of the result.
3766 /// \param __b14
3767 ///    An 8-bit integral value used to initialize bits [119:112] of the result.
3768 /// \param __b15
3769 ///    An 8-bit integral value used to initialize bits [127:120] of the result.
3770 /// \returns An initialized 128-bit integer vector.
3771 static __inline__ __m128i __DEFAULT_FN_ATTRS
3772 _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
3773               char __b6, char __b7, char __b8, char __b9, char __b10,
3774               char __b11, char __b12, char __b13, char __b14, char __b15) {
3775   return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8,
3776                       __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
3777 }
3778 
3779 /// Creates a 128-bit integer vector initialized to zero.
3780 ///
3781 /// \headerfile <x86intrin.h>
3782 ///
3783 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
3784 ///
3785 /// \returns An initialized 128-bit integer vector with all elements set to
3786 ///    zero.
3787 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void) {
3788   return __extension__(__m128i)(__v2di){0LL, 0LL};
3789 }
3790 
3791 /// Stores a 128-bit integer vector to a memory location aligned on a
3792 ///    128-bit boundary.
3793 ///
3794 /// \headerfile <x86intrin.h>
3795 ///
3796 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
3797 ///
3798 /// \param __p
3799 ///    A pointer to an aligned memory location that will receive the integer
3800 ///    values.
3801 /// \param __b
3802 ///    A 128-bit integer vector containing the values to be moved.
3803 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p,
3804                                                           __m128i __b) {
3805   *__p = __b;
3806 }
3807 
3808 /// Stores a 128-bit integer vector to an unaligned memory location.
3809 ///
3810 /// \headerfile <x86intrin.h>
3811 ///
3812 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
3813 ///
3814 /// \param __p
3815 ///    A pointer to a memory location that will receive the integer values.
3816 /// \param __b
3817 ///    A 128-bit integer vector containing the values to be moved.
3818 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p,
3819                                                            __m128i __b) {
3820   struct __storeu_si128 {
3821     __m128i_u __v;
3822   } __attribute__((__packed__, __may_alias__));
3823   ((struct __storeu_si128 *)__p)->__v = __b;
3824 }
3825 
3826 /// Stores a 64-bit integer value from the low element of a 128-bit integer
3827 ///    vector.
3828 ///
3829 /// \headerfile <x86intrin.h>
3830 ///
3831 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3832 ///
3833 /// \param __p
3834 ///    A pointer to a 64-bit memory location. The address of the memory
3835 ///    location does not have to be aligned.
3836 /// \param __b
3837 ///    A 128-bit integer vector containing the value to be stored.
3838 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p,
3839                                                           __m128i __b) {
3840   struct __storeu_si64 {
3841     long long __v;
3842   } __attribute__((__packed__, __may_alias__));
3843   ((struct __storeu_si64 *)__p)->__v = ((__v2di)__b)[0];
3844 }
3845 
3846 /// Stores a 32-bit integer value from the low element of a 128-bit integer
3847 ///    vector.
3848 ///
3849 /// \headerfile <x86intrin.h>
3850 ///
3851 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3852 ///
3853 /// \param __p
3854 ///    A pointer to a 32-bit memory location. The address of the memory
3855 ///    location does not have to be aligned.
3856 /// \param __b
3857 ///    A 128-bit integer vector containing the value to be stored.
3858 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p,
3859                                                           __m128i __b) {
3860   struct __storeu_si32 {
3861     int __v;
3862   } __attribute__((__packed__, __may_alias__));
3863   ((struct __storeu_si32 *)__p)->__v = ((__v4si)__b)[0];
3864 }
3865 
3866 /// Stores a 16-bit integer value from the low element of a 128-bit integer
3867 ///    vector.
3868 ///
3869 /// \headerfile <x86intrin.h>
3870 ///
3871 /// This intrinsic does not correspond to a specific instruction.
3872 ///
3873 /// \param __p
3874 ///    A pointer to a 16-bit memory location. The address of the memory
3875 ///    location does not have to be aligned.
3876 /// \param __b
3877 ///    A 128-bit integer vector containing the value to be stored.
3878 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p,
3879                                                           __m128i __b) {
3880   struct __storeu_si16 {
3881     short __v;
3882   } __attribute__((__packed__, __may_alias__));
3883   ((struct __storeu_si16 *)__p)->__v = ((__v8hi)__b)[0];
3884 }
3885 
3886 /// Moves bytes selected by the mask from the first operand to the
3887 ///    specified unaligned memory location. When a mask bit is 1, the
3888 ///    corresponding byte is written, otherwise it is not written.
3889 ///
3890 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
3891 ///    used again soon). Exception and trap behavior for elements not selected
3892 ///    for storage to memory are implementation dependent.
3893 ///
3894 /// \headerfile <x86intrin.h>
3895 ///
3896 /// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
3897 ///   instruction.
3898 ///
3899 /// \param __d
3900 ///    A 128-bit integer vector containing the values to be moved.
3901 /// \param __n
3902 ///    A 128-bit integer vector containing the mask. The most significant bit of
3903 ///    each byte represents the mask bits.
3904 /// \param __p
3905 ///    A pointer to an unaligned 128-bit memory location where the specified
3906 ///    values are moved.
3907 static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d,
3908                                                               __m128i __n,
3909                                                               char *__p) {
3910   __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
3911 }
3912 
3913 /// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
3914 ///    a memory location.
3915 ///
3916 /// \headerfile <x86intrin.h>
3917 ///
3918 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
3919 ///
3920 /// \param __p
3921 ///    A pointer to a 64-bit memory location that will receive the lower 64 bits
3922 ///    of the integer vector parameter.
3923 /// \param __a
3924 ///    A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
3925 ///    value to be stored.
3926 static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p,
3927                                                            __m128i __a) {
3928   struct __mm_storel_epi64_struct {
3929     long long __u;
3930   } __attribute__((__packed__, __may_alias__));
3931   ((struct __mm_storel_epi64_struct *)__p)->__u = __a[0];
3932 }
3933 
3934 /// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
3935 ///    aligned memory location.
3936 ///
3937 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
3938 ///    used again soon).
3939 ///
3940 /// \headerfile <x86intrin.h>
3941 ///
3942 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
3943 ///
3944 /// \param __p
3945 ///    A pointer to the 128-bit aligned memory location used to store the value.
3946 /// \param __a
3947 ///    A vector of [2 x double] containing the 64-bit values to be stored.
3948 static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(void *__p,
3949                                                         __m128d __a) {
3950   __builtin_nontemporal_store((__v2df)__a, (__v2df *)__p);
3951 }
3952 
3953 /// Stores a 128-bit integer vector to a 128-bit aligned memory location.
3954 ///
3955 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
3956 ///    used again soon).
3957 ///
3958 /// \headerfile <x86intrin.h>
3959 ///
3960 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
3961 ///
3962 /// \param __p
3963 ///    A pointer to the 128-bit aligned memory location used to store the value.
3964 /// \param __a
3965 ///    A 128-bit integer vector containing the values to be stored.
3966 static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(void *__p,
3967                                                            __m128i __a) {
3968   __builtin_nontemporal_store((__v2di)__a, (__v2di *)__p);
3969 }
3970 
3971 /// Stores a 32-bit integer value in the specified memory location.
3972 ///
3973 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
3974 ///    used again soon).
3975 ///
3976 /// \headerfile <x86intrin.h>
3977 ///
3978 /// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
3979 ///
3980 /// \param __p
3981 ///    A pointer to the 32-bit memory location used to store the value.
3982 /// \param __a
3983 ///    A 32-bit integer containing the value to be stored.
3984 static __inline__ void
3985     __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
3986     _mm_stream_si32(void *__p, int __a) {
3987   __builtin_ia32_movnti((int *)__p, __a);
3988 }
3989 
3990 #ifdef __x86_64__
3991 /// Stores a 64-bit integer value in the specified memory location.
3992 ///
3993 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
3994 ///    used again soon).
3995 ///
3996 /// \headerfile <x86intrin.h>
3997 ///
3998 /// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
3999 ///
4000 /// \param __p
4001 ///    A pointer to the 64-bit memory location used to store the value.
4002 /// \param __a
4003 ///    A 64-bit integer containing the value to be stored.
4004 static __inline__ void
4005     __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
4006     _mm_stream_si64(void *__p, long long __a) {
4007   __builtin_ia32_movnti64((long long *)__p, __a);
4008 }
4009 #endif
4010 
4011 #if defined(__cplusplus)
4012 extern "C" {
4013 #endif
4014 
4015 /// The cache line containing \a __p is flushed and invalidated from all
4016 ///    caches in the coherency domain.
4017 ///
4018 /// \headerfile <x86intrin.h>
4019 ///
4020 /// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
4021 ///
4022 /// \param __p
4023 ///    A pointer to the memory location used to identify the cache line to be
4024 ///    flushed.
4025 void _mm_clflush(void const *__p);
4026 
4027 /// Forces strong memory ordering (serialization) between load
4028 ///    instructions preceding this instruction and load instructions following
4029 ///    this instruction, ensuring the system completes all previous loads before
4030 ///    executing subsequent loads.
4031 ///
4032 /// \headerfile <x86intrin.h>
4033 ///
4034 /// This intrinsic corresponds to the <c> LFENCE </c> instruction.
4035 ///
4036 void _mm_lfence(void);
4037 
4038 /// Forces strong memory ordering (serialization) between load and store
4039 ///    instructions preceding this instruction and load and store instructions
4040 ///    following this instruction, ensuring that the system completes all
4041 ///    previous memory accesses before executing subsequent memory accesses.
4042 ///
4043 /// \headerfile <x86intrin.h>
4044 ///
4045 /// This intrinsic corresponds to the <c> MFENCE </c> instruction.
4046 ///
4047 void _mm_mfence(void);
4048 
4049 #if defined(__cplusplus)
4050 } // extern "C"
4051 #endif
4052 
4053 /// Converts 16-bit signed integers from both 128-bit integer vector
4054 ///    operands into 8-bit signed integers, and packs the results into the
4055 ///    destination. Positive values greater than 0x7F are saturated to 0x7F.
4056 ///    Negative values less than 0x80 are saturated to 0x80.
4057 ///
4058 /// \headerfile <x86intrin.h>
4059 ///
4060 /// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
4061 ///
4062 /// \param __a
4063 ///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4064 ///   a signed integer and is converted to a 8-bit signed integer with
4065 ///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4066 ///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4067 ///   written to the lower 64 bits of the result.
4068 /// \param __b
4069 ///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4070 ///   a signed integer and is converted to a 8-bit signed integer with
4071 ///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4072 ///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4073 ///   written to the higher 64 bits of the result.
4074 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
4075 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
4076                                                              __m128i __b) {
4077   return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
4078 }
4079 
4080 /// Converts 32-bit signed integers from both 128-bit integer vector
4081 ///    operands into 16-bit signed integers, and packs the results into the
4082 ///    destination. Positive values greater than 0x7FFF are saturated to 0x7FFF.
4083 ///    Negative values less than 0x8000 are saturated to 0x8000.
4084 ///
4085 /// \headerfile <x86intrin.h>
4086 ///
4087 /// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
4088 ///
4089 /// \param __a
4090 ///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4091 ///    a signed integer and is converted to a 16-bit signed integer with
4092 ///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4093 ///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4094 ///    are written to the lower 64 bits of the result.
4095 /// \param __b
4096 ///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4097 ///    a signed integer and is converted to a 16-bit signed integer with
4098 ///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4099 ///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4100 ///    are written to the higher 64 bits of the result.
4101 /// \returns A 128-bit vector of [8 x i16] containing the converted values.
4102 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
4103                                                              __m128i __b) {
4104   return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
4105 }
4106 
4107 /// Converts 16-bit signed integers from both 128-bit integer vector
4108 ///    operands into 8-bit unsigned integers, and packs the results into the
4109 ///    destination. Values greater than 0xFF are saturated to 0xFF. Values less
4110 ///    than 0x00 are saturated to 0x00.
4111 ///
4112 /// \headerfile <x86intrin.h>
4113 ///
4114 /// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
4115 ///
4116 /// \param __a
4117 ///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4118 ///    a signed integer and is converted to an 8-bit unsigned integer with
4119 ///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4120 ///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4121 ///    written to the lower 64 bits of the result.
4122 /// \param __b
4123 ///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4124 ///    a signed integer and is converted to an 8-bit unsigned integer with
4125 ///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4126 ///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4127 ///    written to the higher 64 bits of the result.
4128 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
4129 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a,
4130                                                               __m128i __b) {
4131   return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
4132 }
4133 
4134 /// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
4135 ///    the immediate-value parameter as a selector.
4136 ///
4137 /// \headerfile <x86intrin.h>
4138 ///
4139 /// \code
4140 /// __m128i _mm_extract_epi16(__m128i a, const int imm);
4141 /// \endcode
4142 ///
4143 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
4144 ///
4145 /// \param a
4146 ///    A 128-bit integer vector.
4147 /// \param imm
4148 ///    An immediate value. Bits [2:0] selects values from \a a to be assigned
4149 ///    to bits[15:0] of the result. \n
4150 ///    000: assign values from bits [15:0] of \a a. \n
4151 ///    001: assign values from bits [31:16] of \a a. \n
4152 ///    010: assign values from bits [47:32] of \a a. \n
4153 ///    011: assign values from bits [63:48] of \a a. \n
4154 ///    100: assign values from bits [79:64] of \a a. \n
4155 ///    101: assign values from bits [95:80] of \a a. \n
4156 ///    110: assign values from bits [111:96] of \a a. \n
4157 ///    111: assign values from bits [127:112] of \a a.
4158 /// \returns An integer, whose lower 16 bits are selected from the 128-bit
4159 ///    integer vector parameter and the remaining bits are assigned zeros.
4160 #define _mm_extract_epi16(a, imm)                                              \
4161   ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a),      \
4162                                                     (int)(imm)))
4163 
4164 /// Constructs a 128-bit integer vector by first making a copy of the
4165 ///    128-bit integer vector parameter, and then inserting the lower 16 bits
4166 ///    of an integer parameter into an offset specified by the immediate-value
4167 ///    parameter.
4168 ///
4169 /// \headerfile <x86intrin.h>
4170 ///
4171 /// \code
4172 /// __m128i _mm_insert_epi16(__m128i a, int b, const int imm);
4173 /// \endcode
4174 ///
4175 /// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
4176 ///
4177 /// \param a
4178 ///    A 128-bit integer vector of [8 x i16]. This vector is copied to the
4179 ///    result and then one of the eight elements in the result is replaced by
4180 ///    the lower 16 bits of \a b.
4181 /// \param b
4182 ///    An integer. The lower 16 bits of this parameter are written to the
4183 ///    result beginning at an offset specified by \a imm.
4184 /// \param imm
4185 ///    An immediate value specifying the bit offset in the result at which the
4186 ///    lower 16 bits of \a b are written.
4187 /// \returns A 128-bit integer vector containing the constructed values.
4188 #define _mm_insert_epi16(a, b, imm)                                            \
4189   ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b),        \
4190                                         (int)(imm)))
4191 
4192 /// Copies the values of the most significant bits from each 8-bit
4193 ///    element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
4194 ///    value, zero-extends the value, and writes it to the destination.
4195 ///
4196 /// \headerfile <x86intrin.h>
4197 ///
4198 /// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
4199 ///
4200 /// \param __a
4201 ///    A 128-bit integer vector containing the values with bits to be extracted.
4202 /// \returns The most significant bits from each 8-bit element in \a __a,
4203 ///    written to bits [15:0]. The other bits are assigned zeros.
4204 static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a) {
4205   return __builtin_ia32_pmovmskb128((__v16qi)__a);
4206 }
4207 
4208 /// Constructs a 128-bit integer vector by shuffling four 32-bit
4209 ///    elements of a 128-bit integer vector parameter, using the immediate-value
4210 ///    parameter as a specifier.
4211 ///
4212 /// \headerfile <x86intrin.h>
4213 ///
4214 /// \code
4215 /// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
4216 /// \endcode
4217 ///
4218 /// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
4219 ///
4220 /// \param a
4221 ///    A 128-bit integer vector containing the values to be copied.
4222 /// \param imm
4223 ///    An immediate value containing an 8-bit value specifying which elements to
4224 ///    copy from a. The destinations within the 128-bit destination are assigned
4225 ///    values as follows: \n
4226 ///    Bits [1:0] are used to assign values to bits [31:0] of the result. \n
4227 ///    Bits [3:2] are used to assign values to bits [63:32] of the result. \n
4228 ///    Bits [5:4] are used to assign values to bits [95:64] of the result. \n
4229 ///    Bits [7:6] are used to assign values to bits [127:96] of the result. \n
4230 ///    Bit value assignments: \n
4231 ///    00: assign values from bits [31:0] of \a a. \n
4232 ///    01: assign values from bits [63:32] of \a a. \n
4233 ///    10: assign values from bits [95:64] of \a a. \n
4234 ///    11: assign values from bits [127:96] of \a a. \n
4235 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4236 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4237 ///    <c>[b6, b4, b2, b0]</c>.
4238 /// \returns A 128-bit integer vector containing the shuffled values.
4239 #define _mm_shuffle_epi32(a, imm)                                              \
4240   ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
4241 
4242 /// Constructs a 128-bit integer vector by shuffling four lower 16-bit
4243 ///    elements of a 128-bit integer vector of [8 x i16], using the immediate
4244 ///    value parameter as a specifier.
4245 ///
4246 /// \headerfile <x86intrin.h>
4247 ///
4248 /// \code
4249 /// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
4250 /// \endcode
4251 ///
4252 /// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
4253 ///
4254 /// \param a
4255 ///    A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
4256 ///    [127:64] of the result.
4257 /// \param imm
4258 ///    An 8-bit immediate value specifying which elements to copy from \a a. \n
4259 ///    Bits[1:0] are used to assign values to bits [15:0] of the result. \n
4260 ///    Bits[3:2] are used to assign values to bits [31:16] of the result. \n
4261 ///    Bits[5:4] are used to assign values to bits [47:32] of the result. \n
4262 ///    Bits[7:6] are used to assign values to bits [63:48] of the result. \n
4263 ///    Bit value assignments: \n
4264 ///    00: assign values from bits [15:0] of \a a. \n
4265 ///    01: assign values from bits [31:16] of \a a. \n
4266 ///    10: assign values from bits [47:32] of \a a. \n
4267 ///    11: assign values from bits [63:48] of \a a. \n
4268 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4269 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4270 ///    <c>[b6, b4, b2, b0]</c>.
4271 /// \returns A 128-bit integer vector containing the shuffled values.
4272 #define _mm_shufflelo_epi16(a, imm)                                            \
4273   ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
4274 
4275 /// Constructs a 128-bit integer vector by shuffling four upper 16-bit
4276 ///    elements of a 128-bit integer vector of [8 x i16], using the immediate
4277 ///    value parameter as a specifier.
4278 ///
4279 /// \headerfile <x86intrin.h>
4280 ///
4281 /// \code
4282 /// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
4283 /// \endcode
4284 ///
4285 /// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
4286 ///
4287 /// \param a
4288 ///    A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
4289 ///    [63:0] of the result.
4290 /// \param imm
4291 ///    An 8-bit immediate value specifying which elements to copy from \a a. \n
4292 ///    Bits[1:0] are used to assign values to bits [79:64] of the result. \n
4293 ///    Bits[3:2] are used to assign values to bits [95:80] of the result. \n
4294 ///    Bits[5:4] are used to assign values to bits [111:96] of the result. \n
4295 ///    Bits[7:6] are used to assign values to bits [127:112] of the result. \n
4296 ///    Bit value assignments: \n
4297 ///    00: assign values from bits [79:64] of \a a. \n
4298 ///    01: assign values from bits [95:80] of \a a. \n
4299 ///    10: assign values from bits [111:96] of \a a. \n
4300 ///    11: assign values from bits [127:112] of \a a. \n
4301 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4302 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4303 ///    <c>[b6, b4, b2, b0]</c>.
4304 /// \returns A 128-bit integer vector containing the shuffled values.
4305 #define _mm_shufflehi_epi16(a, imm)                                            \
4306   ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
4307 
4308 /// Unpacks the high-order (index 8-15) values from two 128-bit vectors
4309 ///    of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4310 ///
4311 /// \headerfile <x86intrin.h>
4312 ///
4313 /// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
4314 ///   instruction.
4315 ///
4316 /// \param __a
4317 ///    A 128-bit vector of [16 x i8].
4318 ///    Bits [71:64] are written to bits [7:0] of the result. \n
4319 ///    Bits [79:72] are written to bits [23:16] of the result. \n
4320 ///    Bits [87:80] are written to bits [39:32] of the result. \n
4321 ///    Bits [95:88] are written to bits [55:48] of the result. \n
4322 ///    Bits [103:96] are written to bits [71:64] of the result. \n
4323 ///    Bits [111:104] are written to bits [87:80] of the result. \n
4324 ///    Bits [119:112] are written to bits [103:96] of the result. \n
4325 ///    Bits [127:120] are written to bits [119:112] of the result.
4326 /// \param __b
4327 ///    A 128-bit vector of [16 x i8]. \n
4328 ///    Bits [71:64] are written to bits [15:8] of the result. \n
4329 ///    Bits [79:72] are written to bits [31:24] of the result. \n
4330 ///    Bits [87:80] are written to bits [47:40] of the result. \n
4331 ///    Bits [95:88] are written to bits [63:56] of the result. \n
4332 ///    Bits [103:96] are written to bits [79:72] of the result. \n
4333 ///    Bits [111:104] are written to bits [95:88] of the result. \n
4334 ///    Bits [119:112] are written to bits [111:104] of the result. \n
4335 ///    Bits [127:120] are written to bits [127:120] of the result.
4336 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4337 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a,
4338                                                                __m128i __b) {
4339   return (__m128i)__builtin_shufflevector(
4340       (__v16qi)__a, (__v16qi)__b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11,
4341       16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15);
4342 }
4343 
4344 /// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
4345 ///    [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
4346 ///
4347 /// \headerfile <x86intrin.h>
4348 ///
4349 /// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
4350 ///   instruction.
4351 ///
4352 /// \param __a
4353 ///    A 128-bit vector of [8 x i16].
4354 ///    Bits [79:64] are written to bits [15:0] of the result. \n
4355 ///    Bits [95:80] are written to bits [47:32] of the result. \n
4356 ///    Bits [111:96] are written to bits [79:64] of the result. \n
4357 ///    Bits [127:112] are written to bits [111:96] of the result.
4358 /// \param __b
4359 ///    A 128-bit vector of [8 x i16].
4360 ///    Bits [79:64] are written to bits [31:16] of the result. \n
4361 ///    Bits [95:80] are written to bits [63:48] of the result. \n
4362 ///    Bits [111:96] are written to bits [95:80] of the result. \n
4363 ///    Bits [127:112] are written to bits [127:112] of the result.
4364 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4365 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a,
4366                                                                 __m128i __b) {
4367   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8 + 4, 5,
4368                                           8 + 5, 6, 8 + 6, 7, 8 + 7);
4369 }
4370 
4371 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
4372 ///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4373 ///
4374 /// \headerfile <x86intrin.h>
4375 ///
4376 /// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
4377 ///   instruction.
4378 ///
4379 /// \param __a
4380 ///    A 128-bit vector of [4 x i32]. \n
4381 ///    Bits [95:64] are written to bits [31:0] of the destination. \n
4382 ///    Bits [127:96] are written to bits [95:64] of the destination.
4383 /// \param __b
4384 ///    A 128-bit vector of [4 x i32]. \n
4385 ///    Bits [95:64] are written to bits [64:32] of the destination. \n
4386 ///    Bits [127:96] are written to bits [127:96] of the destination.
4387 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4388 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a,
4389                                                                 __m128i __b) {
4390   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4 + 2, 3,
4391                                           4 + 3);
4392 }
4393 
4394 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4395 ///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4396 ///
4397 /// \headerfile <x86intrin.h>
4398 ///
4399 /// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
4400 ///   instruction.
4401 ///
4402 /// \param __a
4403 ///    A 128-bit vector of [2 x i64]. \n
4404 ///    Bits [127:64] are written to bits [63:0] of the destination.
4405 /// \param __b
4406 ///    A 128-bit vector of [2 x i64]. \n
4407 ///    Bits [127:64] are written to bits [127:64] of the destination.
4408 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4409 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a,
4410                                                                 __m128i __b) {
4411   return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2 + 1);
4412 }
4413 
4414 /// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
4415 ///    [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4416 ///
4417 /// \headerfile <x86intrin.h>
4418 ///
4419 /// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
4420 ///   instruction.
4421 ///
4422 /// \param __a
4423 ///    A 128-bit vector of [16 x i8]. \n
4424 ///    Bits [7:0] are written to bits [7:0] of the result. \n
4425 ///    Bits [15:8] are written to bits [23:16] of the result. \n
4426 ///    Bits [23:16] are written to bits [39:32] of the result. \n
4427 ///    Bits [31:24] are written to bits [55:48] of the result. \n
4428 ///    Bits [39:32] are written to bits [71:64] of the result. \n
4429 ///    Bits [47:40] are written to bits [87:80] of the result. \n
4430 ///    Bits [55:48] are written to bits [103:96] of the result. \n
4431 ///    Bits [63:56] are written to bits [119:112] of the result.
4432 /// \param __b
4433 ///    A 128-bit vector of [16 x i8].
4434 ///    Bits [7:0] are written to bits [15:8] of the result. \n
4435 ///    Bits [15:8] are written to bits [31:24] of the result. \n
4436 ///    Bits [23:16] are written to bits [47:40] of the result. \n
4437 ///    Bits [31:24] are written to bits [63:56] of the result. \n
4438 ///    Bits [39:32] are written to bits [79:72] of the result. \n
4439 ///    Bits [47:40] are written to bits [95:88] of the result. \n
4440 ///    Bits [55:48] are written to bits [111:104] of the result. \n
4441 ///    Bits [63:56] are written to bits [127:120] of the result.
4442 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4443 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a,
4444                                                                __m128i __b) {
4445   return (__m128i)__builtin_shufflevector(
4446       (__v16qi)__a, (__v16qi)__b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4,
4447       16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7);
4448 }
4449 
4450 /// Unpacks the low-order (index 0-3) values from each of the two 128-bit
4451 ///    vectors of [8 x i16] and interleaves them into a 128-bit vector of
4452 ///    [8 x i16].
4453 ///
4454 /// \headerfile <x86intrin.h>
4455 ///
4456 /// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
4457 ///   instruction.
4458 ///
4459 /// \param __a
4460 ///    A 128-bit vector of [8 x i16].
4461 ///    Bits [15:0] are written to bits [15:0] of the result. \n
4462 ///    Bits [31:16] are written to bits [47:32] of the result. \n
4463 ///    Bits [47:32] are written to bits [79:64] of the result. \n
4464 ///    Bits [63:48] are written to bits [111:96] of the result.
4465 /// \param __b
4466 ///    A 128-bit vector of [8 x i16].
4467 ///    Bits [15:0] are written to bits [31:16] of the result. \n
4468 ///    Bits [31:16] are written to bits [63:48] of the result. \n
4469 ///    Bits [47:32] are written to bits [95:80] of the result. \n
4470 ///    Bits [63:48] are written to bits [127:112] of the result.
4471 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4472 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a,
4473                                                                 __m128i __b) {
4474   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8 + 0, 1,
4475                                           8 + 1, 2, 8 + 2, 3, 8 + 3);
4476 }
4477 
4478 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
4479 ///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4480 ///
4481 /// \headerfile <x86intrin.h>
4482 ///
4483 /// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
4484 ///   instruction.
4485 ///
4486 /// \param __a
4487 ///    A 128-bit vector of [4 x i32]. \n
4488 ///    Bits [31:0] are written to bits [31:0] of the destination. \n
4489 ///    Bits [63:32] are written to bits [95:64] of the destination.
4490 /// \param __b
4491 ///    A 128-bit vector of [4 x i32]. \n
4492 ///    Bits [31:0] are written to bits [64:32] of the destination. \n
4493 ///    Bits [63:32] are written to bits [127:96] of the destination.
4494 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4495 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a,
4496                                                                 __m128i __b) {
4497   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4 + 0, 1,
4498                                           4 + 1);
4499 }
4500 
4501 /// Unpacks the low-order 64-bit elements from two 128-bit vectors of
4502 ///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4503 ///
4504 /// \headerfile <x86intrin.h>
4505 ///
4506 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
4507 ///   instruction.
4508 ///
4509 /// \param __a
4510 ///    A 128-bit vector of [2 x i64]. \n
4511 ///    Bits [63:0] are written to bits [63:0] of the destination. \n
4512 /// \param __b
4513 ///    A 128-bit vector of [2 x i64]. \n
4514 ///    Bits [63:0] are written to bits [127:64] of the destination. \n
4515 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4516 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a,
4517                                                                 __m128i __b) {
4518   return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2 + 0);
4519 }
4520 
4521 /// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
4522 ///    integer.
4523 ///
4524 /// \headerfile <x86intrin.h>
4525 ///
4526 /// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
4527 ///
4528 /// \param __a
4529 ///    A 128-bit integer vector operand. The lower 64 bits are moved to the
4530 ///    destination.
4531 /// \returns A 64-bit integer containing the lower 64 bits of the parameter.
4532 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a) {
4533   return (__m64)__a[0];
4534 }
4535 
4536 /// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
4537 ///    upper bits.
4538 ///
4539 /// \headerfile <x86intrin.h>
4540 ///
4541 /// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
4542 ///
4543 /// \param __a
4544 ///    A 64-bit value.
4545 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4546 ///    the operand. The upper 64 bits are assigned zeros.
4547 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a) {
4548   return __extension__(__m128i)(__v2di){(long long)__a, 0};
4549 }
4550 
4551 /// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
4552 ///    integer vector, zeroing the upper bits.
4553 ///
4554 /// \headerfile <x86intrin.h>
4555 ///
4556 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
4557 ///
4558 /// \param __a
4559 ///    A 128-bit integer vector operand. The lower 64 bits are moved to the
4560 ///    destination.
4561 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4562 ///    the operand. The upper 64 bits are assigned zeros.
4563 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a) {
4564   return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
4565 }
4566 
4567 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4568 ///    [2 x double] and interleaves them into a 128-bit vector of [2 x
4569 ///    double].
4570 ///
4571 /// \headerfile <x86intrin.h>
4572 ///
4573 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
4574 ///
4575 /// \param __a
4576 ///    A 128-bit vector of [2 x double]. \n
4577 ///    Bits [127:64] are written to bits [63:0] of the destination.
4578 /// \param __b
4579 ///    A 128-bit vector of [2 x double]. \n
4580 ///    Bits [127:64] are written to bits [127:64] of the destination.
4581 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4582 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a,
4583                                                              __m128d __b) {
4584   return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2 + 1);
4585 }
4586 
4587 /// Unpacks the low-order 64-bit elements from two 128-bit vectors
4588 ///    of [2 x double] and interleaves them into a 128-bit vector of [2 x
4589 ///    double].
4590 ///
4591 /// \headerfile <x86intrin.h>
4592 ///
4593 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
4594 ///
4595 /// \param __a
4596 ///    A 128-bit vector of [2 x double]. \n
4597 ///    Bits [63:0] are written to bits [63:0] of the destination.
4598 /// \param __b
4599 ///    A 128-bit vector of [2 x double]. \n
4600 ///    Bits [63:0] are written to bits [127:64] of the destination.
4601 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4602 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a,
4603                                                              __m128d __b) {
4604   return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2 + 0);
4605 }
4606 
4607 /// Extracts the sign bits of the double-precision values in the 128-bit
4608 ///    vector of [2 x double], zero-extends the value, and writes it to the
4609 ///    low-order bits of the destination.
4610 ///
4611 /// \headerfile <x86intrin.h>
4612 ///
4613 /// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
4614 ///
4615 /// \param __a
4616 ///    A 128-bit vector of [2 x double] containing the values with sign bits to
4617 ///    be extracted.
4618 /// \returns The sign bits from each of the double-precision elements in \a __a,
4619 ///    written to bits [1:0]. The remaining bits are assigned values of zero.
4620 static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a) {
4621   return __builtin_ia32_movmskpd((__v2df)__a);
4622 }
4623 
4624 /// Constructs a 128-bit floating-point vector of [2 x double] from two
4625 ///    128-bit vector parameters of [2 x double], using the immediate-value
4626 ///     parameter as a specifier.
4627 ///
4628 /// \headerfile <x86intrin.h>
4629 ///
4630 /// \code
4631 /// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
4632 /// \endcode
4633 ///
4634 /// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
4635 ///
4636 /// \param a
4637 ///    A 128-bit vector of [2 x double].
4638 /// \param b
4639 ///    A 128-bit vector of [2 x double].
4640 /// \param i
4641 ///    An 8-bit immediate value. The least significant two bits specify which
4642 ///    elements to copy from \a a and \a b: \n
4643 ///    Bit[0] = 0: lower element of \a a copied to lower element of result. \n
4644 ///    Bit[0] = 1: upper element of \a a copied to lower element of result. \n
4645 ///    Bit[1] = 0: lower element of \a b copied to upper element of result. \n
4646 ///    Bit[1] = 1: upper element of \a b copied to upper element of result. \n
4647 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE2 macro.
4648 ///    <c>_MM_SHUFFLE2(b1, b0)</c> can create a 2-bit mask of the form
4649 ///    <c>[b1, b0]</c>.
4650 /// \returns A 128-bit vector of [2 x double] containing the shuffled values.
4651 #define _mm_shuffle_pd(a, b, i)                                                \
4652   ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b),  \
4653                                   (int)(i)))
4654 
4655 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4656 ///    floating-point vector of [4 x float].
4657 ///
4658 /// \headerfile <x86intrin.h>
4659 ///
4660 /// This intrinsic has no corresponding instruction.
4661 ///
4662 /// \param __a
4663 ///    A 128-bit floating-point vector of [2 x double].
4664 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4665 ///    bitwise pattern as the parameter.
4666 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a) {
4667   return (__m128)__a;
4668 }
4669 
4670 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4671 ///    integer vector.
4672 ///
4673 /// \headerfile <x86intrin.h>
4674 ///
4675 /// This intrinsic has no corresponding instruction.
4676 ///
4677 /// \param __a
4678 ///    A 128-bit floating-point vector of [2 x double].
4679 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4680 ///    parameter.
4681 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a) {
4682   return (__m128i)__a;
4683 }
4684 
4685 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4686 ///    floating-point vector of [2 x double].
4687 ///
4688 /// \headerfile <x86intrin.h>
4689 ///
4690 /// This intrinsic has no corresponding instruction.
4691 ///
4692 /// \param __a
4693 ///    A 128-bit floating-point vector of [4 x float].
4694 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4695 ///    bitwise pattern as the parameter.
4696 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a) {
4697   return (__m128d)__a;
4698 }
4699 
4700 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4701 ///    integer vector.
4702 ///
4703 /// \headerfile <x86intrin.h>
4704 ///
4705 /// This intrinsic has no corresponding instruction.
4706 ///
4707 /// \param __a
4708 ///    A 128-bit floating-point vector of [4 x float].
4709 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4710 ///    parameter.
4711 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a) {
4712   return (__m128i)__a;
4713 }
4714 
4715 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
4716 ///    of [4 x float].
4717 ///
4718 /// \headerfile <x86intrin.h>
4719 ///
4720 /// This intrinsic has no corresponding instruction.
4721 ///
4722 /// \param __a
4723 ///    A 128-bit integer vector.
4724 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4725 ///    bitwise pattern as the parameter.
4726 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a) {
4727   return (__m128)__a;
4728 }
4729 
4730 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
4731 ///    of [2 x double].
4732 ///
4733 /// \headerfile <x86intrin.h>
4734 ///
4735 /// This intrinsic has no corresponding instruction.
4736 ///
4737 /// \param __a
4738 ///    A 128-bit integer vector.
4739 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4740 ///    bitwise pattern as the parameter.
4741 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a) {
4742   return (__m128d)__a;
4743 }
4744 
4745 #if defined(__cplusplus)
4746 extern "C" {
4747 #endif
4748 
4749 /// Indicates that a spin loop is being executed for the purposes of
4750 ///    optimizing power consumption during the loop.
4751 ///
4752 /// \headerfile <x86intrin.h>
4753 ///
4754 /// This intrinsic corresponds to the <c> PAUSE </c> instruction.
4755 ///
4756 void _mm_pause(void);
4757 
4758 #if defined(__cplusplus)
4759 } // extern "C"
4760 #endif
4761 #undef __DEFAULT_FN_ATTRS
4762 #undef __DEFAULT_FN_ATTRS_MMX
4763 
4764 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
4765 
4766 #define _MM_DENORMALS_ZERO_ON (0x0040U)
4767 #define _MM_DENORMALS_ZERO_OFF (0x0000U)
4768 
4769 #define _MM_DENORMALS_ZERO_MASK (0x0040U)
4770 
4771 #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
4772 #define _MM_SET_DENORMALS_ZERO_MODE(x)                                         \
4773   (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
4774 
4775 #endif /* __EMMINTRIN_H */
4776