1 /* Copyright (C) 2008, 2009 Free Software Foundation, Inc.
2 
3    This file is part of GCC.
4 
5    GCC is free software; you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 3, or (at your option)
8    any later version.
9 
10    GCC is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    Under Section 7 of GPL version 3, you are granted additional
16    permissions described in the GCC Runtime Library Exception, version
17    3.1, as published by the Free Software Foundation.
18 
19    You should have received a copy of the GNU General Public License and
20    a copy of the GCC Runtime Library Exception along with this program;
21    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22    <http://www.gnu.org/licenses/>.  */
23 
24 /* Implemented from the specification included in the Intel C++ Compiler
25    User Guide and Reference, version 11.0.  */
26 
27 #ifndef _IMMINTRIN_H_INCLUDED
28 # error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
29 #endif
30 
31 /* Internal data types for implementing the intrinsics.  */
32 typedef double __v4df __attribute__ ((__vector_size__ (32)));
33 typedef float __v8sf __attribute__ ((__vector_size__ (32)));
34 typedef long long __v4di __attribute__ ((__vector_size__ (32)));
35 typedef int __v8si __attribute__ ((__vector_size__ (32)));
36 typedef short __v16hi __attribute__ ((__vector_size__ (32)));
37 typedef char __v32qi __attribute__ ((__vector_size__ (32)));
38 
39 /* The Intel API is flexible enough that we must allow aliasing with other
40    vector types, and their scalar components.  */
41 typedef float __m256 __attribute__ ((__vector_size__ (32),
42 				     __may_alias__));
43 typedef long long __m256i __attribute__ ((__vector_size__ (32),
44 					  __may_alias__));
45 typedef double __m256d __attribute__ ((__vector_size__ (32),
46 				       __may_alias__));
47 
48 /* Compare predicates for scalar and packed compare intrinsics.  */
49 
50 /* Equal (ordered, non-signaling)  */
51 #define _CMP_EQ_OQ	0x00
52 /* Less-than (ordered, signaling)  */
53 #define _CMP_LT_OS	0x01
54 /* Less-than-or-equal (ordered, signaling)  */
55 #define _CMP_LE_OS	0x02
56 /* Unordered (non-signaling)  */
57 #define _CMP_UNORD_Q	0x03
58 /* Not-equal (unordered, non-signaling)  */
59 #define _CMP_NEQ_UQ	0x04
60 /* Not-less-than (unordered, signaling)  */
61 #define _CMP_NLT_US	0x05
62 /* Not-less-than-or-equal (unordered, signaling)  */
63 #define _CMP_NLE_US	0x06
64 /* Ordered (nonsignaling)   */
65 #define _CMP_ORD_Q	0x07
66 /* Equal (unordered, non-signaling)  */
67 #define _CMP_EQ_UQ	0x08
68 /* Not-greater-than-or-equal (unordered, signaling)  */
69 #define _CMP_NGE_US	0x09
70 /* Not-greater-than (unordered, signaling)  */
71 #define _CMP_NGT_US	0x0a
72 /* False (ordered, non-signaling)  */
73 #define _CMP_FALSE_OQ	0x0b
74 /* Not-equal (ordered, non-signaling)  */
75 #define _CMP_NEQ_OQ	0x0c
76 /* Greater-than-or-equal (ordered, signaling)  */
77 #define _CMP_GE_OS	0x0d
78 /* Greater-than (ordered, signaling)  */
79 #define _CMP_GT_OS	0x0e
80 /* True (unordered, non-signaling)  */
81 #define _CMP_TRUE_UQ	0x0f
82 /* Equal (ordered, signaling)  */
83 #define _CMP_EQ_OS	0x10
84 /* Less-than (ordered, non-signaling)  */
85 #define _CMP_LT_OQ	0x11
86 /* Less-than-or-equal (ordered, non-signaling)  */
87 #define _CMP_LE_OQ	0x12
88 /* Unordered (signaling)  */
89 #define _CMP_UNORD_S	0x13
90 /* Not-equal (unordered, signaling)  */
91 #define _CMP_NEQ_US	0x14
92 /* Not-less-than (unordered, non-signaling)  */
93 #define _CMP_NLT_UQ	0x15
94 /* Not-less-than-or-equal (unordered, non-signaling)  */
95 #define _CMP_NLE_UQ	0x16
96 /* Ordered (signaling)  */
97 #define _CMP_ORD_S	0x17
98 /* Equal (unordered, signaling)  */
99 #define _CMP_EQ_US	0x18
100 /* Not-greater-than-or-equal (unordered, non-signaling)  */
101 #define _CMP_NGE_UQ	0x19
102 /* Not-greater-than (unordered, non-signaling)  */
103 #define _CMP_NGT_UQ	0x1a
104 /* False (ordered, signaling)  */
105 #define _CMP_FALSE_OS	0x1b
106 /* Not-equal (ordered, signaling)  */
107 #define _CMP_NEQ_OS	0x1c
108 /* Greater-than-or-equal (ordered, non-signaling)  */
109 #define _CMP_GE_OQ	0x1d
110 /* Greater-than (ordered, non-signaling)  */
111 #define _CMP_GT_OQ	0x1e
112 /* True (unordered, signaling)  */
113 #define _CMP_TRUE_US	0x1f
114 
115 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
116 _mm256_add_pd (__m256d __A, __m256d __B)
117 {
118   return (__m256d) __builtin_ia32_addpd256 ((__v4df)__A, (__v4df)__B);
119 }
120 
121 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
122 _mm256_add_ps (__m256 __A, __m256 __B)
123 {
124   return (__m256) __builtin_ia32_addps256 ((__v8sf)__A, (__v8sf)__B);
125 }
126 
127 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
128 _mm256_addsub_pd (__m256d __A, __m256d __B)
129 {
130   return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B);
131 }
132 
133 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
134 _mm256_addsub_ps (__m256 __A, __m256 __B)
135 {
136   return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B);
137 }
138 
139 
140 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
141 _mm256_and_pd (__m256d __A, __m256d __B)
142 {
143   return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B);
144 }
145 
146 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
147 _mm256_and_ps (__m256 __A, __m256 __B)
148 {
149   return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B);
150 }
151 
152 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
153 _mm256_andnot_pd (__m256d __A, __m256d __B)
154 {
155   return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B);
156 }
157 
158 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
159 _mm256_andnot_ps (__m256 __A, __m256 __B)
160 {
161   return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B);
162 }
163 
164 /* Double/single precision floating point blend instructions - select
165    data from 2 sources using constant/variable mask.  */
166 
167 #ifdef __OPTIMIZE__
168 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
169 _mm256_blend_pd (__m256d __X, __m256d __Y, const int __M)
170 {
171   return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X,
172 					      (__v4df)__Y,
173 					      __M);
174 }
175 
176 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
177 _mm256_blend_ps (__m256 __X, __m256 __Y, const int __M)
178 {
179   return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X,
180 					     (__v8sf)__Y,
181 					     __M);
182 }
183 #else
184 #define _mm256_blend_pd(X, Y, M)					\
185   ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X),		\
186 					(__v4df)(__m256d)(Y), (int)(M)))
187 
188 #define _mm256_blend_ps(X, Y, M)					\
189   ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X),		\
190 				       (__v8sf)(__m256)(Y), (int)(M)))
191 #endif
192 
193 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
194 _mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M)
195 {
196   return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X,
197 					       (__v4df)__Y,
198 					       (__v4df)__M);
199 }
200 
201 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
202 _mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M)
203 {
204   return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X,
205 					      (__v8sf)__Y,
206 					      (__v8sf)__M);
207 }
208 
209 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
210 _mm256_div_pd (__m256d __A, __m256d __B)
211 {
212   return (__m256d) __builtin_ia32_divpd256 ((__v4df)__A, (__v4df)__B);
213 }
214 
215 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
216 _mm256_div_ps (__m256 __A, __m256 __B)
217 {
218   return (__m256) __builtin_ia32_divps256 ((__v8sf)__A, (__v8sf)__B);
219 }
220 
221 /* Dot product instructions with mask-defined summing and zeroing parts
222    of result.  */
223 
224 #ifdef __OPTIMIZE__
225 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
226 _mm256_dp_ps (__m256 __X, __m256 __Y, const int __M)
227 {
228   return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X,
229 					  (__v8sf)__Y,
230 					  __M);
231 }
232 #else
233 #define _mm256_dp_ps(X, Y, M)						\
234   ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X),		\
235 				    (__v8sf)(__m256)(Y), (int)(M)))
236 #endif
237 
238 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
239 _mm256_hadd_pd (__m256d __X, __m256d __Y)
240 {
241   return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y);
242 }
243 
244 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
245 _mm256_hadd_ps (__m256 __X, __m256 __Y)
246 {
247   return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y);
248 }
249 
250 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
251 _mm256_hsub_pd (__m256d __X, __m256d __Y)
252 {
253   return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y);
254 }
255 
256 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
257 _mm256_hsub_ps (__m256 __X, __m256 __Y)
258 {
259   return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y);
260 }
261 
262 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
263 _mm256_max_pd (__m256d __A, __m256d __B)
264 {
265   return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B);
266 }
267 
268 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
269 _mm256_max_ps (__m256 __A, __m256 __B)
270 {
271   return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B);
272 }
273 
274 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
275 _mm256_min_pd (__m256d __A, __m256d __B)
276 {
277   return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B);
278 }
279 
280 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
281 _mm256_min_ps (__m256 __A, __m256 __B)
282 {
283   return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B);
284 }
285 
286 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
287 _mm256_mul_pd (__m256d __A, __m256d __B)
288 {
289   return (__m256d) __builtin_ia32_mulpd256 ((__v4df)__A, (__v4df)__B);
290 }
291 
292 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
293 _mm256_mul_ps (__m256 __A, __m256 __B)
294 {
295   return (__m256) __builtin_ia32_mulps256 ((__v8sf)__A, (__v8sf)__B);
296 }
297 
298 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
299 _mm256_or_pd (__m256d __A, __m256d __B)
300 {
301   return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B);
302 }
303 
304 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
305 _mm256_or_ps (__m256 __A, __m256 __B)
306 {
307   return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B);
308 }
309 
310 #ifdef __OPTIMIZE__
311 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
312 _mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask)
313 {
314   return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B,
315 					     __mask);
316 }
317 
318 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
319 _mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask)
320 {
321   return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B,
322 					    __mask);
323 }
324 #else
325 #define _mm256_shuffle_pd(A, B, N)					\
326   ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A),		\
327 				      (__v4df)(__m256d)(B), (int)(N)))
328 
329 #define _mm256_shuffle_ps(A, B, N)					\
330   ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A),		\
331 				      (__v8sf)(__m256)(B), (int)(N)))
332 #endif
333 
334 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
335 _mm256_sub_pd (__m256d __A, __m256d __B)
336 {
337   return (__m256d) __builtin_ia32_subpd256 ((__v4df)__A, (__v4df)__B);
338 }
339 
340 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
341 _mm256_sub_ps (__m256 __A, __m256 __B)
342 {
343   return (__m256) __builtin_ia32_subps256 ((__v8sf)__A, (__v8sf)__B);
344 }
345 
346 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
347 _mm256_xor_pd (__m256d __A, __m256d __B)
348 {
349   return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B);
350 }
351 
352 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
353 _mm256_xor_ps (__m256 __A, __m256 __B)
354 {
355   return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B);
356 }
357 
358 #ifdef __OPTIMIZE__
359 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
360 _mm_cmp_pd (__m128d __X, __m128d __Y, const int __P)
361 {
362   return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P);
363 }
364 
365 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
366 _mm_cmp_ps (__m128 __X, __m128 __Y, const int __P)
367 {
368   return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P);
369 }
370 
371 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
372 _mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P)
373 {
374   return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y,
375 					    __P);
376 }
377 
378 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
379 _mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P)
380 {
381   return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y,
382 					   __P);
383 }
384 
385 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
386 _mm_cmp_sd (__m128d __X, __m128d __Y, const int __P)
387 {
388   return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P);
389 }
390 
391 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
392 _mm_cmp_ss (__m128 __X, __m128 __Y, const int __P)
393 {
394   return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P);
395 }
396 #else
397 #define _mm_cmp_pd(X, Y, P)						\
398   ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X),		\
399 				   (__v2df)(__m128d)(Y), (int)(P)))
400 
401 #define _mm_cmp_ps(X, Y, P)						\
402   ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X),			\
403 				  (__v4sf)(__m128)(Y), (int)(P)))
404 
405 #define _mm256_cmp_pd(X, Y, P)						\
406   ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X),		\
407 				      (__v4df)(__m256d)(Y), (int)(P)))
408 
409 #define _mm256_cmp_ps(X, Y, P)						\
410   ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X),		\
411 				     (__v8sf)(__m256)(Y), (int)(P)))
412 
413 #define _mm_cmp_sd(X, Y, P)						\
414   ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X),		\
415 				   (__v2df)(__m128d)(Y), (int)(P)))
416 
417 #define _mm_cmp_ss(X, Y, P)						\
418   ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X),			\
419 				  (__v4sf)(__m128)(Y), (int)(P)))
420 #endif
421 
422 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
423 _mm256_cvtepi32_pd (__m128i __A)
424 {
425   return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A);
426 }
427 
428 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
429 _mm256_cvtepi32_ps (__m256i __A)
430 {
431   return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A);
432 }
433 
434 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
435 _mm256_cvtpd_ps (__m256d __A)
436 {
437   return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A);
438 }
439 
440 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
441 _mm256_cvtps_epi32 (__m256 __A)
442 {
443   return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A);
444 }
445 
446 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
447 _mm256_cvtps_pd (__m128 __A)
448 {
449   return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A);
450 }
451 
452 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
453 _mm256_cvttpd_epi32 (__m256d __A)
454 {
455   return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A);
456 }
457 
458 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
459 _mm256_cvtpd_epi32 (__m256d __A)
460 {
461   return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A);
462 }
463 
464 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
465 _mm256_cvttps_epi32 (__m256 __A)
466 {
467   return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A);
468 }
469 
470 #ifdef __OPTIMIZE__
471 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
472 _mm256_extractf128_pd (__m256d __X, const int __N)
473 {
474   return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N);
475 }
476 
477 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
478 _mm256_extractf128_ps (__m256 __X, const int __N)
479 {
480   return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N);
481 }
482 
483 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
484 _mm256_extractf128_si256 (__m256i __X, const int __N)
485 {
486   return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N);
487 }
488 
489 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
490 _mm256_extract_epi32 (__m256i __X, int const __N)
491 {
492   __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
493   return _mm_extract_epi32 (__Y, __N % 4);
494 }
495 
496 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
497 _mm256_extract_epi16 (__m256i __X, int const __N)
498 {
499   __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
500   return _mm_extract_epi16 (__Y, __N % 8);
501 }
502 
503 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
504 _mm256_extract_epi8 (__m256i __X, int const __N)
505 {
506   __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
507   return _mm_extract_epi8 (__Y, __N % 16);
508 }
509 
510 #ifdef __x86_64__
511 extern __inline long long  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
512 _mm256_extract_epi64 (__m256i __X, const int __N)
513 {
514   __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
515   return _mm_extract_epi64 (__Y, __N % 2);
516 }
517 #endif
518 #else
519 #define _mm256_extractf128_pd(X, N)					\
520   ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X),	\
521 						(int)(N)))
522 
523 #define _mm256_extractf128_ps(X, N)					\
524   ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X),	\
525 					       (int)(N)))
526 
527 #define _mm256_extractf128_si256(X, N)					\
528   ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X),	\
529 						(int)(N)))
530 
531 #define _mm256_extract_epi32(X, N)					\
532   (__extension__							\
533    ({									\
534       __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2);		\
535       _mm_extract_epi32 (__Y, (N) % 4);					\
536     }))
537 
538 #define _mm256_extract_epi16(X, N)					\
539   (__extension__							\
540    ({									\
541       __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3);		\
542       _mm_extract_epi16 (__Y, (N) % 8);					\
543     }))
544 
545 #define _mm256_extract_epi8(X, N)					\
546   (__extension__							\
547    ({									\
548       __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4);		\
549       _mm_extract_epi8 (__Y, (N) % 16);					\
550     }))
551 
552 #ifdef __x86_64__
553 #define _mm256_extract_epi64(X, N)					\
554   (__extension__							\
555    ({									\
556       __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1);		\
557       _mm_extract_epi64 (__Y, (N) % 2);					\
558     }))
559 #endif
560 #endif
561 
562 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
563 _mm256_zeroall (void)
564 {
565   __builtin_ia32_vzeroall ();
566 }
567 
568 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
569 _mm256_zeroupper (void)
570 {
571   __builtin_ia32_vzeroupper ();
572 }
573 
574 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
575 _mm_permutevar_pd (__m128d __A, __m128i __C)
576 {
577   return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A,
578 						(__v2di)__C);
579 }
580 
581 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
582 _mm256_permutevar_pd (__m256d __A, __m256i __C)
583 {
584   return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A,
585 						   (__v4di)__C);
586 }
587 
588 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
589 _mm_permutevar_ps (__m128 __A, __m128i __C)
590 {
591   return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A,
592 					       (__v4si)__C);
593 }
594 
595 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
596 _mm256_permutevar_ps (__m256 __A, __m256i __C)
597 {
598   return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A,
599 						  (__v8si)__C);
600 }
601 
602 #ifdef __OPTIMIZE__
603 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
604 _mm_permute_pd (__m128d __X, const int __C)
605 {
606   return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C);
607 }
608 
609 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
610 _mm256_permute_pd (__m256d __X, const int __C)
611 {
612   return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C);
613 }
614 
615 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
616 _mm_permute_ps (__m128 __X, const int __C)
617 {
618   return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C);
619 }
620 
621 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
622 _mm256_permute_ps (__m256 __X, const int __C)
623 {
624   return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C);
625 }
626 #else
627 #define _mm_permute_pd(X, C)						\
628   ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C)))
629 
630 #define _mm256_permute_pd(X, C)						\
631   ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X),	(int)(C)))
632 
633 #define _mm_permute_ps(X, C)						\
634   ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C)))
635 
636 #define _mm256_permute_ps(X, C)						\
637   ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C)))
638 #endif
639 
640 #ifdef __OPTIMIZE__
641 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
642 _mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C)
643 {
644   return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X,
645 						    (__v4df)__Y,
646 						    __C);
647 }
648 
649 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
650 _mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C)
651 {
652   return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X,
653 						   (__v8sf)__Y,
654 						   __C);
655 }
656 
657 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
658 _mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C)
659 {
660   return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X,
661 						    (__v8si)__Y,
662 						    __C);
663 }
664 #else
665 #define _mm256_permute2f128_pd(X, Y, C)					\
666   ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X),	\
667 					      (__v4df)(__m256d)(Y),	\
668 					      (int)(C)))
669 
670 #define _mm256_permute2f128_ps(X, Y, C)					\
671   ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X),	\
672 					     (__v8sf)(__m256)(Y),	\
673 					     (int)(C)))
674 
675 #define _mm256_permute2f128_si256(X, Y, C)				\
676   ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X),	\
677 					      (__v8si)(__m256i)(Y),	\
678 					      (int)(C)))
679 #endif
680 
681 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
682 _mm_broadcast_ss (float const *__X)
683 {
684   return (__m128) __builtin_ia32_vbroadcastss (__X);
685 }
686 
687 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
688 _mm256_broadcast_sd (double const *__X)
689 {
690   return (__m256d) __builtin_ia32_vbroadcastsd256 (__X);
691 }
692 
693 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
694 _mm256_broadcast_ss (float const *__X)
695 {
696   return (__m256) __builtin_ia32_vbroadcastss256 (__X);
697 }
698 
699 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
700 _mm256_broadcast_pd (__m128d const *__X)
701 {
702   return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X);
703 }
704 
705 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
706 _mm256_broadcast_ps (__m128 const *__X)
707 {
708   return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X);
709 }
710 
711 #ifdef __OPTIMIZE__
712 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
713 _mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O)
714 {
715   return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X,
716 						     (__v2df)__Y,
717 						     __O);
718 }
719 
720 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
721 _mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O)
722 {
723   return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X,
724 						    (__v4sf)__Y,
725 						    __O);
726 }
727 
728 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
729 _mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O)
730 {
731   return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X,
732 						     (__v4si)__Y,
733 						     __O);
734 }
735 
736 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
737 _mm256_insert_epi32 (__m256i __X, int __D, int const __N)
738 {
739   __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
740   __Y = _mm_insert_epi32 (__Y, __D, __N % 4);
741   return _mm256_insertf128_si256 (__X, __Y, __N >> 2);
742 }
743 
744 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
745 _mm256_insert_epi16 (__m256i __X, int __D, int const __N)
746 {
747   __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
748   __Y = _mm_insert_epi16 (__Y, __D, __N % 8);
749   return _mm256_insertf128_si256 (__X, __Y, __N >> 3);
750 }
751 
752 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
753 _mm256_insert_epi8 (__m256i __X, int __D, int const __N)
754 {
755   __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
756   __Y = _mm_insert_epi8 (__Y, __D, __N % 16);
757   return _mm256_insertf128_si256 (__X, __Y, __N >> 4);
758 }
759 
760 #ifdef __x86_64__
761 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
762 _mm256_insert_epi64 (__m256i __X, long long __D, int const __N)
763 {
764   __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
765   __Y = _mm_insert_epi64 (__Y, __D, __N % 2);
766   return _mm256_insertf128_si256 (__X, __Y, __N >> 1);
767 }
768 #endif
769 #else
770 #define _mm256_insertf128_pd(X, Y, O)					\
771   ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X),	\
772 					       (__v2df)(__m128d)(Y),	\
773 					       (int)(O)))
774 
775 #define _mm256_insertf128_ps(X, Y, O)					\
776   ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X),	\
777 					      (__v4sf)(__m128)(Y),  	\
778 					      (int)(O)))
779 
780 #define _mm256_insertf128_si256(X, Y, O)				\
781   ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X),	\
782 					       (__v4si)(__m128i)(Y),	\
783 					       (int)(O)))
784 
785 #define _mm256_insert_epi32(X, D, N)					\
786   (__extension__							\
787    ({									\
788       __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2);		\
789       __Y = _mm_insert_epi32 (__Y, (D), (N) % 4);			\
790       _mm256_insertf128_si256 ((X), __Y, (N) >> 2);			\
791     }))
792 
793 #define _mm256_insert_epi16(X, D, N)					\
794   (__extension__							\
795    ({									\
796       __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3);		\
797       __Y = _mm_insert_epi16 (__Y, (D), (N) % 8);			\
798       _mm256_insertf128_si256 ((X), __Y, (N) >> 3);			\
799     }))
800 
801 #define _mm256_insert_epi8(X, D, N)					\
802   (__extension__							\
803    ({									\
804       __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4);		\
805       __Y = _mm_insert_epi8 (__Y, (D), (N) % 16);			\
806       _mm256_insertf128_si256 ((X), __Y, (N) >> 4);			\
807     }))
808 
809 #ifdef __x86_64__
810 #define _mm256_insert_epi64(X, D, N)					\
811   (__extension__							\
812    ({									\
813       __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1);		\
814       __Y = _mm_insert_epi64 (__Y, (D), (N) % 2);			\
815       _mm256_insertf128_si256 ((X), __Y, (N) >> 1);			\
816     }))
817 #endif
818 #endif
819 
820 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
821 _mm256_load_pd (double const *__P)
822 {
823   return *(__m256d *)__P;
824 }
825 
826 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
827 _mm256_store_pd (double *__P, __m256d __A)
828 {
829   *(__m256d *)__P = __A;
830 }
831 
832 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
833 _mm256_load_ps (float const *__P)
834 {
835   return *(__m256 *)__P;
836 }
837 
838 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
839 _mm256_store_ps (float *__P, __m256 __A)
840 {
841   *(__m256 *)__P = __A;
842 }
843 
844 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
845 _mm256_loadu_pd (double const *__P)
846 {
847   return (__m256d) __builtin_ia32_loadupd256 (__P);
848 }
849 
850 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
851 _mm256_storeu_pd (double *__P, __m256d __A)
852 {
853   __builtin_ia32_storeupd256 (__P, (__v4df)__A);
854 }
855 
856 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
857 _mm256_loadu_ps (float const *__P)
858 {
859   return (__m256) __builtin_ia32_loadups256 (__P);
860 }
861 
862 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
863 _mm256_storeu_ps (float *__P, __m256 __A)
864 {
865   __builtin_ia32_storeups256 (__P, (__v8sf)__A);
866 }
867 
868 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
869 _mm256_load_si256 (__m256i const *__P)
870 {
871   return *__P;
872 }
873 
874 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
875 _mm256_store_si256 (__m256i *__P, __m256i __A)
876 {
877   *__P = __A;
878 }
879 
880 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
881 _mm256_loadu_si256 (__m256i const *__P)
882 {
883   return (__m256i) __builtin_ia32_loaddqu256 ((char const *)__P);
884 }
885 
886 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
887 _mm256_storeu_si256 (__m256i *__P, __m256i __A)
888 {
889   __builtin_ia32_storedqu256 ((char *)__P, (__v32qi)__A);
890 }
891 
892 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
893 _mm_maskload_pd (double const *__P, __m128i __M)
894 {
895   return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P,
896 					      (__v2di)__M);
897 }
898 
899 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
900 _mm_maskstore_pd (double *__P, __m128i __M, __m128d __A)
901 {
902   __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2di)__M, (__v2df)__A);
903 }
904 
905 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
906 _mm256_maskload_pd (double const *__P, __m256i __M)
907 {
908   return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P,
909 						 (__v4di)__M);
910 }
911 
912 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
913 _mm256_maskstore_pd (double *__P, __m256i __M, __m256d __A)
914 {
915   __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4di)__M, (__v4df)__A);
916 }
917 
918 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
919 _mm_maskload_ps (float const *__P, __m128i __M)
920 {
921   return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P,
922 					     (__v4si)__M);
923 }
924 
925 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
926 _mm_maskstore_ps (float *__P, __m128i __M, __m128 __A)
927 {
928   __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4si)__M, (__v4sf)__A);
929 }
930 
931 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
932 _mm256_maskload_ps (float const *__P, __m256i __M)
933 {
934   return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P,
935 						(__v8si)__M);
936 }
937 
938 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
939 _mm256_maskstore_ps (float *__P, __m256i __M, __m256 __A)
940 {
941   __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8si)__M, (__v8sf)__A);
942 }
943 
944 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
945 _mm256_movehdup_ps (__m256 __X)
946 {
947   return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X);
948 }
949 
950 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
951 _mm256_moveldup_ps (__m256 __X)
952 {
953   return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X);
954 }
955 
956 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
957 _mm256_movedup_pd (__m256d __X)
958 {
959   return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X);
960 }
961 
962 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
963 _mm256_lddqu_si256 (__m256i const *__P)
964 {
965   return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P);
966 }
967 
968 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
969 _mm256_stream_si256 (__m256i *__A, __m256i __B)
970 {
971   __builtin_ia32_movntdq256 ((__v4di *)__A, (__v4di)__B);
972 }
973 
974 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
975 _mm256_stream_pd (double *__A, __m256d __B)
976 {
977   __builtin_ia32_movntpd256 (__A, (__v4df)__B);
978 }
979 
980 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
981 _mm256_stream_ps (float *__P, __m256 __A)
982 {
983   __builtin_ia32_movntps256 (__P, (__v8sf)__A);
984 }
985 
986 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
987 _mm256_rcp_ps (__m256 __A)
988 {
989   return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A);
990 }
991 
992 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
993 _mm256_rsqrt_ps (__m256 __A)
994 {
995   return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A);
996 }
997 
998 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
999 _mm256_sqrt_pd (__m256d __A)
1000 {
1001   return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A);
1002 }
1003 
1004 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1005 _mm256_sqrt_ps (__m256 __A)
1006 {
1007   return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A);
1008 }
1009 
1010 #ifdef __OPTIMIZE__
1011 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1012 _mm256_round_pd (__m256d __V, const int __M)
1013 {
1014   return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M);
1015 }
1016 
1017 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1018 _mm256_round_ps (__m256 __V, const int __M)
1019 {
1020   return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M);
1021 }
1022 #else
1023 #define _mm256_round_pd(V, M) \
1024   ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M)))
1025 
1026 #define _mm256_round_ps(V, M) \
1027   ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M)))
1028 #endif
1029 
1030 #define _mm256_ceil_pd(V)	_mm256_round_pd ((V), _MM_FROUND_CEIL)
1031 #define _mm256_floor_pd(V)	_mm256_round_pd ((V), _MM_FROUND_FLOOR)
1032 #define _mm256_ceil_ps(V)	_mm256_round_ps ((V), _MM_FROUND_CEIL)
1033 #define _mm256_floor_ps(V)	_mm256_round_ps ((V), _MM_FROUND_FLOOR)
1034 
1035 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1036 _mm256_unpackhi_pd (__m256d __A, __m256d __B)
1037 {
1038   return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B);
1039 }
1040 
1041 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1042 _mm256_unpacklo_pd (__m256d __A, __m256d __B)
1043 {
1044   return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B);
1045 }
1046 
1047 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1048 _mm256_unpackhi_ps (__m256 __A, __m256 __B)
1049 {
1050   return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B);
1051 }
1052 
1053 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1054 _mm256_unpacklo_ps (__m256 __A, __m256 __B)
1055 {
1056   return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B);
1057 }
1058 
1059 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1060 _mm_testz_pd (__m128d __M, __m128d __V)
1061 {
1062   return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V);
1063 }
1064 
1065 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1066 _mm_testc_pd (__m128d __M, __m128d __V)
1067 {
1068   return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V);
1069 }
1070 
1071 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1072 _mm_testnzc_pd (__m128d __M, __m128d __V)
1073 {
1074   return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V);
1075 }
1076 
1077 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1078 _mm_testz_ps (__m128 __M, __m128 __V)
1079 {
1080   return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V);
1081 }
1082 
1083 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1084 _mm_testc_ps (__m128 __M, __m128 __V)
1085 {
1086   return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V);
1087 }
1088 
1089 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1090 _mm_testnzc_ps (__m128 __M, __m128 __V)
1091 {
1092   return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V);
1093 }
1094 
1095 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1096 _mm256_testz_pd (__m256d __M, __m256d __V)
1097 {
1098   return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V);
1099 }
1100 
1101 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1102 _mm256_testc_pd (__m256d __M, __m256d __V)
1103 {
1104   return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V);
1105 }
1106 
1107 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1108 _mm256_testnzc_pd (__m256d __M, __m256d __V)
1109 {
1110   return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V);
1111 }
1112 
1113 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1114 _mm256_testz_ps (__m256 __M, __m256 __V)
1115 {
1116   return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V);
1117 }
1118 
1119 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1120 _mm256_testc_ps (__m256 __M, __m256 __V)
1121 {
1122   return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V);
1123 }
1124 
1125 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1126 _mm256_testnzc_ps (__m256 __M, __m256 __V)
1127 {
1128   return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V);
1129 }
1130 
1131 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1132 _mm256_testz_si256 (__m256i __M, __m256i __V)
1133 {
1134   return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V);
1135 }
1136 
1137 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1138 _mm256_testc_si256 (__m256i __M, __m256i __V)
1139 {
1140   return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V);
1141 }
1142 
1143 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1144 _mm256_testnzc_si256 (__m256i __M, __m256i __V)
1145 {
1146   return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V);
1147 }
1148 
1149 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1150 _mm256_movemask_pd (__m256d __A)
1151 {
1152   return __builtin_ia32_movmskpd256 ((__v4df)__A);
1153 }
1154 
1155 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1156 _mm256_movemask_ps (__m256 __A)
1157 {
1158   return __builtin_ia32_movmskps256 ((__v8sf)__A);
1159 }
1160 
1161 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1162 _mm256_setzero_pd (void)
1163 {
1164   return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
1165 }
1166 
1167 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1168 _mm256_setzero_ps (void)
1169 {
1170   return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0,
1171 				 0.0, 0.0, 0.0, 0.0 };
1172 }
1173 
1174 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1175 _mm256_setzero_si256 (void)
1176 {
1177   return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
1178 }
1179 
1180 /* Create the vector [A B C D].  */
1181 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1182 _mm256_set_pd (double __A, double __B, double __C, double __D)
1183 {
1184   return __extension__ (__m256d){ __D, __C, __B, __A };
1185 }
1186 
1187 /* Create the vector [A B C D E F G H].  */
1188 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1189 _mm256_set_ps (float __A, float __B, float __C, float __D,
1190 	       float __E, float __F, float __G, float __H)
1191 {
1192   return __extension__ (__m256){ __H, __G, __F, __E,
1193 				 __D, __C, __B, __A };
1194 }
1195 
1196 /* Create the vector [A B C D E F G H].  */
1197 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1198 _mm256_set_epi32 (int __A, int __B, int __C, int __D,
1199 		  int __E, int __F, int __G, int __H)
1200 {
1201   return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E,
1202 					  __D, __C, __B, __A };
1203 }
1204 
1205 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1206 _mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12,
1207 		  short __q11, short __q10, short __q09, short __q08,
1208 		  short __q07, short __q06, short __q05, short __q04,
1209 		  short __q03, short __q02, short __q01, short __q00)
1210 {
1211   return __extension__ (__m256i)(__v16hi){
1212     __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1213     __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
1214   };
1215 }
1216 
1217 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1218 _mm256_set_epi8  (char __q31, char __q30, char __q29, char __q28,
1219 		  char __q27, char __q26, char __q25, char __q24,
1220 		  char __q23, char __q22, char __q21, char __q20,
1221 		  char __q19, char __q18, char __q17, char __q16,
1222 		  char __q15, char __q14, char __q13, char __q12,
1223 		  char __q11, char __q10, char __q09, char __q08,
1224 		  char __q07, char __q06, char __q05, char __q04,
1225 		  char __q03, char __q02, char __q01, char __q00)
1226 {
1227   return __extension__ (__m256i)(__v32qi){
1228     __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1229     __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15,
1230     __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23,
1231     __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31
1232   };
1233 }
1234 
1235 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1236 _mm256_set_epi64x (long long __A, long long __B, long long __C,
1237 		   long long __D)
1238 {
1239   return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A };
1240 }
1241 
1242 /* Create a vector with all elements equal to A.  */
1243 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1244 _mm256_set1_pd (double __A)
1245 {
1246   return __extension__ (__m256d){ __A, __A, __A, __A };
1247 }
1248 
1249 /* Create a vector with all elements equal to A.  */
1250 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1251 _mm256_set1_ps (float __A)
1252 {
1253   return __extension__ (__m256){ __A, __A, __A, __A,
1254 				 __A, __A, __A, __A };
1255 }
1256 
1257 /* Create a vector with all elements equal to A.  */
1258 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1259 _mm256_set1_epi32 (int __A)
1260 {
1261   return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A,
1262 					  __A, __A, __A, __A };
1263 }
1264 
1265 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1266 _mm256_set1_epi16 (short __A)
1267 {
1268   return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A,
1269 			   __A, __A, __A, __A, __A, __A, __A, __A);
1270 }
1271 
1272 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1273 _mm256_set1_epi8 (char __A)
1274 {
1275   return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
1276 			  __A, __A, __A, __A, __A, __A, __A, __A,
1277 			  __A, __A, __A, __A, __A, __A, __A, __A,
1278 			  __A, __A, __A, __A, __A, __A, __A, __A);
1279 }
1280 
1281 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1282 _mm256_set1_epi64x (long long __A)
1283 {
1284   return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A };
1285 }
1286 
1287 /* Create vectors of elements in the reversed order from the
1288    _mm256_set_XXX functions.  */
1289 
1290 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1291 _mm256_setr_pd (double __A, double __B, double __C, double __D)
1292 {
1293   return _mm256_set_pd (__D, __C, __B, __A);
1294 }
1295 
1296 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1297 _mm256_setr_ps (float __A, float __B, float __C, float __D,
1298 		float __E, float __F, float __G, float __H)
1299 {
1300   return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A);
1301 }
1302 
1303 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1304 _mm256_setr_epi32 (int __A, int __B, int __C, int __D,
1305 		   int __E, int __F, int __G, int __H)
1306 {
1307   return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A);
1308 }
1309 
1310 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1311 _mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12,
1312 		   short __q11, short __q10, short __q09, short __q08,
1313 		   short __q07, short __q06, short __q05, short __q04,
1314 		   short __q03, short __q02, short __q01, short __q00)
1315 {
1316   return _mm256_set_epi16 (__q00, __q01, __q02, __q03,
1317 			   __q04, __q05, __q06, __q07,
1318 			   __q08, __q09, __q10, __q11,
1319 			   __q12, __q13, __q14, __q15);
1320 }
1321 
1322 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1323 _mm256_setr_epi8  (char __q31, char __q30, char __q29, char __q28,
1324 		   char __q27, char __q26, char __q25, char __q24,
1325 		   char __q23, char __q22, char __q21, char __q20,
1326 		   char __q19, char __q18, char __q17, char __q16,
1327 		   char __q15, char __q14, char __q13, char __q12,
1328 		   char __q11, char __q10, char __q09, char __q08,
1329 		   char __q07, char __q06, char __q05, char __q04,
1330 		   char __q03, char __q02, char __q01, char __q00)
1331 {
1332   return _mm256_set_epi8 (__q00, __q01, __q02, __q03,
1333 			  __q04, __q05, __q06, __q07,
1334 			  __q08, __q09, __q10, __q11,
1335 			  __q12, __q13, __q14, __q15,
1336 			  __q16, __q17, __q18, __q19,
1337 			  __q20, __q21, __q22, __q23,
1338 			  __q24, __q25, __q26, __q27,
1339 			  __q28, __q29, __q30, __q31);
1340 }
1341 
1342 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1343 _mm256_setr_epi64x (long long __A, long long __B, long long __C,
1344 		    long long __D)
1345 {
1346   return _mm256_set_epi64x (__D, __C, __B, __A);
1347 }
1348 
1349 /* Casts between various SP, DP, INT vector types.  Note that these do no
1350    conversion of values, they just change the type.  */
1351 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1352 _mm256_castpd_ps (__m256d __A)
1353 {
1354   return (__m256) __A;
1355 }
1356 
1357 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1358 _mm256_castpd_si256 (__m256d __A)
1359 {
1360   return (__m256i) __A;
1361 }
1362 
1363 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1364 _mm256_castps_pd (__m256 __A)
1365 {
1366   return (__m256d) __A;
1367 }
1368 
1369 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1370 _mm256_castps_si256(__m256 __A)
1371 {
1372   return (__m256i) __A;
1373 }
1374 
1375 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1376 _mm256_castsi256_ps (__m256i __A)
1377 {
1378   return (__m256) __A;
1379 }
1380 
1381 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1382 _mm256_castsi256_pd (__m256i __A)
1383 {
1384   return (__m256d) __A;
1385 }
1386 
1387 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1388 _mm256_castpd256_pd128 (__m256d __A)
1389 {
1390   return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A);
1391 }
1392 
1393 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1394 _mm256_castps256_ps128 (__m256 __A)
1395 {
1396   return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A);
1397 }
1398 
1399 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1400 _mm256_castsi256_si128 (__m256i __A)
1401 {
1402   return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A);
1403 }
1404 
1405 /* When cast is done from a 128 to 256-bit type, the low 128 bits of
1406    the 256-bit result contain source parameter value and the upper 128
1407    bits of the result are undefined.  Those intrinsics shouldn't
1408    generate any extra moves.  */
1409 
1410 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1411 _mm256_castpd128_pd256 (__m128d __A)
1412 {
1413   return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A);
1414 }
1415 
1416 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1417 _mm256_castps128_ps256 (__m128 __A)
1418 {
1419   return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A);
1420 }
1421 
1422 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1423 _mm256_castsi128_si256 (__m128i __A)
1424 {
1425   return (__m256i) __builtin_ia32_si256_si ((__v4si)__A);
1426 }
1427