1 /* Copyright (C) 2008-2018 Free Software Foundation, Inc.
2 
3    This file is part of GCC.
4 
5    GCC is free software; you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 3, or (at your option)
8    any later version.
9 
10    GCC is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    Under Section 7 of GPL version 3, you are granted additional
16    permissions described in the GCC Runtime Library Exception, version
17    3.1, as published by the Free Software Foundation.
18 
19    You should have received a copy of the GNU General Public License and
20    a copy of the GCC Runtime Library Exception along with this program;
21    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22    <http://www.gnu.org/licenses/>.  */
23 
24 /* Implemented from the specification included in the Intel C++ Compiler
25    User Guide and Reference, version 11.0.  */
26 
27 #ifndef _IMMINTRIN_H_INCLUDED
28 # error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
29 #endif
30 
31 #ifndef _AVXINTRIN_H_INCLUDED
32 #define _AVXINTRIN_H_INCLUDED
33 
34 #ifndef __AVX__
35 #pragma GCC push_options
36 #pragma GCC target("avx")
37 #define __DISABLE_AVX__
38 #endif /* __AVX__ */
39 
40 /* Internal data types for implementing the intrinsics.  */
41 typedef double __v4df __attribute__ ((__vector_size__ (32)));
42 typedef float __v8sf __attribute__ ((__vector_size__ (32)));
43 typedef long long __v4di __attribute__ ((__vector_size__ (32)));
44 typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
45 typedef int __v8si __attribute__ ((__vector_size__ (32)));
46 typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
47 typedef short __v16hi __attribute__ ((__vector_size__ (32)));
48 typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
49 typedef char __v32qi __attribute__ ((__vector_size__ (32)));
50 typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
51 
52 /* The Intel API is flexible enough that we must allow aliasing with other
53    vector types, and their scalar components.  */
54 typedef float __m256 __attribute__ ((__vector_size__ (32),
55 				     __may_alias__));
56 typedef long long __m256i __attribute__ ((__vector_size__ (32),
57 					  __may_alias__));
58 typedef double __m256d __attribute__ ((__vector_size__ (32),
59 				       __may_alias__));
60 
61 /* Unaligned version of the same types.  */
62 typedef float __m256_u __attribute__ ((__vector_size__ (32),
63 				       __may_alias__,
64 				       __aligned__ (1)));
65 typedef long long __m256i_u __attribute__ ((__vector_size__ (32),
66 					    __may_alias__,
67 					    __aligned__ (1)));
68 typedef double __m256d_u __attribute__ ((__vector_size__ (32),
69 					 __may_alias__,
70 					 __aligned__ (1)));
71 
72 /* Compare predicates for scalar and packed compare intrinsics.  */
73 
74 /* Equal (ordered, non-signaling)  */
75 #define _CMP_EQ_OQ	0x00
76 /* Less-than (ordered, signaling)  */
77 #define _CMP_LT_OS	0x01
78 /* Less-than-or-equal (ordered, signaling)  */
79 #define _CMP_LE_OS	0x02
80 /* Unordered (non-signaling)  */
81 #define _CMP_UNORD_Q	0x03
82 /* Not-equal (unordered, non-signaling)  */
83 #define _CMP_NEQ_UQ	0x04
84 /* Not-less-than (unordered, signaling)  */
85 #define _CMP_NLT_US	0x05
86 /* Not-less-than-or-equal (unordered, signaling)  */
87 #define _CMP_NLE_US	0x06
88 /* Ordered (nonsignaling)   */
89 #define _CMP_ORD_Q	0x07
90 /* Equal (unordered, non-signaling)  */
91 #define _CMP_EQ_UQ	0x08
92 /* Not-greater-than-or-equal (unordered, signaling)  */
93 #define _CMP_NGE_US	0x09
94 /* Not-greater-than (unordered, signaling)  */
95 #define _CMP_NGT_US	0x0a
96 /* False (ordered, non-signaling)  */
97 #define _CMP_FALSE_OQ	0x0b
98 /* Not-equal (ordered, non-signaling)  */
99 #define _CMP_NEQ_OQ	0x0c
100 /* Greater-than-or-equal (ordered, signaling)  */
101 #define _CMP_GE_OS	0x0d
102 /* Greater-than (ordered, signaling)  */
103 #define _CMP_GT_OS	0x0e
104 /* True (unordered, non-signaling)  */
105 #define _CMP_TRUE_UQ	0x0f
106 /* Equal (ordered, signaling)  */
107 #define _CMP_EQ_OS	0x10
108 /* Less-than (ordered, non-signaling)  */
109 #define _CMP_LT_OQ	0x11
110 /* Less-than-or-equal (ordered, non-signaling)  */
111 #define _CMP_LE_OQ	0x12
112 /* Unordered (signaling)  */
113 #define _CMP_UNORD_S	0x13
114 /* Not-equal (unordered, signaling)  */
115 #define _CMP_NEQ_US	0x14
116 /* Not-less-than (unordered, non-signaling)  */
117 #define _CMP_NLT_UQ	0x15
118 /* Not-less-than-or-equal (unordered, non-signaling)  */
119 #define _CMP_NLE_UQ	0x16
120 /* Ordered (signaling)  */
121 #define _CMP_ORD_S	0x17
122 /* Equal (unordered, signaling)  */
123 #define _CMP_EQ_US	0x18
124 /* Not-greater-than-or-equal (unordered, non-signaling)  */
125 #define _CMP_NGE_UQ	0x19
126 /* Not-greater-than (unordered, non-signaling)  */
127 #define _CMP_NGT_UQ	0x1a
128 /* False (ordered, signaling)  */
129 #define _CMP_FALSE_OS	0x1b
130 /* Not-equal (ordered, signaling)  */
131 #define _CMP_NEQ_OS	0x1c
132 /* Greater-than-or-equal (ordered, non-signaling)  */
133 #define _CMP_GE_OQ	0x1d
134 /* Greater-than (ordered, non-signaling)  */
135 #define _CMP_GT_OQ	0x1e
136 /* True (unordered, signaling)  */
137 #define _CMP_TRUE_US	0x1f
138 
139 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
140 _mm256_add_pd (__m256d __A, __m256d __B)
141 {
142   return (__m256d) ((__v4df)__A + (__v4df)__B);
143 }
144 
145 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
146 _mm256_add_ps (__m256 __A, __m256 __B)
147 {
148   return (__m256) ((__v8sf)__A + (__v8sf)__B);
149 }
150 
151 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
152 _mm256_addsub_pd (__m256d __A, __m256d __B)
153 {
154   return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B);
155 }
156 
157 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
158 _mm256_addsub_ps (__m256 __A, __m256 __B)
159 {
160   return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B);
161 }
162 
163 
164 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
165 _mm256_and_pd (__m256d __A, __m256d __B)
166 {
167   return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B);
168 }
169 
170 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
171 _mm256_and_ps (__m256 __A, __m256 __B)
172 {
173   return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B);
174 }
175 
176 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
177 _mm256_andnot_pd (__m256d __A, __m256d __B)
178 {
179   return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B);
180 }
181 
182 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
183 _mm256_andnot_ps (__m256 __A, __m256 __B)
184 {
185   return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B);
186 }
187 
188 /* Double/single precision floating point blend instructions - select
189    data from 2 sources using constant/variable mask.  */
190 
191 #ifdef __OPTIMIZE__
192 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
193 _mm256_blend_pd (__m256d __X, __m256d __Y, const int __M)
194 {
195   return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X,
196 					      (__v4df)__Y,
197 					      __M);
198 }
199 
200 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
201 _mm256_blend_ps (__m256 __X, __m256 __Y, const int __M)
202 {
203   return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X,
204 					     (__v8sf)__Y,
205 					     __M);
206 }
207 #else
208 #define _mm256_blend_pd(X, Y, M)					\
209   ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X),		\
210 					(__v4df)(__m256d)(Y), (int)(M)))
211 
212 #define _mm256_blend_ps(X, Y, M)					\
213   ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X),		\
214 				       (__v8sf)(__m256)(Y), (int)(M)))
215 #endif
216 
217 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
218 _mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M)
219 {
220   return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X,
221 					       (__v4df)__Y,
222 					       (__v4df)__M);
223 }
224 
225 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
226 _mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M)
227 {
228   return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X,
229 					      (__v8sf)__Y,
230 					      (__v8sf)__M);
231 }
232 
233 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
234 _mm256_div_pd (__m256d __A, __m256d __B)
235 {
236   return (__m256d) ((__v4df)__A / (__v4df)__B);
237 }
238 
239 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
240 _mm256_div_ps (__m256 __A, __m256 __B)
241 {
242   return (__m256) ((__v8sf)__A / (__v8sf)__B);
243 }
244 
245 /* Dot product instructions with mask-defined summing and zeroing parts
246    of result.  */
247 
248 #ifdef __OPTIMIZE__
249 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
250 _mm256_dp_ps (__m256 __X, __m256 __Y, const int __M)
251 {
252   return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X,
253 					  (__v8sf)__Y,
254 					  __M);
255 }
256 #else
257 #define _mm256_dp_ps(X, Y, M)						\
258   ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X),		\
259 				    (__v8sf)(__m256)(Y), (int)(M)))
260 #endif
261 
262 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
263 _mm256_hadd_pd (__m256d __X, __m256d __Y)
264 {
265   return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y);
266 }
267 
268 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
269 _mm256_hadd_ps (__m256 __X, __m256 __Y)
270 {
271   return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y);
272 }
273 
274 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
275 _mm256_hsub_pd (__m256d __X, __m256d __Y)
276 {
277   return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y);
278 }
279 
280 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
281 _mm256_hsub_ps (__m256 __X, __m256 __Y)
282 {
283   return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y);
284 }
285 
286 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
287 _mm256_max_pd (__m256d __A, __m256d __B)
288 {
289   return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B);
290 }
291 
292 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
293 _mm256_max_ps (__m256 __A, __m256 __B)
294 {
295   return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B);
296 }
297 
298 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
299 _mm256_min_pd (__m256d __A, __m256d __B)
300 {
301   return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B);
302 }
303 
304 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
305 _mm256_min_ps (__m256 __A, __m256 __B)
306 {
307   return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B);
308 }
309 
310 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
311 _mm256_mul_pd (__m256d __A, __m256d __B)
312 {
313   return (__m256d) ((__v4df)__A * (__v4df)__B);
314 }
315 
316 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
317 _mm256_mul_ps (__m256 __A, __m256 __B)
318 {
319   return (__m256) ((__v8sf)__A * (__v8sf)__B);
320 }
321 
322 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
323 _mm256_or_pd (__m256d __A, __m256d __B)
324 {
325   return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B);
326 }
327 
328 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
329 _mm256_or_ps (__m256 __A, __m256 __B)
330 {
331   return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B);
332 }
333 
334 #ifdef __OPTIMIZE__
335 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
336 _mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask)
337 {
338   return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B,
339 					     __mask);
340 }
341 
342 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
343 _mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask)
344 {
345   return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B,
346 					    __mask);
347 }
348 #else
349 #define _mm256_shuffle_pd(A, B, N)					\
350   ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A),		\
351 				      (__v4df)(__m256d)(B), (int)(N)))
352 
353 #define _mm256_shuffle_ps(A, B, N)					\
354   ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A),		\
355 				      (__v8sf)(__m256)(B), (int)(N)))
356 #endif
357 
358 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
359 _mm256_sub_pd (__m256d __A, __m256d __B)
360 {
361   return (__m256d) ((__v4df)__A - (__v4df)__B);
362 }
363 
364 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
365 _mm256_sub_ps (__m256 __A, __m256 __B)
366 {
367   return (__m256) ((__v8sf)__A - (__v8sf)__B);
368 }
369 
370 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
371 _mm256_xor_pd (__m256d __A, __m256d __B)
372 {
373   return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B);
374 }
375 
376 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
377 _mm256_xor_ps (__m256 __A, __m256 __B)
378 {
379   return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B);
380 }
381 
382 #ifdef __OPTIMIZE__
383 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
384 _mm_cmp_pd (__m128d __X, __m128d __Y, const int __P)
385 {
386   return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P);
387 }
388 
389 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
390 _mm_cmp_ps (__m128 __X, __m128 __Y, const int __P)
391 {
392   return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P);
393 }
394 
395 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
396 _mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P)
397 {
398   return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y,
399 					    __P);
400 }
401 
402 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
403 _mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P)
404 {
405   return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y,
406 					   __P);
407 }
408 
409 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
410 _mm_cmp_sd (__m128d __X, __m128d __Y, const int __P)
411 {
412   return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P);
413 }
414 
415 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
416 _mm_cmp_ss (__m128 __X, __m128 __Y, const int __P)
417 {
418   return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P);
419 }
420 #else
421 #define _mm_cmp_pd(X, Y, P)						\
422   ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X),		\
423 				   (__v2df)(__m128d)(Y), (int)(P)))
424 
425 #define _mm_cmp_ps(X, Y, P)						\
426   ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X),			\
427 				  (__v4sf)(__m128)(Y), (int)(P)))
428 
429 #define _mm256_cmp_pd(X, Y, P)						\
430   ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X),		\
431 				      (__v4df)(__m256d)(Y), (int)(P)))
432 
433 #define _mm256_cmp_ps(X, Y, P)						\
434   ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X),		\
435 				     (__v8sf)(__m256)(Y), (int)(P)))
436 
437 #define _mm_cmp_sd(X, Y, P)						\
438   ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X),		\
439 				   (__v2df)(__m128d)(Y), (int)(P)))
440 
441 #define _mm_cmp_ss(X, Y, P)						\
442   ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X),			\
443 				  (__v4sf)(__m128)(Y), (int)(P)))
444 #endif
445 
446 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
447 _mm256_cvtepi32_pd (__m128i __A)
448 {
449   return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A);
450 }
451 
452 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
453 _mm256_cvtepi32_ps (__m256i __A)
454 {
455   return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A);
456 }
457 
458 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
459 _mm256_cvtpd_ps (__m256d __A)
460 {
461   return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A);
462 }
463 
464 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
465 _mm256_cvtps_epi32 (__m256 __A)
466 {
467   return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A);
468 }
469 
470 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
471 _mm256_cvtps_pd (__m128 __A)
472 {
473   return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A);
474 }
475 
476 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
477 _mm256_cvttpd_epi32 (__m256d __A)
478 {
479   return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A);
480 }
481 
482 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
483 _mm256_cvtpd_epi32 (__m256d __A)
484 {
485   return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A);
486 }
487 
488 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
489 _mm256_cvttps_epi32 (__m256 __A)
490 {
491   return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A);
492 }
493 
494 extern __inline double
495 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
496 _mm256_cvtsd_f64 (__m256d __A)
497 {
498   return __A[0];
499 }
500 
501 extern __inline float
502 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
503 _mm256_cvtss_f32 (__m256 __A)
504 {
505   return __A[0];
506 }
507 
508 #ifdef __OPTIMIZE__
509 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
510 _mm256_extractf128_pd (__m256d __X, const int __N)
511 {
512   return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N);
513 }
514 
515 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
516 _mm256_extractf128_ps (__m256 __X, const int __N)
517 {
518   return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N);
519 }
520 
521 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
522 _mm256_extractf128_si256 (__m256i __X, const int __N)
523 {
524   return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N);
525 }
526 
527 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
528 _mm256_extract_epi32 (__m256i __X, int const __N)
529 {
530   __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
531   return _mm_extract_epi32 (__Y, __N % 4);
532 }
533 
534 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
535 _mm256_extract_epi16 (__m256i __X, int const __N)
536 {
537   __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
538   return _mm_extract_epi16 (__Y, __N % 8);
539 }
540 
541 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
542 _mm256_extract_epi8 (__m256i __X, int const __N)
543 {
544   __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
545   return _mm_extract_epi8 (__Y, __N % 16);
546 }
547 
548 #ifdef __x86_64__
549 extern __inline long long  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
550 _mm256_extract_epi64 (__m256i __X, const int __N)
551 {
552   __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
553   return _mm_extract_epi64 (__Y, __N % 2);
554 }
555 #endif
556 #else
557 #define _mm256_extractf128_pd(X, N)					\
558   ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X),	\
559 						(int)(N)))
560 
561 #define _mm256_extractf128_ps(X, N)					\
562   ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X),	\
563 					       (int)(N)))
564 
565 #define _mm256_extractf128_si256(X, N)					\
566   ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X),	\
567 						(int)(N)))
568 
569 #define _mm256_extract_epi32(X, N)					\
570   (__extension__							\
571    ({									\
572       __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2);		\
573       _mm_extract_epi32 (__Y, (N) % 4);					\
574     }))
575 
576 #define _mm256_extract_epi16(X, N)					\
577   (__extension__							\
578    ({									\
579       __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3);		\
580       _mm_extract_epi16 (__Y, (N) % 8);					\
581     }))
582 
583 #define _mm256_extract_epi8(X, N)					\
584   (__extension__							\
585    ({									\
586       __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4);		\
587       _mm_extract_epi8 (__Y, (N) % 16);					\
588     }))
589 
590 #ifdef __x86_64__
591 #define _mm256_extract_epi64(X, N)					\
592   (__extension__							\
593    ({									\
594       __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1);		\
595       _mm_extract_epi64 (__Y, (N) % 2);					\
596     }))
597 #endif
598 #endif
599 
600 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
601 _mm256_zeroall (void)
602 {
603   __builtin_ia32_vzeroall ();
604 }
605 
606 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
607 _mm256_zeroupper (void)
608 {
609   __builtin_ia32_vzeroupper ();
610 }
611 
612 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
613 _mm_permutevar_pd (__m128d __A, __m128i __C)
614 {
615   return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A,
616 						(__v2di)__C);
617 }
618 
619 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
620 _mm256_permutevar_pd (__m256d __A, __m256i __C)
621 {
622   return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A,
623 						   (__v4di)__C);
624 }
625 
626 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
627 _mm_permutevar_ps (__m128 __A, __m128i __C)
628 {
629   return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A,
630 					       (__v4si)__C);
631 }
632 
633 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
634 _mm256_permutevar_ps (__m256 __A, __m256i __C)
635 {
636   return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A,
637 						  (__v8si)__C);
638 }
639 
640 #ifdef __OPTIMIZE__
641 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
642 _mm_permute_pd (__m128d __X, const int __C)
643 {
644   return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C);
645 }
646 
647 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
648 _mm256_permute_pd (__m256d __X, const int __C)
649 {
650   return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C);
651 }
652 
653 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
654 _mm_permute_ps (__m128 __X, const int __C)
655 {
656   return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C);
657 }
658 
659 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
660 _mm256_permute_ps (__m256 __X, const int __C)
661 {
662   return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C);
663 }
664 #else
665 #define _mm_permute_pd(X, C)						\
666   ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C)))
667 
668 #define _mm256_permute_pd(X, C)						\
669   ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X),	(int)(C)))
670 
671 #define _mm_permute_ps(X, C)						\
672   ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C)))
673 
674 #define _mm256_permute_ps(X, C)						\
675   ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C)))
676 #endif
677 
678 #ifdef __OPTIMIZE__
679 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
680 _mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C)
681 {
682   return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X,
683 						    (__v4df)__Y,
684 						    __C);
685 }
686 
687 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
688 _mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C)
689 {
690   return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X,
691 						   (__v8sf)__Y,
692 						   __C);
693 }
694 
695 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
696 _mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C)
697 {
698   return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X,
699 						    (__v8si)__Y,
700 						    __C);
701 }
702 #else
703 #define _mm256_permute2f128_pd(X, Y, C)					\
704   ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X),	\
705 					      (__v4df)(__m256d)(Y),	\
706 					      (int)(C)))
707 
708 #define _mm256_permute2f128_ps(X, Y, C)					\
709   ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X),	\
710 					     (__v8sf)(__m256)(Y),	\
711 					     (int)(C)))
712 
713 #define _mm256_permute2f128_si256(X, Y, C)				\
714   ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X),	\
715 					      (__v8si)(__m256i)(Y),	\
716 					      (int)(C)))
717 #endif
718 
719 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
720 _mm_broadcast_ss (float const *__X)
721 {
722   return (__m128) __builtin_ia32_vbroadcastss (__X);
723 }
724 
725 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
726 _mm256_broadcast_sd (double const *__X)
727 {
728   return (__m256d) __builtin_ia32_vbroadcastsd256 (__X);
729 }
730 
731 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
732 _mm256_broadcast_ss (float const *__X)
733 {
734   return (__m256) __builtin_ia32_vbroadcastss256 (__X);
735 }
736 
737 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
738 _mm256_broadcast_pd (__m128d const *__X)
739 {
740   return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X);
741 }
742 
743 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
744 _mm256_broadcast_ps (__m128 const *__X)
745 {
746   return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X);
747 }
748 
749 #ifdef __OPTIMIZE__
750 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
751 _mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O)
752 {
753   return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X,
754 						     (__v2df)__Y,
755 						     __O);
756 }
757 
758 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
759 _mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O)
760 {
761   return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X,
762 						    (__v4sf)__Y,
763 						    __O);
764 }
765 
766 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
767 _mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O)
768 {
769   return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X,
770 						     (__v4si)__Y,
771 						     __O);
772 }
773 
774 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
775 _mm256_insert_epi32 (__m256i __X, int __D, int const __N)
776 {
777   __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
778   __Y = _mm_insert_epi32 (__Y, __D, __N % 4);
779   return _mm256_insertf128_si256 (__X, __Y, __N >> 2);
780 }
781 
782 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
783 _mm256_insert_epi16 (__m256i __X, int __D, int const __N)
784 {
785   __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
786   __Y = _mm_insert_epi16 (__Y, __D, __N % 8);
787   return _mm256_insertf128_si256 (__X, __Y, __N >> 3);
788 }
789 
790 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
791 _mm256_insert_epi8 (__m256i __X, int __D, int const __N)
792 {
793   __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
794   __Y = _mm_insert_epi8 (__Y, __D, __N % 16);
795   return _mm256_insertf128_si256 (__X, __Y, __N >> 4);
796 }
797 
798 #ifdef __x86_64__
799 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
800 _mm256_insert_epi64 (__m256i __X, long long __D, int const __N)
801 {
802   __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
803   __Y = _mm_insert_epi64 (__Y, __D, __N % 2);
804   return _mm256_insertf128_si256 (__X, __Y, __N >> 1);
805 }
806 #endif
807 #else
808 #define _mm256_insertf128_pd(X, Y, O)					\
809   ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X),	\
810 					       (__v2df)(__m128d)(Y),	\
811 					       (int)(O)))
812 
813 #define _mm256_insertf128_ps(X, Y, O)					\
814   ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X),	\
815 					      (__v4sf)(__m128)(Y),  	\
816 					      (int)(O)))
817 
818 #define _mm256_insertf128_si256(X, Y, O)				\
819   ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X),	\
820 					       (__v4si)(__m128i)(Y),	\
821 					       (int)(O)))
822 
823 #define _mm256_insert_epi32(X, D, N)					\
824   (__extension__							\
825    ({									\
826       __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2);		\
827       __Y = _mm_insert_epi32 (__Y, (D), (N) % 4);			\
828       _mm256_insertf128_si256 ((X), __Y, (N) >> 2);			\
829     }))
830 
831 #define _mm256_insert_epi16(X, D, N)					\
832   (__extension__							\
833    ({									\
834       __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3);		\
835       __Y = _mm_insert_epi16 (__Y, (D), (N) % 8);			\
836       _mm256_insertf128_si256 ((X), __Y, (N) >> 3);			\
837     }))
838 
839 #define _mm256_insert_epi8(X, D, N)					\
840   (__extension__							\
841    ({									\
842       __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4);		\
843       __Y = _mm_insert_epi8 (__Y, (D), (N) % 16);			\
844       _mm256_insertf128_si256 ((X), __Y, (N) >> 4);			\
845     }))
846 
847 #ifdef __x86_64__
848 #define _mm256_insert_epi64(X, D, N)					\
849   (__extension__							\
850    ({									\
851       __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1);		\
852       __Y = _mm_insert_epi64 (__Y, (D), (N) % 2);			\
853       _mm256_insertf128_si256 ((X), __Y, (N) >> 1);			\
854     }))
855 #endif
856 #endif
857 
858 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
859 _mm256_load_pd (double const *__P)
860 {
861   return *(__m256d *)__P;
862 }
863 
864 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
865 _mm256_store_pd (double *__P, __m256d __A)
866 {
867   *(__m256d *)__P = __A;
868 }
869 
870 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
871 _mm256_load_ps (float const *__P)
872 {
873   return *(__m256 *)__P;
874 }
875 
876 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
877 _mm256_store_ps (float *__P, __m256 __A)
878 {
879   *(__m256 *)__P = __A;
880 }
881 
882 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
883 _mm256_loadu_pd (double const *__P)
884 {
885   return *(__m256d_u *)__P;
886 }
887 
888 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
889 _mm256_storeu_pd (double *__P, __m256d __A)
890 {
891   *(__m256d_u *)__P = __A;
892 }
893 
894 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
895 _mm256_loadu_ps (float const *__P)
896 {
897   return *(__m256_u *)__P;
898 }
899 
900 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
901 _mm256_storeu_ps (float *__P, __m256 __A)
902 {
903   *(__m256_u *)__P = __A;
904 }
905 
906 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
907 _mm256_load_si256 (__m256i const *__P)
908 {
909   return *__P;
910 }
911 
912 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
913 _mm256_store_si256 (__m256i *__P, __m256i __A)
914 {
915   *__P = __A;
916 }
917 
918 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
919 _mm256_loadu_si256 (__m256i_u const *__P)
920 {
921   return *__P;
922 }
923 
924 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
925 _mm256_storeu_si256 (__m256i_u *__P, __m256i __A)
926 {
927   *__P = __A;
928 }
929 
930 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
931 _mm_maskload_pd (double const *__P, __m128i __M)
932 {
933   return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P,
934 					      (__v2di)__M);
935 }
936 
937 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
938 _mm_maskstore_pd (double *__P, __m128i __M, __m128d __A)
939 {
940   __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2di)__M, (__v2df)__A);
941 }
942 
943 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
944 _mm256_maskload_pd (double const *__P, __m256i __M)
945 {
946   return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P,
947 						 (__v4di)__M);
948 }
949 
950 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
951 _mm256_maskstore_pd (double *__P, __m256i __M, __m256d __A)
952 {
953   __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4di)__M, (__v4df)__A);
954 }
955 
956 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
957 _mm_maskload_ps (float const *__P, __m128i __M)
958 {
959   return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P,
960 					     (__v4si)__M);
961 }
962 
963 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
964 _mm_maskstore_ps (float *__P, __m128i __M, __m128 __A)
965 {
966   __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4si)__M, (__v4sf)__A);
967 }
968 
969 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
970 _mm256_maskload_ps (float const *__P, __m256i __M)
971 {
972   return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P,
973 						(__v8si)__M);
974 }
975 
976 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
977 _mm256_maskstore_ps (float *__P, __m256i __M, __m256 __A)
978 {
979   __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8si)__M, (__v8sf)__A);
980 }
981 
982 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
983 _mm256_movehdup_ps (__m256 __X)
984 {
985   return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X);
986 }
987 
988 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
989 _mm256_moveldup_ps (__m256 __X)
990 {
991   return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X);
992 }
993 
994 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
995 _mm256_movedup_pd (__m256d __X)
996 {
997   return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X);
998 }
999 
1000 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1001 _mm256_lddqu_si256 (__m256i const *__P)
1002 {
1003   return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P);
1004 }
1005 
1006 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1007 _mm256_stream_si256 (__m256i *__A, __m256i __B)
1008 {
1009   __builtin_ia32_movntdq256 ((__v4di *)__A, (__v4di)__B);
1010 }
1011 
1012 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1013 _mm256_stream_pd (double *__A, __m256d __B)
1014 {
1015   __builtin_ia32_movntpd256 (__A, (__v4df)__B);
1016 }
1017 
1018 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1019 _mm256_stream_ps (float *__P, __m256 __A)
1020 {
1021   __builtin_ia32_movntps256 (__P, (__v8sf)__A);
1022 }
1023 
1024 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1025 _mm256_rcp_ps (__m256 __A)
1026 {
1027   return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A);
1028 }
1029 
1030 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1031 _mm256_rsqrt_ps (__m256 __A)
1032 {
1033   return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A);
1034 }
1035 
1036 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1037 _mm256_sqrt_pd (__m256d __A)
1038 {
1039   return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A);
1040 }
1041 
1042 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1043 _mm256_sqrt_ps (__m256 __A)
1044 {
1045   return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A);
1046 }
1047 
1048 #ifdef __OPTIMIZE__
1049 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1050 _mm256_round_pd (__m256d __V, const int __M)
1051 {
1052   return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M);
1053 }
1054 
1055 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1056 _mm256_round_ps (__m256 __V, const int __M)
1057 {
1058   return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M);
1059 }
1060 #else
1061 #define _mm256_round_pd(V, M) \
1062   ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M)))
1063 
1064 #define _mm256_round_ps(V, M) \
1065   ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M)))
1066 #endif
1067 
1068 #define _mm256_ceil_pd(V)	_mm256_round_pd ((V), _MM_FROUND_CEIL)
1069 #define _mm256_floor_pd(V)	_mm256_round_pd ((V), _MM_FROUND_FLOOR)
1070 #define _mm256_ceil_ps(V)	_mm256_round_ps ((V), _MM_FROUND_CEIL)
1071 #define _mm256_floor_ps(V)	_mm256_round_ps ((V), _MM_FROUND_FLOOR)
1072 
1073 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1074 _mm256_unpackhi_pd (__m256d __A, __m256d __B)
1075 {
1076   return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B);
1077 }
1078 
1079 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1080 _mm256_unpacklo_pd (__m256d __A, __m256d __B)
1081 {
1082   return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B);
1083 }
1084 
1085 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1086 _mm256_unpackhi_ps (__m256 __A, __m256 __B)
1087 {
1088   return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B);
1089 }
1090 
1091 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1092 _mm256_unpacklo_ps (__m256 __A, __m256 __B)
1093 {
1094   return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B);
1095 }
1096 
1097 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1098 _mm_testz_pd (__m128d __M, __m128d __V)
1099 {
1100   return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V);
1101 }
1102 
1103 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1104 _mm_testc_pd (__m128d __M, __m128d __V)
1105 {
1106   return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V);
1107 }
1108 
1109 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1110 _mm_testnzc_pd (__m128d __M, __m128d __V)
1111 {
1112   return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V);
1113 }
1114 
1115 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1116 _mm_testz_ps (__m128 __M, __m128 __V)
1117 {
1118   return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V);
1119 }
1120 
1121 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1122 _mm_testc_ps (__m128 __M, __m128 __V)
1123 {
1124   return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V);
1125 }
1126 
1127 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1128 _mm_testnzc_ps (__m128 __M, __m128 __V)
1129 {
1130   return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V);
1131 }
1132 
1133 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1134 _mm256_testz_pd (__m256d __M, __m256d __V)
1135 {
1136   return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V);
1137 }
1138 
1139 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1140 _mm256_testc_pd (__m256d __M, __m256d __V)
1141 {
1142   return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V);
1143 }
1144 
1145 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1146 _mm256_testnzc_pd (__m256d __M, __m256d __V)
1147 {
1148   return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V);
1149 }
1150 
1151 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1152 _mm256_testz_ps (__m256 __M, __m256 __V)
1153 {
1154   return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V);
1155 }
1156 
1157 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1158 _mm256_testc_ps (__m256 __M, __m256 __V)
1159 {
1160   return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V);
1161 }
1162 
1163 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1164 _mm256_testnzc_ps (__m256 __M, __m256 __V)
1165 {
1166   return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V);
1167 }
1168 
1169 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1170 _mm256_testz_si256 (__m256i __M, __m256i __V)
1171 {
1172   return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V);
1173 }
1174 
1175 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1176 _mm256_testc_si256 (__m256i __M, __m256i __V)
1177 {
1178   return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V);
1179 }
1180 
1181 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1182 _mm256_testnzc_si256 (__m256i __M, __m256i __V)
1183 {
1184   return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V);
1185 }
1186 
1187 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1188 _mm256_movemask_pd (__m256d __A)
1189 {
1190   return __builtin_ia32_movmskpd256 ((__v4df)__A);
1191 }
1192 
1193 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1194 _mm256_movemask_ps (__m256 __A)
1195 {
1196   return __builtin_ia32_movmskps256 ((__v8sf)__A);
1197 }
1198 
1199 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1200 _mm256_undefined_pd (void)
1201 {
1202   __m256d __Y = __Y;
1203   return __Y;
1204 }
1205 
1206 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1207 _mm256_undefined_ps (void)
1208 {
1209   __m256 __Y = __Y;
1210   return __Y;
1211 }
1212 
1213 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1214 _mm256_undefined_si256 (void)
1215 {
1216   __m256i __Y = __Y;
1217   return __Y;
1218 }
1219 
1220 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1221 _mm256_setzero_pd (void)
1222 {
1223   return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
1224 }
1225 
1226 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1227 _mm256_setzero_ps (void)
1228 {
1229   return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0,
1230 				 0.0, 0.0, 0.0, 0.0 };
1231 }
1232 
1233 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1234 _mm256_setzero_si256 (void)
1235 {
1236   return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
1237 }
1238 
1239 /* Create the vector [A B C D].  */
1240 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1241 _mm256_set_pd (double __A, double __B, double __C, double __D)
1242 {
1243   return __extension__ (__m256d){ __D, __C, __B, __A };
1244 }
1245 
1246 /* Create the vector [A B C D E F G H].  */
1247 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1248 _mm256_set_ps (float __A, float __B, float __C, float __D,
1249 	       float __E, float __F, float __G, float __H)
1250 {
1251   return __extension__ (__m256){ __H, __G, __F, __E,
1252 				 __D, __C, __B, __A };
1253 }
1254 
1255 /* Create the vector [A B C D E F G H].  */
1256 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1257 _mm256_set_epi32 (int __A, int __B, int __C, int __D,
1258 		  int __E, int __F, int __G, int __H)
1259 {
1260   return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E,
1261 					  __D, __C, __B, __A };
1262 }
1263 
1264 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1265 _mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12,
1266 		  short __q11, short __q10, short __q09, short __q08,
1267 		  short __q07, short __q06, short __q05, short __q04,
1268 		  short __q03, short __q02, short __q01, short __q00)
1269 {
1270   return __extension__ (__m256i)(__v16hi){
1271     __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1272     __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
1273   };
1274 }
1275 
1276 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1277 _mm256_set_epi8  (char __q31, char __q30, char __q29, char __q28,
1278 		  char __q27, char __q26, char __q25, char __q24,
1279 		  char __q23, char __q22, char __q21, char __q20,
1280 		  char __q19, char __q18, char __q17, char __q16,
1281 		  char __q15, char __q14, char __q13, char __q12,
1282 		  char __q11, char __q10, char __q09, char __q08,
1283 		  char __q07, char __q06, char __q05, char __q04,
1284 		  char __q03, char __q02, char __q01, char __q00)
1285 {
1286   return __extension__ (__m256i)(__v32qi){
1287     __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1288     __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15,
1289     __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23,
1290     __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31
1291   };
1292 }
1293 
1294 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1295 _mm256_set_epi64x (long long __A, long long __B, long long __C,
1296 		   long long __D)
1297 {
1298   return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A };
1299 }
1300 
1301 /* Create a vector with all elements equal to A.  */
1302 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1303 _mm256_set1_pd (double __A)
1304 {
1305   return __extension__ (__m256d){ __A, __A, __A, __A };
1306 }
1307 
1308 /* Create a vector with all elements equal to A.  */
1309 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1310 _mm256_set1_ps (float __A)
1311 {
1312   return __extension__ (__m256){ __A, __A, __A, __A,
1313 				 __A, __A, __A, __A };
1314 }
1315 
1316 /* Create a vector with all elements equal to A.  */
1317 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1318 _mm256_set1_epi32 (int __A)
1319 {
1320   return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A,
1321 					  __A, __A, __A, __A };
1322 }
1323 
1324 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1325 _mm256_set1_epi16 (short __A)
1326 {
1327   return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A,
1328 			   __A, __A, __A, __A, __A, __A, __A, __A);
1329 }
1330 
1331 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1332 _mm256_set1_epi8 (char __A)
1333 {
1334   return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
1335 			  __A, __A, __A, __A, __A, __A, __A, __A,
1336 			  __A, __A, __A, __A, __A, __A, __A, __A,
1337 			  __A, __A, __A, __A, __A, __A, __A, __A);
1338 }
1339 
1340 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1341 _mm256_set1_epi64x (long long __A)
1342 {
1343   return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A };
1344 }
1345 
1346 /* Create vectors of elements in the reversed order from the
1347    _mm256_set_XXX functions.  */
1348 
1349 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1350 _mm256_setr_pd (double __A, double __B, double __C, double __D)
1351 {
1352   return _mm256_set_pd (__D, __C, __B, __A);
1353 }
1354 
1355 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1356 _mm256_setr_ps (float __A, float __B, float __C, float __D,
1357 		float __E, float __F, float __G, float __H)
1358 {
1359   return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A);
1360 }
1361 
1362 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1363 _mm256_setr_epi32 (int __A, int __B, int __C, int __D,
1364 		   int __E, int __F, int __G, int __H)
1365 {
1366   return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A);
1367 }
1368 
1369 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1370 _mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12,
1371 		   short __q11, short __q10, short __q09, short __q08,
1372 		   short __q07, short __q06, short __q05, short __q04,
1373 		   short __q03, short __q02, short __q01, short __q00)
1374 {
1375   return _mm256_set_epi16 (__q00, __q01, __q02, __q03,
1376 			   __q04, __q05, __q06, __q07,
1377 			   __q08, __q09, __q10, __q11,
1378 			   __q12, __q13, __q14, __q15);
1379 }
1380 
1381 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1382 _mm256_setr_epi8  (char __q31, char __q30, char __q29, char __q28,
1383 		   char __q27, char __q26, char __q25, char __q24,
1384 		   char __q23, char __q22, char __q21, char __q20,
1385 		   char __q19, char __q18, char __q17, char __q16,
1386 		   char __q15, char __q14, char __q13, char __q12,
1387 		   char __q11, char __q10, char __q09, char __q08,
1388 		   char __q07, char __q06, char __q05, char __q04,
1389 		   char __q03, char __q02, char __q01, char __q00)
1390 {
1391   return _mm256_set_epi8 (__q00, __q01, __q02, __q03,
1392 			  __q04, __q05, __q06, __q07,
1393 			  __q08, __q09, __q10, __q11,
1394 			  __q12, __q13, __q14, __q15,
1395 			  __q16, __q17, __q18, __q19,
1396 			  __q20, __q21, __q22, __q23,
1397 			  __q24, __q25, __q26, __q27,
1398 			  __q28, __q29, __q30, __q31);
1399 }
1400 
1401 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1402 _mm256_setr_epi64x (long long __A, long long __B, long long __C,
1403 		    long long __D)
1404 {
1405   return _mm256_set_epi64x (__D, __C, __B, __A);
1406 }
1407 
1408 /* Casts between various SP, DP, INT vector types.  Note that these do no
1409    conversion of values, they just change the type.  */
1410 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1411 _mm256_castpd_ps (__m256d __A)
1412 {
1413   return (__m256) __A;
1414 }
1415 
1416 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1417 _mm256_castpd_si256 (__m256d __A)
1418 {
1419   return (__m256i) __A;
1420 }
1421 
1422 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1423 _mm256_castps_pd (__m256 __A)
1424 {
1425   return (__m256d) __A;
1426 }
1427 
1428 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1429 _mm256_castps_si256(__m256 __A)
1430 {
1431   return (__m256i) __A;
1432 }
1433 
1434 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1435 _mm256_castsi256_ps (__m256i __A)
1436 {
1437   return (__m256) __A;
1438 }
1439 
1440 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1441 _mm256_castsi256_pd (__m256i __A)
1442 {
1443   return (__m256d) __A;
1444 }
1445 
1446 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1447 _mm256_castpd256_pd128 (__m256d __A)
1448 {
1449   return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A);
1450 }
1451 
1452 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1453 _mm256_castps256_ps128 (__m256 __A)
1454 {
1455   return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A);
1456 }
1457 
1458 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1459 _mm256_castsi256_si128 (__m256i __A)
1460 {
1461   return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A);
1462 }
1463 
1464 /* When cast is done from a 128 to 256-bit type, the low 128 bits of
1465    the 256-bit result contain source parameter value and the upper 128
1466    bits of the result are undefined.  Those intrinsics shouldn't
1467    generate any extra moves.  */
1468 
1469 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1470 _mm256_castpd128_pd256 (__m128d __A)
1471 {
1472   return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A);
1473 }
1474 
1475 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1476 _mm256_castps128_ps256 (__m128 __A)
1477 {
1478   return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A);
1479 }
1480 
1481 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1482 _mm256_castsi128_si256 (__m128i __A)
1483 {
1484   return (__m256i) __builtin_ia32_si256_si ((__v4si)__A);
1485 }
1486 
1487 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1488 _mm256_set_m128 ( __m128 __H, __m128 __L)
1489 {
1490   return _mm256_insertf128_ps (_mm256_castps128_ps256 (__L), __H, 1);
1491 }
1492 
1493 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1494 _mm256_set_m128d (__m128d __H, __m128d __L)
1495 {
1496   return _mm256_insertf128_pd (_mm256_castpd128_pd256 (__L), __H, 1);
1497 }
1498 
1499 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1500 _mm256_set_m128i (__m128i __H, __m128i __L)
1501 {
1502   return _mm256_insertf128_si256 (_mm256_castsi128_si256 (__L), __H, 1);
1503 }
1504 
1505 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1506 _mm256_setr_m128 (__m128 __L, __m128 __H)
1507 {
1508   return _mm256_set_m128 (__H, __L);
1509 }
1510 
1511 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1512 _mm256_setr_m128d (__m128d __L, __m128d __H)
1513 {
1514   return _mm256_set_m128d (__H, __L);
1515 }
1516 
1517 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1518 _mm256_setr_m128i (__m128i __L, __m128i __H)
1519 {
1520   return _mm256_set_m128i (__H, __L);
1521 }
1522 
1523 #ifdef __DISABLE_AVX__
1524 #undef __DISABLE_AVX__
1525 #pragma GCC pop_options
1526 #endif /* __DISABLE_AVX__ */
1527 
1528 #endif /* _AVXINTRIN_H_INCLUDED */
1529