1f4a2713aSLionel Sambuc /*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2f4a2713aSLionel Sambuc  *
3f4a2713aSLionel Sambuc  * Permission is hereby granted, free of charge, to any person obtaining a copy
4f4a2713aSLionel Sambuc  * of this software and associated documentation files (the "Software"), to deal
5f4a2713aSLionel Sambuc  * in the Software without restriction, including without limitation the rights
6f4a2713aSLionel Sambuc  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7f4a2713aSLionel Sambuc  * copies of the Software, and to permit persons to whom the Software is
8f4a2713aSLionel Sambuc  * furnished to do so, subject to the following conditions:
9f4a2713aSLionel Sambuc  *
10f4a2713aSLionel Sambuc  * The above copyright notice and this permission notice shall be included in
11f4a2713aSLionel Sambuc  * all copies or substantial portions of the Software.
12f4a2713aSLionel Sambuc  *
13f4a2713aSLionel Sambuc  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14f4a2713aSLionel Sambuc  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15f4a2713aSLionel Sambuc  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16f4a2713aSLionel Sambuc  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17f4a2713aSLionel Sambuc  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18f4a2713aSLionel Sambuc  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19f4a2713aSLionel Sambuc  * THE SOFTWARE.
20f4a2713aSLionel Sambuc  *
21f4a2713aSLionel Sambuc  *===-----------------------------------------------------------------------===
22f4a2713aSLionel Sambuc  */
23f4a2713aSLionel Sambuc 
24f4a2713aSLionel Sambuc #ifndef __IMMINTRIN_H
25f4a2713aSLionel Sambuc #error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
26f4a2713aSLionel Sambuc #endif
27f4a2713aSLionel Sambuc 
28f4a2713aSLionel Sambuc #ifndef __AVXINTRIN_H
29f4a2713aSLionel Sambuc #define __AVXINTRIN_H
30f4a2713aSLionel Sambuc 
31f4a2713aSLionel Sambuc typedef double __v4df __attribute__ ((__vector_size__ (32)));
32f4a2713aSLionel Sambuc typedef float __v8sf __attribute__ ((__vector_size__ (32)));
33f4a2713aSLionel Sambuc typedef long long __v4di __attribute__ ((__vector_size__ (32)));
34f4a2713aSLionel Sambuc typedef int __v8si __attribute__ ((__vector_size__ (32)));
35f4a2713aSLionel Sambuc typedef short __v16hi __attribute__ ((__vector_size__ (32)));
36f4a2713aSLionel Sambuc typedef char __v32qi __attribute__ ((__vector_size__ (32)));
37f4a2713aSLionel Sambuc 
38f4a2713aSLionel Sambuc typedef float __m256 __attribute__ ((__vector_size__ (32)));
39f4a2713aSLionel Sambuc typedef double __m256d __attribute__((__vector_size__(32)));
40f4a2713aSLionel Sambuc typedef long long __m256i __attribute__((__vector_size__(32)));
41f4a2713aSLionel Sambuc 
42f4a2713aSLionel Sambuc /* Arithmetic */
43f4a2713aSLionel Sambuc static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_add_pd(__m256d __a,__m256d __b)44f4a2713aSLionel Sambuc _mm256_add_pd(__m256d __a, __m256d __b)
45f4a2713aSLionel Sambuc {
46f4a2713aSLionel Sambuc   return __a+__b;
47f4a2713aSLionel Sambuc }
48f4a2713aSLionel Sambuc 
49f4a2713aSLionel Sambuc static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_add_ps(__m256 __a,__m256 __b)50f4a2713aSLionel Sambuc _mm256_add_ps(__m256 __a, __m256 __b)
51f4a2713aSLionel Sambuc {
52f4a2713aSLionel Sambuc   return __a+__b;
53f4a2713aSLionel Sambuc }
54f4a2713aSLionel Sambuc 
55f4a2713aSLionel Sambuc static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_sub_pd(__m256d __a,__m256d __b)56f4a2713aSLionel Sambuc _mm256_sub_pd(__m256d __a, __m256d __b)
57f4a2713aSLionel Sambuc {
58f4a2713aSLionel Sambuc   return __a-__b;
59f4a2713aSLionel Sambuc }
60f4a2713aSLionel Sambuc 
61f4a2713aSLionel Sambuc static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_sub_ps(__m256 __a,__m256 __b)62f4a2713aSLionel Sambuc _mm256_sub_ps(__m256 __a, __m256 __b)
63f4a2713aSLionel Sambuc {
64f4a2713aSLionel Sambuc   return __a-__b;
65f4a2713aSLionel Sambuc }
66f4a2713aSLionel Sambuc 
67f4a2713aSLionel Sambuc static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_addsub_pd(__m256d __a,__m256d __b)68f4a2713aSLionel Sambuc _mm256_addsub_pd(__m256d __a, __m256d __b)
69f4a2713aSLionel Sambuc {
70f4a2713aSLionel Sambuc   return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
71f4a2713aSLionel Sambuc }
72f4a2713aSLionel Sambuc 
73f4a2713aSLionel Sambuc static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_addsub_ps(__m256 __a,__m256 __b)74f4a2713aSLionel Sambuc _mm256_addsub_ps(__m256 __a, __m256 __b)
75f4a2713aSLionel Sambuc {
76f4a2713aSLionel Sambuc   return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
77f4a2713aSLionel Sambuc }
78f4a2713aSLionel Sambuc 
79f4a2713aSLionel Sambuc static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_div_pd(__m256d __a,__m256d __b)80f4a2713aSLionel Sambuc _mm256_div_pd(__m256d __a, __m256d __b)
81f4a2713aSLionel Sambuc {
82f4a2713aSLionel Sambuc   return __a / __b;
83f4a2713aSLionel Sambuc }
84f4a2713aSLionel Sambuc 
85f4a2713aSLionel Sambuc static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_div_ps(__m256 __a,__m256 __b)86f4a2713aSLionel Sambuc _mm256_div_ps(__m256 __a, __m256 __b)
87f4a2713aSLionel Sambuc {
88f4a2713aSLionel Sambuc   return __a / __b;
89f4a2713aSLionel Sambuc }
90f4a2713aSLionel Sambuc 
91f4a2713aSLionel Sambuc static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_max_pd(__m256d __a,__m256d __b)92f4a2713aSLionel Sambuc _mm256_max_pd(__m256d __a, __m256d __b)
93f4a2713aSLionel Sambuc {
94f4a2713aSLionel Sambuc   return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
95f4a2713aSLionel Sambuc }
96f4a2713aSLionel Sambuc 
97f4a2713aSLionel Sambuc static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_max_ps(__m256 __a,__m256 __b)98f4a2713aSLionel Sambuc _mm256_max_ps(__m256 __a, __m256 __b)
99f4a2713aSLionel Sambuc {
100f4a2713aSLionel Sambuc   return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
101f4a2713aSLionel Sambuc }
102f4a2713aSLionel Sambuc 
103f4a2713aSLionel Sambuc static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_min_pd(__m256d __a,__m256d __b)104f4a2713aSLionel Sambuc _mm256_min_pd(__m256d __a, __m256d __b)
105f4a2713aSLionel Sambuc {
106f4a2713aSLionel Sambuc   return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
107f4a2713aSLionel Sambuc }
108f4a2713aSLionel Sambuc 
109f4a2713aSLionel Sambuc static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_min_ps(__m256 __a,__m256 __b)110f4a2713aSLionel Sambuc _mm256_min_ps(__m256 __a, __m256 __b)
111f4a2713aSLionel Sambuc {
112f4a2713aSLionel Sambuc   return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
113f4a2713aSLionel Sambuc }
114f4a2713aSLionel Sambuc 
115f4a2713aSLionel Sambuc static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_mul_pd(__m256d __a,__m256d __b)116f4a2713aSLionel Sambuc _mm256_mul_pd(__m256d __a, __m256d __b)
117f4a2713aSLionel Sambuc {
118f4a2713aSLionel Sambuc   return __a * __b;
119f4a2713aSLionel Sambuc }
120f4a2713aSLionel Sambuc 
121f4a2713aSLionel Sambuc static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_mul_ps(__m256 __a,__m256 __b)122f4a2713aSLionel Sambuc _mm256_mul_ps(__m256 __a, __m256 __b)
123f4a2713aSLionel Sambuc {
124f4a2713aSLionel Sambuc   return __a * __b;
125f4a2713aSLionel Sambuc }
126f4a2713aSLionel Sambuc 
127f4a2713aSLionel Sambuc static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_sqrt_pd(__m256d __a)128f4a2713aSLionel Sambuc _mm256_sqrt_pd(__m256d __a)
129f4a2713aSLionel Sambuc {
130f4a2713aSLionel Sambuc   return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
131f4a2713aSLionel Sambuc }
132f4a2713aSLionel Sambuc 
133f4a2713aSLionel Sambuc static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_sqrt_ps(__m256 __a)134f4a2713aSLionel Sambuc _mm256_sqrt_ps(__m256 __a)
135f4a2713aSLionel Sambuc {
136f4a2713aSLionel Sambuc   return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
137f4a2713aSLionel Sambuc }
138f4a2713aSLionel Sambuc 
139f4a2713aSLionel Sambuc static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_rsqrt_ps(__m256 __a)140f4a2713aSLionel Sambuc _mm256_rsqrt_ps(__m256 __a)
141f4a2713aSLionel Sambuc {
142f4a2713aSLionel Sambuc   return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
143f4a2713aSLionel Sambuc }
144f4a2713aSLionel Sambuc 
145f4a2713aSLionel Sambuc static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_rcp_ps(__m256 __a)146f4a2713aSLionel Sambuc _mm256_rcp_ps(__m256 __a)
147f4a2713aSLionel Sambuc {
148f4a2713aSLionel Sambuc   return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
149f4a2713aSLionel Sambuc }
150f4a2713aSLionel Sambuc 
151f4a2713aSLionel Sambuc #define _mm256_round_pd(V, M) __extension__ ({ \
152f4a2713aSLionel Sambuc     __m256d __V = (V); \
153f4a2713aSLionel Sambuc     (__m256d)__builtin_ia32_roundpd256((__v4df)__V, (M)); })
154f4a2713aSLionel Sambuc 
155f4a2713aSLionel Sambuc #define _mm256_round_ps(V, M) __extension__ ({ \
156f4a2713aSLionel Sambuc   __m256 __V = (V); \
157f4a2713aSLionel Sambuc   (__m256)__builtin_ia32_roundps256((__v8sf)__V, (M)); })
158f4a2713aSLionel Sambuc 
159f4a2713aSLionel Sambuc #define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
160f4a2713aSLionel Sambuc #define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
161f4a2713aSLionel Sambuc #define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
162f4a2713aSLionel Sambuc #define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
163f4a2713aSLionel Sambuc 
164f4a2713aSLionel Sambuc /* Logical */
165f4a2713aSLionel Sambuc static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_and_pd(__m256d __a,__m256d __b)166f4a2713aSLionel Sambuc _mm256_and_pd(__m256d __a, __m256d __b)
167f4a2713aSLionel Sambuc {
168f4a2713aSLionel Sambuc   return (__m256d)((__v4di)__a & (__v4di)__b);
169f4a2713aSLionel Sambuc }
170f4a2713aSLionel Sambuc 
171f4a2713aSLionel Sambuc static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_and_ps(__m256 __a,__m256 __b)172f4a2713aSLionel Sambuc _mm256_and_ps(__m256 __a, __m256 __b)
173f4a2713aSLionel Sambuc {
174f4a2713aSLionel Sambuc   return (__m256)((__v8si)__a & (__v8si)__b);
175f4a2713aSLionel Sambuc }
176f4a2713aSLionel Sambuc 
177f4a2713aSLionel Sambuc static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_andnot_pd(__m256d __a,__m256d __b)178f4a2713aSLionel Sambuc _mm256_andnot_pd(__m256d __a, __m256d __b)
179f4a2713aSLionel Sambuc {
180f4a2713aSLionel Sambuc   return (__m256d)(~(__v4di)__a & (__v4di)__b);
181f4a2713aSLionel Sambuc }
182f4a2713aSLionel Sambuc 
183f4a2713aSLionel Sambuc static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_andnot_ps(__m256 __a,__m256 __b)184f4a2713aSLionel Sambuc _mm256_andnot_ps(__m256 __a, __m256 __b)
185f4a2713aSLionel Sambuc {
186f4a2713aSLionel Sambuc   return (__m256)(~(__v8si)__a & (__v8si)__b);
187f4a2713aSLionel Sambuc }
188f4a2713aSLionel Sambuc 
189f4a2713aSLionel Sambuc static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_or_pd(__m256d __a,__m256d __b)190f4a2713aSLionel Sambuc _mm256_or_pd(__m256d __a, __m256d __b)
191f4a2713aSLionel Sambuc {
192f4a2713aSLionel Sambuc   return (__m256d)((__v4di)__a | (__v4di)__b);
193f4a2713aSLionel Sambuc }
194f4a2713aSLionel Sambuc 
195f4a2713aSLionel Sambuc static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_or_ps(__m256 __a,__m256 __b)196f4a2713aSLionel Sambuc _mm256_or_ps(__m256 __a, __m256 __b)
197f4a2713aSLionel Sambuc {
198f4a2713aSLionel Sambuc   return (__m256)((__v8si)__a | (__v8si)__b);
199f4a2713aSLionel Sambuc }
200f4a2713aSLionel Sambuc 
201f4a2713aSLionel Sambuc static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_xor_pd(__m256d __a,__m256d __b)202f4a2713aSLionel Sambuc _mm256_xor_pd(__m256d __a, __m256d __b)
203f4a2713aSLionel Sambuc {
204f4a2713aSLionel Sambuc   return (__m256d)((__v4di)__a ^ (__v4di)__b);
205f4a2713aSLionel Sambuc }
206f4a2713aSLionel Sambuc 
207f4a2713aSLionel Sambuc static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_xor_ps(__m256 __a,__m256 __b)208f4a2713aSLionel Sambuc _mm256_xor_ps(__m256 __a, __m256 __b)
209f4a2713aSLionel Sambuc {
210f4a2713aSLionel Sambuc   return (__m256)((__v8si)__a ^ (__v8si)__b);
211f4a2713aSLionel Sambuc }
212f4a2713aSLionel Sambuc 
213f4a2713aSLionel Sambuc /* Horizontal arithmetic */
214f4a2713aSLionel Sambuc static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_hadd_pd(__m256d __a,__m256d __b)215f4a2713aSLionel Sambuc _mm256_hadd_pd(__m256d __a, __m256d __b)
216f4a2713aSLionel Sambuc {
217f4a2713aSLionel Sambuc   return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
218f4a2713aSLionel Sambuc }
219f4a2713aSLionel Sambuc 
220f4a2713aSLionel Sambuc static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_hadd_ps(__m256 __a,__m256 __b)221f4a2713aSLionel Sambuc _mm256_hadd_ps(__m256 __a, __m256 __b)
222f4a2713aSLionel Sambuc {
223f4a2713aSLionel Sambuc   return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
224f4a2713aSLionel Sambuc }
225f4a2713aSLionel Sambuc 
226f4a2713aSLionel Sambuc static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_hsub_pd(__m256d __a,__m256d __b)227f4a2713aSLionel Sambuc _mm256_hsub_pd(__m256d __a, __m256d __b)
228f4a2713aSLionel Sambuc {
229f4a2713aSLionel Sambuc   return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
230f4a2713aSLionel Sambuc }
231f4a2713aSLionel Sambuc 
232f4a2713aSLionel Sambuc static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_hsub_ps(__m256 __a,__m256 __b)233f4a2713aSLionel Sambuc _mm256_hsub_ps(__m256 __a, __m256 __b)
234f4a2713aSLionel Sambuc {
235f4a2713aSLionel Sambuc   return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
236f4a2713aSLionel Sambuc }
237f4a2713aSLionel Sambuc 
238f4a2713aSLionel Sambuc /* Vector permutations */
239f4a2713aSLionel Sambuc static __inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_permutevar_pd(__m128d __a,__m128i __c)240f4a2713aSLionel Sambuc _mm_permutevar_pd(__m128d __a, __m128i __c)
241f4a2713aSLionel Sambuc {
242f4a2713aSLionel Sambuc   return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
243f4a2713aSLionel Sambuc }
244f4a2713aSLionel Sambuc 
245f4a2713aSLionel Sambuc static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_permutevar_pd(__m256d __a,__m256i __c)246f4a2713aSLionel Sambuc _mm256_permutevar_pd(__m256d __a, __m256i __c)
247f4a2713aSLionel Sambuc {
248f4a2713aSLionel Sambuc   return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
249f4a2713aSLionel Sambuc }
250f4a2713aSLionel Sambuc 
251f4a2713aSLionel Sambuc static __inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_permutevar_ps(__m128 __a,__m128i __c)252f4a2713aSLionel Sambuc _mm_permutevar_ps(__m128 __a, __m128i __c)
253f4a2713aSLionel Sambuc {
254f4a2713aSLionel Sambuc   return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
255f4a2713aSLionel Sambuc }
256f4a2713aSLionel Sambuc 
257f4a2713aSLionel Sambuc static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_permutevar_ps(__m256 __a,__m256i __c)258f4a2713aSLionel Sambuc _mm256_permutevar_ps(__m256 __a, __m256i __c)
259f4a2713aSLionel Sambuc {
260f4a2713aSLionel Sambuc   return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a,
261f4a2713aSLionel Sambuc 						  (__v8si)__c);
262f4a2713aSLionel Sambuc }
263f4a2713aSLionel Sambuc 
264f4a2713aSLionel Sambuc #define _mm_permute_pd(A, C) __extension__ ({ \
265f4a2713aSLionel Sambuc   __m128d __A = (A); \
266f4a2713aSLionel Sambuc   (__m128d)__builtin_shufflevector((__v2df)__A, (__v2df) _mm_setzero_pd(), \
267f4a2713aSLionel Sambuc                                    (C) & 0x1, ((C) & 0x2) >> 1); })
268f4a2713aSLionel Sambuc 
269f4a2713aSLionel Sambuc #define _mm256_permute_pd(A, C) __extension__ ({ \
270f4a2713aSLionel Sambuc   __m256d __A = (A); \
271f4a2713aSLionel Sambuc   (__m256d)__builtin_shufflevector((__v4df)__A, (__v4df) _mm256_setzero_pd(), \
272f4a2713aSLionel Sambuc                                    (C) & 0x1, ((C) & 0x2) >> 1, \
273f4a2713aSLionel Sambuc                                    2 + (((C) & 0x4) >> 2), \
274f4a2713aSLionel Sambuc                                    2 + (((C) & 0x8) >> 3)); })
275f4a2713aSLionel Sambuc 
276f4a2713aSLionel Sambuc #define _mm_permute_ps(A, C) __extension__ ({ \
277f4a2713aSLionel Sambuc   __m128 __A = (A); \
278f4a2713aSLionel Sambuc   (__m128)__builtin_shufflevector((__v4sf)__A, (__v4sf) _mm_setzero_ps(), \
279f4a2713aSLionel Sambuc                                    (C) & 0x3, ((C) & 0xc) >> 2, \
280f4a2713aSLionel Sambuc                                    ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); })
281f4a2713aSLionel Sambuc 
282f4a2713aSLionel Sambuc #define _mm256_permute_ps(A, C) __extension__ ({ \
283f4a2713aSLionel Sambuc   __m256 __A = (A); \
284f4a2713aSLionel Sambuc   (__m256)__builtin_shufflevector((__v8sf)__A, (__v8sf) _mm256_setzero_ps(), \
285f4a2713aSLionel Sambuc                                   (C) & 0x3, ((C) & 0xc) >> 2, \
286f4a2713aSLionel Sambuc                                   ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6, \
287f4a2713aSLionel Sambuc                                   4 + (((C) & 0x03) >> 0), \
288f4a2713aSLionel Sambuc                                   4 + (((C) & 0x0c) >> 2), \
289f4a2713aSLionel Sambuc                                   4 + (((C) & 0x30) >> 4), \
290f4a2713aSLionel Sambuc                                   4 + (((C) & 0xc0) >> 6)); })
291f4a2713aSLionel Sambuc 
292f4a2713aSLionel Sambuc #define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
293f4a2713aSLionel Sambuc   __m256d __V1 = (V1); \
294f4a2713aSLionel Sambuc   __m256d __V2 = (V2); \
295f4a2713aSLionel Sambuc   (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)__V1, (__v4df)__V2, (M)); })
296f4a2713aSLionel Sambuc 
297f4a2713aSLionel Sambuc #define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
298f4a2713aSLionel Sambuc   __m256 __V1 = (V1); \
299f4a2713aSLionel Sambuc   __m256 __V2 = (V2); \
300f4a2713aSLionel Sambuc   (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
301f4a2713aSLionel Sambuc 
302f4a2713aSLionel Sambuc #define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
303f4a2713aSLionel Sambuc   __m256i __V1 = (V1); \
304f4a2713aSLionel Sambuc   __m256i __V2 = (V2); \
305f4a2713aSLionel Sambuc   (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)__V1, (__v8si)__V2, (M)); })
306f4a2713aSLionel Sambuc 
307f4a2713aSLionel Sambuc /* Vector Blend */
308f4a2713aSLionel Sambuc #define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
309f4a2713aSLionel Sambuc   __m256d __V1 = (V1); \
310f4a2713aSLionel Sambuc   __m256d __V2 = (V2); \
311*0a6a1f1dSLionel Sambuc   (__m256d)__builtin_shufflevector((__v4df)__V1, (__v4df)__V2, \
312*0a6a1f1dSLionel Sambuc                                    (((M) & 0x01) ? 4 : 0), \
313*0a6a1f1dSLionel Sambuc                                    (((M) & 0x02) ? 5 : 1), \
314*0a6a1f1dSLionel Sambuc                                    (((M) & 0x04) ? 6 : 2), \
315*0a6a1f1dSLionel Sambuc                                    (((M) & 0x08) ? 7 : 3)); })
316f4a2713aSLionel Sambuc 
317f4a2713aSLionel Sambuc #define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
318f4a2713aSLionel Sambuc   __m256 __V1 = (V1); \
319f4a2713aSLionel Sambuc   __m256 __V2 = (V2); \
320*0a6a1f1dSLionel Sambuc   (__m256)__builtin_shufflevector((__v8sf)__V1, (__v8sf)__V2, \
321*0a6a1f1dSLionel Sambuc                                   (((M) & 0x01) ?  8 : 0), \
322*0a6a1f1dSLionel Sambuc                                   (((M) & 0x02) ?  9 : 1), \
323*0a6a1f1dSLionel Sambuc                                   (((M) & 0x04) ? 10 : 2), \
324*0a6a1f1dSLionel Sambuc                                   (((M) & 0x08) ? 11 : 3), \
325*0a6a1f1dSLionel Sambuc                                   (((M) & 0x10) ? 12 : 4), \
326*0a6a1f1dSLionel Sambuc                                   (((M) & 0x20) ? 13 : 5), \
327*0a6a1f1dSLionel Sambuc                                   (((M) & 0x40) ? 14 : 6), \
328*0a6a1f1dSLionel Sambuc                                   (((M) & 0x80) ? 15 : 7)); })
329f4a2713aSLionel Sambuc 
330f4a2713aSLionel Sambuc static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_blendv_pd(__m256d __a,__m256d __b,__m256d __c)331f4a2713aSLionel Sambuc _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
332f4a2713aSLionel Sambuc {
333f4a2713aSLionel Sambuc   return (__m256d)__builtin_ia32_blendvpd256(
334f4a2713aSLionel Sambuc     (__v4df)__a, (__v4df)__b, (__v4df)__c);
335f4a2713aSLionel Sambuc }
336f4a2713aSLionel Sambuc 
337f4a2713aSLionel Sambuc static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_blendv_ps(__m256 __a,__m256 __b,__m256 __c)338f4a2713aSLionel Sambuc _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
339f4a2713aSLionel Sambuc {
340f4a2713aSLionel Sambuc   return (__m256)__builtin_ia32_blendvps256(
341f4a2713aSLionel Sambuc     (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
342f4a2713aSLionel Sambuc }
343f4a2713aSLionel Sambuc 
344f4a2713aSLionel Sambuc /* Vector Dot Product */
345f4a2713aSLionel Sambuc #define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
346f4a2713aSLionel Sambuc   __m256 __V1 = (V1); \
347f4a2713aSLionel Sambuc   __m256 __V2 = (V2); \
348f4a2713aSLionel Sambuc   (__m256)__builtin_ia32_dpps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
349f4a2713aSLionel Sambuc 
350f4a2713aSLionel Sambuc /* Vector shuffle */
351f4a2713aSLionel Sambuc #define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
352f4a2713aSLionel Sambuc         __m256 __a = (a); \
353f4a2713aSLionel Sambuc         __m256 __b = (b); \
354f4a2713aSLionel Sambuc         (__m256)__builtin_shufflevector((__v8sf)__a, (__v8sf)__b, \
355f4a2713aSLionel Sambuc         (mask) & 0x3,                ((mask) & 0xc) >> 2, \
356f4a2713aSLionel Sambuc         (((mask) & 0x30) >> 4) + 8,  (((mask) & 0xc0) >> 6) + 8, \
357f4a2713aSLionel Sambuc         ((mask) & 0x3) + 4,          (((mask) & 0xc) >> 2) + 4, \
358f4a2713aSLionel Sambuc         (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12); })
359f4a2713aSLionel Sambuc 
360f4a2713aSLionel Sambuc #define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
361f4a2713aSLionel Sambuc         __m256d __a = (a); \
362f4a2713aSLionel Sambuc         __m256d __b = (b); \
363f4a2713aSLionel Sambuc         (__m256d)__builtin_shufflevector((__v4df)__a, (__v4df)__b, \
364f4a2713aSLionel Sambuc         (mask) & 0x1, \
365f4a2713aSLionel Sambuc         (((mask) & 0x2) >> 1) + 4, \
366f4a2713aSLionel Sambuc         (((mask) & 0x4) >> 2) + 2, \
367f4a2713aSLionel Sambuc         (((mask) & 0x8) >> 3) + 6); })
368f4a2713aSLionel Sambuc 
369f4a2713aSLionel Sambuc /* Compare */
370f4a2713aSLionel Sambuc #define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
371f4a2713aSLionel Sambuc #define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
372f4a2713aSLionel Sambuc #define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
373f4a2713aSLionel Sambuc #define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
374f4a2713aSLionel Sambuc #define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
375f4a2713aSLionel Sambuc #define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
376f4a2713aSLionel Sambuc #define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
377f4a2713aSLionel Sambuc #define _CMP_ORD_Q    0x07 /* Ordered (nonsignaling)   */
378f4a2713aSLionel Sambuc #define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
379f4a2713aSLionel Sambuc #define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unord, signaling)  */
380f4a2713aSLionel Sambuc #define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
381f4a2713aSLionel Sambuc #define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
382f4a2713aSLionel Sambuc #define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
383f4a2713aSLionel Sambuc #define _CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
384f4a2713aSLionel Sambuc #define _CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
385f4a2713aSLionel Sambuc #define _CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
386f4a2713aSLionel Sambuc #define _CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
387f4a2713aSLionel Sambuc #define _CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
388f4a2713aSLionel Sambuc #define _CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
389f4a2713aSLionel Sambuc #define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
390f4a2713aSLionel Sambuc #define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
391f4a2713aSLionel Sambuc #define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
392f4a2713aSLionel Sambuc #define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unord, non-signaling)  */
393f4a2713aSLionel Sambuc #define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
394f4a2713aSLionel Sambuc #define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
395f4a2713aSLionel Sambuc #define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unord, non-sign)  */
396f4a2713aSLionel Sambuc #define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
397f4a2713aSLionel Sambuc #define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
398f4a2713aSLionel Sambuc #define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
399f4a2713aSLionel Sambuc #define _CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
400f4a2713aSLionel Sambuc #define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
401f4a2713aSLionel Sambuc #define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
402f4a2713aSLionel Sambuc 
403f4a2713aSLionel Sambuc #define _mm_cmp_pd(a, b, c) __extension__ ({ \
404f4a2713aSLionel Sambuc   __m128d __a = (a); \
405f4a2713aSLionel Sambuc   __m128d __b = (b); \
406f4a2713aSLionel Sambuc   (__m128d)__builtin_ia32_cmppd((__v2df)__a, (__v2df)__b, (c)); })
407f4a2713aSLionel Sambuc 
408f4a2713aSLionel Sambuc #define _mm_cmp_ps(a, b, c) __extension__ ({ \
409f4a2713aSLionel Sambuc   __m128 __a = (a); \
410f4a2713aSLionel Sambuc   __m128 __b = (b); \
411f4a2713aSLionel Sambuc   (__m128)__builtin_ia32_cmpps((__v4sf)__a, (__v4sf)__b, (c)); })
412f4a2713aSLionel Sambuc 
413f4a2713aSLionel Sambuc #define _mm256_cmp_pd(a, b, c) __extension__ ({ \
414f4a2713aSLionel Sambuc   __m256d __a = (a); \
415f4a2713aSLionel Sambuc   __m256d __b = (b); \
416f4a2713aSLionel Sambuc   (__m256d)__builtin_ia32_cmppd256((__v4df)__a, (__v4df)__b, (c)); })
417f4a2713aSLionel Sambuc 
418f4a2713aSLionel Sambuc #define _mm256_cmp_ps(a, b, c) __extension__ ({ \
419f4a2713aSLionel Sambuc   __m256 __a = (a); \
420f4a2713aSLionel Sambuc   __m256 __b = (b); \
421f4a2713aSLionel Sambuc   (__m256)__builtin_ia32_cmpps256((__v8sf)__a, (__v8sf)__b, (c)); })
422f4a2713aSLionel Sambuc 
423f4a2713aSLionel Sambuc #define _mm_cmp_sd(a, b, c) __extension__ ({ \
424f4a2713aSLionel Sambuc   __m128d __a = (a); \
425f4a2713aSLionel Sambuc   __m128d __b = (b); \
426f4a2713aSLionel Sambuc   (__m128d)__builtin_ia32_cmpsd((__v2df)__a, (__v2df)__b, (c)); })
427f4a2713aSLionel Sambuc 
428f4a2713aSLionel Sambuc #define _mm_cmp_ss(a, b, c) __extension__ ({ \
429f4a2713aSLionel Sambuc   __m128 __a = (a); \
430f4a2713aSLionel Sambuc   __m128 __b = (b); \
431f4a2713aSLionel Sambuc   (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); })
432f4a2713aSLionel Sambuc 
433f4a2713aSLionel Sambuc /* Vector extract */
434f4a2713aSLionel Sambuc #define _mm256_extractf128_pd(A, O) __extension__ ({ \
435f4a2713aSLionel Sambuc   __m256d __A = (A); \
436f4a2713aSLionel Sambuc   (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)__A, (O)); })
437f4a2713aSLionel Sambuc 
438f4a2713aSLionel Sambuc #define _mm256_extractf128_ps(A, O) __extension__ ({ \
439f4a2713aSLionel Sambuc   __m256 __A = (A); \
440f4a2713aSLionel Sambuc   (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)__A, (O)); })
441f4a2713aSLionel Sambuc 
442f4a2713aSLionel Sambuc #define _mm256_extractf128_si256(A, O) __extension__ ({ \
443f4a2713aSLionel Sambuc   __m256i __A = (A); \
444f4a2713aSLionel Sambuc   (__m128i)__builtin_ia32_vextractf128_si256((__v8si)__A, (O)); })
445f4a2713aSLionel Sambuc 
446f4a2713aSLionel Sambuc static __inline int __attribute__((__always_inline__, __nodebug__))
_mm256_extract_epi32(__m256i __a,int const __imm)447f4a2713aSLionel Sambuc _mm256_extract_epi32(__m256i __a, int const __imm)
448f4a2713aSLionel Sambuc {
449f4a2713aSLionel Sambuc   __v8si __b = (__v8si)__a;
450f4a2713aSLionel Sambuc   return __b[__imm & 7];
451f4a2713aSLionel Sambuc }
452f4a2713aSLionel Sambuc 
453f4a2713aSLionel Sambuc static __inline int __attribute__((__always_inline__, __nodebug__))
_mm256_extract_epi16(__m256i __a,int const __imm)454f4a2713aSLionel Sambuc _mm256_extract_epi16(__m256i __a, int const __imm)
455f4a2713aSLionel Sambuc {
456f4a2713aSLionel Sambuc   __v16hi __b = (__v16hi)__a;
457f4a2713aSLionel Sambuc   return __b[__imm & 15];
458f4a2713aSLionel Sambuc }
459f4a2713aSLionel Sambuc 
460f4a2713aSLionel Sambuc static __inline int __attribute__((__always_inline__, __nodebug__))
_mm256_extract_epi8(__m256i __a,int const __imm)461f4a2713aSLionel Sambuc _mm256_extract_epi8(__m256i __a, int const __imm)
462f4a2713aSLionel Sambuc {
463f4a2713aSLionel Sambuc   __v32qi __b = (__v32qi)__a;
464f4a2713aSLionel Sambuc   return __b[__imm & 31];
465f4a2713aSLionel Sambuc }
466f4a2713aSLionel Sambuc 
467f4a2713aSLionel Sambuc #ifdef __x86_64__
468f4a2713aSLionel Sambuc static __inline long long  __attribute__((__always_inline__, __nodebug__))
_mm256_extract_epi64(__m256i __a,const int __imm)469f4a2713aSLionel Sambuc _mm256_extract_epi64(__m256i __a, const int __imm)
470f4a2713aSLionel Sambuc {
471f4a2713aSLionel Sambuc   __v4di __b = (__v4di)__a;
472f4a2713aSLionel Sambuc   return __b[__imm & 3];
473f4a2713aSLionel Sambuc }
474f4a2713aSLionel Sambuc #endif
475f4a2713aSLionel Sambuc 
476f4a2713aSLionel Sambuc /* Vector insert */
477f4a2713aSLionel Sambuc #define _mm256_insertf128_pd(V1, V2, O) __extension__ ({ \
478f4a2713aSLionel Sambuc   __m256d __V1 = (V1); \
479f4a2713aSLionel Sambuc   __m128d __V2 = (V2); \
480f4a2713aSLionel Sambuc   (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)__V1, (__v2df)__V2, (O)); })
481f4a2713aSLionel Sambuc 
482f4a2713aSLionel Sambuc #define _mm256_insertf128_ps(V1, V2, O) __extension__ ({ \
483f4a2713aSLionel Sambuc   __m256 __V1 = (V1); \
484f4a2713aSLionel Sambuc   __m128 __V2 = (V2); \
485f4a2713aSLionel Sambuc   (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)__V1, (__v4sf)__V2, (O)); })
486f4a2713aSLionel Sambuc 
487f4a2713aSLionel Sambuc #define _mm256_insertf128_si256(V1, V2, O) __extension__ ({ \
488f4a2713aSLionel Sambuc   __m256i __V1 = (V1); \
489f4a2713aSLionel Sambuc   __m128i __V2 = (V2); \
490f4a2713aSLionel Sambuc   (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)__V1, (__v4si)__V2, (O)); })
491f4a2713aSLionel Sambuc 
492f4a2713aSLionel Sambuc static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_insert_epi32(__m256i __a,int __b,int const __imm)493f4a2713aSLionel Sambuc _mm256_insert_epi32(__m256i __a, int __b, int const __imm)
494f4a2713aSLionel Sambuc {
495f4a2713aSLionel Sambuc   __v8si __c = (__v8si)__a;
496f4a2713aSLionel Sambuc   __c[__imm & 7] = __b;
497f4a2713aSLionel Sambuc   return (__m256i)__c;
498f4a2713aSLionel Sambuc }
499f4a2713aSLionel Sambuc 
500f4a2713aSLionel Sambuc static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_insert_epi16(__m256i __a,int __b,int const __imm)501f4a2713aSLionel Sambuc _mm256_insert_epi16(__m256i __a, int __b, int const __imm)
502f4a2713aSLionel Sambuc {
503f4a2713aSLionel Sambuc   __v16hi __c = (__v16hi)__a;
504f4a2713aSLionel Sambuc   __c[__imm & 15] = __b;
505f4a2713aSLionel Sambuc   return (__m256i)__c;
506f4a2713aSLionel Sambuc }
507f4a2713aSLionel Sambuc 
508f4a2713aSLionel Sambuc static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_insert_epi8(__m256i __a,int __b,int const __imm)509f4a2713aSLionel Sambuc _mm256_insert_epi8(__m256i __a, int __b, int const __imm)
510f4a2713aSLionel Sambuc {
511f4a2713aSLionel Sambuc   __v32qi __c = (__v32qi)__a;
512f4a2713aSLionel Sambuc   __c[__imm & 31] = __b;
513f4a2713aSLionel Sambuc   return (__m256i)__c;
514f4a2713aSLionel Sambuc }
515f4a2713aSLionel Sambuc 
516f4a2713aSLionel Sambuc #ifdef __x86_64__
517f4a2713aSLionel Sambuc static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_insert_epi64(__m256i __a,int __b,int const __imm)518f4a2713aSLionel Sambuc _mm256_insert_epi64(__m256i __a, int __b, int const __imm)
519f4a2713aSLionel Sambuc {
520f4a2713aSLionel Sambuc   __v4di __c = (__v4di)__a;
521f4a2713aSLionel Sambuc   __c[__imm & 3] = __b;
522f4a2713aSLionel Sambuc   return (__m256i)__c;
523f4a2713aSLionel Sambuc }
524f4a2713aSLionel Sambuc #endif
525f4a2713aSLionel Sambuc 
526f4a2713aSLionel Sambuc /* Conversion */
527f4a2713aSLionel Sambuc static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_cvtepi32_pd(__m128i __a)528f4a2713aSLionel Sambuc _mm256_cvtepi32_pd(__m128i __a)
529f4a2713aSLionel Sambuc {
530f4a2713aSLionel Sambuc   return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) __a);
531f4a2713aSLionel Sambuc }
532f4a2713aSLionel Sambuc 
533f4a2713aSLionel Sambuc static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_cvtepi32_ps(__m256i __a)534f4a2713aSLionel Sambuc _mm256_cvtepi32_ps(__m256i __a)
535f4a2713aSLionel Sambuc {
536f4a2713aSLionel Sambuc   return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a);
537f4a2713aSLionel Sambuc }
538f4a2713aSLionel Sambuc 
539f4a2713aSLionel Sambuc static __inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm256_cvtpd_ps(__m256d __a)540f4a2713aSLionel Sambuc _mm256_cvtpd_ps(__m256d __a)
541f4a2713aSLionel Sambuc {
542f4a2713aSLionel Sambuc   return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
543f4a2713aSLionel Sambuc }
544f4a2713aSLionel Sambuc 
545f4a2713aSLionel Sambuc static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_cvtps_epi32(__m256 __a)546f4a2713aSLionel Sambuc _mm256_cvtps_epi32(__m256 __a)
547f4a2713aSLionel Sambuc {
548f4a2713aSLionel Sambuc   return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
549f4a2713aSLionel Sambuc }
550f4a2713aSLionel Sambuc 
551f4a2713aSLionel Sambuc static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_cvtps_pd(__m128 __a)552f4a2713aSLionel Sambuc _mm256_cvtps_pd(__m128 __a)
553f4a2713aSLionel Sambuc {
554f4a2713aSLionel Sambuc   return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) __a);
555f4a2713aSLionel Sambuc }
556f4a2713aSLionel Sambuc 
557f4a2713aSLionel Sambuc static __inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm256_cvttpd_epi32(__m256d __a)558f4a2713aSLionel Sambuc _mm256_cvttpd_epi32(__m256d __a)
559f4a2713aSLionel Sambuc {
560f4a2713aSLionel Sambuc   return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
561f4a2713aSLionel Sambuc }
562f4a2713aSLionel Sambuc 
563f4a2713aSLionel Sambuc static __inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm256_cvtpd_epi32(__m256d __a)564f4a2713aSLionel Sambuc _mm256_cvtpd_epi32(__m256d __a)
565f4a2713aSLionel Sambuc {
566f4a2713aSLionel Sambuc   return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
567f4a2713aSLionel Sambuc }
568f4a2713aSLionel Sambuc 
569f4a2713aSLionel Sambuc static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_cvttps_epi32(__m256 __a)570f4a2713aSLionel Sambuc _mm256_cvttps_epi32(__m256 __a)
571f4a2713aSLionel Sambuc {
572f4a2713aSLionel Sambuc   return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
573f4a2713aSLionel Sambuc }
574f4a2713aSLionel Sambuc 
575f4a2713aSLionel Sambuc /* Vector replicate */
576f4a2713aSLionel Sambuc static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_movehdup_ps(__m256 __a)577f4a2713aSLionel Sambuc _mm256_movehdup_ps(__m256 __a)
578f4a2713aSLionel Sambuc {
579f4a2713aSLionel Sambuc   return __builtin_shufflevector(__a, __a, 1, 1, 3, 3, 5, 5, 7, 7);
580f4a2713aSLionel Sambuc }
581f4a2713aSLionel Sambuc 
582f4a2713aSLionel Sambuc static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_moveldup_ps(__m256 __a)583f4a2713aSLionel Sambuc _mm256_moveldup_ps(__m256 __a)
584f4a2713aSLionel Sambuc {
585f4a2713aSLionel Sambuc   return __builtin_shufflevector(__a, __a, 0, 0, 2, 2, 4, 4, 6, 6);
586f4a2713aSLionel Sambuc }
587f4a2713aSLionel Sambuc 
588f4a2713aSLionel Sambuc static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_movedup_pd(__m256d __a)589f4a2713aSLionel Sambuc _mm256_movedup_pd(__m256d __a)
590f4a2713aSLionel Sambuc {
591f4a2713aSLionel Sambuc   return __builtin_shufflevector(__a, __a, 0, 0, 2, 2);
592f4a2713aSLionel Sambuc }
593f4a2713aSLionel Sambuc 
594f4a2713aSLionel Sambuc /* Unpack and Interleave */
595f4a2713aSLionel Sambuc static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_unpackhi_pd(__m256d __a,__m256d __b)596f4a2713aSLionel Sambuc _mm256_unpackhi_pd(__m256d __a, __m256d __b)
597f4a2713aSLionel Sambuc {
598f4a2713aSLionel Sambuc   return __builtin_shufflevector(__a, __b, 1, 5, 1+2, 5+2);
599f4a2713aSLionel Sambuc }
600f4a2713aSLionel Sambuc 
601f4a2713aSLionel Sambuc static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_unpacklo_pd(__m256d __a,__m256d __b)602f4a2713aSLionel Sambuc _mm256_unpacklo_pd(__m256d __a, __m256d __b)
603f4a2713aSLionel Sambuc {
604f4a2713aSLionel Sambuc   return __builtin_shufflevector(__a, __b, 0, 4, 0+2, 4+2);
605f4a2713aSLionel Sambuc }
606f4a2713aSLionel Sambuc 
607f4a2713aSLionel Sambuc static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_unpackhi_ps(__m256 __a,__m256 __b)608f4a2713aSLionel Sambuc _mm256_unpackhi_ps(__m256 __a, __m256 __b)
609f4a2713aSLionel Sambuc {
610f4a2713aSLionel Sambuc   return __builtin_shufflevector(__a, __b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
611f4a2713aSLionel Sambuc }
612f4a2713aSLionel Sambuc 
613f4a2713aSLionel Sambuc static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_unpacklo_ps(__m256 __a,__m256 __b)614f4a2713aSLionel Sambuc _mm256_unpacklo_ps(__m256 __a, __m256 __b)
615f4a2713aSLionel Sambuc {
616f4a2713aSLionel Sambuc   return __builtin_shufflevector(__a, __b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
617f4a2713aSLionel Sambuc }
618f4a2713aSLionel Sambuc 
619f4a2713aSLionel Sambuc /* Bit Test */
620f4a2713aSLionel Sambuc static __inline int __attribute__((__always_inline__, __nodebug__))
_mm_testz_pd(__m128d __a,__m128d __b)621f4a2713aSLionel Sambuc _mm_testz_pd(__m128d __a, __m128d __b)
622f4a2713aSLionel Sambuc {
623f4a2713aSLionel Sambuc   return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
624f4a2713aSLionel Sambuc }
625f4a2713aSLionel Sambuc 
626f4a2713aSLionel Sambuc static __inline int __attribute__((__always_inline__, __nodebug__))
_mm_testc_pd(__m128d __a,__m128d __b)627f4a2713aSLionel Sambuc _mm_testc_pd(__m128d __a, __m128d __b)
628f4a2713aSLionel Sambuc {
629f4a2713aSLionel Sambuc   return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
630f4a2713aSLionel Sambuc }
631f4a2713aSLionel Sambuc 
632f4a2713aSLionel Sambuc static __inline int __attribute__((__always_inline__, __nodebug__))
_mm_testnzc_pd(__m128d __a,__m128d __b)633f4a2713aSLionel Sambuc _mm_testnzc_pd(__m128d __a, __m128d __b)
634f4a2713aSLionel Sambuc {
635f4a2713aSLionel Sambuc   return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
636f4a2713aSLionel Sambuc }
637f4a2713aSLionel Sambuc 
638f4a2713aSLionel Sambuc static __inline int __attribute__((__always_inline__, __nodebug__))
_mm_testz_ps(__m128 __a,__m128 __b)639f4a2713aSLionel Sambuc _mm_testz_ps(__m128 __a, __m128 __b)
640f4a2713aSLionel Sambuc {
641f4a2713aSLionel Sambuc   return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
642f4a2713aSLionel Sambuc }
643f4a2713aSLionel Sambuc 
644f4a2713aSLionel Sambuc static __inline int __attribute__((__always_inline__, __nodebug__))
_mm_testc_ps(__m128 __a,__m128 __b)645f4a2713aSLionel Sambuc _mm_testc_ps(__m128 __a, __m128 __b)
646f4a2713aSLionel Sambuc {
647f4a2713aSLionel Sambuc   return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
648f4a2713aSLionel Sambuc }
649f4a2713aSLionel Sambuc 
650f4a2713aSLionel Sambuc static __inline int __attribute__((__always_inline__, __nodebug__))
_mm_testnzc_ps(__m128 __a,__m128 __b)651f4a2713aSLionel Sambuc _mm_testnzc_ps(__m128 __a, __m128 __b)
652f4a2713aSLionel Sambuc {
653f4a2713aSLionel Sambuc   return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
654f4a2713aSLionel Sambuc }
655f4a2713aSLionel Sambuc 
656f4a2713aSLionel Sambuc static __inline int __attribute__((__always_inline__, __nodebug__))
_mm256_testz_pd(__m256d __a,__m256d __b)657f4a2713aSLionel Sambuc _mm256_testz_pd(__m256d __a, __m256d __b)
658f4a2713aSLionel Sambuc {
659f4a2713aSLionel Sambuc   return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
660f4a2713aSLionel Sambuc }
661f4a2713aSLionel Sambuc 
662f4a2713aSLionel Sambuc static __inline int __attribute__((__always_inline__, __nodebug__))
_mm256_testc_pd(__m256d __a,__m256d __b)663f4a2713aSLionel Sambuc _mm256_testc_pd(__m256d __a, __m256d __b)
664f4a2713aSLionel Sambuc {
665f4a2713aSLionel Sambuc   return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
666f4a2713aSLionel Sambuc }
667f4a2713aSLionel Sambuc 
668f4a2713aSLionel Sambuc static __inline int __attribute__((__always_inline__, __nodebug__))
_mm256_testnzc_pd(__m256d __a,__m256d __b)669f4a2713aSLionel Sambuc _mm256_testnzc_pd(__m256d __a, __m256d __b)
670f4a2713aSLionel Sambuc {
671f4a2713aSLionel Sambuc   return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
672f4a2713aSLionel Sambuc }
673f4a2713aSLionel Sambuc 
674f4a2713aSLionel Sambuc static __inline int __attribute__((__always_inline__, __nodebug__))
_mm256_testz_ps(__m256 __a,__m256 __b)675f4a2713aSLionel Sambuc _mm256_testz_ps(__m256 __a, __m256 __b)
676f4a2713aSLionel Sambuc {
677f4a2713aSLionel Sambuc   return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
678f4a2713aSLionel Sambuc }
679f4a2713aSLionel Sambuc 
680f4a2713aSLionel Sambuc static __inline int __attribute__((__always_inline__, __nodebug__))
_mm256_testc_ps(__m256 __a,__m256 __b)681f4a2713aSLionel Sambuc _mm256_testc_ps(__m256 __a, __m256 __b)
682f4a2713aSLionel Sambuc {
683f4a2713aSLionel Sambuc   return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
684f4a2713aSLionel Sambuc }
685f4a2713aSLionel Sambuc 
686f4a2713aSLionel Sambuc static __inline int __attribute__((__always_inline__, __nodebug__))
_mm256_testnzc_ps(__m256 __a,__m256 __b)687f4a2713aSLionel Sambuc _mm256_testnzc_ps(__m256 __a, __m256 __b)
688f4a2713aSLionel Sambuc {
689f4a2713aSLionel Sambuc   return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
690f4a2713aSLionel Sambuc }
691f4a2713aSLionel Sambuc 
692f4a2713aSLionel Sambuc static __inline int __attribute__((__always_inline__, __nodebug__))
_mm256_testz_si256(__m256i __a,__m256i __b)693f4a2713aSLionel Sambuc _mm256_testz_si256(__m256i __a, __m256i __b)
694f4a2713aSLionel Sambuc {
695f4a2713aSLionel Sambuc   return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
696f4a2713aSLionel Sambuc }
697f4a2713aSLionel Sambuc 
698f4a2713aSLionel Sambuc static __inline int __attribute__((__always_inline__, __nodebug__))
_mm256_testc_si256(__m256i __a,__m256i __b)699f4a2713aSLionel Sambuc _mm256_testc_si256(__m256i __a, __m256i __b)
700f4a2713aSLionel Sambuc {
701f4a2713aSLionel Sambuc   return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
702f4a2713aSLionel Sambuc }
703f4a2713aSLionel Sambuc 
704f4a2713aSLionel Sambuc static __inline int __attribute__((__always_inline__, __nodebug__))
_mm256_testnzc_si256(__m256i __a,__m256i __b)705f4a2713aSLionel Sambuc _mm256_testnzc_si256(__m256i __a, __m256i __b)
706f4a2713aSLionel Sambuc {
707f4a2713aSLionel Sambuc   return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
708f4a2713aSLionel Sambuc }
709f4a2713aSLionel Sambuc 
710f4a2713aSLionel Sambuc /* Vector extract sign mask */
711f4a2713aSLionel Sambuc static __inline int __attribute__((__always_inline__, __nodebug__))
_mm256_movemask_pd(__m256d __a)712f4a2713aSLionel Sambuc _mm256_movemask_pd(__m256d __a)
713f4a2713aSLionel Sambuc {
714f4a2713aSLionel Sambuc   return __builtin_ia32_movmskpd256((__v4df)__a);
715f4a2713aSLionel Sambuc }
716f4a2713aSLionel Sambuc 
717f4a2713aSLionel Sambuc static __inline int __attribute__((__always_inline__, __nodebug__))
_mm256_movemask_ps(__m256 __a)718f4a2713aSLionel Sambuc _mm256_movemask_ps(__m256 __a)
719f4a2713aSLionel Sambuc {
720f4a2713aSLionel Sambuc   return __builtin_ia32_movmskps256((__v8sf)__a);
721f4a2713aSLionel Sambuc }
722f4a2713aSLionel Sambuc 
723f4a2713aSLionel Sambuc /* Vector __zero */
724f4a2713aSLionel Sambuc static __inline void __attribute__((__always_inline__, __nodebug__))
_mm256_zeroall(void)725f4a2713aSLionel Sambuc _mm256_zeroall(void)
726f4a2713aSLionel Sambuc {
727f4a2713aSLionel Sambuc   __builtin_ia32_vzeroall();
728f4a2713aSLionel Sambuc }
729f4a2713aSLionel Sambuc 
730f4a2713aSLionel Sambuc static __inline void __attribute__((__always_inline__, __nodebug__))
_mm256_zeroupper(void)731f4a2713aSLionel Sambuc _mm256_zeroupper(void)
732f4a2713aSLionel Sambuc {
733f4a2713aSLionel Sambuc   __builtin_ia32_vzeroupper();
734f4a2713aSLionel Sambuc }
735f4a2713aSLionel Sambuc 
736f4a2713aSLionel Sambuc /* Vector load with broadcast */
737f4a2713aSLionel Sambuc static __inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_broadcast_ss(float const * __a)738f4a2713aSLionel Sambuc _mm_broadcast_ss(float const *__a)
739f4a2713aSLionel Sambuc {
740*0a6a1f1dSLionel Sambuc   float __f = *__a;
741*0a6a1f1dSLionel Sambuc   return (__m128)(__v4sf){ __f, __f, __f, __f };
742f4a2713aSLionel Sambuc }
743f4a2713aSLionel Sambuc 
744f4a2713aSLionel Sambuc static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_broadcast_sd(double const * __a)745f4a2713aSLionel Sambuc _mm256_broadcast_sd(double const *__a)
746f4a2713aSLionel Sambuc {
747*0a6a1f1dSLionel Sambuc   double __d = *__a;
748*0a6a1f1dSLionel Sambuc   return (__m256d)(__v4df){ __d, __d, __d, __d };
749f4a2713aSLionel Sambuc }
750f4a2713aSLionel Sambuc 
751f4a2713aSLionel Sambuc static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_broadcast_ss(float const * __a)752f4a2713aSLionel Sambuc _mm256_broadcast_ss(float const *__a)
753f4a2713aSLionel Sambuc {
754*0a6a1f1dSLionel Sambuc   float __f = *__a;
755*0a6a1f1dSLionel Sambuc   return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
756f4a2713aSLionel Sambuc }
757f4a2713aSLionel Sambuc 
758f4a2713aSLionel Sambuc static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_broadcast_pd(__m128d const * __a)759f4a2713aSLionel Sambuc _mm256_broadcast_pd(__m128d const *__a)
760f4a2713aSLionel Sambuc {
761f4a2713aSLionel Sambuc   return (__m256d)__builtin_ia32_vbroadcastf128_pd256(__a);
762f4a2713aSLionel Sambuc }
763f4a2713aSLionel Sambuc 
764f4a2713aSLionel Sambuc static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_broadcast_ps(__m128 const * __a)765f4a2713aSLionel Sambuc _mm256_broadcast_ps(__m128 const *__a)
766f4a2713aSLionel Sambuc {
767f4a2713aSLionel Sambuc   return (__m256)__builtin_ia32_vbroadcastf128_ps256(__a);
768f4a2713aSLionel Sambuc }
769f4a2713aSLionel Sambuc 
770f4a2713aSLionel Sambuc /* SIMD load ops */
771f4a2713aSLionel Sambuc static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_load_pd(double const * __p)772f4a2713aSLionel Sambuc _mm256_load_pd(double const *__p)
773f4a2713aSLionel Sambuc {
774f4a2713aSLionel Sambuc   return *(__m256d *)__p;
775f4a2713aSLionel Sambuc }
776f4a2713aSLionel Sambuc 
777f4a2713aSLionel Sambuc static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_load_ps(float const * __p)778f4a2713aSLionel Sambuc _mm256_load_ps(float const *__p)
779f4a2713aSLionel Sambuc {
780f4a2713aSLionel Sambuc   return *(__m256 *)__p;
781f4a2713aSLionel Sambuc }
782f4a2713aSLionel Sambuc 
783f4a2713aSLionel Sambuc static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_loadu_pd(double const * __p)784f4a2713aSLionel Sambuc _mm256_loadu_pd(double const *__p)
785f4a2713aSLionel Sambuc {
786f4a2713aSLionel Sambuc   struct __loadu_pd {
787f4a2713aSLionel Sambuc     __m256d __v;
788f4a2713aSLionel Sambuc   } __attribute__((packed, may_alias));
789f4a2713aSLionel Sambuc   return ((struct __loadu_pd*)__p)->__v;
790f4a2713aSLionel Sambuc }
791f4a2713aSLionel Sambuc 
792f4a2713aSLionel Sambuc static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_loadu_ps(float const * __p)793f4a2713aSLionel Sambuc _mm256_loadu_ps(float const *__p)
794f4a2713aSLionel Sambuc {
795f4a2713aSLionel Sambuc   struct __loadu_ps {
796f4a2713aSLionel Sambuc     __m256 __v;
797f4a2713aSLionel Sambuc   } __attribute__((packed, may_alias));
798f4a2713aSLionel Sambuc   return ((struct __loadu_ps*)__p)->__v;
799f4a2713aSLionel Sambuc }
800f4a2713aSLionel Sambuc 
801f4a2713aSLionel Sambuc static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_load_si256(__m256i const * __p)802f4a2713aSLionel Sambuc _mm256_load_si256(__m256i const *__p)
803f4a2713aSLionel Sambuc {
804f4a2713aSLionel Sambuc   return *__p;
805f4a2713aSLionel Sambuc }
806f4a2713aSLionel Sambuc 
807f4a2713aSLionel Sambuc static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_loadu_si256(__m256i const * __p)808f4a2713aSLionel Sambuc _mm256_loadu_si256(__m256i const *__p)
809f4a2713aSLionel Sambuc {
810f4a2713aSLionel Sambuc   struct __loadu_si256 {
811f4a2713aSLionel Sambuc     __m256i __v;
812f4a2713aSLionel Sambuc   } __attribute__((packed, may_alias));
813f4a2713aSLionel Sambuc   return ((struct __loadu_si256*)__p)->__v;
814f4a2713aSLionel Sambuc }
815f4a2713aSLionel Sambuc 
816f4a2713aSLionel Sambuc static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_lddqu_si256(__m256i const * __p)817f4a2713aSLionel Sambuc _mm256_lddqu_si256(__m256i const *__p)
818f4a2713aSLionel Sambuc {
819f4a2713aSLionel Sambuc   return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
820f4a2713aSLionel Sambuc }
821f4a2713aSLionel Sambuc 
822f4a2713aSLionel Sambuc /* SIMD store ops */
823f4a2713aSLionel Sambuc static __inline void __attribute__((__always_inline__, __nodebug__))
_mm256_store_pd(double * __p,__m256d __a)824f4a2713aSLionel Sambuc _mm256_store_pd(double *__p, __m256d __a)
825f4a2713aSLionel Sambuc {
826f4a2713aSLionel Sambuc   *(__m256d *)__p = __a;
827f4a2713aSLionel Sambuc }
828f4a2713aSLionel Sambuc 
829f4a2713aSLionel Sambuc static __inline void __attribute__((__always_inline__, __nodebug__))
_mm256_store_ps(float * __p,__m256 __a)830f4a2713aSLionel Sambuc _mm256_store_ps(float *__p, __m256 __a)
831f4a2713aSLionel Sambuc {
832f4a2713aSLionel Sambuc   *(__m256 *)__p = __a;
833f4a2713aSLionel Sambuc }
834f4a2713aSLionel Sambuc 
835f4a2713aSLionel Sambuc static __inline void __attribute__((__always_inline__, __nodebug__))
_mm256_storeu_pd(double * __p,__m256d __a)836f4a2713aSLionel Sambuc _mm256_storeu_pd(double *__p, __m256d __a)
837f4a2713aSLionel Sambuc {
838f4a2713aSLionel Sambuc   __builtin_ia32_storeupd256(__p, (__v4df)__a);
839f4a2713aSLionel Sambuc }
840f4a2713aSLionel Sambuc 
841f4a2713aSLionel Sambuc static __inline void __attribute__((__always_inline__, __nodebug__))
_mm256_storeu_ps(float * __p,__m256 __a)842f4a2713aSLionel Sambuc _mm256_storeu_ps(float *__p, __m256 __a)
843f4a2713aSLionel Sambuc {
844f4a2713aSLionel Sambuc   __builtin_ia32_storeups256(__p, (__v8sf)__a);
845f4a2713aSLionel Sambuc }
846f4a2713aSLionel Sambuc 
847f4a2713aSLionel Sambuc static __inline void __attribute__((__always_inline__, __nodebug__))
_mm256_store_si256(__m256i * __p,__m256i __a)848f4a2713aSLionel Sambuc _mm256_store_si256(__m256i *__p, __m256i __a)
849f4a2713aSLionel Sambuc {
850f4a2713aSLionel Sambuc   *__p = __a;
851f4a2713aSLionel Sambuc }
852f4a2713aSLionel Sambuc 
853f4a2713aSLionel Sambuc static __inline void __attribute__((__always_inline__, __nodebug__))
_mm256_storeu_si256(__m256i * __p,__m256i __a)854f4a2713aSLionel Sambuc _mm256_storeu_si256(__m256i *__p, __m256i __a)
855f4a2713aSLionel Sambuc {
856f4a2713aSLionel Sambuc   __builtin_ia32_storedqu256((char *)__p, (__v32qi)__a);
857f4a2713aSLionel Sambuc }
858f4a2713aSLionel Sambuc 
859f4a2713aSLionel Sambuc /* Conditional load ops */
860f4a2713aSLionel Sambuc static __inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_maskload_pd(double const * __p,__m128d __m)861f4a2713aSLionel Sambuc _mm_maskload_pd(double const *__p, __m128d __m)
862f4a2713aSLionel Sambuc {
863f4a2713aSLionel Sambuc   return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2df)__m);
864f4a2713aSLionel Sambuc }
865f4a2713aSLionel Sambuc 
866f4a2713aSLionel Sambuc static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_maskload_pd(double const * __p,__m256d __m)867f4a2713aSLionel Sambuc _mm256_maskload_pd(double const *__p, __m256d __m)
868f4a2713aSLionel Sambuc {
869f4a2713aSLionel Sambuc   return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
870f4a2713aSLionel Sambuc                                                (__v4df)__m);
871f4a2713aSLionel Sambuc }
872f4a2713aSLionel Sambuc 
873f4a2713aSLionel Sambuc static __inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_maskload_ps(float const * __p,__m128 __m)874f4a2713aSLionel Sambuc _mm_maskload_ps(float const *__p, __m128 __m)
875f4a2713aSLionel Sambuc {
876f4a2713aSLionel Sambuc   return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4sf)__m);
877f4a2713aSLionel Sambuc }
878f4a2713aSLionel Sambuc 
879f4a2713aSLionel Sambuc static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_maskload_ps(float const * __p,__m256 __m)880f4a2713aSLionel Sambuc _mm256_maskload_ps(float const *__p, __m256 __m)
881f4a2713aSLionel Sambuc {
882f4a2713aSLionel Sambuc   return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8sf)__m);
883f4a2713aSLionel Sambuc }
884f4a2713aSLionel Sambuc 
885f4a2713aSLionel Sambuc /* Conditional store ops */
886f4a2713aSLionel Sambuc static __inline void __attribute__((__always_inline__, __nodebug__))
_mm256_maskstore_ps(float * __p,__m256 __m,__m256 __a)887f4a2713aSLionel Sambuc _mm256_maskstore_ps(float *__p, __m256 __m, __m256 __a)
888f4a2713aSLionel Sambuc {
889f4a2713aSLionel Sambuc   __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8sf)__m, (__v8sf)__a);
890f4a2713aSLionel Sambuc }
891f4a2713aSLionel Sambuc 
892f4a2713aSLionel Sambuc static __inline void __attribute__((__always_inline__, __nodebug__))
_mm_maskstore_pd(double * __p,__m128d __m,__m128d __a)893f4a2713aSLionel Sambuc _mm_maskstore_pd(double *__p, __m128d __m, __m128d __a)
894f4a2713aSLionel Sambuc {
895f4a2713aSLionel Sambuc   __builtin_ia32_maskstorepd((__v2df *)__p, (__v2df)__m, (__v2df)__a);
896f4a2713aSLionel Sambuc }
897f4a2713aSLionel Sambuc 
898f4a2713aSLionel Sambuc static __inline void __attribute__((__always_inline__, __nodebug__))
_mm256_maskstore_pd(double * __p,__m256d __m,__m256d __a)899f4a2713aSLionel Sambuc _mm256_maskstore_pd(double *__p, __m256d __m, __m256d __a)
900f4a2713aSLionel Sambuc {
901f4a2713aSLionel Sambuc   __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4df)__m, (__v4df)__a);
902f4a2713aSLionel Sambuc }
903f4a2713aSLionel Sambuc 
904f4a2713aSLionel Sambuc static __inline void __attribute__((__always_inline__, __nodebug__))
_mm_maskstore_ps(float * __p,__m128 __m,__m128 __a)905f4a2713aSLionel Sambuc _mm_maskstore_ps(float *__p, __m128 __m, __m128 __a)
906f4a2713aSLionel Sambuc {
907f4a2713aSLionel Sambuc   __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4sf)__m, (__v4sf)__a);
908f4a2713aSLionel Sambuc }
909f4a2713aSLionel Sambuc 
910f4a2713aSLionel Sambuc /* Cacheability support ops */
911f4a2713aSLionel Sambuc static __inline void __attribute__((__always_inline__, __nodebug__))
_mm256_stream_si256(__m256i * __a,__m256i __b)912f4a2713aSLionel Sambuc _mm256_stream_si256(__m256i *__a, __m256i __b)
913f4a2713aSLionel Sambuc {
914f4a2713aSLionel Sambuc   __builtin_ia32_movntdq256((__v4di *)__a, (__v4di)__b);
915f4a2713aSLionel Sambuc }
916f4a2713aSLionel Sambuc 
917f4a2713aSLionel Sambuc static __inline void __attribute__((__always_inline__, __nodebug__))
_mm256_stream_pd(double * __a,__m256d __b)918f4a2713aSLionel Sambuc _mm256_stream_pd(double *__a, __m256d __b)
919f4a2713aSLionel Sambuc {
920f4a2713aSLionel Sambuc   __builtin_ia32_movntpd256(__a, (__v4df)__b);
921f4a2713aSLionel Sambuc }
922f4a2713aSLionel Sambuc 
923f4a2713aSLionel Sambuc static __inline void __attribute__((__always_inline__, __nodebug__))
_mm256_stream_ps(float * __p,__m256 __a)924f4a2713aSLionel Sambuc _mm256_stream_ps(float *__p, __m256 __a)
925f4a2713aSLionel Sambuc {
926f4a2713aSLionel Sambuc   __builtin_ia32_movntps256(__p, (__v8sf)__a);
927f4a2713aSLionel Sambuc }
928f4a2713aSLionel Sambuc 
929f4a2713aSLionel Sambuc /* Create vectors */
930f4a2713aSLionel Sambuc static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_set_pd(double __a,double __b,double __c,double __d)931f4a2713aSLionel Sambuc _mm256_set_pd(double __a, double __b, double __c, double __d)
932f4a2713aSLionel Sambuc {
933f4a2713aSLionel Sambuc   return (__m256d){ __d, __c, __b, __a };
934f4a2713aSLionel Sambuc }
935f4a2713aSLionel Sambuc 
936f4a2713aSLionel Sambuc static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_set_ps(float __a,float __b,float __c,float __d,float __e,float __f,float __g,float __h)937f4a2713aSLionel Sambuc _mm256_set_ps(float __a, float __b, float __c, float __d,
938f4a2713aSLionel Sambuc 	            float __e, float __f, float __g, float __h)
939f4a2713aSLionel Sambuc {
940f4a2713aSLionel Sambuc   return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
941f4a2713aSLionel Sambuc }
942f4a2713aSLionel Sambuc 
943f4a2713aSLionel Sambuc static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_set_epi32(int __i0,int __i1,int __i2,int __i3,int __i4,int __i5,int __i6,int __i7)944f4a2713aSLionel Sambuc _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
945f4a2713aSLionel Sambuc 		             int __i4, int __i5, int __i6, int __i7)
946f4a2713aSLionel Sambuc {
947f4a2713aSLionel Sambuc   return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
948f4a2713aSLionel Sambuc }
949f4a2713aSLionel Sambuc 
950f4a2713aSLionel Sambuc static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_set_epi16(short __w15,short __w14,short __w13,short __w12,short __w11,short __w10,short __w09,short __w08,short __w07,short __w06,short __w05,short __w04,short __w03,short __w02,short __w01,short __w00)951f4a2713aSLionel Sambuc _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
952f4a2713aSLionel Sambuc 		             short __w11, short __w10, short __w09, short __w08,
953f4a2713aSLionel Sambuc 		             short __w07, short __w06, short __w05, short __w04,
954f4a2713aSLionel Sambuc 		             short __w03, short __w02, short __w01, short __w00)
955f4a2713aSLionel Sambuc {
956f4a2713aSLionel Sambuc   return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
957f4a2713aSLionel Sambuc     __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
958f4a2713aSLionel Sambuc }
959f4a2713aSLionel Sambuc 
960f4a2713aSLionel Sambuc static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_set_epi8(char __b31,char __b30,char __b29,char __b28,char __b27,char __b26,char __b25,char __b24,char __b23,char __b22,char __b21,char __b20,char __b19,char __b18,char __b17,char __b16,char __b15,char __b14,char __b13,char __b12,char __b11,char __b10,char __b09,char __b08,char __b07,char __b06,char __b05,char __b04,char __b03,char __b02,char __b01,char __b00)961f4a2713aSLionel Sambuc _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
962f4a2713aSLionel Sambuc 		            char __b27, char __b26, char __b25, char __b24,
963f4a2713aSLionel Sambuc 		            char __b23, char __b22, char __b21, char __b20,
964f4a2713aSLionel Sambuc 		            char __b19, char __b18, char __b17, char __b16,
965f4a2713aSLionel Sambuc 		            char __b15, char __b14, char __b13, char __b12,
966f4a2713aSLionel Sambuc 		            char __b11, char __b10, char __b09, char __b08,
967f4a2713aSLionel Sambuc 		            char __b07, char __b06, char __b05, char __b04,
968f4a2713aSLionel Sambuc 		            char __b03, char __b02, char __b01, char __b00)
969f4a2713aSLionel Sambuc {
970f4a2713aSLionel Sambuc   return (__m256i)(__v32qi){
971f4a2713aSLionel Sambuc     __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
972f4a2713aSLionel Sambuc     __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
973f4a2713aSLionel Sambuc     __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
974f4a2713aSLionel Sambuc     __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
975f4a2713aSLionel Sambuc   };
976f4a2713aSLionel Sambuc }
977f4a2713aSLionel Sambuc 
978f4a2713aSLionel Sambuc static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_set_epi64x(long long __a,long long __b,long long __c,long long __d)979f4a2713aSLionel Sambuc _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
980f4a2713aSLionel Sambuc {
981f4a2713aSLionel Sambuc   return (__m256i)(__v4di){ __d, __c, __b, __a };
982f4a2713aSLionel Sambuc }
983f4a2713aSLionel Sambuc 
984f4a2713aSLionel Sambuc /* Create vectors with elements in reverse order */
985f4a2713aSLionel Sambuc static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_setr_pd(double __a,double __b,double __c,double __d)986f4a2713aSLionel Sambuc _mm256_setr_pd(double __a, double __b, double __c, double __d)
987f4a2713aSLionel Sambuc {
988f4a2713aSLionel Sambuc   return (__m256d){ __a, __b, __c, __d };
989f4a2713aSLionel Sambuc }
990f4a2713aSLionel Sambuc 
991f4a2713aSLionel Sambuc static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_setr_ps(float __a,float __b,float __c,float __d,float __e,float __f,float __g,float __h)992f4a2713aSLionel Sambuc _mm256_setr_ps(float __a, float __b, float __c, float __d,
993f4a2713aSLionel Sambuc 		           float __e, float __f, float __g, float __h)
994f4a2713aSLionel Sambuc {
995f4a2713aSLionel Sambuc   return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h };
996f4a2713aSLionel Sambuc }
997f4a2713aSLionel Sambuc 
998f4a2713aSLionel Sambuc static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_setr_epi32(int __i0,int __i1,int __i2,int __i3,int __i4,int __i5,int __i6,int __i7)999f4a2713aSLionel Sambuc _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
1000f4a2713aSLionel Sambuc 		              int __i4, int __i5, int __i6, int __i7)
1001f4a2713aSLionel Sambuc {
1002f4a2713aSLionel Sambuc   return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 };
1003f4a2713aSLionel Sambuc }
1004f4a2713aSLionel Sambuc 
1005f4a2713aSLionel Sambuc static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_setr_epi16(short __w15,short __w14,short __w13,short __w12,short __w11,short __w10,short __w09,short __w08,short __w07,short __w06,short __w05,short __w04,short __w03,short __w02,short __w01,short __w00)1006f4a2713aSLionel Sambuc _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
1007f4a2713aSLionel Sambuc 		   short __w11, short __w10, short __w09, short __w08,
1008f4a2713aSLionel Sambuc 		   short __w07, short __w06, short __w05, short __w04,
1009f4a2713aSLionel Sambuc 		   short __w03, short __w02, short __w01, short __w00)
1010f4a2713aSLionel Sambuc {
1011f4a2713aSLionel Sambuc   return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09,
1012f4a2713aSLionel Sambuc     __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 };
1013f4a2713aSLionel Sambuc }
1014f4a2713aSLionel Sambuc 
1015f4a2713aSLionel Sambuc static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_setr_epi8(char __b31,char __b30,char __b29,char __b28,char __b27,char __b26,char __b25,char __b24,char __b23,char __b22,char __b21,char __b20,char __b19,char __b18,char __b17,char __b16,char __b15,char __b14,char __b13,char __b12,char __b11,char __b10,char __b09,char __b08,char __b07,char __b06,char __b05,char __b04,char __b03,char __b02,char __b01,char __b00)1016f4a2713aSLionel Sambuc _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
1017f4a2713aSLionel Sambuc 		             char __b27, char __b26, char __b25, char __b24,
1018f4a2713aSLionel Sambuc 		             char __b23, char __b22, char __b21, char __b20,
1019f4a2713aSLionel Sambuc 		             char __b19, char __b18, char __b17, char __b16,
1020f4a2713aSLionel Sambuc 		             char __b15, char __b14, char __b13, char __b12,
1021f4a2713aSLionel Sambuc 		             char __b11, char __b10, char __b09, char __b08,
1022f4a2713aSLionel Sambuc 		             char __b07, char __b06, char __b05, char __b04,
1023f4a2713aSLionel Sambuc 		             char __b03, char __b02, char __b01, char __b00)
1024f4a2713aSLionel Sambuc {
1025f4a2713aSLionel Sambuc   return (__m256i)(__v32qi){
1026f4a2713aSLionel Sambuc     __b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24,
1027f4a2713aSLionel Sambuc 		__b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16,
1028f4a2713aSLionel Sambuc 		__b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08,
1029f4a2713aSLionel Sambuc 		__b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };
1030f4a2713aSLionel Sambuc }
1031f4a2713aSLionel Sambuc 
1032f4a2713aSLionel Sambuc static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_setr_epi64x(long long __a,long long __b,long long __c,long long __d)1033f4a2713aSLionel Sambuc _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
1034f4a2713aSLionel Sambuc {
1035f4a2713aSLionel Sambuc   return (__m256i)(__v4di){ __a, __b, __c, __d };
1036f4a2713aSLionel Sambuc }
1037f4a2713aSLionel Sambuc 
1038f4a2713aSLionel Sambuc /* Create vectors with repeated elements */
1039f4a2713aSLionel Sambuc static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_set1_pd(double __w)1040f4a2713aSLionel Sambuc _mm256_set1_pd(double __w)
1041f4a2713aSLionel Sambuc {
1042f4a2713aSLionel Sambuc   return (__m256d){ __w, __w, __w, __w };
1043f4a2713aSLionel Sambuc }
1044f4a2713aSLionel Sambuc 
1045f4a2713aSLionel Sambuc static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_set1_ps(float __w)1046f4a2713aSLionel Sambuc _mm256_set1_ps(float __w)
1047f4a2713aSLionel Sambuc {
1048f4a2713aSLionel Sambuc   return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w };
1049f4a2713aSLionel Sambuc }
1050f4a2713aSLionel Sambuc 
1051f4a2713aSLionel Sambuc static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_set1_epi32(int __i)1052f4a2713aSLionel Sambuc _mm256_set1_epi32(int __i)
1053f4a2713aSLionel Sambuc {
1054f4a2713aSLionel Sambuc   return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i };
1055f4a2713aSLionel Sambuc }
1056f4a2713aSLionel Sambuc 
1057f4a2713aSLionel Sambuc static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_set1_epi16(short __w)1058f4a2713aSLionel Sambuc _mm256_set1_epi16(short __w)
1059f4a2713aSLionel Sambuc {
1060f4a2713aSLionel Sambuc   return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w,
1061f4a2713aSLionel Sambuc     __w, __w, __w, __w, __w, __w };
1062f4a2713aSLionel Sambuc }
1063f4a2713aSLionel Sambuc 
1064f4a2713aSLionel Sambuc static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_set1_epi8(char __b)1065f4a2713aSLionel Sambuc _mm256_set1_epi8(char __b)
1066f4a2713aSLionel Sambuc {
1067f4a2713aSLionel Sambuc   return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
1068f4a2713aSLionel Sambuc     __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
1069f4a2713aSLionel Sambuc     __b, __b, __b, __b, __b, __b, __b };
1070f4a2713aSLionel Sambuc }
1071f4a2713aSLionel Sambuc 
1072f4a2713aSLionel Sambuc static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_set1_epi64x(long long __q)1073f4a2713aSLionel Sambuc _mm256_set1_epi64x(long long __q)
1074f4a2713aSLionel Sambuc {
1075f4a2713aSLionel Sambuc   return (__m256i)(__v4di){ __q, __q, __q, __q };
1076f4a2713aSLionel Sambuc }
1077f4a2713aSLionel Sambuc 
1078f4a2713aSLionel Sambuc /* Create __zeroed vectors */
1079f4a2713aSLionel Sambuc static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_setzero_pd(void)1080f4a2713aSLionel Sambuc _mm256_setzero_pd(void)
1081f4a2713aSLionel Sambuc {
1082f4a2713aSLionel Sambuc   return (__m256d){ 0, 0, 0, 0 };
1083f4a2713aSLionel Sambuc }
1084f4a2713aSLionel Sambuc 
1085f4a2713aSLionel Sambuc static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_setzero_ps(void)1086f4a2713aSLionel Sambuc _mm256_setzero_ps(void)
1087f4a2713aSLionel Sambuc {
1088f4a2713aSLionel Sambuc   return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
1089f4a2713aSLionel Sambuc }
1090f4a2713aSLionel Sambuc 
1091f4a2713aSLionel Sambuc static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_setzero_si256(void)1092f4a2713aSLionel Sambuc _mm256_setzero_si256(void)
1093f4a2713aSLionel Sambuc {
1094f4a2713aSLionel Sambuc   return (__m256i){ 0LL, 0LL, 0LL, 0LL };
1095f4a2713aSLionel Sambuc }
1096f4a2713aSLionel Sambuc 
1097f4a2713aSLionel Sambuc /* Cast between vector types */
1098f4a2713aSLionel Sambuc static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_castpd_ps(__m256d __a)1099f4a2713aSLionel Sambuc _mm256_castpd_ps(__m256d __a)
1100f4a2713aSLionel Sambuc {
1101f4a2713aSLionel Sambuc   return (__m256)__a;
1102f4a2713aSLionel Sambuc }
1103f4a2713aSLionel Sambuc 
1104f4a2713aSLionel Sambuc static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_castpd_si256(__m256d __a)1105f4a2713aSLionel Sambuc _mm256_castpd_si256(__m256d __a)
1106f4a2713aSLionel Sambuc {
1107f4a2713aSLionel Sambuc   return (__m256i)__a;
1108f4a2713aSLionel Sambuc }
1109f4a2713aSLionel Sambuc 
1110f4a2713aSLionel Sambuc static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_castps_pd(__m256 __a)1111f4a2713aSLionel Sambuc _mm256_castps_pd(__m256 __a)
1112f4a2713aSLionel Sambuc {
1113f4a2713aSLionel Sambuc   return (__m256d)__a;
1114f4a2713aSLionel Sambuc }
1115f4a2713aSLionel Sambuc 
1116f4a2713aSLionel Sambuc static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_castps_si256(__m256 __a)1117f4a2713aSLionel Sambuc _mm256_castps_si256(__m256 __a)
1118f4a2713aSLionel Sambuc {
1119f4a2713aSLionel Sambuc   return (__m256i)__a;
1120f4a2713aSLionel Sambuc }
1121f4a2713aSLionel Sambuc 
1122f4a2713aSLionel Sambuc static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_castsi256_ps(__m256i __a)1123f4a2713aSLionel Sambuc _mm256_castsi256_ps(__m256i __a)
1124f4a2713aSLionel Sambuc {
1125f4a2713aSLionel Sambuc   return (__m256)__a;
1126f4a2713aSLionel Sambuc }
1127f4a2713aSLionel Sambuc 
1128f4a2713aSLionel Sambuc static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_castsi256_pd(__m256i __a)1129f4a2713aSLionel Sambuc _mm256_castsi256_pd(__m256i __a)
1130f4a2713aSLionel Sambuc {
1131f4a2713aSLionel Sambuc   return (__m256d)__a;
1132f4a2713aSLionel Sambuc }
1133f4a2713aSLionel Sambuc 
1134f4a2713aSLionel Sambuc static __inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm256_castpd256_pd128(__m256d __a)1135f4a2713aSLionel Sambuc _mm256_castpd256_pd128(__m256d __a)
1136f4a2713aSLionel Sambuc {
1137f4a2713aSLionel Sambuc   return __builtin_shufflevector(__a, __a, 0, 1);
1138f4a2713aSLionel Sambuc }
1139f4a2713aSLionel Sambuc 
1140f4a2713aSLionel Sambuc static __inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm256_castps256_ps128(__m256 __a)1141f4a2713aSLionel Sambuc _mm256_castps256_ps128(__m256 __a)
1142f4a2713aSLionel Sambuc {
1143f4a2713aSLionel Sambuc   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3);
1144f4a2713aSLionel Sambuc }
1145f4a2713aSLionel Sambuc 
1146f4a2713aSLionel Sambuc static __inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm256_castsi256_si128(__m256i __a)1147f4a2713aSLionel Sambuc _mm256_castsi256_si128(__m256i __a)
1148f4a2713aSLionel Sambuc {
1149f4a2713aSLionel Sambuc   return __builtin_shufflevector(__a, __a, 0, 1);
1150f4a2713aSLionel Sambuc }
1151f4a2713aSLionel Sambuc 
1152f4a2713aSLionel Sambuc static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_castpd128_pd256(__m128d __a)1153f4a2713aSLionel Sambuc _mm256_castpd128_pd256(__m128d __a)
1154f4a2713aSLionel Sambuc {
1155f4a2713aSLionel Sambuc   return __builtin_shufflevector(__a, __a, 0, 1, -1, -1);
1156f4a2713aSLionel Sambuc }
1157f4a2713aSLionel Sambuc 
1158f4a2713aSLionel Sambuc static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_castps128_ps256(__m128 __a)1159f4a2713aSLionel Sambuc _mm256_castps128_ps256(__m128 __a)
1160f4a2713aSLionel Sambuc {
1161f4a2713aSLionel Sambuc   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1);
1162f4a2713aSLionel Sambuc }
1163f4a2713aSLionel Sambuc 
1164f4a2713aSLionel Sambuc static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_castsi128_si256(__m128i __a)1165f4a2713aSLionel Sambuc _mm256_castsi128_si256(__m128i __a)
1166f4a2713aSLionel Sambuc {
1167f4a2713aSLionel Sambuc   return __builtin_shufflevector(__a, __a, 0, 1, -1, -1);
1168f4a2713aSLionel Sambuc }
1169f4a2713aSLionel Sambuc 
1170f4a2713aSLionel Sambuc /* SIMD load ops (unaligned) */
1171f4a2713aSLionel Sambuc static __inline __m256 __attribute__((__always_inline__, __nodebug__))
_mm256_loadu2_m128(float const * __addr_hi,float const * __addr_lo)1172f4a2713aSLionel Sambuc _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
1173f4a2713aSLionel Sambuc {
1174f4a2713aSLionel Sambuc   struct __loadu_ps {
1175f4a2713aSLionel Sambuc     __m128 __v;
1176f4a2713aSLionel Sambuc   } __attribute__((__packed__, __may_alias__));
1177f4a2713aSLionel Sambuc 
1178f4a2713aSLionel Sambuc   __m256 __v256 = _mm256_castps128_ps256(((struct __loadu_ps*)__addr_lo)->__v);
1179f4a2713aSLionel Sambuc   return _mm256_insertf128_ps(__v256, ((struct __loadu_ps*)__addr_hi)->__v, 1);
1180f4a2713aSLionel Sambuc }
1181f4a2713aSLionel Sambuc 
1182f4a2713aSLionel Sambuc static __inline __m256d __attribute__((__always_inline__, __nodebug__))
_mm256_loadu2_m128d(double const * __addr_hi,double const * __addr_lo)1183f4a2713aSLionel Sambuc _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
1184f4a2713aSLionel Sambuc {
1185f4a2713aSLionel Sambuc   struct __loadu_pd {
1186f4a2713aSLionel Sambuc     __m128d __v;
1187f4a2713aSLionel Sambuc   } __attribute__((__packed__, __may_alias__));
1188f4a2713aSLionel Sambuc 
1189f4a2713aSLionel Sambuc   __m256d __v256 = _mm256_castpd128_pd256(((struct __loadu_pd*)__addr_lo)->__v);
1190f4a2713aSLionel Sambuc   return _mm256_insertf128_pd(__v256, ((struct __loadu_pd*)__addr_hi)->__v, 1);
1191f4a2713aSLionel Sambuc }
1192f4a2713aSLionel Sambuc 
1193f4a2713aSLionel Sambuc static __inline __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_loadu2_m128i(__m128i const * __addr_hi,__m128i const * __addr_lo)1194f4a2713aSLionel Sambuc _mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)
1195f4a2713aSLionel Sambuc {
1196f4a2713aSLionel Sambuc   struct __loadu_si128 {
1197f4a2713aSLionel Sambuc     __m128i __v;
1198f4a2713aSLionel Sambuc   } __attribute__((packed, may_alias));
1199f4a2713aSLionel Sambuc   __m256i __v256 = _mm256_castsi128_si256(
1200f4a2713aSLionel Sambuc     ((struct __loadu_si128*)__addr_lo)->__v);
1201f4a2713aSLionel Sambuc   return _mm256_insertf128_si256(__v256,
1202f4a2713aSLionel Sambuc                                  ((struct __loadu_si128*)__addr_hi)->__v, 1);
1203f4a2713aSLionel Sambuc }
1204f4a2713aSLionel Sambuc 
1205f4a2713aSLionel Sambuc /* SIMD store ops (unaligned) */
1206f4a2713aSLionel Sambuc static __inline void __attribute__((__always_inline__, __nodebug__))
_mm256_storeu2_m128(float * __addr_hi,float * __addr_lo,__m256 __a)1207f4a2713aSLionel Sambuc _mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
1208f4a2713aSLionel Sambuc {
1209f4a2713aSLionel Sambuc   __m128 __v128;
1210f4a2713aSLionel Sambuc 
1211f4a2713aSLionel Sambuc   __v128 = _mm256_castps256_ps128(__a);
1212f4a2713aSLionel Sambuc   __builtin_ia32_storeups(__addr_lo, __v128);
1213f4a2713aSLionel Sambuc   __v128 = _mm256_extractf128_ps(__a, 1);
1214f4a2713aSLionel Sambuc   __builtin_ia32_storeups(__addr_hi, __v128);
1215f4a2713aSLionel Sambuc }
1216f4a2713aSLionel Sambuc 
1217f4a2713aSLionel Sambuc static __inline void __attribute__((__always_inline__, __nodebug__))
_mm256_storeu2_m128d(double * __addr_hi,double * __addr_lo,__m256d __a)1218f4a2713aSLionel Sambuc _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
1219f4a2713aSLionel Sambuc {
1220f4a2713aSLionel Sambuc   __m128d __v128;
1221f4a2713aSLionel Sambuc 
1222f4a2713aSLionel Sambuc   __v128 = _mm256_castpd256_pd128(__a);
1223f4a2713aSLionel Sambuc   __builtin_ia32_storeupd(__addr_lo, __v128);
1224f4a2713aSLionel Sambuc   __v128 = _mm256_extractf128_pd(__a, 1);
1225f4a2713aSLionel Sambuc   __builtin_ia32_storeupd(__addr_hi, __v128);
1226f4a2713aSLionel Sambuc }
1227f4a2713aSLionel Sambuc 
1228f4a2713aSLionel Sambuc static __inline void __attribute__((__always_inline__, __nodebug__))
_mm256_storeu2_m128i(__m128i * __addr_hi,__m128i * __addr_lo,__m256i __a)1229f4a2713aSLionel Sambuc _mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a)
1230f4a2713aSLionel Sambuc {
1231f4a2713aSLionel Sambuc   __m128i __v128;
1232f4a2713aSLionel Sambuc 
1233f4a2713aSLionel Sambuc   __v128 = _mm256_castsi256_si128(__a);
1234f4a2713aSLionel Sambuc   __builtin_ia32_storedqu((char *)__addr_lo, (__v16qi)__v128);
1235f4a2713aSLionel Sambuc   __v128 = _mm256_extractf128_si256(__a, 1);
1236f4a2713aSLionel Sambuc   __builtin_ia32_storedqu((char *)__addr_hi, (__v16qi)__v128);
1237f4a2713aSLionel Sambuc }
1238f4a2713aSLionel Sambuc 
1239f4a2713aSLionel Sambuc #endif /* __AVXINTRIN_H */
1240