1 /* This file is part of the Vc library. {{{
2 Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6 * Redistributions of source code must retain the above copyright
7 notice, this list of conditions and the following disclaimer.
8 * Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 * Neither the names of contributing organizations nor the
12 names of its contributors may be used to endorse or promote products
13 derived from this software without specific prior written permission.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
19 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26 }}}*/
27
28 #ifndef VC_AVX_INTRINSICS_H_
29 #define VC_AVX_INTRINSICS_H_
30
31 #include "../global.h"
32 #include "../traits/type_traits.h"
33
34 // see comment in sse/intrinsics.h
35 extern "C" {
36 // AVX
37 #include <immintrin.h>
38
39 #if (defined(Vc_IMPL_XOP) || defined(Vc_IMPL_FMA4)) && !defined(Vc_MSVC)
40 #include <x86intrin.h>
41 #endif
42 }
43
44 #include "../common/fix_clang_emmintrin.h"
45
46 #include "const_data.h"
47 #include "../common/types.h"
48 #include "macros.h"
49 #include <cstdlib>
50
51 #if (defined Vc_CLANG && Vc_CLANG >= 0x30900 && Vc_CLANG < 0x70000)
52 #ifdef _mm256_permute2f128_si256
53 #undef _mm256_permute2f128_si256
54 #define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
55 (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
56 (__v8si)(__m256i)(V2), (char)(M)); })
57 #endif
58
59 #ifdef _mm256_permute2f128_ps
60 #undef _mm256_permute2f128_ps
61 #define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
62 (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
63 (__v8sf)(__m256)(V2), (char)(M)); })
64 #endif
65
66 #ifdef _mm256_permute2x128_si256
67 #undef _mm256_permute2x128_si256
68 #define _mm256_permute2x128_si256(V1, V2, M) __extension__ ({ \
69 (__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (char)(M)); })
70 #endif
71 #endif
72
73 namespace Vc_VERSIONED_NAMESPACE
74 {
75 namespace AvxIntrinsics
76 {
77 using AVX::c_general;
78 using AVX::_IndexesFromZero32;
79 using AVX::_IndexesFromZero16;
80 using AVX::_IndexesFromZero8;
81
82 typedef __m128 m128 ;
83 typedef __m128d m128d;
84 typedef __m128i m128i;
85 typedef __m256 m256 ;
86 typedef __m256d m256d;
87 typedef __m256i m256i;
88
89 #ifdef Vc_GCC
90 // Redefine the mul/add/sub intrinsics to use GCC-specific operators instead of builtin
91 // functions. This way the fp-contraction optimization step kicks in and creates FMAs! :)
_mm256_mul_pd(m256d a,m256d b)92 static Vc_INTRINSIC Vc_CONST m256d _mm256_mul_pd(m256d a, m256d b) { return static_cast<m256d>(static_cast<__v4df>(a) * static_cast<__v4df>(b)); }
_mm256_add_pd(m256d a,m256d b)93 static Vc_INTRINSIC Vc_CONST m256d _mm256_add_pd(m256d a, m256d b) { return static_cast<m256d>(static_cast<__v4df>(a) + static_cast<__v4df>(b)); }
_mm256_sub_pd(m256d a,m256d b)94 static Vc_INTRINSIC Vc_CONST m256d _mm256_sub_pd(m256d a, m256d b) { return static_cast<m256d>(static_cast<__v4df>(a) - static_cast<__v4df>(b)); }
_mm256_mul_ps(m256 a,m256 b)95 static Vc_INTRINSIC Vc_CONST m256 _mm256_mul_ps(m256 a, m256 b) { return static_cast<m256>(static_cast<__v8sf>(a) * static_cast<__v8sf>(b)); }
_mm256_add_ps(m256 a,m256 b)96 static Vc_INTRINSIC Vc_CONST m256 _mm256_add_ps(m256 a, m256 b) { return static_cast<m256>(static_cast<__v8sf>(a) + static_cast<__v8sf>(b)); }
_mm256_sub_ps(m256 a,m256 b)97 static Vc_INTRINSIC Vc_CONST m256 _mm256_sub_ps(m256 a, m256 b) { return static_cast<m256>(static_cast<__v8sf>(a) - static_cast<__v8sf>(b)); }
98 #endif
99
set1_pd(double a)100 static Vc_INTRINSIC m256d Vc_CONST set1_pd (double a) { return _mm256_set1_pd (a); }
set1_epi32(int a)101 static Vc_INTRINSIC m256i Vc_CONST set1_epi32(int a) { return _mm256_set1_epi32(a); }
102
_mm_setallone_si128()103 static Vc_INTRINSIC Vc_CONST m128i _mm_setallone_si128() { return _mm_load_si128(reinterpret_cast<const __m128i *>(Common::AllBitsSet)); }
_mm_setallone_ps()104 static Vc_INTRINSIC Vc_CONST m128 _mm_setallone_ps() { return _mm_load_ps(reinterpret_cast<const float *>(Common::AllBitsSet)); }
_mm_setallone_pd()105 static Vc_INTRINSIC Vc_CONST m128d _mm_setallone_pd() { return _mm_load_pd(reinterpret_cast<const double *>(Common::AllBitsSet)); }
106
setallone_si256()107 static Vc_INTRINSIC Vc_CONST m256i setallone_si256() { return _mm256_castps_si256(_mm256_load_ps(reinterpret_cast<const float *>(Common::AllBitsSet))); }
setallone_pd()108 static Vc_INTRINSIC Vc_CONST m256d setallone_pd() { return _mm256_load_pd(reinterpret_cast<const double *>(Common::AllBitsSet)); }
setallone_ps()109 static Vc_INTRINSIC Vc_CONST m256 setallone_ps() { return _mm256_load_ps(reinterpret_cast<const float *>(Common::AllBitsSet)); }
110
setone_epi8()111 static Vc_INTRINSIC m256i Vc_CONST setone_epi8 () { return _mm256_set1_epi8(1); }
setone_epu8()112 static Vc_INTRINSIC m256i Vc_CONST setone_epu8 () { return setone_epi8(); }
setone_epi16()113 static Vc_INTRINSIC m256i Vc_CONST setone_epi16() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(c_general::one16))); }
setone_epu16()114 static Vc_INTRINSIC m256i Vc_CONST setone_epu16() { return setone_epi16(); }
setone_epi32()115 static Vc_INTRINSIC m256i Vc_CONST setone_epi32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(&_IndexesFromZero32[1]))); }
setone_epu32()116 static Vc_INTRINSIC m256i Vc_CONST setone_epu32() { return setone_epi32(); }
117
setone_ps()118 static Vc_INTRINSIC m256 Vc_CONST setone_ps() { return _mm256_broadcast_ss(&c_general::oneFloat); }
setone_pd()119 static Vc_INTRINSIC m256d Vc_CONST setone_pd() { return _mm256_broadcast_sd(&c_general::oneDouble); }
120
setabsmask_pd()121 static Vc_INTRINSIC m256d Vc_CONST setabsmask_pd() { return _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::absMaskFloat[0])); }
setabsmask_ps()122 static Vc_INTRINSIC m256 Vc_CONST setabsmask_ps() { return _mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::absMaskFloat[1])); }
setsignmask_pd()123 static Vc_INTRINSIC m256d Vc_CONST setsignmask_pd(){ return _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::signMaskFloat[0])); }
setsignmask_ps()124 static Vc_INTRINSIC m256 Vc_CONST setsignmask_ps(){ return _mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1])); }
125
set2power31_ps()126 static Vc_INTRINSIC m256 Vc_CONST set2power31_ps() { return _mm256_broadcast_ss(&c_general::_2power31); }
_mm_set2power31_ps()127 static Vc_INTRINSIC m128 Vc_CONST _mm_set2power31_ps() { return _mm_broadcast_ss(&c_general::_2power31); }
set2power31_epu32()128 static Vc_INTRINSIC m256i Vc_CONST set2power31_epu32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1]))); }
_mm_set2power31_epu32()129 static Vc_INTRINSIC m128i Vc_CONST _mm_set2power31_epu32() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1]))); }
130
setmin_epi8()131 static Vc_INTRINSIC m256i Vc_CONST setmin_epi8 () { return _mm256_set1_epi8(-0x80); }
_mm_setmin_epi16()132 static Vc_INTRINSIC m128i Vc_CONST _mm_setmin_epi16() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast<const float *>(c_general::minShort))); }
_mm_setmin_epi32()133 static Vc_INTRINSIC m128i Vc_CONST _mm_setmin_epi32() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1]))); }
setmin_epi16()134 static Vc_INTRINSIC m256i Vc_CONST setmin_epi16() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(c_general::minShort))); }
setmin_epi32()135 static Vc_INTRINSIC m256i Vc_CONST setmin_epi32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1]))); }
136
137 template <int i>
extract_epu32(__m128i x)138 static Vc_INTRINSIC Vc_CONST unsigned int extract_epu32(__m128i x)
139 {
140 return _mm_extract_epi32(x, i);
141 }
142
insert128(__m256 a,__m128 b)143 template <int offset> Vc_INTRINSIC __m256 insert128(__m256 a, __m128 b) { return _mm256_insertf128_ps(a, b, offset); }
insert128(__m256d a,__m128d b)144 template <int offset> Vc_INTRINSIC __m256d insert128(__m256d a, __m128d b) { return _mm256_insertf128_pd(a, b, offset); }
insert128(__m256i a,__m128i b)145 template <int offset> Vc_INTRINSIC __m256i insert128(__m256i a, __m128i b) {
146 #ifdef Vc_IMPL_AVX2
147 return _mm256_inserti128_si256(a, b, offset);
148 #else
149 return _mm256_insertf128_si256(a, b, offset);
150 #endif
151 }
152
extract128(__m256 a)153 template <int offset> Vc_INTRINSIC __m128 extract128(__m256 a) { return _mm256_extractf128_ps(a, offset); }
extract128(__m256d a)154 template <int offset> Vc_INTRINSIC __m128d extract128(__m256d a) { return _mm256_extractf128_pd(a, offset); }
extract128(__m256i a)155 template <int offset> Vc_INTRINSIC __m128i extract128(__m256i a) {
156 #ifdef Vc_IMPL_AVX2
157 return _mm256_extracti128_si256(a, offset);
158 #else
159 return _mm256_extractf128_si256(a, offset);
160 #endif
161 }
162
163 /////////////////////// COMPARE OPS ///////////////////////
164 #ifdef Vc_GCC
165 // GCC needs builtin compare operators to enable constant folding
cmpeq_pd(__m256d a,__m256d b)166 Vc_INTRINSIC __m256d cmpeq_pd (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a == b); }
cmpneq_pd(__m256d a,__m256d b)167 Vc_INTRINSIC __m256d cmpneq_pd (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a != b); }
cmplt_pd(__m256d a,__m256d b)168 Vc_INTRINSIC __m256d cmplt_pd (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a < b); }
cmpge_pd(__m256d a,__m256d b)169 Vc_INTRINSIC __m256d cmpge_pd (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a >= b); }
cmple_pd(__m256d a,__m256d b)170 Vc_INTRINSIC __m256d cmple_pd (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a <= b); }
cmpgt_pd(__m256d a,__m256d b)171 Vc_INTRINSIC __m256d cmpgt_pd (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a > b); }
172
cmpeq_ps(__m256 a,__m256 b)173 Vc_INTRINSIC __m256 cmpeq_ps (__m256 a, __m256 b) { return reinterpret_cast<__m256 >(a == b); }
cmpneq_ps(__m256 a,__m256 b)174 Vc_INTRINSIC __m256 cmpneq_ps (__m256 a, __m256 b) { return reinterpret_cast<__m256 >(a != b); }
cmplt_ps(__m256 a,__m256 b)175 Vc_INTRINSIC __m256 cmplt_ps (__m256 a, __m256 b) { return reinterpret_cast<__m256 >(a < b); }
cmpge_ps(__m256 a,__m256 b)176 Vc_INTRINSIC __m256 cmpge_ps (__m256 a, __m256 b) { return reinterpret_cast<__m256 >(a >= b); }
cmple_ps(__m256 a,__m256 b)177 Vc_INTRINSIC __m256 cmple_ps (__m256 a, __m256 b) { return reinterpret_cast<__m256 >(a <= b); }
cmpgt_ps(__m256 a,__m256 b)178 Vc_INTRINSIC __m256 cmpgt_ps (__m256 a, __m256 b) { return reinterpret_cast<__m256 >(a > b); }
179 #else
cmpeq_pd(__m256d a,__m256d b)180 Vc_INTRINSIC __m256d cmpeq_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ); }
cmpneq_pd(__m256d a,__m256d b)181 Vc_INTRINSIC __m256d cmpneq_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ); }
cmplt_pd(__m256d a,__m256d b)182 Vc_INTRINSIC __m256d cmplt_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS); }
cmpge_pd(__m256d a,__m256d b)183 Vc_INTRINSIC __m256d cmpge_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); }
cmple_pd(__m256d a,__m256d b)184 Vc_INTRINSIC __m256d cmple_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS); }
cmpgt_pd(__m256d a,__m256d b)185 Vc_INTRINSIC __m256d cmpgt_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); }
186
cmpeq_ps(__m256 a,__m256 b)187 Vc_INTRINSIC __m256 cmpeq_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ); }
cmpneq_ps(__m256 a,__m256 b)188 Vc_INTRINSIC __m256 cmpneq_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); }
cmplt_ps(__m256 a,__m256 b)189 Vc_INTRINSIC __m256 cmplt_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS); }
cmpge_ps(__m256 a,__m256 b)190 Vc_INTRINSIC __m256 cmpge_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); }
cmple_ps(__m256 a,__m256 b)191 Vc_INTRINSIC __m256 cmple_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_LE_OS); }
cmpgt_ps(__m256 a,__m256 b)192 Vc_INTRINSIC __m256 cmpgt_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); }
193 #endif
cmpnlt_pd(__m256d a,__m256d b)194 Vc_INTRINSIC __m256d cmpnlt_pd (__m256d a, __m256d b) { return cmpge_pd(a, b); }
cmpnle_pd(__m256d a,__m256d b)195 Vc_INTRINSIC __m256d cmpnle_pd (__m256d a, __m256d b) { return cmpgt_pd(a, b); }
cmpnlt_ps(__m256 a,__m256 b)196 Vc_INTRINSIC __m256 cmpnlt_ps (__m256 a, __m256 b) { return cmpge_ps(a, b); }
cmpnle_ps(__m256 a,__m256 b)197 Vc_INTRINSIC __m256 cmpnle_ps (__m256 a, __m256 b) { return cmpgt_ps(a, b); }
198
cmpord_pd(__m256d a,__m256d b)199 Vc_INTRINSIC __m256d cmpord_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_ORD_Q); }
cmpunord_pd(__m256d a,__m256d b)200 Vc_INTRINSIC __m256d cmpunord_pd(__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_UNORD_Q); }
cmpord_ps(__m256 a,__m256 b)201 Vc_INTRINSIC __m256 cmpord_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_ORD_Q); }
cmpunord_ps(__m256 a,__m256 b)202 Vc_INTRINSIC __m256 cmpunord_ps(__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_UNORD_Q); }
203
204 #if defined(Vc_IMPL_XOP)
cmplt_epu16(__m128i a,__m128i b)205 static Vc_INTRINSIC m128i cmplt_epu16(__m128i a, __m128i b) {
206 return _mm_comlt_epu16(a, b);
207 }
cmpgt_epu16(__m128i a,__m128i b)208 static Vc_INTRINSIC m128i cmpgt_epu16(__m128i a, __m128i b) {
209 return _mm_comgt_epu16(a, b);
210 }
211 #else
cmplt_epu16(__m128i a,__m128i b)212 static Vc_INTRINSIC m128i cmplt_epu16(__m128i a, __m128i b) {
213 return _mm_cmplt_epi16(_mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16()));
214 }
cmpgt_epu16(__m128i a,__m128i b)215 static Vc_INTRINSIC m128i cmpgt_epu16(__m128i a, __m128i b) {
216 return _mm_cmpgt_epi16(_mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16()));
217 }
218 #endif
219
220 #ifdef Vc_IMPL_AVX2
alignr(__m256i s1,__m256i s2)221 template <int shift> Vc_INTRINSIC Vc_CONST m256i alignr(__m256i s1, __m256i s2)
222 {
223 return _mm256_alignr_epi8(s1, s2, shift);
224 }
225 #else
alignr(__m256i s1,__m256i s2)226 template <int shift> Vc_INTRINSIC Vc_CONST m256i alignr(__m256i s1, __m256i s2)
227 {
228 return insert128<1>(
229 _mm256_castsi128_si256(_mm_alignr_epi8(_mm256_castsi256_si128(s1),
230 _mm256_castsi256_si128(s2), shift)),
231 _mm_alignr_epi8(extract128<1>(s1), extract128<1>(s2), shift));
232 }
233 #endif
234
235 #ifdef Vc_IMPL_AVX2
236 #define Vc_AVX_TO_SSE_2_NEW(name) \
237 Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m256i b0) \
238 { \
239 return _mm256_##name(a0, b0); \
240 }
241 #define Vc_AVX_TO_SSE_256_128(name) \
242 Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m128i b0) \
243 { \
244 return _mm256_##name(a0, b0); \
245 }
246 #define Vc_AVX_TO_SSE_1i(name) \
247 template <int i> Vc_INTRINSIC Vc_CONST m256i name(__m256i a0) \
248 { \
249 return _mm256_##name(a0, i); \
250 }
251 #define Vc_AVX_TO_SSE_1(name) \
252 Vc_INTRINSIC Vc_CONST __m256i name(__m256i a0) { return _mm256_##name(a0); }
253 #define Vc_AVX_TO_SSE_1_128(name, shift__) \
254 Vc_INTRINSIC Vc_CONST __m256i name(__m128i a0) { return _mm256_##name(a0); }
255 #else
256 /**\internal
257 * Defines the function \p name, which takes to __m256i arguments and calls `_mm_##name` on the low
258 * and high 128 bit halfs of the arguments.
259 *
260 * In case the AVX2 intrinsics are enabled, the arguments are directly passed to a single
261 * `_mm256_##name` call.
262 */
263 #define Vc_AVX_TO_SSE_1(name) \
264 Vc_INTRINSIC Vc_CONST __m256i name(__m256i a0) \
265 { \
266 __m128i a1 = extract128<1>(a0); \
267 __m128i r0 = _mm_##name(_mm256_castsi256_si128(a0)); \
268 __m128i r1 = _mm_##name(a1); \
269 return insert128<1>(_mm256_castsi128_si256(r0), r1); \
270 }
271 #define Vc_AVX_TO_SSE_1_128(name, shift__) \
272 Vc_INTRINSIC Vc_CONST __m256i name(__m128i a0) \
273 { \
274 __m128i r0 = _mm_##name(a0); \
275 __m128i r1 = _mm_##name(_mm_srli_si128(a0, shift__)); \
276 return insert128<1>(_mm256_castsi128_si256(r0), r1); \
277 }
278 #define Vc_AVX_TO_SSE_2_NEW(name) \
279 Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m256i b0) \
280 { \
281 m128i a1 = extract128<1>(a0); \
282 m128i b1 = extract128<1>(b0); \
283 m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0)); \
284 m128i r1 = _mm_##name(a1, b1); \
285 return insert128<1>(_mm256_castsi128_si256(r0), r1); \
286 }
287 #define Vc_AVX_TO_SSE_256_128(name) \
288 Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m128i b0) \
289 { \
290 m128i a1 = extract128<1>(a0); \
291 m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), b0); \
292 m128i r1 = _mm_##name(a1, b0); \
293 return insert128<1>(_mm256_castsi128_si256(r0), r1); \
294 }
295 #define Vc_AVX_TO_SSE_1i(name) \
296 template <int i> Vc_INTRINSIC Vc_CONST m256i name(__m256i a0) \
297 { \
298 m128i a1 = extract128<1>(a0); \
299 m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), i); \
300 m128i r1 = _mm_##name(a1, i); \
301 return insert128<1>(_mm256_castsi128_si256(r0), r1); \
302 }
303 #endif
sll_epi16(__m128i a,__m128i b)304 Vc_INTRINSIC Vc_CONST __m128i sll_epi16(__m128i a, __m128i b) { return _mm_sll_epi16(a, b); }
sll_epi32(__m128i a,__m128i b)305 Vc_INTRINSIC Vc_CONST __m128i sll_epi32(__m128i a, __m128i b) { return _mm_sll_epi32(a, b); }
sll_epi64(__m128i a,__m128i b)306 Vc_INTRINSIC Vc_CONST __m128i sll_epi64(__m128i a, __m128i b) { return _mm_sll_epi64(a, b); }
srl_epi16(__m128i a,__m128i b)307 Vc_INTRINSIC Vc_CONST __m128i srl_epi16(__m128i a, __m128i b) { return _mm_srl_epi16(a, b); }
srl_epi32(__m128i a,__m128i b)308 Vc_INTRINSIC Vc_CONST __m128i srl_epi32(__m128i a, __m128i b) { return _mm_srl_epi32(a, b); }
srl_epi64(__m128i a,__m128i b)309 Vc_INTRINSIC Vc_CONST __m128i srl_epi64(__m128i a, __m128i b) { return _mm_srl_epi64(a, b); }
sra_epi16(__m128i a,__m128i b)310 Vc_INTRINSIC Vc_CONST __m128i sra_epi16(__m128i a, __m128i b) { return _mm_sra_epi16(a, b); }
sra_epi32(__m128i a,__m128i b)311 Vc_INTRINSIC Vc_CONST __m128i sra_epi32(__m128i a, __m128i b) { return _mm_sra_epi32(a, b); }
312
313 Vc_AVX_TO_SSE_1i(slli_epi16)
Vc_AVX_TO_SSE_1i(slli_epi32)314 Vc_AVX_TO_SSE_1i(slli_epi32)
315 Vc_AVX_TO_SSE_1i(slli_epi64)
316 Vc_AVX_TO_SSE_1i(srai_epi16)
317 Vc_AVX_TO_SSE_1i(srai_epi32)
318 Vc_AVX_TO_SSE_1i(srli_epi16)
319 Vc_AVX_TO_SSE_1i(srli_epi32)
320 Vc_AVX_TO_SSE_1i(srli_epi64)
321
322 Vc_AVX_TO_SSE_256_128(sll_epi16)
323 Vc_AVX_TO_SSE_256_128(sll_epi32)
324 Vc_AVX_TO_SSE_256_128(sll_epi64)
325 Vc_AVX_TO_SSE_256_128(srl_epi16)
326 Vc_AVX_TO_SSE_256_128(srl_epi32)
327 Vc_AVX_TO_SSE_256_128(srl_epi64)
328 Vc_AVX_TO_SSE_256_128(sra_epi16)
329 Vc_AVX_TO_SSE_256_128(sra_epi32)
330
331 Vc_AVX_TO_SSE_2_NEW(cmpeq_epi8)
332 Vc_AVX_TO_SSE_2_NEW(cmpeq_epi16)
333 Vc_AVX_TO_SSE_2_NEW(cmpeq_epi32)
334 Vc_AVX_TO_SSE_2_NEW(cmpeq_epi64)
335 Vc_AVX_TO_SSE_2_NEW(cmpgt_epi8)
336 Vc_AVX_TO_SSE_2_NEW(cmpgt_epi16)
337 Vc_AVX_TO_SSE_2_NEW(cmpgt_epi32)
338 Vc_AVX_TO_SSE_2_NEW(cmpgt_epi64)
339 Vc_AVX_TO_SSE_2_NEW(unpackhi_epi16)
340 Vc_AVX_TO_SSE_2_NEW(unpacklo_epi16)
341 Vc_AVX_TO_SSE_2_NEW(add_epi16)
342 Vc_AVX_TO_SSE_2_NEW(add_epi32)
343 Vc_AVX_TO_SSE_2_NEW(add_epi64)
344 Vc_AVX_TO_SSE_2_NEW(sub_epi16)
345 Vc_AVX_TO_SSE_2_NEW(sub_epi32)
346 Vc_AVX_TO_SSE_2_NEW(mullo_epi16)
347 Vc_AVX_TO_SSE_2_NEW(sign_epi16)
348 Vc_AVX_TO_SSE_2_NEW(sign_epi32)
349 Vc_AVX_TO_SSE_2_NEW(min_epi8)
350 Vc_AVX_TO_SSE_2_NEW(max_epi8)
351 Vc_AVX_TO_SSE_2_NEW(min_epu16)
352 Vc_AVX_TO_SSE_2_NEW(max_epu16)
353 Vc_AVX_TO_SSE_2_NEW(min_epi32)
354 Vc_AVX_TO_SSE_2_NEW(max_epi32)
355 Vc_AVX_TO_SSE_2_NEW(min_epu32)
356 Vc_AVX_TO_SSE_2_NEW(max_epu32)
357 Vc_AVX_TO_SSE_2_NEW(mullo_epi32)
358
359 Vc_AVX_TO_SSE_1(abs_epi8)
360 Vc_AVX_TO_SSE_1(abs_epi16)
361 Vc_AVX_TO_SSE_1(abs_epi32)
362 Vc_AVX_TO_SSE_1_128(cvtepi8_epi16, 8)
363 Vc_AVX_TO_SSE_1_128(cvtepi8_epi32, 4)
364 Vc_AVX_TO_SSE_1_128(cvtepi8_epi64, 2)
365 Vc_AVX_TO_SSE_1_128(cvtepi16_epi32, 8)
366 Vc_AVX_TO_SSE_1_128(cvtepi16_epi64, 4)
367 Vc_AVX_TO_SSE_1_128(cvtepi32_epi64, 8)
368 Vc_AVX_TO_SSE_1_128(cvtepu8_epi16, 8)
369 Vc_AVX_TO_SSE_1_128(cvtepu8_epi32, 4)
370 Vc_AVX_TO_SSE_1_128(cvtepu8_epi64, 2)
371 Vc_AVX_TO_SSE_1_128(cvtepu16_epi32, 8)
372 Vc_AVX_TO_SSE_1_128(cvtepu16_epi64, 4)
373 Vc_AVX_TO_SSE_1_128(cvtepu32_epi64, 8)
374 #ifndef Vc_IMPL_AVX2
375
376 /////////////////////////////////////////////////////////////////////////
377 // implementation of the intrinsics missing in AVX
378 /////////////////////////////////////////////////////////////////////////
379
380 static Vc_INTRINSIC m256i Vc_CONST and_si256(__m256i x, __m256i y) {
381 return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
382 }
andnot_si256(__m256i x,__m256i y)383 static Vc_INTRINSIC m256i Vc_CONST andnot_si256(__m256i x, __m256i y) {
384 return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
385 }
or_si256(__m256i x,__m256i y)386 static Vc_INTRINSIC m256i Vc_CONST or_si256(__m256i x, __m256i y) {
387 return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
388 }
xor_si256(__m256i x,__m256i y)389 static Vc_INTRINSIC m256i Vc_CONST xor_si256(__m256i x, __m256i y) {
390 return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
391 }
392
movemask_epi8(__m256i a0)393 Vc_INTRINSIC Vc_CONST int movemask_epi8(__m256i a0)
394 {
395 m128i a1 = extract128<1>(a0);
396 return (_mm_movemask_epi8(a1) << 16) | _mm_movemask_epi8(_mm256_castsi256_si128(a0));
397 }
blend_epi16(__m256i a0,__m256i b0)398 template <int m> Vc_INTRINSIC Vc_CONST m256i blend_epi16(__m256i a0, __m256i b0)
399 {
400 m128i a1 = extract128<1>(a0);
401 m128i b1 = extract128<1>(b0);
402 m128i r0 = _mm_blend_epi16(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), m & 0xff);
403 m128i r1 = _mm_blend_epi16(a1, b1, m >> 8);
404 return insert128<1>(_mm256_castsi128_si256(r0), r1);
405 }
blendv_epi8(__m256i a0,__m256i b0,__m256i m0)406 Vc_INTRINSIC Vc_CONST m256i blendv_epi8(__m256i a0, __m256i b0, __m256i m0) {
407 m128i a1 = extract128<1>(a0);
408 m128i b1 = extract128<1>(b0);
409 m128i m1 = extract128<1>(m0);
410 m128i r0 = _mm_blendv_epi8(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), _mm256_castsi256_si128(m0));
411 m128i r1 = _mm_blendv_epi8(a1, b1, m1);
412 return insert128<1>(_mm256_castsi128_si256(r0), r1);
413 }
414 // mpsadbw_epu8 (__m128i __X, __m128i __Y, const int __M)
415
416 #else // Vc_IMPL_AVX2
417
418 static Vc_INTRINSIC Vc_CONST m256i xor_si256(__m256i x, __m256i y) { return _mm256_xor_si256(x, y); }
419 static Vc_INTRINSIC Vc_CONST m256i or_si256(__m256i x, __m256i y) { return _mm256_or_si256(x, y); }
420 static Vc_INTRINSIC Vc_CONST m256i and_si256(__m256i x, __m256i y) { return _mm256_and_si256(x, y); }
421 static Vc_INTRINSIC Vc_CONST m256i andnot_si256(__m256i x, __m256i y) { return _mm256_andnot_si256(x, y); }
422
423 /////////////////////////////////////////////////////////////////////////
424 // implementation of the intrinsics missing in AVX2
425 /////////////////////////////////////////////////////////////////////////
426 Vc_INTRINSIC Vc_CONST m256i blendv_epi8(__m256i a0, __m256i b0, __m256i m0)
427 {
428 return _mm256_blendv_epi8(a0, b0, m0);
429 }
430 Vc_INTRINSIC Vc_CONST int movemask_epi8(__m256i a0)
431 {
432 return _mm256_movemask_epi8(a0);
433 }
434
435 #endif // Vc_IMPL_AVX2
436
437 /////////////////////////////////////////////////////////////////////////
438 // implementation of intrinsics missing in AVX and AVX2
439 /////////////////////////////////////////////////////////////////////////
440
cmplt_epi64(__m256i a,__m256i b)441 static Vc_INTRINSIC m256i cmplt_epi64(__m256i a, __m256i b) {
442 return cmpgt_epi64(b, a);
443 }
cmplt_epi32(__m256i a,__m256i b)444 static Vc_INTRINSIC m256i cmplt_epi32(__m256i a, __m256i b) {
445 return cmpgt_epi32(b, a);
446 }
cmplt_epi16(__m256i a,__m256i b)447 static Vc_INTRINSIC m256i cmplt_epi16(__m256i a, __m256i b) {
448 return cmpgt_epi16(b, a);
449 }
cmplt_epi8(__m256i a,__m256i b)450 static Vc_INTRINSIC m256i cmplt_epi8(__m256i a, __m256i b) {
451 return cmpgt_epi8(b, a);
452 }
453
cmpgt_epu8(__m256i a,__m256i b)454 static Vc_INTRINSIC m256i cmpgt_epu8(__m256i a, __m256i b) {
455 return cmpgt_epi8(xor_si256(a, setmin_epi8()), xor_si256(b, setmin_epi8()));
456 }
457 #if defined(Vc_IMPL_XOP)
458 Vc_AVX_TO_SSE_2_NEW(comlt_epu32)
Vc_AVX_TO_SSE_2_NEW(comgt_epu32)459 Vc_AVX_TO_SSE_2_NEW(comgt_epu32)
460 Vc_AVX_TO_SSE_2_NEW(comlt_epu16)
461 Vc_AVX_TO_SSE_2_NEW(comgt_epu16)
462 static Vc_INTRINSIC m256i Vc_CONST cmplt_epu32(__m256i a, __m256i b) { return comlt_epu32(a, b); }
cmpgt_epu32(__m256i a,__m256i b)463 static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu32(__m256i a, __m256i b) { return comgt_epu32(a, b); }
cmplt_epu16(__m256i a,__m256i b)464 static Vc_INTRINSIC m256i Vc_CONST cmplt_epu16(__m256i a, __m256i b) { return comlt_epu16(a, b); }
cmpgt_epu16(__m256i a,__m256i b)465 static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu16(__m256i a, __m256i b) { return comgt_epu16(a, b); }
466 #else
cmplt_epu32(__m256i _a,__m256i _b)467 static Vc_INTRINSIC m256i Vc_CONST cmplt_epu32(__m256i _a, __m256i _b) {
468 m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi32())));
469 m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi32())));
470 return cmplt_epi32(a, b);
471 }
cmpgt_epu32(__m256i _a,__m256i _b)472 static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu32(__m256i _a, __m256i _b) {
473 m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi32())));
474 m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi32())));
475 return cmpgt_epi32(a, b);
476 }
cmplt_epu16(__m256i _a,__m256i _b)477 static Vc_INTRINSIC m256i Vc_CONST cmplt_epu16(__m256i _a, __m256i _b) {
478 m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi16())));
479 m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi16())));
480 return cmplt_epi16(a, b);
481 }
cmpgt_epu16(__m256i _a,__m256i _b)482 static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu16(__m256i _a, __m256i _b) {
483 m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi16())));
484 m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi16())));
485 return cmpgt_epi16(a, b);
486 }
487 #endif
488
_mm256_maskstore(float * mem,const __m256 mask,const __m256 v)489 static Vc_INTRINSIC void _mm256_maskstore(float *mem, const __m256 mask, const __m256 v) {
490 _mm256_maskstore_ps(mem, _mm256_castps_si256(mask), v);
491 }
_mm256_maskstore(double * mem,const __m256d mask,const __m256d v)492 static Vc_INTRINSIC void _mm256_maskstore(double *mem, const __m256d mask, const __m256d v) {
493 _mm256_maskstore_pd(mem, _mm256_castpd_si256(mask), v);
494 }
_mm256_maskstore(int * mem,const __m256i mask,const __m256i v)495 static Vc_INTRINSIC void _mm256_maskstore(int *mem, const __m256i mask, const __m256i v) {
496 #ifdef Vc_IMPL_AVX2
497 _mm256_maskstore_epi32(mem, mask, v);
498 #else
499 _mm256_maskstore_ps(reinterpret_cast<float *>(mem), mask, _mm256_castsi256_ps(v));
500 #endif
501 }
_mm256_maskstore(unsigned int * mem,const __m256i mask,const __m256i v)502 static Vc_INTRINSIC void _mm256_maskstore(unsigned int *mem, const __m256i mask, const __m256i v) {
503 _mm256_maskstore(reinterpret_cast<int *>(mem), mask, v);
504 }
_mm256_maskstore(short * mem,const __m256i mask,const __m256i v)505 static Vc_INTRINSIC void _mm256_maskstore(short *mem, const __m256i mask, const __m256i v) {
506 using namespace AVX;
507 _mm_maskmoveu_si128(_mm256_castsi256_si128(v), _mm256_castsi256_si128(mask), reinterpret_cast<char *>(&mem[0]));
508 _mm_maskmoveu_si128(extract128<1>(v), extract128<1>(mask), reinterpret_cast<char *>(&mem[8]));
509 }
_mm256_maskstore(unsigned short * mem,const __m256i mask,const __m256i v)510 static Vc_INTRINSIC void _mm256_maskstore(unsigned short *mem, const __m256i mask, const __m256i v) {
511 _mm256_maskstore(reinterpret_cast<short *>(mem), mask, v);
512 }
513
514 #undef Vc_AVX_TO_SSE_1
515 #undef Vc_AVX_TO_SSE_1_128
516 #undef Vc_AVX_TO_SSE_2_NEW
517 #undef Vc_AVX_TO_SSE_256_128
518 #undef Vc_AVX_TO_SSE_1i
519
520 template<typename R> Vc_INTRINSIC_L R stream_load(const float *mem) Vc_INTRINSIC_R;
521 template<> Vc_INTRINSIC m128 stream_load<m128>(const float *mem)
522 {
523 return _mm_castsi128_ps(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<float *>(mem))));
524 }
525 template<> Vc_INTRINSIC m256 stream_load<m256>(const float *mem)
526 {
527 return insert128<1>(_mm256_castps128_ps256(stream_load<m128>(mem)),
528 stream_load<m128>(mem + 4));
529 }
530
531 template<typename R> Vc_INTRINSIC_L R stream_load(const double *mem) Vc_INTRINSIC_R;
532 template<> Vc_INTRINSIC m128d stream_load<m128d>(const double *mem)
533 {
534 return _mm_castsi128_pd(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<double *>(mem))));
535 }
536 template<> Vc_INTRINSIC m256d stream_load<m256d>(const double *mem)
537 {
538 return insert128<1>(_mm256_castpd128_pd256(stream_load<m128d>(mem)),
539 stream_load<m128d>(mem + 2));
540 }
541
542 template<typename R> Vc_INTRINSIC_L R stream_load(const void *mem) Vc_INTRINSIC_R;
543 template<> Vc_INTRINSIC m128i stream_load<m128i>(const void *mem)
544 {
545 return _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<void *>(mem)));
546 }
547 template<> Vc_INTRINSIC m256i stream_load<m256i>(const void *mem)
548 {
549 return insert128<1>(_mm256_castsi128_si256(stream_load<m128i>(mem)),
550 stream_load<m128i>(static_cast<const __m128i *>(mem) + 1));
551 }
552
stream_store(float * mem,__m128 value,__m128 mask)553 Vc_INTRINSIC void stream_store(float *mem, __m128 value, __m128 mask)
554 {
555 _mm_maskmoveu_si128(_mm_castps_si128(value), _mm_castps_si128(mask), reinterpret_cast<char *>(mem));
556 }
stream_store(float * mem,__m256 value,__m256 mask)557 Vc_INTRINSIC void stream_store(float *mem, __m256 value, __m256 mask)
558 {
559 stream_store(mem, _mm256_castps256_ps128(value), _mm256_castps256_ps128(mask));
560 stream_store(mem + 4, extract128<1>(value), extract128<1>(mask));
561 }
stream_store(double * mem,__m128d value,__m128d mask)562 Vc_INTRINSIC void stream_store(double *mem, __m128d value, __m128d mask)
563 {
564 _mm_maskmoveu_si128(_mm_castpd_si128(value), _mm_castpd_si128(mask), reinterpret_cast<char *>(mem));
565 }
stream_store(double * mem,__m256d value,__m256d mask)566 Vc_INTRINSIC void stream_store(double *mem, __m256d value, __m256d mask)
567 {
568 stream_store(mem, _mm256_castpd256_pd128(value), _mm256_castpd256_pd128(mask));
569 stream_store(mem + 2, extract128<1>(value), extract128<1>(mask));
570 }
stream_store(void * mem,__m128i value,__m128i mask)571 Vc_INTRINSIC void stream_store(void *mem, __m128i value, __m128i mask)
572 {
573 _mm_maskmoveu_si128(value, mask, reinterpret_cast<char *>(mem));
574 }
stream_store(void * mem,__m256i value,__m256i mask)575 Vc_INTRINSIC void stream_store(void *mem, __m256i value, __m256i mask)
576 {
577 stream_store(mem, _mm256_castsi256_si128(value), _mm256_castsi256_si128(mask));
578 stream_store(static_cast<__m128i *>(mem) + 1, extract128<1>(value), extract128<1>(mask));
579 }
580
581 #ifndef __x86_64__
_mm_cvtsi64_si128(int64_t x)582 Vc_INTRINSIC Vc_PURE __m128i _mm_cvtsi64_si128(int64_t x) {
583 return _mm_castpd_si128(_mm_load_sd(reinterpret_cast<const double *>(&x)));
584 }
585 #endif
586
587 #ifdef Vc_IMPL_AVX2
gather(const float * addr,__m256i idx)588 template <int Scale> __m256 gather(const float *addr, __m256i idx)
589 {
590 return _mm256_i32gather_ps(addr, idx, Scale);
591 }
gather(const double * addr,__m128i idx)592 template <int Scale> __m256d gather(const double *addr, __m128i idx)
593 {
594 return _mm256_i32gather_pd(addr, idx, Scale);
595 }
gather(const int * addr,__m256i idx)596 template <int Scale> __m256i gather(const int *addr, __m256i idx)
597 {
598 return _mm256_i32gather_epi32(addr, idx, Scale);
599 }
gather(const unsigned * addr,__m256i idx)600 template <int Scale> __m256i gather(const unsigned *addr, __m256i idx)
601 {
602 return _mm256_i32gather_epi32(aliasing_cast<int>(addr), idx, Scale);
603 }
604
gather(__m256 src,__m256 k,const float * addr,__m256i idx)605 template <int Scale> __m256 gather(__m256 src, __m256 k, const float *addr, __m256i idx)
606 {
607 return _mm256_mask_i32gather_ps(src, addr, idx, k, Scale);
608 }
609 template <int Scale>
gather(__m256d src,__m256d k,const double * addr,__m128i idx)610 __m256d gather(__m256d src, __m256d k, const double *addr, __m128i idx)
611 {
612 return _mm256_mask_i32gather_pd(src, addr, idx, k, Scale);
613 }
gather(__m256i src,__m256i k,const int * addr,__m256i idx)614 template <int Scale> __m256i gather(__m256i src, __m256i k, const int *addr, __m256i idx)
615 {
616 return _mm256_mask_i32gather_epi32(src, addr, idx, k, Scale);
617 }
618 template <int Scale>
gather(__m256i src,__m256i k,const unsigned * addr,__m256i idx)619 __m256i gather(__m256i src, __m256i k, const unsigned *addr, __m256i idx)
620 {
621 return _mm256_mask_i32gather_epi32(src, aliasing_cast<int>(addr), idx, k, Scale);
622 }
623 #endif
624
625 } // namespace AvxIntrinsics
626 } // namespace Vc
627
628 namespace Vc_VERSIONED_NAMESPACE
629 {
630 namespace AVX
631 {
632 using namespace AvxIntrinsics;
633 } // namespace AVX
634 namespace AVX2
635 {
636 using namespace AvxIntrinsics;
637 } // namespace AVX2
638 namespace AVX
639 {
640 template<typename T> struct VectorTypeHelper;
641 template<> struct VectorTypeHelper< char > { typedef __m256i Type; };
642 template<> struct VectorTypeHelper< signed char > { typedef __m256i Type; };
643 template<> struct VectorTypeHelper<unsigned char > { typedef __m256i Type; };
644 template<> struct VectorTypeHelper< short> { typedef __m256i Type; };
645 template<> struct VectorTypeHelper<unsigned short> { typedef __m256i Type; };
646 template<> struct VectorTypeHelper< int > { typedef __m256i Type; };
647 template<> struct VectorTypeHelper<unsigned int > { typedef __m256i Type; };
648 template<> struct VectorTypeHelper< long > { typedef __m256i Type; };
649 template<> struct VectorTypeHelper<unsigned long > { typedef __m256i Type; };
650 template<> struct VectorTypeHelper< long long> { typedef __m256i Type; };
651 template<> struct VectorTypeHelper<unsigned long long> { typedef __m256i Type; };
652 template<> struct VectorTypeHelper< float> { typedef __m256 Type; };
653 template<> struct VectorTypeHelper< double> { typedef __m256d Type; };
654
655 template <typename T>
656 using IntegerVectorType =
657 typename std::conditional<sizeof(T) == 16, __m128i, __m256i>::type;
658 template <typename T>
659 using DoubleVectorType =
660 typename std::conditional<sizeof(T) == 16, __m128d, __m256d>::type;
661 template <typename T>
662 using FloatVectorType =
663 typename std::conditional<sizeof(T) == 16, __m128, __m256>::type;
664
665 template<typename T> struct VectorHelper {};
666 template<typename T> struct VectorHelperSize;
667 } // namespace AVX
668 } // namespace Vc
669
670 #endif // VC_AVX_INTRINSICS_H_
671