1 /* Copyright (C) 2011-2017 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6 */
7
8 #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_EXTRACT_H
9 #define LIBSIMDPP_SIMDPP_DETAIL_INSN_EXTRACT_H
10
11 #ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13 #endif
14
15 #include <simdpp/types.h>
16 #include <simdpp/core/cast.h>
17 #include <simdpp/core/move_l.h>
18 #include <simdpp/core/i_shift_l.h>
19 #include <simdpp/core/i_sub.h>
20 #include <simdpp/core/make_int.h>
21 #include <simdpp/detail/insn/split.h>
22 #include <simdpp/detail/mem_block.h>
23
24 namespace simdpp {
25 namespace SIMDPP_ARCH_NAMESPACE {
26 namespace detail {
27 namespace insn {
28
29 template<unsigned id> SIMDPP_INL
i_extract(const uint8<16> & a)30 uint8_t i_extract(const uint8<16>& a)
31 {
32 #if SIMDPP_USE_NULL
33 return a.el(id);
34 #elif SIMDPP_USE_SSE4_1
35 // Explicit cast is needed due to bug in Clang headers (intrinsic
36 // implemented as a macro with no appropriate casts) and a bug in Clang
37 // (thinks explicit conversion operators have the same rank as the regular
38 // ones)
39 return _mm_extract_epi8(a.native(), id);
40 #elif SIMDPP_USE_SSE2
41 unsigned shift = (id % 2 == 1) ? 8 : 0;
42 return _mm_extract_epi16(a.native(), id/2) >> shift;
43 #elif SIMDPP_USE_NEON
44 return vgetq_lane_u8(a.native(), id);
45 #elif SIMDPP_USE_ALTIVEC
46 detail::mem_block<uint8x16> ax(a);
47 vec_ste(a.native(), 0, &ax[id]);
48 return ax[id];
49 #elif SIMDPP_USE_MSA
50 return __msa_copy_u_b((v16i8) a.native(), id);
51 #endif
52 }
53
54 #if SIMDPP_USE_AVX2
55 template<unsigned id> SIMDPP_INL
i_extract(const uint8<32> & a)56 uint8_t i_extract(const uint8<32>& a)
57 {
58 __m128i val = _mm256_extracti128_si256(a.native(), id / 16);
59 return _mm_extract_epi8(val, id % 16);
60 }
61 #endif
62
63 #if SIMDPP_USE_AVX512BW
64 template<unsigned id> SIMDPP_INL
i_extract(const uint8<64> & a)65 uint8_t i_extract(const uint8<64>& a)
66 {
67 __m128i val = _mm512_extracti32x4_epi32(a.native(), id / 16);
68 return _mm_extract_epi8(val, id % 16);
69 }
70 #endif
71
72 // -----------------------------------------------------------------------------
73
74 template<unsigned id> SIMDPP_INL
i_extract(const int8<16> & a)75 int8_t i_extract(const int8<16>& a)
76 {
77 #if SIMDPP_USE_MSA
78 return __msa_copy_s_b(a.native(), id);
79 #else
80 return i_extract<id>(uint8x16(a));
81 #endif
82 }
83
84 #if SIMDPP_USE_AVX2
85 template<unsigned id> SIMDPP_INL
i_extract(const int8<32> & a)86 int8_t i_extract(const int8<32>& a)
87 {
88 __m128i val = _mm256_extracti128_si256(a.native(), id / 16);
89 return _mm_extract_epi8(val, id % 16);
90 }
91 #endif
92
93 #if SIMDPP_USE_AVX512BW
94 template<unsigned id> SIMDPP_INL
i_extract(const int8<64> & a)95 int8_t i_extract(const int8<64>& a)
96 {
97 __m128i val = _mm512_extracti32x4_epi32(a.native(), id / 16);
98 return _mm_extract_epi8(val, id % 16);
99 }
100 #endif
101
102 // -----------------------------------------------------------------------------
103
104 template<unsigned id> SIMDPP_INL
i_extract(const uint16<8> & a)105 uint16_t i_extract(const uint16<8>& a)
106 {
107 #if SIMDPP_USE_NULL
108 return a.el(id);
109 #elif SIMDPP_USE_SSE2
110 return _mm_extract_epi16(a.native(), id);
111 #elif SIMDPP_USE_NEON
112 return vgetq_lane_u16(a.native(), id);
113 #elif SIMDPP_USE_ALTIVEC
114 detail::mem_block<uint16x8> ax(a);
115 vec_ste(a.native(), 0, &ax[id]);
116 return ax[id];
117 #elif SIMDPP_USE_MSA
118 return __msa_copy_u_h((v8i16) a.native(), id);
119 #endif
120 }
121
122 #if SIMDPP_USE_AVX2
123 template<unsigned id> SIMDPP_INL
i_extract(const uint16<16> & a)124 uint16_t i_extract(const uint16<16>& a)
125 {
126 __m128i val = _mm256_extracti128_si256(a.native(), id / 8);
127 return _mm_extract_epi16(val, id % 8);
128 }
129 #endif
130
131 #if SIMDPP_USE_AVX512BW
132 template<unsigned id> SIMDPP_INL
i_extract(const uint16<32> & a)133 uint16_t i_extract(const uint16<32>& a)
134 {
135 __m128i val = _mm512_extracti32x4_epi32(a.native(), id / 8);
136 return _mm_extract_epi16(val, id % 8);
137 }
138 #endif
139
140 // -----------------------------------------------------------------------------
141
142 template<unsigned id> SIMDPP_INL
i_extract(const int16<8> & a)143 int16_t i_extract(const int16<8>& a)
144 {
145 #if SIMDPP_USE_MSA
146 return __msa_copy_s_h(a.native(), id);
147 #else
148 return i_extract<id>(uint16x8(a));
149 #endif
150 }
151
152 #if SIMDPP_USE_AVX2
153 template<unsigned id> SIMDPP_INL
i_extract(const int16<16> & a)154 int16_t i_extract(const int16<16>& a)
155 {
156 __m128i val = _mm256_extracti128_si256(a.native(), id / 8);
157 return _mm_extract_epi16(val, id % 8);
158 }
159 #endif
160
161 #if SIMDPP_USE_AVX512BW
162 template<unsigned id> SIMDPP_INL
i_extract(const int16<32> & a)163 int16_t i_extract(const int16<32>& a)
164 {
165 __m128i val = _mm512_extracti32x4_epi32(a.native(), id / 8);
166 return _mm_extract_epi16(val, id % 8);
167 }
168 #endif
169
170 // -----------------------------------------------------------------------------
171
172 template<unsigned id> SIMDPP_INL
i_extract(const uint32<4> & a)173 uint32_t i_extract(const uint32<4>& a)
174 {
175 #if SIMDPP_USE_NULL
176 return a.el(id);
177 #elif SIMDPP_USE_SSE4_1
178 return _mm_extract_epi32(a.native(), id);
179 #elif SIMDPP_USE_SSE2
180 // when id==0, move_l is template-specialized and does nothing
181 return _mm_cvtsi128_si32(move4_l<id>(a).eval().native());
182 #elif SIMDPP_USE_NEON
183 return vgetq_lane_u32(a.native(), id);
184 #elif SIMDPP_USE_ALTIVEC
185 detail::mem_block<uint32x4> ax(a);
186 vec_ste(a.native(), 0, &ax[id]);
187 return ax[id];
188 #elif SIMDPP_USE_MSA
189 return __msa_copy_u_w((v4i32) a.native(), id);
190 #endif
191 }
192
193 #if SIMDPP_USE_AVX2
194 template<unsigned id> SIMDPP_INL
i_extract(const uint32<8> & a)195 uint32_t i_extract(const uint32<8>& a)
196 {
197 __m128i val = _mm256_extracti128_si256(a.native(), id / 4);
198 return _mm_extract_epi32(val, id % 4);
199 }
200 #endif
201
202 #if SIMDPP_USE_AVX512F
203 template<unsigned id> SIMDPP_INL
i_extract(const uint32<16> & a)204 uint32_t i_extract(const uint32<16>& a)
205 {
206 __m128i val = _mm512_extracti32x4_epi32(a.native(), id / 4);
207 return _mm_extract_epi32(val, id % 4);
208 }
209 #endif
210
211 // -----------------------------------------------------------------------------
212
213 template<unsigned id> SIMDPP_INL
i_extract(const int32<4> & a)214 int32_t i_extract(const int32<4>& a)
215 {
216 #if SIMDPP_USE_MSA
217 return __msa_copy_s_w(a.native(), id);
218 #else
219 return i_extract<id>(uint32x4(a));
220 #endif
221 }
222
223 #if SIMDPP_USE_AVX2
224 template<unsigned id> SIMDPP_INL
i_extract(const int32<8> & a)225 int32_t i_extract(const int32<8>& a)
226 {
227 __m128i val = _mm256_extracti128_si256(a.native(), id / 4);
228 return _mm_extract_epi32(val, id % 4);
229 }
230 #endif
231
232 #if SIMDPP_USE_AVX512F
233 template<unsigned id> SIMDPP_INL
i_extract(const int32<16> & a)234 int32_t i_extract(const int32<16>& a)
235 {
236 __m128i val = _mm512_extracti32x4_epi32(a.native(), id / 4);
237 return _mm_extract_epi32(val, id % 4);
238 }
239 #endif
240
241 // -----------------------------------------------------------------------------
242
243 template<unsigned id> SIMDPP_INL
i_extract(const uint64<2> & a)244 uint64_t i_extract(const uint64<2>& a)
245 {
246 #if SIMDPP_USE_NULL
247 return a.el(id);
248 #elif SIMDPP_USE_SSE4_1
249 #if SIMDPP_32_BITS
250 uint32x4 t = uint32x4(a);
251 uint64_t r = i_extract<id*2>(t);
252 r |= uint64_t(i_extract<id*2+1>(t)) << 32;
253 return r;
254 #else
255 return _mm_extract_epi64(a.native(), id);
256 #endif
257 #elif SIMDPP_USE_SSE2
258 #if SIMDPP_32_BITS
259 uint32x4 t = uint32x4(a);
260 uint64_t r = 0;
261 t = move4_l<id*2>(t); // when id==0, move_l is template-specialized and does nothing
262 r = i_extract<0>(t);
263 t = move4_l<1>(t);
264 r |= uint64_t(i_extract<0>(t)) << 32;
265 return r;
266 #else
267 uint64x2 t = a;
268 if (id != 0) {
269 t = move2_l<id>(t);
270 }
271 return _mm_cvtsi128_si64(t.native());
272 #endif
273 #elif SIMDPP_USE_NEON
274 return vgetq_lane_u64(a.native(), id);
275 #elif SIMDPP_USE_ALTIVEC
276 detail::mem_block<uint64x2> ax(a);
277 return ax[id];
278 #elif SIMDPP_USE_MSA
279 #if SIMDPP_64_BITS
280 return __msa_copy_u_d((v2i64) a.native(), id);
281 #else
282 v4i32 a32 = (v4i32) a.native();
283 uint64_t lo = __msa_copy_u_w(a32, id*2);
284 uint64_t hi = __msa_copy_u_w(a32, id*2+1);
285 return lo | (hi << 32);
286 #endif
287 #endif
288 }
289
290 #if SIMDPP_USE_AVX2
291 template<unsigned id> SIMDPP_INL
i_extract(const uint64<4> & a)292 uint64_t i_extract(const uint64<4>& a)
293 {
294 uint64<2> val = _mm256_extracti128_si256(a.native(), id / 2);
295 return i_extract<id % 2>(val);
296 }
297 #endif
298
299 #if SIMDPP_USE_AVX512F
300 template<unsigned id> SIMDPP_INL
i_extract(const uint64<8> & a)301 uint64_t i_extract(const uint64<8>& a)
302 {
303 uint64<2> val = _mm512_extracti32x4_epi32(a.native(), id / 2);
304 return i_extract<id % 2>(val);
305 }
306 #endif
307
308 // -----------------------------------------------------------------------------
309
310 template<unsigned id> SIMDPP_INL
i_extract(const int64<2> & a)311 int64_t i_extract(const int64<2>& a)
312 {
313 #if SIMDPP_USE_MSA
314 #if SIMDPP_64_BITS
315 return __msa_copy_s_d(a, id);
316 #else
317 v4i32 a32 = (v4i32) a.native();
318 int64_t lo = __msa_copy_s_w(a32, id*2);
319 int64_t hi = __msa_copy_s_w(a32, id*2+1);
320 return lo | (hi << 32);
321 #endif
322 #else
323 return i_extract<id>(uint64x2(a));
324 #endif
325 }
326
327 #if SIMDPP_USE_AVX2
328 template<unsigned id> SIMDPP_INL
i_extract(const int64<4> & a)329 int64_t i_extract(const int64<4>& a)
330 {
331 uint64<2> val = _mm256_extracti128_si256(a.native(), id / 2);
332 return i_extract<id % 2>(val);
333 }
334 #endif
335
336 #if SIMDPP_USE_AVX512F
337 template<unsigned id> SIMDPP_INL
i_extract(const int64<8> & a)338 int64_t i_extract(const int64<8>& a)
339 {
340 uint64<2> val = _mm512_extracti32x4_epi32(a.native(), id / 2);
341 return i_extract<id % 2>(val);
342 }
343 #endif
344
345 // -----------------------------------------------------------------------------
346
347 template<unsigned id> SIMDPP_INL
i_extract(const float32<4> & a)348 float i_extract(const float32<4>& a)
349 {
350 #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
351 return a.el(id);
352 #elif SIMDPP_USE_SSE2
353 return bit_cast<float>(i_extract<id>(int32x4(a)));
354 #elif SIMDPP_USE_NEON
355 return vgetq_lane_f32(a.native(), id);
356 #elif SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
357 detail::mem_block<float32x4> ax(a);
358 return ax[id];
359 #endif
360 }
361
362 #if SIMDPP_USE_AVX
363 template<unsigned id> SIMDPP_INL
i_extract(const float32<8> & a)364 float i_extract(const float32<8>& a)
365 {
366 __m128 val = _mm256_extractf128_ps(a.native(), id / 4);
367 return bit_cast<float>(_mm_extract_epi32(_mm_castps_si128(val), id % 4));
368 }
369 #endif
370
371 #if SIMDPP_USE_AVX512F
372 template<unsigned id> SIMDPP_INL
i_extract(const float32<16> & a)373 float i_extract(const float32<16>& a)
374 {
375 __m128 val = _mm512_extractf32x4_ps(a.native(), id / 4);
376 return bit_cast<float>(_mm_extract_epi32(_mm_castps_si128(val), id % 4));
377 }
378 #endif
379
380 // -----------------------------------------------------------------------------
381
382 template<unsigned id> SIMDPP_INL
i_extract(const float64<2> & a)383 double i_extract(const float64<2>& a)
384 {
385 #if SIMDPP_USE_NULL
386 return a.el(id);
387 #elif SIMDPP_USE_SSE2
388 return bit_cast<double>(i_extract<id>(int64x2(a)));
389 #elif SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
390 detail::mem_block<float64x2> ax(a);
391 return ax[id];
392 #elif SIMDPP_USE_NEON64
393 return vgetq_lane_f64(a.native(), id);
394 #endif
395 }
396
397 #if SIMDPP_USE_AVX
398 template<unsigned id> SIMDPP_INL
i_extract(const float64<4> & a)399 double i_extract(const float64<4>& a)
400 {
401 __m128d val = _mm256_extractf128_pd(a.native(), id / 2);
402 return bit_cast<double>(i_extract<id % 2>((uint64<2>)_mm_castpd_si128(val)));
403 }
404 #endif
405
406 #if SIMDPP_USE_AVX512F
407 template<unsigned id> SIMDPP_INL
i_extract(const float64<8> & a)408 double i_extract(const float64<8>& a)
409 {
410 __m128 val = _mm512_extractf32x4_ps(_mm512_castpd_ps(a.native()), id / 2);
411 return bit_cast<double>(i_extract<id % 2>((uint64<2>)_mm_castps_si128(val)));
412 }
413 #endif
414
415 // -----------------------------------------------------------------------------
416
417 template<unsigned id, class V> SIMDPP_INL
i_extract(const V & a)418 typename V::element_type i_extract(const V& a)
419 {
420 typename V::base_vector_type base = a.vec(id / V::base_length);
421 return i_extract<id % V::base_length>(base);
422 }
423
424 } // namespace insn
425 } // namespace detail
426 } // namespace SIMDPP_ARCH_NAMESPACE
427 } // namespace simdpp
428
429 #endif
430