1 /*  Copyright (C) 2011-2017  Povilas Kanapickas <povilas@radix.lt>
2 
3     Distributed under the Boost Software License, Version 1.0.
4         (See accompanying file LICENSE_1_0.txt or copy at
5             http://www.boost.org/LICENSE_1_0.txt)
6 */
7 
8 #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_EXTRACT_H
9 #define LIBSIMDPP_SIMDPP_DETAIL_INSN_EXTRACT_H
10 
11 #ifndef LIBSIMDPP_SIMD_H
12     #error "This file must be included through simd.h"
13 #endif
14 
15 #include <simdpp/types.h>
16 #include <simdpp/core/cast.h>
17 #include <simdpp/core/move_l.h>
18 #include <simdpp/core/i_shift_l.h>
19 #include <simdpp/core/i_sub.h>
20 #include <simdpp/core/make_int.h>
21 #include <simdpp/detail/insn/split.h>
22 #include <simdpp/detail/mem_block.h>
23 
24 namespace simdpp {
25 namespace SIMDPP_ARCH_NAMESPACE {
26 namespace detail {
27 namespace insn {
28 
29 template<unsigned id> SIMDPP_INL
i_extract(const uint8<16> & a)30 uint8_t i_extract(const uint8<16>& a)
31 {
32 #if SIMDPP_USE_NULL
33     return a.el(id);
34 #elif SIMDPP_USE_SSE4_1
35     // Explicit cast is needed due to bug in Clang headers (intrinsic
36     // implemented as a macro with no appropriate casts) and a bug in Clang
37     // (thinks explicit conversion operators have the same rank as the regular
38     // ones)
39     return _mm_extract_epi8(a.native(), id);
40 #elif SIMDPP_USE_SSE2
41     unsigned shift = (id % 2 == 1) ? 8 : 0;
42     return _mm_extract_epi16(a.native(), id/2) >> shift;
43 #elif SIMDPP_USE_NEON
44     return vgetq_lane_u8(a.native(), id);
45 #elif SIMDPP_USE_ALTIVEC
46     detail::mem_block<uint8x16> ax(a);
47     vec_ste(a.native(), 0, &ax[id]);
48     return ax[id];
49 #elif SIMDPP_USE_MSA
50     return __msa_copy_u_b((v16i8) a.native(), id);
51 #endif
52 }
53 
54 #if SIMDPP_USE_AVX2
55 template<unsigned id> SIMDPP_INL
i_extract(const uint8<32> & a)56 uint8_t i_extract(const uint8<32>& a)
57 {
58     __m128i val = _mm256_extracti128_si256(a.native(), id / 16);
59     return _mm_extract_epi8(val, id % 16);
60 }
61 #endif
62 
63 #if SIMDPP_USE_AVX512BW
64 template<unsigned id> SIMDPP_INL
i_extract(const uint8<64> & a)65 uint8_t i_extract(const uint8<64>& a)
66 {
67     __m128i val = _mm512_extracti32x4_epi32(a.native(), id / 16);
68     return _mm_extract_epi8(val, id % 16);
69 }
70 #endif
71 
72 // -----------------------------------------------------------------------------
73 
74 template<unsigned id> SIMDPP_INL
i_extract(const int8<16> & a)75 int8_t i_extract(const int8<16>& a)
76 {
77 #if SIMDPP_USE_MSA
78     return __msa_copy_s_b(a.native(), id);
79 #else
80     return i_extract<id>(uint8x16(a));
81 #endif
82 }
83 
84 #if SIMDPP_USE_AVX2
85 template<unsigned id> SIMDPP_INL
i_extract(const int8<32> & a)86 int8_t i_extract(const int8<32>& a)
87 {
88     __m128i val = _mm256_extracti128_si256(a.native(), id / 16);
89     return _mm_extract_epi8(val, id % 16);
90 }
91 #endif
92 
93 #if SIMDPP_USE_AVX512BW
94 template<unsigned id> SIMDPP_INL
i_extract(const int8<64> & a)95 int8_t i_extract(const int8<64>& a)
96 {
97     __m128i val = _mm512_extracti32x4_epi32(a.native(), id / 16);
98     return _mm_extract_epi8(val, id % 16);
99 }
100 #endif
101 
102 // -----------------------------------------------------------------------------
103 
104 template<unsigned id> SIMDPP_INL
i_extract(const uint16<8> & a)105 uint16_t i_extract(const uint16<8>& a)
106 {
107 #if SIMDPP_USE_NULL
108     return a.el(id);
109 #elif SIMDPP_USE_SSE2
110     return _mm_extract_epi16(a.native(), id);
111 #elif SIMDPP_USE_NEON
112     return vgetq_lane_u16(a.native(), id);
113 #elif SIMDPP_USE_ALTIVEC
114     detail::mem_block<uint16x8> ax(a);
115     vec_ste(a.native(), 0, &ax[id]);
116     return ax[id];
117 #elif SIMDPP_USE_MSA
118     return __msa_copy_u_h((v8i16) a.native(), id);
119 #endif
120 }
121 
122 #if SIMDPP_USE_AVX2
123 template<unsigned id> SIMDPP_INL
i_extract(const uint16<16> & a)124 uint16_t i_extract(const uint16<16>& a)
125 {
126     __m128i val = _mm256_extracti128_si256(a.native(), id / 8);
127     return _mm_extract_epi16(val, id % 8);
128 }
129 #endif
130 
131 #if SIMDPP_USE_AVX512BW
132 template<unsigned id> SIMDPP_INL
i_extract(const uint16<32> & a)133 uint16_t i_extract(const uint16<32>& a)
134 {
135     __m128i val = _mm512_extracti32x4_epi32(a.native(), id / 8);
136     return _mm_extract_epi16(val, id % 8);
137 }
138 #endif
139 
140 // -----------------------------------------------------------------------------
141 
142 template<unsigned id> SIMDPP_INL
i_extract(const int16<8> & a)143 int16_t i_extract(const int16<8>& a)
144 {
145 #if SIMDPP_USE_MSA
146     return __msa_copy_s_h(a.native(), id);
147 #else
148     return i_extract<id>(uint16x8(a));
149 #endif
150 }
151 
152 #if SIMDPP_USE_AVX2
153 template<unsigned id> SIMDPP_INL
i_extract(const int16<16> & a)154 int16_t i_extract(const int16<16>& a)
155 {
156     __m128i val = _mm256_extracti128_si256(a.native(), id / 8);
157     return _mm_extract_epi16(val, id % 8);
158 }
159 #endif
160 
161 #if SIMDPP_USE_AVX512BW
162 template<unsigned id> SIMDPP_INL
i_extract(const int16<32> & a)163 int16_t i_extract(const int16<32>& a)
164 {
165     __m128i val = _mm512_extracti32x4_epi32(a.native(), id / 8);
166     return _mm_extract_epi16(val, id % 8);
167 }
168 #endif
169 
170 // -----------------------------------------------------------------------------
171 
172 template<unsigned id> SIMDPP_INL
i_extract(const uint32<4> & a)173 uint32_t i_extract(const uint32<4>& a)
174 {
175 #if SIMDPP_USE_NULL
176     return a.el(id);
177 #elif SIMDPP_USE_SSE4_1
178     return _mm_extract_epi32(a.native(), id);
179 #elif SIMDPP_USE_SSE2
180     // when id==0, move_l is template-specialized and does nothing
181     return _mm_cvtsi128_si32(move4_l<id>(a).eval().native());
182 #elif SIMDPP_USE_NEON
183     return vgetq_lane_u32(a.native(), id);
184 #elif SIMDPP_USE_ALTIVEC
185     detail::mem_block<uint32x4> ax(a);
186     vec_ste(a.native(), 0, &ax[id]);
187     return ax[id];
188 #elif SIMDPP_USE_MSA
189     return __msa_copy_u_w((v4i32) a.native(), id);
190 #endif
191 }
192 
193 #if SIMDPP_USE_AVX2
194 template<unsigned id> SIMDPP_INL
i_extract(const uint32<8> & a)195 uint32_t i_extract(const uint32<8>& a)
196 {
197     __m128i val = _mm256_extracti128_si256(a.native(), id / 4);
198     return _mm_extract_epi32(val, id % 4);
199 }
200 #endif
201 
202 #if SIMDPP_USE_AVX512F
203 template<unsigned id> SIMDPP_INL
i_extract(const uint32<16> & a)204 uint32_t i_extract(const uint32<16>& a)
205 {
206     __m128i val = _mm512_extracti32x4_epi32(a.native(), id / 4);
207     return _mm_extract_epi32(val, id % 4);
208 }
209 #endif
210 
211 // -----------------------------------------------------------------------------
212 
213 template<unsigned id> SIMDPP_INL
i_extract(const int32<4> & a)214 int32_t i_extract(const int32<4>& a)
215 {
216 #if SIMDPP_USE_MSA
217     return __msa_copy_s_w(a.native(), id);
218 #else
219     return i_extract<id>(uint32x4(a));
220 #endif
221 }
222 
223 #if SIMDPP_USE_AVX2
224 template<unsigned id> SIMDPP_INL
i_extract(const int32<8> & a)225 int32_t i_extract(const int32<8>& a)
226 {
227     __m128i val = _mm256_extracti128_si256(a.native(), id / 4);
228     return _mm_extract_epi32(val, id % 4);
229 }
230 #endif
231 
232 #if SIMDPP_USE_AVX512F
233 template<unsigned id> SIMDPP_INL
i_extract(const int32<16> & a)234 int32_t i_extract(const int32<16>& a)
235 {
236     __m128i val = _mm512_extracti32x4_epi32(a.native(), id / 4);
237     return _mm_extract_epi32(val, id % 4);
238 }
239 #endif
240 
241 // -----------------------------------------------------------------------------
242 
243 template<unsigned id> SIMDPP_INL
i_extract(const uint64<2> & a)244 uint64_t i_extract(const uint64<2>& a)
245 {
246 #if SIMDPP_USE_NULL
247     return a.el(id);
248 #elif SIMDPP_USE_SSE4_1
249 #if SIMDPP_32_BITS
250     uint32x4 t = uint32x4(a);
251     uint64_t r = i_extract<id*2>(t);
252     r |= uint64_t(i_extract<id*2+1>(t)) << 32;
253     return r;
254 #else
255     return _mm_extract_epi64(a.native(), id);
256 #endif
257 #elif SIMDPP_USE_SSE2
258 #if SIMDPP_32_BITS
259     uint32x4 t = uint32x4(a);
260     uint64_t r = 0;
261     t = move4_l<id*2>(t); // when id==0, move_l is template-specialized and does nothing
262     r = i_extract<0>(t);
263     t = move4_l<1>(t);
264     r |= uint64_t(i_extract<0>(t)) << 32;
265     return r;
266 #else
267     uint64x2 t = a;
268     if (id != 0) {
269         t = move2_l<id>(t);
270     }
271     return _mm_cvtsi128_si64(t.native());
272 #endif
273 #elif SIMDPP_USE_NEON
274     return vgetq_lane_u64(a.native(), id);
275 #elif SIMDPP_USE_ALTIVEC
276     detail::mem_block<uint64x2> ax(a);
277     return ax[id];
278 #elif SIMDPP_USE_MSA
279 #if SIMDPP_64_BITS
280     return __msa_copy_u_d((v2i64) a.native(), id);
281 #else
282     v4i32 a32 = (v4i32) a.native();
283     uint64_t lo = __msa_copy_u_w(a32, id*2);
284     uint64_t hi = __msa_copy_u_w(a32, id*2+1);
285     return lo | (hi << 32);
286 #endif
287 #endif
288 }
289 
290 #if SIMDPP_USE_AVX2
291 template<unsigned id> SIMDPP_INL
i_extract(const uint64<4> & a)292 uint64_t i_extract(const uint64<4>& a)
293 {
294     uint64<2> val = _mm256_extracti128_si256(a.native(), id / 2);
295     return i_extract<id % 2>(val);
296 }
297 #endif
298 
299 #if SIMDPP_USE_AVX512F
300 template<unsigned id> SIMDPP_INL
i_extract(const uint64<8> & a)301 uint64_t i_extract(const uint64<8>& a)
302 {
303     uint64<2> val = _mm512_extracti32x4_epi32(a.native(), id / 2);
304     return i_extract<id % 2>(val);
305 }
306 #endif
307 
308 // -----------------------------------------------------------------------------
309 
310 template<unsigned id> SIMDPP_INL
i_extract(const int64<2> & a)311 int64_t i_extract(const int64<2>& a)
312 {
313 #if SIMDPP_USE_MSA
314 #if SIMDPP_64_BITS
315     return __msa_copy_s_d(a, id);
316 #else
317     v4i32 a32 = (v4i32) a.native();
318     int64_t lo = __msa_copy_s_w(a32, id*2);
319     int64_t hi = __msa_copy_s_w(a32, id*2+1);
320     return lo | (hi << 32);
321 #endif
322 #else
323     return i_extract<id>(uint64x2(a));
324 #endif
325 }
326 
327 #if SIMDPP_USE_AVX2
328 template<unsigned id> SIMDPP_INL
i_extract(const int64<4> & a)329 int64_t i_extract(const int64<4>& a)
330 {
331     uint64<2> val = _mm256_extracti128_si256(a.native(), id / 2);
332     return i_extract<id % 2>(val);
333 }
334 #endif
335 
336 #if SIMDPP_USE_AVX512F
337 template<unsigned id> SIMDPP_INL
i_extract(const int64<8> & a)338 int64_t i_extract(const int64<8>& a)
339 {
340     uint64<2> val = _mm512_extracti32x4_epi32(a.native(), id / 2);
341     return i_extract<id % 2>(val);
342 }
343 #endif
344 
345 // -----------------------------------------------------------------------------
346 
347 template<unsigned id> SIMDPP_INL
i_extract(const float32<4> & a)348 float i_extract(const float32<4>& a)
349 {
350 #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
351     return a.el(id);
352 #elif SIMDPP_USE_SSE2
353     return bit_cast<float>(i_extract<id>(int32x4(a)));
354 #elif SIMDPP_USE_NEON
355     return vgetq_lane_f32(a.native(), id);
356 #elif SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
357     detail::mem_block<float32x4> ax(a);
358     return ax[id];
359 #endif
360 }
361 
362 #if SIMDPP_USE_AVX
363 template<unsigned id> SIMDPP_INL
i_extract(const float32<8> & a)364 float i_extract(const float32<8>& a)
365 {
366     __m128 val = _mm256_extractf128_ps(a.native(), id / 4);
367     return bit_cast<float>(_mm_extract_epi32(_mm_castps_si128(val), id % 4));
368 }
369 #endif
370 
371 #if SIMDPP_USE_AVX512F
372 template<unsigned id> SIMDPP_INL
i_extract(const float32<16> & a)373 float i_extract(const float32<16>& a)
374 {
375     __m128 val = _mm512_extractf32x4_ps(a.native(), id / 4);
376     return bit_cast<float>(_mm_extract_epi32(_mm_castps_si128(val), id % 4));
377 }
378 #endif
379 
380 // -----------------------------------------------------------------------------
381 
382 template<unsigned id> SIMDPP_INL
i_extract(const float64<2> & a)383 double i_extract(const float64<2>& a)
384 {
385 #if SIMDPP_USE_NULL
386     return a.el(id);
387 #elif SIMDPP_USE_SSE2
388     return bit_cast<double>(i_extract<id>(int64x2(a)));
389 #elif SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
390     detail::mem_block<float64x2> ax(a);
391     return ax[id];
392 #elif SIMDPP_USE_NEON64
393     return vgetq_lane_f64(a.native(), id);
394 #endif
395 }
396 
397 #if SIMDPP_USE_AVX
398 template<unsigned id> SIMDPP_INL
i_extract(const float64<4> & a)399 double i_extract(const float64<4>& a)
400 {
401     __m128d val = _mm256_extractf128_pd(a.native(), id / 2);
402     return bit_cast<double>(i_extract<id % 2>((uint64<2>)_mm_castpd_si128(val)));
403 }
404 #endif
405 
406 #if SIMDPP_USE_AVX512F
407 template<unsigned id> SIMDPP_INL
i_extract(const float64<8> & a)408 double i_extract(const float64<8>& a)
409 {
410     __m128 val = _mm512_extractf32x4_ps(_mm512_castpd_ps(a.native()), id / 2);
411     return bit_cast<double>(i_extract<id % 2>((uint64<2>)_mm_castps_si128(val)));
412 }
413 #endif
414 
415 // -----------------------------------------------------------------------------
416 
417 template<unsigned id, class V> SIMDPP_INL
i_extract(const V & a)418 typename V::element_type i_extract(const V& a)
419 {
420     typename V::base_vector_type base = a.vec(id / V::base_length);
421     return i_extract<id % V::base_length>(base);
422 }
423 
424 } // namespace insn
425 } // namespace detail
426 } // namespace SIMDPP_ARCH_NAMESPACE
427 } // namespace simdpp
428 
429 #endif
430