1 /* Copyright (C) 2016 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6 */
7
8 #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_I_REDUCE_ADD_H
9 #define LIBSIMDPP_SIMDPP_DETAIL_INSN_I_REDUCE_ADD_H
10
11 #ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13 #endif
14
15 #include <simdpp/types.h>
16 #include <simdpp/core/extract.h>
17 #include <simdpp/core/move_l.h>
18 #include <simdpp/core/make_uint.h>
19 #include <simdpp/detail/extract128.h>
20
21 namespace simdpp {
22 namespace SIMDPP_ARCH_NAMESPACE {
23
24 // forward declarations
25 template<unsigned N, class E> SIMDPP_INL
26 int16_t reduce_add(const int8<N,E>& a);
27 template<unsigned N, class E> SIMDPP_INL
28 uint16_t reduce_add(const uint8<N,E>& a);
29 template<unsigned N, class E> SIMDPP_INL
30 int32_t reduce_add(const int16<N,E>& a);
31 template<unsigned N, class E> SIMDPP_INL
32 uint32_t reduce_add(const uint16<N,E>& a);
33 template<unsigned N, class E> SIMDPP_INL
34 int32_t reduce_add(const int32<N,E>& a);
35 template<unsigned N, class E> SIMDPP_INL
36 uint32_t reduce_add(const uint32<N,E>& a);
37 template<unsigned N, class E> SIMDPP_INL
38 int64_t reduce_add(const int64<N,E>& a);
39 template<unsigned N, class E> SIMDPP_INL
40 uint64_t reduce_add(const uint64<N,E>& a);
41
42 namespace detail {
43 namespace insn {
44
45 static SIMDPP_INL
i_reduce_add(const uint8x16 & a)46 uint16_t i_reduce_add(const uint8x16& a)
47 {
48 #if SIMDPP_USE_NULL
49 uint16_t r = a.el(0);
50 for (unsigned i = 1; i < a.length; i++) {
51 r += a.el(i);
52 }
53 return r;
54 #elif SIMDPP_USE_XOP
55 uint16x8 sum = _mm_haddq_epu8(a.native());
56 return extract<0>(sum) + extract<4>(sum);
57 #elif SIMDPP_USE_SSE2
58 uint16x8 sum = _mm_sad_epu8(a.native(), _mm_setzero_si128());
59 return extract<0>(sum) + extract<4>(sum);
60 #elif SIMDPP_USE_NEON
61 uint16x8 a2 = vpaddlq_u8(a.native());
62 uint32x4 a3 = vpaddlq_u16(a2.native());
63 uint64x2 a4 = vpaddlq_u32(a3.native());
64 a3 = a4;
65 uint32x2_t r = vadd_u32(vget_low_u32(a3.native()), vget_high_u32(a3.native()));
66 return vget_lane_u32(r, 0);
67 #elif SIMDPP_USE_ALTIVEC
68 uint32x4 sum = make_zero();
69 sum = vec_sum4s(a.native(), sum.native());
70 sum = add(sum, move4_l<2>(sum));
71 sum = add(sum, move4_l<1>(sum));
72 return extract<0>(sum);
73 #elif SIMDPP_USE_MSA
74 uint16<8> s16 = __msa_hadd_u_h(a.native(), a.native());
75 uint32<4> s32 = __msa_hadd_u_w(s16.native(), s16.native());
76 s32 = (uint64<2>) __msa_hadd_u_d(s32.native(), s32.native());
77 s32 = add(s32, move4_l<2>(s32));
78 return extract<0>(s32);
79 #endif
80 }
81
82 #if SIMDPP_USE_AVX2
83 static SIMDPP_INL
i_reduce_add(const uint8x32 & a)84 uint16_t i_reduce_add(const uint8x32& a)
85 {
86 uint16x16 sum2 = _mm256_sad_epu8(a.native(), _mm256_setzero_si256()); // results are in 0,2,4,6 elements
87 uint16x8 sum = add(detail::extract128<0>(sum2), detail::extract128<1>(sum2));
88 return extract<0>(sum) + extract<4>(sum);
89 }
90 #endif
91
92 #if SIMDPP_USE_AVX512BW
i_reduce_add(const uint8<64> & a)93 SIMDPP_INL uint16_t i_reduce_add(const uint8<64>& a)
94 {
95 uint64<8> sum2 = _mm512_sad_epu8(a.native(), _mm512_setzero_si512());
96 return reduce_add(sum2);
97 }
98 #endif
99
100 template<unsigned N>
i_reduce_add(const uint8<N> & a)101 SIMDPP_INL uint16_t i_reduce_add(const uint8<N>& a)
102 {
103 #if SIMDPP_USE_NULL
104 uint16_t r = 0;
105 for (unsigned j = 0; j < a.vec_length; ++j) {
106 for (unsigned i = 0; i < a.base_length; i++) {
107 r += a.vec(j).el(i);
108 }
109 }
110 return r;
111 #elif SIMDPP_USE_AVX512BW
112 uint64<8> sum2 = make_zero();
113 for (unsigned j = 0; j < a.vec_length; ++j) {
114 uint64<8> sum = _mm512_sad_epu8(a.native(), _mm512_setzero_si512());
115 sum2 = add(sum2, sum);
116 }
117 return reduce_add(sum2);
118 #elif SIMDPP_USE_AVX2
119 uint16x16 r = make_zero();
120 for (unsigned j = 0; j < a.vec_length; ++j) {
121 uint16x16 sum = _mm256_sad_epu8(a.vec(j).native(), _mm256_setzero_si256());
122 r = add(r, sum);
123 }
124 uint16x8 rl = add(detail::extract128<0>(r), detail::extract128<1>(r));
125 return extract<0>(rl) + extract<4>(rl);
126 #elif SIMDPP_USE_SSE2
127 uint16x8 r = make_zero();
128 for (unsigned j = 0; j < a.vec_length; ++j) {
129 #if SIMDPP_USE_XOP
130 uint16x8 sum = _mm_haddq_epu8(a.vec(j).native());
131 #else
132 uint16x8 sum = _mm_sad_epu8(a.vec(j).native(), _mm_setzero_si128());
133 #endif
134 r = add(r, sum);
135 }
136 return extract<0>(r) + extract<4>(r);
137 #elif SIMDPP_USE_NEON
138 uint16x8 r = make_zero();
139 for (unsigned j = 0; j < a.vec_length; ++j) {
140 uint16x8 sum = vpaddlq_u8(a.vec(j).native());
141 r = add(r, sum);
142 }
143 uint32x4 r2 = vpaddlq_u16(r.native());
144 uint64x2 r3 = vpaddlq_u32(r2.native());
145 r2 = r3;
146 uint32x2_t r4 = vadd_u32(vget_low_u32(r2.native()),
147 vget_high_u32(r2.native()));
148 return vget_lane_u32(r4, 0);
149 #elif SIMDPP_USE_ALTIVEC
150 uint32x4 sum = make_zero();
151 for (unsigned j = 0; j < a.vec_length; ++j) {
152 sum = vec_sum4s(a.vec(j).native(), sum.native());
153 }
154 sum = add(sum, move4_l<2>(sum));
155 sum = add(sum, move4_l<1>(sum));
156 return extract<0>(sum);
157 #elif SIMDPP_USE_MSA
158 uint16<8> r = make_zero();
159 for (unsigned j = 0; j < a.vec_length; ++j) {
160 uint16x8 sum = __msa_hadd_u_h(a.vec(j).native(), a.vec(j).native());
161 r = add(r, sum);
162 }
163 uint32<4> s32 = __msa_hadd_u_w(r.native(), r.native());
164 s32 = (v4u32) __msa_hadd_u_d(s32.native(), s32.native());
165 s32 = add(s32, move4_l<2>(s32));
166 return extract<0>(s32);
167 #endif
168 }
169
170 // -----------------------------------------------------------------------------
171
172 static SIMDPP_INL
i_reduce_add(const int8x16 & a)173 int16_t i_reduce_add(const int8x16& a)
174 {
175 #if SIMDPP_USE_NULL
176 int16_t r = a.el(0);
177 for (unsigned i = 1; i < a.length; i++) {
178 r += a.el(i);
179 }
180 return r;
181 #elif SIMDPP_USE_XOP
182 uint16x8 sum = _mm_haddq_epi8(a.native());
183 return extract<0>(sum) + extract<4>(sum);
184 #elif SIMDPP_USE_SSE2
185 return i_reduce_add(uint8x16(bit_xor(a, 0x80))) - a.length*0x80;
186 #elif SIMDPP_USE_NEON
187 int16x8 a2 = vpaddlq_s8(a.native());
188 int32x4 a3 = vpaddlq_s16(a2.native());
189 int64x2 a4 = vpaddlq_s32(a3.native());
190 a3 = a4;
191 int32x2_t r = vadd_s32(vget_low_s32(a3.native()),
192 vget_high_s32(a3.native()));
193 return vget_lane_s32(r, 0);
194 #elif SIMDPP_USE_ALTIVEC
195 int32x4 sum = make_zero();
196 sum = vec_sum4s(a.native(), sum.native());
197 sum = add(sum, move4_l<2>(sum));
198 sum = add(sum, move4_l<1>(sum));
199 return extract<0>(sum);
200 #elif SIMDPP_USE_MSA
201 int16<8> s16 = __msa_hadd_s_h(a.native(), a.native());
202 int32<4> s32 = __msa_hadd_s_w(s16.native(), s16.native());
203 s32 = (v4i32) __msa_hadd_s_d(s32.native(), s32.native());
204 s32 = add(s32, move4_l<2>(s32));
205 return extract<0>(s32);
206 #endif
207 }
208
209 #if SIMDPP_USE_AVX2
210 static SIMDPP_INL
i_reduce_add(const int8x32 & a)211 int16_t i_reduce_add(const int8x32& a)
212 {
213 return i_reduce_add(uint8x32(bit_xor(a, 0x80))) - a.length*0x80;
214 }
215 #endif
216
217 #if SIMDPP_USE_AVX512BW
i_reduce_add(const int8<64> & a)218 SIMDPP_INL uint16_t i_reduce_add(const int8<64>& a)
219 {
220 return i_reduce_add(uint8<64>(bit_xor(a, 0x80))) - a.length*0x80;
221 }
222 #endif
223
224 template<unsigned N>
i_reduce_add(const int8<N> & a)225 SIMDPP_INL uint16_t i_reduce_add(const int8<N>& a)
226 {
227 #if SIMDPP_USE_NULL
228 uint16_t r = 0;
229 for (unsigned j = 0; j < a.vec_length; ++j) {
230 for (unsigned i = 0; i < a.base_length; i++) {
231 r += a.vec(j).el(i);
232 }
233 }
234 return r;
235 #elif SIMDPP_USE_AVX512BW || SIMDPP_USE_AVX2
236 return i_reduce_add(uint8<N>(bit_xor(a, 0x80))) - a.length*0x80;
237 #elif SIMDPP_USE_XOP
238 int16x8 r = make_zero();
239 for (unsigned j = 0; j < a.vec_length; ++j) {
240 int16x8 sum = _mm_haddq_epi8(a.vec(j).native());
241 r = add(r, sum);
242 }
243 return extract<0>(r) + extract<4>(r);
244 #elif SIMDPP_USE_SSE2
245 return i_reduce_add(uint8<N>(bit_xor(a, 0x80))) - a.length*0x80;
246 #elif SIMDPP_USE_NEON
247 int16x8 r = make_zero();
248 for (unsigned j = 0; j < a.vec_length; ++j) {
249 int16x8 sum = vpaddlq_s8(a.vec(j).native());
250 r = add(r, sum);
251 }
252 int32x4 r2 = vpaddlq_s16(r.native());
253 int64x2 r3 = vpaddlq_s32(r2.native());
254 r2 = r3;
255 int32x2_t r4 = vadd_s32(vget_low_s32(r2.native()),
256 vget_high_s32(r2.native()));
257 return vget_lane_s32(r4, 0);
258 #elif SIMDPP_USE_ALTIVEC
259 int32x4 sum = make_zero();
260 for (unsigned j = 0; j < a.vec_length; ++j) {
261 sum = vec_sum4s(a.vec(j).native(), sum.native());
262 }
263 sum = add(sum, move4_l<2>(sum));
264 sum = add(sum, move4_l<1>(sum));
265 return extract<0>(sum);
266 #elif SIMDPP_USE_MSA
267 int16<8> r = make_zero();
268 for (unsigned j = 0; j < a.vec_length; ++j) {
269 int16x8 sum = __msa_hadd_s_h(a.vec(j).native(), a.vec(j).native());
270 r = add(r, sum);
271 }
272 int32<4> s32 = __msa_hadd_s_w(r.native(), r.native());
273 s32 = (v4i32) __msa_hadd_s_d(s32.native(), s32.native());
274 s32 = add(s32, move4_l<2>(s32));
275 return extract<0>(s32);
276 #endif
277 }
278
279 // -----------------------------------------------------------------------------
280
281 static SIMDPP_INL
i_reduce_add(const uint16x8 & a)282 uint32_t i_reduce_add(const uint16x8& a)
283 {
284 #if SIMDPP_USE_NULL
285 uint32_t r = a.el(0);
286 for (unsigned i = 1; i < a.length; i++) {
287 r += a.el(i);
288 }
289 return r;
290 #elif SIMDPP_USE_XOP
291 uint32x4 sum = _mm_haddq_epu16(a.native()); // sum in the 0 and 2 elements
292 sum = add(sum, move4_l<2>(sum));
293 return extract<0>(sum);
294 #elif SIMDPP_USE_SSE2
295 uint16x8 ones = make_uint(1);
296 uint16x8 ca = bit_xor(a, 0x8000);
297 uint32x4 sum = _mm_madd_epi16(ca.native(), ones.native());
298 // phadd is slower option on intel processors
299 sum = add(sum, move4_l<2>(sum));
300 sum = add(sum, move4_l<1>(sum));
301 return extract<0>(sum) + 0x8000 * a.length;
302 #elif SIMDPP_USE_NEON
303 uint32x4 a2 = vpaddlq_u16(a.native());
304 uint64x2 a3 = vpaddlq_u32(a2.native());
305 a2 = a3;
306 uint32x2_t r = vadd_u32(vget_low_u32(a2.native()),
307 vget_high_u32(a2.native()));
308 return vget_lane_u32(r, 0);
309 #elif SIMDPP_USE_ALTIVEC
310 int32x4 sum = make_zero();
311 int16x8 ca = bit_xor(a, 0x8000);
312 sum = vec_sum4s(ca.native(), sum.native());
313 sum = add(sum, move4_l<2>(sum));
314 sum = add(sum, move4_l<1>(sum));
315 return extract<0>(sum) + 0x8000 * a.length;
316 #elif SIMDPP_USE_MSA
317 uint32<4> s32 = __msa_hadd_u_w(a.native(), a.native());
318 s32 = (v4u32) __msa_hadd_u_d(s32.native(), s32.native());
319 s32 = add(s32, move4_l<2>(s32));
320 return extract<0>(s32);
321 #endif
322 }
323
324 #if SIMDPP_USE_AVX2
325 static SIMDPP_INL
i_reduce_add(const uint16x16 & a)326 uint32_t i_reduce_add(const uint16x16& a)
327 {
328 uint16x16 ones = make_uint(1);
329 uint16x16 ca = bit_xor(a, 0x8000);
330 uint32x8 sum = _mm256_madd_epi16(ca.native(), ones.native());
331 return reduce_add(sum) + 0x8000 * a.length;
332 }
333 #endif
334
335 #if SIMDPP_USE_AVX512BW
i_reduce_add(const uint16<32> & a)336 SIMDPP_INL uint32_t i_reduce_add(const uint16<32>& a)
337 {
338 uint16<32> ones = make_uint(1);
339 uint16<32> ca = bit_xor(a, 0x8000);
340 uint32<16> sum = _mm512_madd_epi16(ca.native(), ones.native());
341 return reduce_add(sum) + 0x8000 * a.length;
342 }
343 #endif
344
345 template<unsigned N>
i_reduce_add(const uint16<N> & a)346 SIMDPP_INL uint32_t i_reduce_add(const uint16<N>& a)
347 {
348 #if SIMDPP_USE_NULL
349 uint32_t r = 0;
350 for (unsigned j = 0; j < a.vec_length; ++j) {
351 for (unsigned i = 0; i < a.base_length; i++) {
352 r += a.vec(j).el(i);
353 }
354 }
355 return r;
356 #elif SIMDPP_USE_AVX512BW
357 uint32<16> sum = make_zero();
358 uint16<32> ones = make_uint(1);
359 for (unsigned j = 0; j < a.vec_length; ++j) {
360 uint16<32> ca = bit_xor(a.vec(j), 0x8000);
361 uint32<16> isum = _mm512_madd_epi16(ca.native(), ones.native());
362 sum = add(sum, isum);
363 }
364 return reduce_add(sum) + 0x8000 * a.length;
365 #elif SIMDPP_USE_AVX2
366 uint32x8 sum = make_zero();
367 uint16x16 ones = make_uint(1);
368 for (unsigned j = 0; j < a.vec_length; ++j) {
369 uint16x16 ca = bit_xor(a.vec(j), 0x8000);
370 uint32x8 isum = _mm256_madd_epi16(ca.native(), ones.native());
371 sum = add(sum, isum);
372 }
373 return reduce_add(sum) + 0x8000 * a.length;
374 #elif SIMDPP_USE_XOP
375 uint32x4 sum = make_zero();
376 for (unsigned j = 0; j < a.vec_length; ++j) {
377 uint32x4 isum = _mm_haddq_epu16(a.vec(j).native());
378 sum = add(sum, isum);
379 }
380 sum = add(sum, move4_l<2>(sum));
381 return extract<0>(sum);
382 #elif SIMDPP_USE_SSE2
383 uint32x4 sum = make_zero();
384 uint16x8 ones = make_uint(1);
385 for (unsigned j = 0; j < a.vec_length; ++j) {
386 uint16x8 ca = bit_xor(a.vec(j), 0x8000);
387 uint32x4 isum = _mm_madd_epi16(ca.native(), ones.native());
388 sum = add(sum, isum);
389 }
390 sum = add(sum, move4_l<2>(sum));
391 sum = add(sum, move4_l<1>(sum));
392 return extract<0>(sum) + 0x8000 * a.length;
393 #elif SIMDPP_USE_NEON
394 uint32x4 sum = make_zero();
395 for (unsigned j = 0; j < a.vec_length; ++j) {
396 uint32x4 isum = vpaddlq_u16(a.vec(j).native());
397 sum = add(sum, isum);
398 }
399 uint64x2 sum2 = vpaddlq_u32(sum.native());
400 sum = sum2;
401 uint32x2_t sum3 = vadd_u32(vget_low_u32(sum.native()),
402 vget_high_u32(sum.native()));
403 return vget_lane_u32(sum3, 0);
404 #elif SIMDPP_USE_ALTIVEC
405 int32x4 sum = make_zero();
406 for (unsigned j = 0; j < a.vec_length; ++j) {
407 int16x8 ca = bit_xor(a.vec(j), 0x8000);
408 sum = vec_sum4s(ca.native(), sum.native());
409 }
410 sum = add(sum, move4_l<2>(sum));
411 sum = add(sum, move4_l<1>(sum));
412 return extract<0>(sum) + 0x8000 * a.length;
413 #elif SIMDPP_USE_MSA
414 uint32<4> r = make_zero();
415 for (unsigned j = 0; j < a.vec_length; ++j) {
416 uint32<4> sum = __msa_hadd_u_w(a.vec(j).native(), a.vec(j).native());
417 r = add(r, sum);
418 }
419 r = (uint64<2>) __msa_hadd_u_d(r.native(), r.native());
420 r = add(r, move4_l<2>(r));
421 return extract<0>(r);
422 #endif
423 }
424
425 // -----------------------------------------------------------------------------
426
427 static SIMDPP_INL
i_reduce_add(const int16x8 & a)428 int32_t i_reduce_add(const int16x8& a)
429 {
430 #if SIMDPP_USE_NULL
431 int32_t r = a.el(0);
432 for (unsigned i = 1; i < a.length; i++) {
433 r += a.el(i);
434 }
435 return r;
436 #elif SIMDPP_USE_XOP
437 int32x4 sum = _mm_haddq_epi16(a.native()); // sum in the 0 and 2 elements
438 sum = add(sum, move4_l<2>(sum));
439 return extract<0>(sum);
440 #elif SIMDPP_USE_SSE2
441 int16x8 ones = make_uint(1);
442 int32x4 sum = _mm_madd_epi16(a.native(), ones.native());
443 return reduce_add(sum);
444 #elif SIMDPP_USE_NEON
445 int32x4 a2 = vpaddlq_s16(a.native());
446 int64x2 a3 = vpaddlq_s32(a2.native());
447 a2 = a3;
448 int32x2_t r = vadd_s32(vget_low_s32(a2.native()), vget_high_s32(a2.native()));
449 return vget_lane_s32(r, 0);
450 #elif SIMDPP_USE_ALTIVEC
451 int32x4 sum = make_zero();
452 sum = vec_sum4s(a.native(), sum.native());
453 sum = add(sum, move4_l<2>(sum));
454 sum = add(sum, move4_l<1>(sum));
455 return extract<0>(sum);
456 #elif SIMDPP_USE_MSA
457 int32<4> s32 = __msa_hadd_s_w(a.native(), a.native());
458 s32 = (int64<2>) __msa_hadd_s_d(s32.native(), s32.native());
459 s32 = add(s32, move4_l<2>(s32));
460 return extract<0>(s32);
461 #endif
462 }
463
464 #if SIMDPP_USE_AVX2
465 static SIMDPP_INL
i_reduce_add(const int16x16 & a)466 int32_t i_reduce_add(const int16x16& a)
467 {
468 int16x16 ones = make_uint(1);
469 int32x8 sum = _mm256_madd_epi16(a.native(), ones.native());
470 return reduce_add(sum);
471 }
472 #endif
473
474 #if SIMDPP_USE_AVX512BW
i_reduce_add(const int16<32> & a)475 SIMDPP_INL int32_t i_reduce_add(const int16<32>& a)
476 {
477 int16<32> ones = make_uint(1);
478 int32<16> sum = _mm512_madd_epi16(a.native(), ones.native());
479 return reduce_add(sum);
480 }
481 #endif
482
483 template<unsigned N>
i_reduce_add(const int16<N> & a)484 SIMDPP_INL int32_t i_reduce_add(const int16<N>& a)
485 {
486 #if SIMDPP_USE_NULL
487 int32_t r = 0;
488 for (unsigned j = 0; j < a.vec_length; ++j) {
489 for (unsigned i = 0; i < a.base_length; i++) {
490 r += a.vec(j).el(i);
491 }
492 }
493 return r;
494 #elif SIMDPP_USE_AVX512BW
495 int32<16> sum = make_zero();
496 int16<32> ones = make_int(1);
497 for (unsigned j = 0; j < a.vec_length; ++j) {
498 int32<16> isum = _mm512_madd_epi16(a.vec(j).native(), ones.native());
499 sum = add(sum, isum);
500 }
501 return reduce_add(sum);
502 #elif SIMDPP_USE_AVX2
503 int32x8 sum = make_zero();
504 int16x16 ones = make_int(1);
505 for (unsigned j = 0; j < a.vec_length; ++j) {
506 int32x8 isum = _mm256_madd_epi16(a.vec(j).native(), ones.native());
507 sum = add(sum, isum);
508 }
509 return reduce_add(sum);
510 #elif SIMDPP_USE_XOP
511 int32x4 sum = make_zero();
512 for (unsigned j = 0; j < a.vec_length; ++j) {
513 int32x4 isum = _mm_haddq_epi16(a.vec(j).native());
514 sum = add(sum, isum);
515 }
516 // _mm_haddq_epi16 computes 64-bit results.
517 // 1 and 3 32-bit elements may be nonzero
518 sum = add(sum, move4_l<2>(sum));
519 return extract<0>(sum);
520 #elif SIMDPP_USE_SSE2
521 int32x4 sum = make_zero();
522 int16x8 ones = make_int(1);
523 for (unsigned j = 0; j < a.vec_length; ++j) {
524 int32x4 isum = _mm_madd_epi16(a.vec(j).native(), ones.native());
525 sum = add(sum, isum);
526 }
527 return reduce_add(sum);
528 #elif SIMDPP_USE_NEON
529 int32x4 sum = make_zero();
530 for (unsigned j = 0; j < a.vec_length; ++j) {
531 int32x4 isum = vpaddlq_s16(a.vec(j).native());
532 sum = add(sum, isum);
533 }
534 return reduce_add(sum);
535 #elif SIMDPP_USE_ALTIVEC
536 int32x4 sum = make_zero();
537 for (unsigned j = 0; j < a.vec_length; ++j) {
538 sum = vec_sum4s(a.vec(j).native(), sum.native());
539 }
540 return reduce_add(sum);
541 #elif SIMDPP_USE_MSA
542 int32<4> r = make_zero();
543 for (unsigned j = 0; j < a.vec_length; ++j) {
544 int32<4> sum = __msa_hadd_s_w(a.vec(j).native(),
545 a.vec(j).native());
546 r = add(r, sum);
547 }
548 r = (int64<2>) __msa_hadd_s_d(r.native(), r.native());
549 r = add(r, move4_l<2>(r));
550 return extract<0>(r);
551 #endif
552 }
553
554 // -----------------------------------------------------------------------------
555
556 static SIMDPP_INL
i_reduce_add(const uint32x4 & a)557 uint32_t i_reduce_add(const uint32x4& a)
558 {
559 #if SIMDPP_USE_NULL
560 uint32_t r = a.el(0);
561 for (unsigned i = 1; i < a.length; i++) {
562 r += a.el(i);
563 }
564 return r;
565 #elif SIMDPP_USE_MSA
566 uint32x4 sum = a;
567 sum = (uint64<2>) __msa_hadd_u_d(sum.native(), sum.native());
568 sum = add(sum, move4_l<2>(sum));
569 return extract<0>(sum);
570 #else
571 uint32x4 sum = a;
572 sum = add(sum, move4_l<2>(sum));
573 sum = add(sum, move4_l<1>(sum));
574 return extract<0>(sum);
575 #endif
576 }
577
578 #if SIMDPP_USE_AVX2
579 static SIMDPP_INL
i_reduce_add(const uint32x8 & a)580 uint32_t i_reduce_add(const uint32x8& a)
581 {
582 uint32x4 sum = add(detail::extract128<0>(a), detail::extract128<1>(a));
583 sum = add(sum, move4_l<2>(sum));
584 sum = add(sum, move4_l<1>(sum));
585 return extract<0>(sum);
586 }
587 #endif
588
589 #if SIMDPP_USE_AVX512F
590 static SIMDPP_INL
i_reduce_add(const uint32<16> & a)591 uint32_t i_reduce_add(const uint32<16>& a)
592 {
593 return i_reduce_add(add(extract256<0>(a), extract256<1>(a)));
594 }
595 #endif
596
597 template<unsigned N>
i_reduce_add(const uint32<N> & a)598 SIMDPP_INL uint32_t i_reduce_add(const uint32<N>& a)
599 {
600 #if SIMDPP_USE_NULL
601 uint32_t r = 0;
602 for (unsigned j = 0; j < a.vec_length; ++j) {
603 for (unsigned i = 0; i < a.base_length; i++) {
604 r += a.vec(j).el(i);
605 }
606 }
607 return r;
608 #else
609 uint32v sum = make_zero();
610 for (unsigned j = 0; j < a.vec_length; ++j) {
611 sum = add(sum, a.vec(j));
612 }
613 return i_reduce_add(sum);
614 #endif
615 }
616
617 // -----------------------------------------------------------------------------
618
619 static SIMDPP_INL
i_reduce_add(const uint64x2 & a)620 uint64_t i_reduce_add(const uint64x2& a)
621 {
622 #if SIMDPP_USE_NULL
623 uint64_t r = a.el(0);
624 for (unsigned i = 1; i < a.length; i++) {
625 r += a.el(i);
626 }
627 return r;
628 #elif SIMDPP_USE_SSE2
629 uint64x2 sum = a;
630 sum = add(sum, move2_l<1>(sum));
631 return extract<0>(sum);
632 #elif SIMDPP_USE_NEON
633 uint64x1_t r = vadd_u64(vget_low_u64(a.native()),
634 vget_high_u64(a.native()));
635 return vget_lane_u64(r, 0);
636 #elif SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
637 return extract<0>(a) + extract<1>(a);
638 #endif
639 }
640
641 #if SIMDPP_USE_AVX2
642 static SIMDPP_INL
i_reduce_add(const uint64x4 & a)643 uint64_t i_reduce_add(const uint64x4& a)
644 {
645 uint64x2 sum = add(detail::extract128<0>(a), detail::extract128<1>(a));
646 sum = add(sum, move2_l<1>(sum));
647 return extract<0>(sum);
648 }
649 #endif
650
651 #if SIMDPP_USE_AVX512F
652 static SIMDPP_INL
i_reduce_add(const uint64<8> & a)653 uint64_t i_reduce_add(const uint64<8>& a)
654 {
655 return i_reduce_add(add(extract256<0>(a), extract256<1>(a)));
656 }
657 #endif
658
659 template<unsigned N>
i_reduce_add(const uint64<N> & a)660 SIMDPP_INL uint64_t i_reduce_add(const uint64<N>& a)
661 {
662 #if SIMDPP_USE_NULL || (SIMDPP_USE_ALTIVEC && !SIMDPP_USE_VSX_207)
663 uint64_t r = 0;
664 for (unsigned j = 0; j < a.vec_length; ++j) {
665 for (unsigned i = 0; i < a.base_length; i++) {
666 r += a.vec(j).el(i);
667 }
668 }
669 return r;
670 #else
671 uint64v sum = make_zero();
672 for (unsigned j = 0; j < a.vec_length; ++j) {
673 sum = add(sum, a.vec(j));
674 }
675 return i_reduce_add(sum);
676 #endif
677 }
678
679 // -----------------------------------------------------------------------------
680
681
682 } // namespace insn
683 } // namespace detail
684 } // namespace SIMDPP_ARCH_NAMESPACE
685 } // namespace simdpp
686
687 #endif
688
689