1 /*  Copyright (C) 2016  Povilas Kanapickas <povilas@radix.lt>
2 
3     Distributed under the Boost Software License, Version 1.0.
4         (See accompanying file LICENSE_1_0.txt or copy at
5             http://www.boost.org/LICENSE_1_0.txt)
6 */
7 
8 #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_I_REDUCE_ADD_H
9 #define LIBSIMDPP_SIMDPP_DETAIL_INSN_I_REDUCE_ADD_H
10 
11 #ifndef LIBSIMDPP_SIMD_H
12     #error "This file must be included through simd.h"
13 #endif
14 
15 #include <simdpp/types.h>
16 #include <simdpp/core/extract.h>
17 #include <simdpp/core/move_l.h>
18 #include <simdpp/core/make_uint.h>
19 #include <simdpp/detail/extract128.h>
20 
21 namespace simdpp {
22 namespace SIMDPP_ARCH_NAMESPACE {
23 
24 // forward declarations
25 template<unsigned N, class E> SIMDPP_INL
26 int16_t reduce_add(const int8<N,E>& a);
27 template<unsigned N, class E> SIMDPP_INL
28 uint16_t reduce_add(const uint8<N,E>& a);
29 template<unsigned N, class E> SIMDPP_INL
30 int32_t reduce_add(const int16<N,E>& a);
31 template<unsigned N, class E> SIMDPP_INL
32 uint32_t reduce_add(const uint16<N,E>& a);
33 template<unsigned N, class E> SIMDPP_INL
34 int32_t reduce_add(const int32<N,E>& a);
35 template<unsigned N, class E> SIMDPP_INL
36 uint32_t reduce_add(const uint32<N,E>& a);
37 template<unsigned N, class E> SIMDPP_INL
38 int64_t reduce_add(const int64<N,E>& a);
39 template<unsigned N, class E> SIMDPP_INL
40 uint64_t reduce_add(const uint64<N,E>& a);
41 
42 namespace detail {
43 namespace insn {
44 
45 static SIMDPP_INL
i_reduce_add(const uint8x16 & a)46 uint16_t i_reduce_add(const uint8x16& a)
47 {
48 #if SIMDPP_USE_NULL
49     uint16_t r = a.el(0);
50     for (unsigned i = 1; i < a.length; i++) {
51         r += a.el(i);
52     }
53     return r;
54 #elif SIMDPP_USE_XOP
55     uint16x8 sum = _mm_haddq_epu8(a.native());
56     return extract<0>(sum) + extract<4>(sum);
57 #elif SIMDPP_USE_SSE2
58     uint16x8 sum = _mm_sad_epu8(a.native(), _mm_setzero_si128());
59     return extract<0>(sum) + extract<4>(sum);
60 #elif SIMDPP_USE_NEON
61     uint16x8 a2 = vpaddlq_u8(a.native());
62     uint32x4 a3 = vpaddlq_u16(a2.native());
63     uint64x2 a4 = vpaddlq_u32(a3.native());
64     a3 = a4;
65     uint32x2_t r = vadd_u32(vget_low_u32(a3.native()), vget_high_u32(a3.native()));
66     return vget_lane_u32(r, 0);
67 #elif SIMDPP_USE_ALTIVEC
68     uint32x4 sum = make_zero();
69     sum = vec_sum4s(a.native(), sum.native());
70     sum = add(sum, move4_l<2>(sum));
71     sum = add(sum, move4_l<1>(sum));
72     return extract<0>(sum);
73 #elif SIMDPP_USE_MSA
74     uint16<8> s16 = __msa_hadd_u_h(a.native(), a.native());
75     uint32<4> s32 = __msa_hadd_u_w(s16.native(), s16.native());
76     s32 = (uint64<2>) __msa_hadd_u_d(s32.native(), s32.native());
77     s32 = add(s32, move4_l<2>(s32));
78     return extract<0>(s32);
79 #endif
80 }
81 
82 #if SIMDPP_USE_AVX2
83 static SIMDPP_INL
i_reduce_add(const uint8x32 & a)84 uint16_t i_reduce_add(const uint8x32& a)
85 {
86     uint16x16 sum2 = _mm256_sad_epu8(a.native(), _mm256_setzero_si256()); // results are in 0,2,4,6 elements
87     uint16x8 sum = add(detail::extract128<0>(sum2), detail::extract128<1>(sum2));
88     return extract<0>(sum) + extract<4>(sum);
89 }
90 #endif
91 
92 #if SIMDPP_USE_AVX512BW
i_reduce_add(const uint8<64> & a)93 SIMDPP_INL uint16_t i_reduce_add(const uint8<64>& a)
94 {
95     uint64<8> sum2 = _mm512_sad_epu8(a.native(), _mm512_setzero_si512());
96     return reduce_add(sum2);
97 }
98 #endif
99 
100 template<unsigned N>
i_reduce_add(const uint8<N> & a)101 SIMDPP_INL uint16_t i_reduce_add(const uint8<N>& a)
102 {
103 #if SIMDPP_USE_NULL
104     uint16_t r = 0;
105     for (unsigned j = 0; j < a.vec_length; ++j) {
106         for (unsigned i = 0; i < a.base_length; i++) {
107             r += a.vec(j).el(i);
108         }
109     }
110     return r;
111 #elif SIMDPP_USE_AVX512BW
112     uint64<8> sum2 = make_zero();
113     for (unsigned j = 0; j < a.vec_length; ++j) {
114         uint64<8> sum = _mm512_sad_epu8(a.native(), _mm512_setzero_si512());
115         sum2 = add(sum2, sum);
116     }
117     return reduce_add(sum2);
118 #elif SIMDPP_USE_AVX2
119     uint16x16 r = make_zero();
120     for (unsigned j = 0; j < a.vec_length; ++j) {
121         uint16x16 sum = _mm256_sad_epu8(a.vec(j).native(), _mm256_setzero_si256());
122         r = add(r, sum);
123     }
124     uint16x8 rl = add(detail::extract128<0>(r), detail::extract128<1>(r));
125     return extract<0>(rl) + extract<4>(rl);
126 #elif SIMDPP_USE_SSE2
127     uint16x8 r = make_zero();
128     for (unsigned j = 0; j < a.vec_length; ++j) {
129 #if SIMDPP_USE_XOP
130         uint16x8 sum = _mm_haddq_epu8(a.vec(j).native());
131 #else
132         uint16x8 sum = _mm_sad_epu8(a.vec(j).native(), _mm_setzero_si128());
133 #endif
134         r = add(r, sum);
135     }
136     return extract<0>(r) + extract<4>(r);
137 #elif SIMDPP_USE_NEON
138     uint16x8 r = make_zero();
139     for (unsigned j = 0; j < a.vec_length; ++j) {
140         uint16x8 sum = vpaddlq_u8(a.vec(j).native());
141         r = add(r, sum);
142     }
143     uint32x4 r2 = vpaddlq_u16(r.native());
144     uint64x2 r3 = vpaddlq_u32(r2.native());
145     r2 = r3;
146     uint32x2_t r4 = vadd_u32(vget_low_u32(r2.native()),
147                              vget_high_u32(r2.native()));
148     return vget_lane_u32(r4, 0);
149 #elif SIMDPP_USE_ALTIVEC
150     uint32x4 sum = make_zero();
151     for (unsigned j = 0; j < a.vec_length; ++j) {
152         sum = vec_sum4s(a.vec(j).native(), sum.native());
153     }
154     sum = add(sum, move4_l<2>(sum));
155     sum = add(sum, move4_l<1>(sum));
156     return extract<0>(sum);
157 #elif SIMDPP_USE_MSA
158     uint16<8> r = make_zero();
159     for (unsigned j = 0; j < a.vec_length; ++j) {
160         uint16x8 sum = __msa_hadd_u_h(a.vec(j).native(), a.vec(j).native());
161         r = add(r, sum);
162     }
163     uint32<4> s32 = __msa_hadd_u_w(r.native(), r.native());
164     s32 = (v4u32) __msa_hadd_u_d(s32.native(), s32.native());
165     s32 = add(s32, move4_l<2>(s32));
166     return extract<0>(s32);
167 #endif
168 }
169 
170 // -----------------------------------------------------------------------------
171 
172 static SIMDPP_INL
i_reduce_add(const int8x16 & a)173 int16_t i_reduce_add(const int8x16& a)
174 {
175 #if SIMDPP_USE_NULL
176     int16_t r = a.el(0);
177     for (unsigned i = 1; i < a.length; i++) {
178         r += a.el(i);
179     }
180     return r;
181 #elif SIMDPP_USE_XOP
182     uint16x8 sum = _mm_haddq_epi8(a.native());
183     return extract<0>(sum) + extract<4>(sum);
184 #elif SIMDPP_USE_SSE2
185     return i_reduce_add(uint8x16(bit_xor(a, 0x80))) - a.length*0x80;
186 #elif SIMDPP_USE_NEON
187     int16x8 a2 = vpaddlq_s8(a.native());
188     int32x4 a3 = vpaddlq_s16(a2.native());
189     int64x2 a4 = vpaddlq_s32(a3.native());
190     a3 = a4;
191     int32x2_t r = vadd_s32(vget_low_s32(a3.native()),
192                            vget_high_s32(a3.native()));
193     return vget_lane_s32(r, 0);
194 #elif SIMDPP_USE_ALTIVEC
195     int32x4 sum = make_zero();
196     sum = vec_sum4s(a.native(), sum.native());
197     sum = add(sum, move4_l<2>(sum));
198     sum = add(sum, move4_l<1>(sum));
199     return extract<0>(sum);
200 #elif SIMDPP_USE_MSA
201     int16<8> s16 = __msa_hadd_s_h(a.native(), a.native());
202     int32<4> s32 = __msa_hadd_s_w(s16.native(), s16.native());
203     s32 = (v4i32) __msa_hadd_s_d(s32.native(), s32.native());
204     s32 = add(s32, move4_l<2>(s32));
205     return extract<0>(s32);
206 #endif
207 }
208 
209 #if SIMDPP_USE_AVX2
210 static SIMDPP_INL
i_reduce_add(const int8x32 & a)211 int16_t i_reduce_add(const int8x32& a)
212 {
213     return i_reduce_add(uint8x32(bit_xor(a, 0x80))) - a.length*0x80;
214 }
215 #endif
216 
217 #if SIMDPP_USE_AVX512BW
i_reduce_add(const int8<64> & a)218 SIMDPP_INL uint16_t i_reduce_add(const int8<64>& a)
219 {
220     return i_reduce_add(uint8<64>(bit_xor(a, 0x80))) - a.length*0x80;
221 }
222 #endif
223 
224 template<unsigned N>
i_reduce_add(const int8<N> & a)225 SIMDPP_INL uint16_t i_reduce_add(const int8<N>& a)
226 {
227 #if SIMDPP_USE_NULL
228     uint16_t r = 0;
229     for (unsigned j = 0; j < a.vec_length; ++j) {
230         for (unsigned i = 0; i < a.base_length; i++) {
231             r += a.vec(j).el(i);
232         }
233     }
234     return r;
235 #elif SIMDPP_USE_AVX512BW || SIMDPP_USE_AVX2
236     return i_reduce_add(uint8<N>(bit_xor(a, 0x80))) - a.length*0x80;
237 #elif SIMDPP_USE_XOP
238     int16x8 r = make_zero();
239     for (unsigned j = 0; j < a.vec_length; ++j) {
240         int16x8 sum = _mm_haddq_epi8(a.vec(j).native());
241         r = add(r, sum);
242     }
243     return extract<0>(r) + extract<4>(r);
244 #elif SIMDPP_USE_SSE2
245     return i_reduce_add(uint8<N>(bit_xor(a, 0x80))) - a.length*0x80;
246 #elif SIMDPP_USE_NEON
247     int16x8 r = make_zero();
248     for (unsigned j = 0; j < a.vec_length; ++j) {
249         int16x8 sum = vpaddlq_s8(a.vec(j).native());
250         r = add(r, sum);
251     }
252     int32x4 r2 = vpaddlq_s16(r.native());
253     int64x2 r3 = vpaddlq_s32(r2.native());
254     r2 = r3;
255     int32x2_t r4 = vadd_s32(vget_low_s32(r2.native()),
256                             vget_high_s32(r2.native()));
257     return vget_lane_s32(r4, 0);
258 #elif SIMDPP_USE_ALTIVEC
259     int32x4 sum = make_zero();
260     for (unsigned j = 0; j < a.vec_length; ++j) {
261         sum = vec_sum4s(a.vec(j).native(), sum.native());
262     }
263     sum = add(sum, move4_l<2>(sum));
264     sum = add(sum, move4_l<1>(sum));
265     return extract<0>(sum);
266 #elif SIMDPP_USE_MSA
267     int16<8> r = make_zero();
268     for (unsigned j = 0; j < a.vec_length; ++j) {
269         int16x8 sum = __msa_hadd_s_h(a.vec(j).native(), a.vec(j).native());
270         r = add(r, sum);
271     }
272     int32<4> s32 = __msa_hadd_s_w(r.native(), r.native());
273     s32 = (v4i32) __msa_hadd_s_d(s32.native(), s32.native());
274     s32 = add(s32, move4_l<2>(s32));
275     return extract<0>(s32);
276 #endif
277 }
278 
279 // -----------------------------------------------------------------------------
280 
281 static SIMDPP_INL
i_reduce_add(const uint16x8 & a)282 uint32_t i_reduce_add(const uint16x8& a)
283 {
284 #if SIMDPP_USE_NULL
285     uint32_t r = a.el(0);
286     for (unsigned i = 1; i < a.length; i++) {
287         r += a.el(i);
288     }
289     return r;
290 #elif SIMDPP_USE_XOP
291     uint32x4 sum = _mm_haddq_epu16(a.native()); // sum in the 0 and 2 elements
292     sum = add(sum, move4_l<2>(sum));
293     return extract<0>(sum);
294 #elif SIMDPP_USE_SSE2
295     uint16x8 ones = make_uint(1);
296     uint16x8 ca = bit_xor(a, 0x8000);
297     uint32x4 sum = _mm_madd_epi16(ca.native(), ones.native());
298     // phadd is slower option on intel processors
299     sum = add(sum, move4_l<2>(sum));
300     sum = add(sum, move4_l<1>(sum));
301     return extract<0>(sum) + 0x8000 * a.length;
302 #elif SIMDPP_USE_NEON
303     uint32x4 a2 = vpaddlq_u16(a.native());
304     uint64x2 a3 = vpaddlq_u32(a2.native());
305     a2 = a3;
306     uint32x2_t r = vadd_u32(vget_low_u32(a2.native()),
307                             vget_high_u32(a2.native()));
308     return vget_lane_u32(r, 0);
309 #elif SIMDPP_USE_ALTIVEC
310     int32x4 sum = make_zero();
311     int16x8 ca = bit_xor(a, 0x8000);
312     sum = vec_sum4s(ca.native(), sum.native());
313     sum = add(sum, move4_l<2>(sum));
314     sum = add(sum, move4_l<1>(sum));
315     return extract<0>(sum) + 0x8000 * a.length;
316 #elif SIMDPP_USE_MSA
317     uint32<4> s32 = __msa_hadd_u_w(a.native(), a.native());
318     s32 = (v4u32) __msa_hadd_u_d(s32.native(), s32.native());
319     s32 = add(s32, move4_l<2>(s32));
320     return extract<0>(s32);
321 #endif
322 }
323 
324 #if SIMDPP_USE_AVX2
325 static SIMDPP_INL
i_reduce_add(const uint16x16 & a)326 uint32_t i_reduce_add(const uint16x16& a)
327 {
328     uint16x16 ones = make_uint(1);
329     uint16x16 ca = bit_xor(a, 0x8000);
330     uint32x8 sum = _mm256_madd_epi16(ca.native(), ones.native());
331     return reduce_add(sum) + 0x8000 * a.length;
332 }
333 #endif
334 
335 #if SIMDPP_USE_AVX512BW
i_reduce_add(const uint16<32> & a)336 SIMDPP_INL uint32_t i_reduce_add(const uint16<32>& a)
337 {
338     uint16<32> ones = make_uint(1);
339     uint16<32> ca = bit_xor(a, 0x8000);
340     uint32<16> sum = _mm512_madd_epi16(ca.native(), ones.native());
341     return reduce_add(sum) + 0x8000 * a.length;
342 }
343 #endif
344 
345 template<unsigned N>
i_reduce_add(const uint16<N> & a)346 SIMDPP_INL uint32_t i_reduce_add(const uint16<N>& a)
347 {
348 #if SIMDPP_USE_NULL
349     uint32_t r = 0;
350     for (unsigned j = 0; j < a.vec_length; ++j) {
351         for (unsigned i = 0; i < a.base_length; i++) {
352             r += a.vec(j).el(i);
353         }
354     }
355     return r;
356 #elif SIMDPP_USE_AVX512BW
357     uint32<16> sum = make_zero();
358     uint16<32> ones = make_uint(1);
359     for (unsigned j = 0; j < a.vec_length; ++j) {
360         uint16<32> ca = bit_xor(a.vec(j), 0x8000);
361         uint32<16> isum = _mm512_madd_epi16(ca.native(), ones.native());
362         sum = add(sum, isum);
363     }
364     return reduce_add(sum) + 0x8000 * a.length;
365 #elif SIMDPP_USE_AVX2
366     uint32x8 sum = make_zero();
367     uint16x16 ones = make_uint(1);
368     for (unsigned j = 0; j < a.vec_length; ++j) {
369         uint16x16 ca = bit_xor(a.vec(j), 0x8000);
370         uint32x8 isum = _mm256_madd_epi16(ca.native(), ones.native());
371         sum = add(sum, isum);
372     }
373     return reduce_add(sum) + 0x8000 * a.length;
374 #elif SIMDPP_USE_XOP
375     uint32x4 sum = make_zero();
376     for (unsigned j = 0; j < a.vec_length; ++j) {
377         uint32x4 isum = _mm_haddq_epu16(a.vec(j).native());
378         sum = add(sum, isum);
379     }
380     sum = add(sum, move4_l<2>(sum));
381     return extract<0>(sum);
382 #elif SIMDPP_USE_SSE2
383     uint32x4 sum = make_zero();
384     uint16x8 ones = make_uint(1);
385     for (unsigned j = 0; j < a.vec_length; ++j) {
386         uint16x8 ca = bit_xor(a.vec(j), 0x8000);
387         uint32x4 isum = _mm_madd_epi16(ca.native(), ones.native());
388         sum = add(sum, isum);
389     }
390     sum = add(sum, move4_l<2>(sum));
391     sum = add(sum, move4_l<1>(sum));
392     return extract<0>(sum) + 0x8000 * a.length;
393 #elif SIMDPP_USE_NEON
394     uint32x4 sum = make_zero();
395     for (unsigned j = 0; j < a.vec_length; ++j) {
396         uint32x4 isum = vpaddlq_u16(a.vec(j).native());
397         sum = add(sum, isum);
398     }
399     uint64x2 sum2 = vpaddlq_u32(sum.native());
400     sum = sum2;
401     uint32x2_t sum3 = vadd_u32(vget_low_u32(sum.native()),
402                                vget_high_u32(sum.native()));
403     return vget_lane_u32(sum3, 0);
404 #elif SIMDPP_USE_ALTIVEC
405     int32x4 sum = make_zero();
406     for (unsigned j = 0; j < a.vec_length; ++j) {
407         int16x8 ca = bit_xor(a.vec(j), 0x8000);
408         sum = vec_sum4s(ca.native(), sum.native());
409     }
410     sum = add(sum, move4_l<2>(sum));
411     sum = add(sum, move4_l<1>(sum));
412     return extract<0>(sum) + 0x8000 * a.length;
413 #elif SIMDPP_USE_MSA
414     uint32<4> r = make_zero();
415     for (unsigned j = 0; j < a.vec_length; ++j) {
416         uint32<4> sum = __msa_hadd_u_w(a.vec(j).native(), a.vec(j).native());
417         r = add(r, sum);
418     }
419     r = (uint64<2>) __msa_hadd_u_d(r.native(), r.native());
420     r = add(r, move4_l<2>(r));
421     return extract<0>(r);
422 #endif
423 }
424 
425 // -----------------------------------------------------------------------------
426 
427 static SIMDPP_INL
i_reduce_add(const int16x8 & a)428 int32_t i_reduce_add(const int16x8& a)
429 {
430 #if SIMDPP_USE_NULL
431     int32_t r = a.el(0);
432     for (unsigned i = 1; i < a.length; i++) {
433         r += a.el(i);
434     }
435     return r;
436 #elif SIMDPP_USE_XOP
437     int32x4 sum = _mm_haddq_epi16(a.native()); // sum in the 0 and 2 elements
438     sum = add(sum, move4_l<2>(sum));
439     return extract<0>(sum);
440 #elif SIMDPP_USE_SSE2
441     int16x8 ones = make_uint(1);
442     int32x4 sum = _mm_madd_epi16(a.native(), ones.native());
443     return reduce_add(sum);
444 #elif SIMDPP_USE_NEON
445     int32x4 a2 = vpaddlq_s16(a.native());
446     int64x2 a3 = vpaddlq_s32(a2.native());
447     a2 = a3;
448     int32x2_t r = vadd_s32(vget_low_s32(a2.native()), vget_high_s32(a2.native()));
449     return vget_lane_s32(r, 0);
450 #elif SIMDPP_USE_ALTIVEC
451     int32x4 sum = make_zero();
452     sum = vec_sum4s(a.native(), sum.native());
453     sum = add(sum, move4_l<2>(sum));
454     sum = add(sum, move4_l<1>(sum));
455     return extract<0>(sum);
456 #elif SIMDPP_USE_MSA
457     int32<4> s32 = __msa_hadd_s_w(a.native(), a.native());
458     s32 = (int64<2>) __msa_hadd_s_d(s32.native(), s32.native());
459     s32 = add(s32, move4_l<2>(s32));
460     return extract<0>(s32);
461 #endif
462 }
463 
464 #if SIMDPP_USE_AVX2
465 static SIMDPP_INL
i_reduce_add(const int16x16 & a)466 int32_t i_reduce_add(const int16x16& a)
467 {
468     int16x16 ones = make_uint(1);
469     int32x8 sum = _mm256_madd_epi16(a.native(), ones.native());
470     return reduce_add(sum);
471 }
472 #endif
473 
474 #if SIMDPP_USE_AVX512BW
i_reduce_add(const int16<32> & a)475 SIMDPP_INL int32_t i_reduce_add(const int16<32>& a)
476 {
477     int16<32> ones = make_uint(1);
478     int32<16> sum = _mm512_madd_epi16(a.native(), ones.native());
479     return reduce_add(sum);
480 }
481 #endif
482 
483 template<unsigned N>
i_reduce_add(const int16<N> & a)484 SIMDPP_INL int32_t i_reduce_add(const int16<N>& a)
485 {
486 #if SIMDPP_USE_NULL
487     int32_t r = 0;
488     for (unsigned j = 0; j < a.vec_length; ++j) {
489         for (unsigned i = 0; i < a.base_length; i++) {
490             r += a.vec(j).el(i);
491         }
492     }
493     return r;
494 #elif SIMDPP_USE_AVX512BW
495     int32<16> sum = make_zero();
496     int16<32> ones = make_int(1);
497     for (unsigned j = 0; j < a.vec_length; ++j) {
498         int32<16> isum = _mm512_madd_epi16(a.vec(j).native(), ones.native());
499         sum = add(sum, isum);
500     }
501     return reduce_add(sum);
502 #elif SIMDPP_USE_AVX2
503     int32x8 sum = make_zero();
504     int16x16 ones = make_int(1);
505     for (unsigned j = 0; j < a.vec_length; ++j) {
506         int32x8 isum = _mm256_madd_epi16(a.vec(j).native(), ones.native());
507         sum = add(sum, isum);
508     }
509     return reduce_add(sum);
510 #elif SIMDPP_USE_XOP
511     int32x4 sum = make_zero();
512     for (unsigned j = 0; j < a.vec_length; ++j) {
513         int32x4 isum = _mm_haddq_epi16(a.vec(j).native());
514         sum = add(sum, isum);
515     }
516     // _mm_haddq_epi16 computes 64-bit results.
517     // 1 and 3 32-bit elements may be nonzero
518     sum = add(sum, move4_l<2>(sum));
519     return extract<0>(sum);
520 #elif SIMDPP_USE_SSE2
521     int32x4 sum = make_zero();
522     int16x8 ones = make_int(1);
523     for (unsigned j = 0; j < a.vec_length; ++j) {
524         int32x4 isum = _mm_madd_epi16(a.vec(j).native(), ones.native());
525         sum = add(sum, isum);
526     }
527     return reduce_add(sum);
528 #elif SIMDPP_USE_NEON
529     int32x4 sum = make_zero();
530     for (unsigned j = 0; j < a.vec_length; ++j) {
531         int32x4 isum = vpaddlq_s16(a.vec(j).native());
532         sum = add(sum, isum);
533     }
534     return reduce_add(sum);
535 #elif SIMDPP_USE_ALTIVEC
536     int32x4 sum = make_zero();
537     for (unsigned j = 0; j < a.vec_length; ++j) {
538         sum = vec_sum4s(a.vec(j).native(), sum.native());
539     }
540     return reduce_add(sum);
541 #elif SIMDPP_USE_MSA
542     int32<4> r = make_zero();
543     for (unsigned j = 0; j < a.vec_length; ++j) {
544         int32<4> sum = __msa_hadd_s_w(a.vec(j).native(),
545                                       a.vec(j).native());
546         r = add(r, sum);
547     }
548     r = (int64<2>) __msa_hadd_s_d(r.native(), r.native());
549     r = add(r, move4_l<2>(r));
550     return extract<0>(r);
551 #endif
552 }
553 
554 // -----------------------------------------------------------------------------
555 
556 static SIMDPP_INL
i_reduce_add(const uint32x4 & a)557 uint32_t i_reduce_add(const uint32x4& a)
558 {
559 #if SIMDPP_USE_NULL
560     uint32_t r = a.el(0);
561     for (unsigned i = 1; i < a.length; i++) {
562         r += a.el(i);
563     }
564     return r;
565 #elif SIMDPP_USE_MSA
566     uint32x4 sum = a;
567     sum = (uint64<2>) __msa_hadd_u_d(sum.native(), sum.native());
568     sum = add(sum, move4_l<2>(sum));
569     return extract<0>(sum);
570 #else
571     uint32x4 sum = a;
572     sum = add(sum, move4_l<2>(sum));
573     sum = add(sum, move4_l<1>(sum));
574     return extract<0>(sum);
575 #endif
576 }
577 
578 #if SIMDPP_USE_AVX2
579 static SIMDPP_INL
i_reduce_add(const uint32x8 & a)580 uint32_t i_reduce_add(const uint32x8& a)
581 {
582     uint32x4 sum = add(detail::extract128<0>(a), detail::extract128<1>(a));
583     sum = add(sum, move4_l<2>(sum));
584     sum = add(sum, move4_l<1>(sum));
585     return extract<0>(sum);
586 }
587 #endif
588 
589 #if SIMDPP_USE_AVX512F
590 static SIMDPP_INL
i_reduce_add(const uint32<16> & a)591 uint32_t i_reduce_add(const uint32<16>& a)
592 {
593     return i_reduce_add(add(extract256<0>(a), extract256<1>(a)));
594 }
595 #endif
596 
597 template<unsigned N>
i_reduce_add(const uint32<N> & a)598 SIMDPP_INL uint32_t i_reduce_add(const uint32<N>& a)
599 {
600 #if SIMDPP_USE_NULL
601     uint32_t r = 0;
602     for (unsigned j = 0; j < a.vec_length; ++j) {
603         for (unsigned i = 0; i < a.base_length; i++) {
604             r += a.vec(j).el(i);
605         }
606     }
607     return r;
608 #else
609     uint32v sum = make_zero();
610     for (unsigned j = 0; j < a.vec_length; ++j) {
611         sum = add(sum, a.vec(j));
612     }
613     return i_reduce_add(sum);
614 #endif
615 }
616 
617 // -----------------------------------------------------------------------------
618 
619 static SIMDPP_INL
i_reduce_add(const uint64x2 & a)620 uint64_t i_reduce_add(const uint64x2& a)
621 {
622 #if SIMDPP_USE_NULL
623     uint64_t r = a.el(0);
624     for (unsigned i = 1; i < a.length; i++) {
625         r += a.el(i);
626     }
627     return r;
628 #elif SIMDPP_USE_SSE2
629     uint64x2 sum = a;
630     sum = add(sum, move2_l<1>(sum));
631     return extract<0>(sum);
632 #elif SIMDPP_USE_NEON
633     uint64x1_t r = vadd_u64(vget_low_u64(a.native()),
634                             vget_high_u64(a.native()));
635     return vget_lane_u64(r, 0);
636 #elif SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
637     return extract<0>(a) + extract<1>(a);
638 #endif
639 }
640 
641 #if SIMDPP_USE_AVX2
642 static SIMDPP_INL
i_reduce_add(const uint64x4 & a)643 uint64_t i_reduce_add(const uint64x4& a)
644 {
645     uint64x2 sum = add(detail::extract128<0>(a), detail::extract128<1>(a));
646     sum = add(sum, move2_l<1>(sum));
647     return extract<0>(sum);
648 }
649 #endif
650 
651 #if SIMDPP_USE_AVX512F
652 static SIMDPP_INL
i_reduce_add(const uint64<8> & a)653 uint64_t i_reduce_add(const uint64<8>& a)
654 {
655     return i_reduce_add(add(extract256<0>(a), extract256<1>(a)));
656 }
657 #endif
658 
659 template<unsigned N>
i_reduce_add(const uint64<N> & a)660 SIMDPP_INL uint64_t i_reduce_add(const uint64<N>& a)
661 {
662 #if SIMDPP_USE_NULL || (SIMDPP_USE_ALTIVEC && !SIMDPP_USE_VSX_207)
663     uint64_t r = 0;
664     for (unsigned j = 0; j < a.vec_length; ++j) {
665         for (unsigned i = 0; i < a.base_length; i++) {
666             r += a.vec(j).el(i);
667         }
668     }
669     return r;
670 #else
671     uint64v sum = make_zero();
672     for (unsigned j = 0; j < a.vec_length; ++j) {
673         sum = add(sum, a.vec(j));
674     }
675     return i_reduce_add(sum);
676 #endif
677 }
678 
679 // -----------------------------------------------------------------------------
680 
681 
682 } // namespace insn
683 } // namespace detail
684 } // namespace SIMDPP_ARCH_NAMESPACE
685 } // namespace simdpp
686 
687 #endif
688 
689