1 /* need a new enough GCC for avx512 support */
2 #if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
3 
4 #define HAVE_DASUM_KERNEL 1
5 
6 #include <immintrin.h>
7 
8 #include <stdint.h>
9 
10 #ifndef ABS_K
11 #define ABS_K(a) ((a) > 0 ? (a) : (-(a)))
12 #endif
13 
dasum_kernel(BLASLONG n,FLOAT * x1)14 static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1)
15 {
16     BLASLONG i = 0;
17     FLOAT sumf = 0.0;
18 
19     if (n >= 256) {
20         BLASLONG align_512 = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 3) & 0x7;
21 
22         for (i = 0; i < align_512; i++) {
23             sumf += ABS_K(x1[i]);
24         }
25 
26         n -= align_512;
27         x1 += align_512;
28     }
29 
30     BLASLONG tail_index_SSE = n&(~7);
31     BLASLONG tail_index_AVX512 = n&(~255);
32 
33     //
34     if ( n >= 256 ) {
35 
36         __m512d accum_0, accum_1, accum_2, accum_3;
37         accum_0 = _mm512_setzero_pd();
38         accum_1 = _mm512_setzero_pd();
39         accum_2 = _mm512_setzero_pd();
40         accum_3 = _mm512_setzero_pd();
41         for (i = 0; i < tail_index_AVX512; i += 32) {
42             accum_0 += _mm512_abs_pd(_mm512_load_pd(&x1[i + 0]));
43             accum_1 += _mm512_abs_pd(_mm512_load_pd(&x1[i + 8]));
44             accum_2 += _mm512_abs_pd(_mm512_load_pd(&x1[i +16]));
45             accum_3 += _mm512_abs_pd(_mm512_load_pd(&x1[i +24]));
46         }
47 
48         accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
49         sumf += _mm512_reduce_add_pd(accum_0);
50     }
51 
52     if (n >= 8) {
53         __m128d accum_20, accum_21, accum_22, accum_23;
54         accum_20 = _mm_setzero_pd();
55         accum_21 = _mm_setzero_pd();
56         accum_22 = _mm_setzero_pd();
57         accum_23 = _mm_setzero_pd();
58 
59         __m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff);
60         for (i = tail_index_AVX512; i < tail_index_SSE; i += 8) {
61             accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2);
62             accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 2]), abs_mask2);
63             accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2);
64             accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 6]), abs_mask2);
65         }
66 
67         accum_20 = accum_20 + accum_21 + accum_22 + accum_23;
68         __m128d half_accum20;
69         half_accum20 = _mm_hadd_pd(accum_20, accum_20);
70 
71         sumf += half_accum20[0];
72     }
73 
74     for (i = tail_index_SSE; i < n; ++i) {
75         sumf += ABS_K(x1[i]);
76     }
77 
78     return sumf;
79 }
80 #endif
81