1 /* Vectorized routines for x86 AVX-512 instructions.
2  *
3  * This header file, unusually, provides many complete function
4  * implementations so they can be inlined by the compiler.
5  *
6  * Contents:
7  *    1. Function declarations for esl_avx512.c
8  *    2. Inlined functions: horizontal max, sum
9  *    3. Inlined functions: left, right shift
10  */
11 #ifndef eslAVX512_INCLUDED
12 #define eslAVX512_INCLUDED
13 #include "esl_config.h"
14 #ifdef  eslENABLE_AVX512
15 
16 #include "easel.h"
17 
18 #include <stdio.h>
19 #include <x86intrin.h>
20 
21 /*****************************************************************
22  * 1. Function declarations for esl_avx512.c
23  *****************************************************************/
24 
25 extern void esl_avx512_dump_512i_hex8(__m512i v);
26 
27 
28 /*****************************************************************
29  * 2. Inlined functions: horizontal max, sum
30  *****************************************************************/
31 
32 /* Function:  esl_avx512_hmax_epu8()
33  * Synopsis:  Return max of 64 unsigned uint8_t elements in epu8 vector.
34  */
35 static inline uint8_t
esl_avx512_hmax_epu8(__m512i a)36 esl_avx512_hmax_epu8(__m512i a)
37 {
38   // Use AVX instructions for this because AVX-512 can't extract 8-bit quantities
39   // Intel has stated that there will be no performance penalty for switching between AVX-512 and AVX
40   __m256i b = _mm256_max_epu8(_mm512_extracti32x8_epi32(a, 0), _mm512_extracti32x8_epi32(a, 1));
41   b = _mm256_max_epu8(b, _mm256_permute2x128_si256(b, b, 0x01));
42   b = _mm256_max_epu8(b, _mm256_shuffle_epi32     (b,    0x4e));
43   b = _mm256_max_epu8(b, _mm256_shuffle_epi32     (b,    0xb1));
44   b = _mm256_max_epu8(b, _mm256_shufflelo_epi16   (b,    0xb1));
45   b = _mm256_max_epu8(b, _mm256_srli_si256        (b,    1));
46   return _mm256_extract_epi8(b, 0);  // epi8 is fine here. gets cast properly to uint8_t on return.
47 }
48 
49 /* Function:  esl_avx512_hmax_epu8()
50  * Synopsis:  Return max of 64 unsigned uint8_t elements in epu8 vector.
51  * Incept:    SRE, Thu May 25 13:20:45 2017 [Old 97's, Oppenheimer]
52  */
53 static inline int8_t
esl_avx512_hmax_epi8(__m512i a)54 esl_avx512_hmax_epi8(__m512i a)
55 {
56   __m256i b = _mm256_max_epi8(_mm512_extracti32x8_epi32(a, 0), _mm512_extracti32x8_epi32(a, 1));
57   b = _mm256_max_epi8(b, _mm256_permute2x128_si256(b, b, 0x01));
58   b = _mm256_max_epi8(b, _mm256_shuffle_epi32     (b,    0x4e));
59   b = _mm256_max_epi8(b, _mm256_shuffle_epi32     (b,    0xb1));
60   b = _mm256_max_epi8(b, _mm256_shufflelo_epi16   (b,    0xb1));
61   b = _mm256_max_epi8(b, _mm256_srli_si256        (b,    1));
62   return _mm256_extract_epi8(b, 0);
63 }
64 
65 /* Function:  esl_avx512_hmax_epi16()
66  * Synopsis:  Return max of 32 signed int8_t elements in epi16 vector.
67  */
68 static inline int16_t
esl_avx512_hmax_epi16(__m512i a)69 esl_avx512_hmax_epi16(__m512i a)
70 {
71   __m256i b = _mm256_max_epi16(_mm512_extracti32x8_epi32(a, 0), _mm512_extracti32x8_epi32(a, 1));
72   b = _mm256_max_epi16(b, _mm256_permute2x128_si256(b, b, 0x01));
73   b = _mm256_max_epi16(b, _mm256_shuffle_epi32     (b,    0x4e));
74   b = _mm256_max_epi16(b, _mm256_shuffle_epi32     (b,    0xb1));
75   b = _mm256_max_epi16(b, _mm256_shufflelo_epi16   (b,    0xb1));
76   return _mm256_extract_epi16(b, 0);
77 }
78 
79 
80 /* Function: esl_avx512_hsum_ps()
81  * Synopsis: sums the floating-point values in an __m512 vector
82  *           returns the result in ret_sum
83  * Purpose:  To compute the sum of the 32-bit float elements of a 512-bit vector
84  */
85 static inline void
esl_avx512_hsum_ps(__m512 a,float * ret_sum)86 esl_avx512_hsum_ps(__m512 a, float *ret_sum)
87 {
88   __m512 temp1_AVX_512 = _mm512_shuffle_f32x4(a, a, 0x4e);  //swap high and low halves of a
89   __m512 temp2_AVX_512 = _mm512_add_ps(a, temp1_AVX_512); // sum corresponding floats in the high, low halves
90 
91   temp1_AVX_512 = _mm512_shuffle_f32x4(temp2_AVX_512, temp2_AVX_512, 0xb1);  //swap high and low quarters of each half of temp2
92   temp2_AVX_512 = _mm512_add_ps(temp2_AVX_512, temp1_AVX_512); // sum corresponding floats in the high, low quarters
93 
94   temp1_AVX_512 = _mm512_shuffle_ps(temp2_AVX_512, temp2_AVX_512, 0x4e);  //swap high and low eigths of each quarter of a
95   temp2_AVX_512 = _mm512_add_ps(temp2_AVX_512, temp1_AVX_512); // sum corresponding floats in the high, low eighths
96 
97   temp1_AVX_512 = _mm512_shuffle_ps(temp2_AVX_512, temp2_AVX_512, 0xb1);  //swap high and low sixteenths of each eighth
98   temp2_AVX_512 = _mm512_add_ps(temp2_AVX_512, temp1_AVX_512); // each element of temp2_AVX_512 now contains the sum of all the floats in a
99 
100   __m256 temp3_AVX = _mm512_extractf32x8_ps(temp2_AVX_512, 0); //Grab the low half of temp2_AVX_512
101   // because AVX-512 doesn't provide an operation to extract one float from a 512-bit vector
102   // printf("output sum vector is: ");
103   // print_512(temp2_AVX_512);
104   int *retint_ptr = (int *) ret_sum;  // This is a horrible hack because there isn't an intrinsic to extract a float from
105   // an __m256.  Do this to avoid casting an int back to a float and screwing it up
106   *retint_ptr = _mm256_extract_epi32((__m256i) temp3_AVX, 0);
107 }
108 
109 
110 /*****************************************************************
111  * 3. Inlined functions: left and right shifts
112  *****************************************************************/
113 
114 /* Function:  esl_avx512_rightshift_int8()
115  * Synopsis:  Shift int8 vector elements to the right, shifting -inf on.
116  * Incept:    SRE, Sun Jun  4 17:43:14 2017
117  * See:       esl_sse.h::esl_sse_rightshift_int8()
118  */
119 static inline __m512i
esl_avx512_rightshift_int8(__m512i v,__m512i neginfmask)120 esl_avx512_rightshift_int8(__m512i v, __m512i neginfmask)
121 {
122   // Similar to AVX logic, but complicated by lack of permute2x128 instruction.
123   v = _mm512_alignr_epi8(v,  _mm512_maskz_shuffle_i32x4(0xfff0, v, v, 0x90), 15);
124   return _mm512_or_si512(v, neginfmask);
125 }
126 
127 
128 /* Function:  esl_avx512_rightshift_int16()
129  * Synopsis:  Shift int16 vector elements to the right, shifting -inf on.
130  * Incept:    SRE, Sun Jun  4 17:49:02 2017
131  * See:       esl_sse.h::esl_sse_rightshift_int16()
132  */
133 static inline __m512i
esl_avx512_rightshift_int16(__m512i v,__m512i neginfmask)134 esl_avx512_rightshift_int16(__m512i v, __m512i neginfmask)
135 {
136   v = _mm512_alignr_epi8(v, _mm512_maskz_shuffle_i32x4(0xfff0, v, v, 0x90), 14);
137   return _mm512_or_si512(v, neginfmask);
138 }
139 
140 
141 
142 /* Function:  esl_avx512_rightshiftz_float()
143  * Synopsis:  Shift float vector elements to the right, shifting zero on.
144  * Incept:    SRE, Sun Jun  4 17:59:54 2017
145  * See:       esl_sse.h::esl_sse_rightshiftz_float()
146  */
147 static inline __m512
esl_avx512_rightshiftz_float(__m512 v)148 esl_avx512_rightshiftz_float(__m512 v)
149 {
150   return ((__m512) _mm512_alignr_epi8((__m512i) v, _mm512_maskz_shuffle_i32x4(0xfff0, (__m512i) v, (__m512i) v, 0x90), 12));
151 }
152 
153 /* Function:  esl_avx512_leftshiftz_float()
154  * Synopsis:  Shift float vector elements to the left, shifting zero on.
155  * Incept:    SRE, Sun Jun  4 18:04:34 2017
156  * See:       esl_sse.h::esl_sse_leftshiftz_float()
157  */
158 static inline __m512
esl_avx512_leftshiftz_float(__m512 v)159 esl_avx512_leftshiftz_float(__m512 v)
160 {
161   return ((__m512) _mm512_alignr_epi8( _mm512_maskz_shuffle_i32x4(0x0fff, (__m512i) v, (__m512i) v, 0x39), (__m512i) v, 4));
162 }
163 #endif //eslAVX512_INCLUDED
164 #endif //eslENABLE_AVX512
165