1 /* Vectorized routines for x86 AVX-512 instructions.
2 *
3 * This header file, unusually, provides many complete function
4 * implementations so they can be inlined by the compiler.
5 *
6 * Contents:
7 * 1. Function declarations for esl_avx512.c
8 * 2. Inlined functions: horizontal max, sum
9 * 3. Inlined functions: left, right shift
10 */
11 #ifndef eslAVX512_INCLUDED
12 #define eslAVX512_INCLUDED
13 #include "esl_config.h"
14 #ifdef eslENABLE_AVX512
15
16 #include "easel.h"
17
18 #include <stdio.h>
19 #include <x86intrin.h>
20
21 /*****************************************************************
22 * 1. Function declarations for esl_avx512.c
23 *****************************************************************/
24
25 extern void esl_avx512_dump_512i_hex8(__m512i v);
26
27
28 /*****************************************************************
29 * 2. Inlined functions: horizontal max, sum
30 *****************************************************************/
31
32 /* Function: esl_avx512_hmax_epu8()
33 * Synopsis: Return max of 64 unsigned uint8_t elements in epu8 vector.
34 */
35 static inline uint8_t
esl_avx512_hmax_epu8(__m512i a)36 esl_avx512_hmax_epu8(__m512i a)
37 {
38 // Use AVX instructions for this because AVX-512 can't extract 8-bit quantities
39 // Intel has stated that there will be no performance penalty for switching between AVX-512 and AVX
40 __m256i b = _mm256_max_epu8(_mm512_extracti32x8_epi32(a, 0), _mm512_extracti32x8_epi32(a, 1));
41 b = _mm256_max_epu8(b, _mm256_permute2x128_si256(b, b, 0x01));
42 b = _mm256_max_epu8(b, _mm256_shuffle_epi32 (b, 0x4e));
43 b = _mm256_max_epu8(b, _mm256_shuffle_epi32 (b, 0xb1));
44 b = _mm256_max_epu8(b, _mm256_shufflelo_epi16 (b, 0xb1));
45 b = _mm256_max_epu8(b, _mm256_srli_si256 (b, 1));
46 return _mm256_extract_epi8(b, 0); // epi8 is fine here. gets cast properly to uint8_t on return.
47 }
48
49 /* Function: esl_avx512_hmax_epu8()
50 * Synopsis: Return max of 64 unsigned uint8_t elements in epu8 vector.
51 * Incept: SRE, Thu May 25 13:20:45 2017 [Old 97's, Oppenheimer]
52 */
53 static inline int8_t
esl_avx512_hmax_epi8(__m512i a)54 esl_avx512_hmax_epi8(__m512i a)
55 {
56 __m256i b = _mm256_max_epi8(_mm512_extracti32x8_epi32(a, 0), _mm512_extracti32x8_epi32(a, 1));
57 b = _mm256_max_epi8(b, _mm256_permute2x128_si256(b, b, 0x01));
58 b = _mm256_max_epi8(b, _mm256_shuffle_epi32 (b, 0x4e));
59 b = _mm256_max_epi8(b, _mm256_shuffle_epi32 (b, 0xb1));
60 b = _mm256_max_epi8(b, _mm256_shufflelo_epi16 (b, 0xb1));
61 b = _mm256_max_epi8(b, _mm256_srli_si256 (b, 1));
62 return _mm256_extract_epi8(b, 0);
63 }
64
65 /* Function: esl_avx512_hmax_epi16()
66 * Synopsis: Return max of 32 signed int8_t elements in epi16 vector.
67 */
68 static inline int16_t
esl_avx512_hmax_epi16(__m512i a)69 esl_avx512_hmax_epi16(__m512i a)
70 {
71 __m256i b = _mm256_max_epi16(_mm512_extracti32x8_epi32(a, 0), _mm512_extracti32x8_epi32(a, 1));
72 b = _mm256_max_epi16(b, _mm256_permute2x128_si256(b, b, 0x01));
73 b = _mm256_max_epi16(b, _mm256_shuffle_epi32 (b, 0x4e));
74 b = _mm256_max_epi16(b, _mm256_shuffle_epi32 (b, 0xb1));
75 b = _mm256_max_epi16(b, _mm256_shufflelo_epi16 (b, 0xb1));
76 return _mm256_extract_epi16(b, 0);
77 }
78
79
80 /* Function: esl_avx512_hsum_ps()
81 * Synopsis: sums the floating-point values in an __m512 vector
82 * returns the result in ret_sum
83 * Purpose: To compute the sum of the 32-bit float elements of a 512-bit vector
84 */
85 static inline void
esl_avx512_hsum_ps(__m512 a,float * ret_sum)86 esl_avx512_hsum_ps(__m512 a, float *ret_sum)
87 {
88 __m512 temp1_AVX_512 = _mm512_shuffle_f32x4(a, a, 0x4e); //swap high and low halves of a
89 __m512 temp2_AVX_512 = _mm512_add_ps(a, temp1_AVX_512); // sum corresponding floats in the high, low halves
90
91 temp1_AVX_512 = _mm512_shuffle_f32x4(temp2_AVX_512, temp2_AVX_512, 0xb1); //swap high and low quarters of each half of temp2
92 temp2_AVX_512 = _mm512_add_ps(temp2_AVX_512, temp1_AVX_512); // sum corresponding floats in the high, low quarters
93
94 temp1_AVX_512 = _mm512_shuffle_ps(temp2_AVX_512, temp2_AVX_512, 0x4e); //swap high and low eigths of each quarter of a
95 temp2_AVX_512 = _mm512_add_ps(temp2_AVX_512, temp1_AVX_512); // sum corresponding floats in the high, low eighths
96
97 temp1_AVX_512 = _mm512_shuffle_ps(temp2_AVX_512, temp2_AVX_512, 0xb1); //swap high and low sixteenths of each eighth
98 temp2_AVX_512 = _mm512_add_ps(temp2_AVX_512, temp1_AVX_512); // each element of temp2_AVX_512 now contains the sum of all the floats in a
99
100 __m256 temp3_AVX = _mm512_extractf32x8_ps(temp2_AVX_512, 0); //Grab the low half of temp2_AVX_512
101 // because AVX-512 doesn't provide an operation to extract one float from a 512-bit vector
102 // printf("output sum vector is: ");
103 // print_512(temp2_AVX_512);
104 int *retint_ptr = (int *) ret_sum; // This is a horrible hack because there isn't an intrinsic to extract a float from
105 // an __m256. Do this to avoid casting an int back to a float and screwing it up
106 *retint_ptr = _mm256_extract_epi32((__m256i) temp3_AVX, 0);
107 }
108
109
110 /*****************************************************************
111 * 3. Inlined functions: left and right shifts
112 *****************************************************************/
113
114 /* Function: esl_avx512_rightshift_int8()
115 * Synopsis: Shift int8 vector elements to the right, shifting -inf on.
116 * Incept: SRE, Sun Jun 4 17:43:14 2017
117 * See: esl_sse.h::esl_sse_rightshift_int8()
118 */
119 static inline __m512i
esl_avx512_rightshift_int8(__m512i v,__m512i neginfmask)120 esl_avx512_rightshift_int8(__m512i v, __m512i neginfmask)
121 {
122 // Similar to AVX logic, but complicated by lack of permute2x128 instruction.
123 v = _mm512_alignr_epi8(v, _mm512_maskz_shuffle_i32x4(0xfff0, v, v, 0x90), 15);
124 return _mm512_or_si512(v, neginfmask);
125 }
126
127
128 /* Function: esl_avx512_rightshift_int16()
129 * Synopsis: Shift int16 vector elements to the right, shifting -inf on.
130 * Incept: SRE, Sun Jun 4 17:49:02 2017
131 * See: esl_sse.h::esl_sse_rightshift_int16()
132 */
133 static inline __m512i
esl_avx512_rightshift_int16(__m512i v,__m512i neginfmask)134 esl_avx512_rightshift_int16(__m512i v, __m512i neginfmask)
135 {
136 v = _mm512_alignr_epi8(v, _mm512_maskz_shuffle_i32x4(0xfff0, v, v, 0x90), 14);
137 return _mm512_or_si512(v, neginfmask);
138 }
139
140
141
142 /* Function: esl_avx512_rightshiftz_float()
143 * Synopsis: Shift float vector elements to the right, shifting zero on.
144 * Incept: SRE, Sun Jun 4 17:59:54 2017
145 * See: esl_sse.h::esl_sse_rightshiftz_float()
146 */
147 static inline __m512
esl_avx512_rightshiftz_float(__m512 v)148 esl_avx512_rightshiftz_float(__m512 v)
149 {
150 return ((__m512) _mm512_alignr_epi8((__m512i) v, _mm512_maskz_shuffle_i32x4(0xfff0, (__m512i) v, (__m512i) v, 0x90), 12));
151 }
152
153 /* Function: esl_avx512_leftshiftz_float()
154 * Synopsis: Shift float vector elements to the left, shifting zero on.
155 * Incept: SRE, Sun Jun 4 18:04:34 2017
156 * See: esl_sse.h::esl_sse_leftshiftz_float()
157 */
158 static inline __m512
esl_avx512_leftshiftz_float(__m512 v)159 esl_avx512_leftshiftz_float(__m512 v)
160 {
161 return ((__m512) _mm512_alignr_epi8( _mm512_maskz_shuffle_i32x4(0x0fff, (__m512i) v, (__m512i) v, 0x39), (__m512i) v, 4));
162 }
163 #endif //eslAVX512_INCLUDED
164 #endif //eslENABLE_AVX512
165