1 
2 /*
3  * Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  *     http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  */
18 
19 #if ! defined(TARGET_X8664)
20 #error "mth_avx512helper.h must have TARGET_X8664 defined"
21 #endif
22 
23 #ifndef	MTH_AVX512HELPER_H
24 #define	MTH_AVX512HELPER_H
25 
26 /*
27  * mth_avx512helper.h - helper macros for AVX512.
28  *
29  * Two objectives:
30  * 1)	Provide AVX/AVX2 and SKYLAKE-AVX512 compatibility.
31  * 	There are instructions in the AVX/AVX2 extensions that do not exist
32  * 	with SKYLAKE-AVX512.  Create macros that provide identical functionality
33  * 	to AVX/AVX2 with AVX512 - though using 512-bit registers.
34  *
35  * 	Example:
36  * 	Vector compare in the AVX/AVX2 extensions set a resulting
37  * 	vector register with a -1 (32 or 64-bit) where the results of the
38  * 	comparison match.  AVX512 uses the K registers for the result of the
39  * 	compare. So extend _mm256_cmpeq_epi32(a,b) to _MM512_CMPEQ_EPI32 as:
40  *	(__m512i) _mm512_maskz_set1_epi32(_mm512_cmpeq_epi32_mask(a, b), -1))
41  *
42  *
43  * 2)	Provide KNC and SKYLAKE-AVX512 compatibility.
44  *	Another complication is that we currently build to have a common object
45  *	between KNL and AVX512F (CPUID flags AVX512F for AVX-512, KNCNI for KNC)
46  *	thus AVX512 instructions.
47  *
48  * 	Example:
49  * 	The KNC extensions do not have a "floating point" boolean AND()
50  * 	instruciton.
51  * 	Extend _mm512_and_ps(a,b) to _MM512_AND_PS as:
52  *	(__m512) _mm512_and_si512(a, _mm512_castps_si512(b))
53  *
54  *	Macro FCN_AVX512(name) is used to create unique (entry point) names
55  *	based upon the extensions "_knl" or "_512" depending whether KNL is
56  *	targetted or not.
57  *
58  *	Note: Not every possible AVX/AVX2 intrinsic is currently defined.
59  *	They helper macros are created as needed in porting the FMA3 version
60  *	of the math intrinsics source code.
61  */
62 
63 /*
64  * The following macros are used to have a common source between KNL and
65  * SKYLAKE-AVX512.
66  */
67 
68 #if	defined(__knl) || defined (__knl__)
69 /*
70  * 	KNL implementations.
71  */
72 #define	FCN_AVX512(a)                                                         \
73 	a##_knl
74 
75 #define	_MM512_AND_PD(a, b)                                                   \
76 	(__m512d) _mm512_and_si512(_mm512_castpd_si512(a),                    \
77 				   _mm512_castpd_si512(b))
78 
79 #define	_MM512_AND_PS(a, b)                                                   \
80 	(__m512) _mm512_and_si512(_mm512_castps_si512(a),                     \
81 				  _mm512_castps_si512(b))
82 
83 #define	_MM512_ANDNOT_PD(a, b)                                                \
84 	(__m512d) _mm512_andnot_si512(_mm512_castpd_si512(a),                 \
85 				      _mm512_castpd_si512(b))
86 
87 #define	_MM512_ANDNOT_PS(a, b)                                                \
88 	(__m512) _mm512_andnot_si512(_mm512_castps_si512(a),                  \
89 				     _mm512_castps_si512(b))
90 
91 #define	_MM512_OR_PD(a, b)                                                    \
92 	(__m512d) _mm512_or_si512(_mm512_castpd_si512(a),                     \
93 				  _mm512_castpd_si512(b))
94 
95 #define	_MM512_OR_PS(a, b)                                                    \
96 	(__m512) _mm512_or_si512(_mm512_castps_si512(a),                      \
97 				 _mm512_castps_si512(b))
98 
99 #define	_MM512_XOR_PD(a, b)                                                   \
100 	(__m512d) _mm512_xor_si512(_mm512_castpd_si512(a),                    \
101 				   _mm512_castpd_si512(b))
102 
103 #define	_MM512_XOR_PS(a, b)                                                   \
104 	(__m512) _mm512_xor_si512(_mm512_castps_si512(a),                     \
105 				  _mm512_castps_si512(b))
106 
107 #define	_MM512_EXTRACTF256_PS(a,b)                                            \
108 	(__m256) _mm512_extractf64x4_pd(_mm512_castps_pd(a),b)
109 
110 #define	_MM512_INSERTF256_PS(a,b,c)                                           \
111 	 (__m512) _mm512_insertf64x4(_mm512_castps_pd(a),                     \
112 				     _mm256_castps_pd(b),c)
113 
114 #define	_MM512_EXTRACTI256_SI512(a,b)                                         \
115 	_mm512_extracti64x4_epi64(a,b)
116 
117 #define	_MM512_MOVM_EPI32(a)                                                  \
118 	_mm512_maskz_set1_epi32(a,-1)
119 
120 #define	_MM512_MOVM_EPI64(a)                                                  \
121 	_mm512_maskz_set1_epi64(a,-1)
122 
123 #else		// #if	defined(__knl) || defined (__knl__)
124 /*
125  * 	SKYLAKE-AVX512 implementations.
126  */
127 #define	FCN_AVX512(a)                                                         \
128 	a##_512
129 
130 #define	_MM512_AND_PS(a, b)                                                   \
131 	_mm512_and_ps(a, b)
132 
133 #define	_MM512_AND_PD(a, b)                                                   \
134 	_mm512_and_pd(a, b)
135 
136 #define	_MM512_ANDNOT_PS(a, b)                                                \
137 	_mm512_andnot_ps(a, b)
138 
139 #define	_MM512_ANDNOT_PD(a, b)                                                \
140 	_mm512_andnot_pd(a, b)
141 
142 #define	_MM512_OR_PS(a, b)                                                    \
143 	_mm512_or_ps(a, b)
144 
145 #define	_MM512_OR_PD(a, b)                                                    \
146 	_mm512_or_pd(a, b)
147 
148 #define	_MM512_XOR_PS(a, b)                                                   \
149 	_mm512_xor_ps(a, b)
150 
151 #define	_MM512_XOR_PD(a, b)                                                   \
152 	_mm512_xor_pd(a, b)
153 
154 #define	_MM512_EXTRACTF256_PS(a,b)                                            \
155 	(__m256) _mm512_extractf32x8_ps(a,b)
156 
157 #define	_MM512_INSERTF256_PS(a,b,c)                                           \
158 	 _mm512_insertf32x8(a,b,c)
159 
160 #define	_MM512_EXTRACTI256_SI512(a,b)                                         \
161 	_mm512_extracti32x8_epi32(a,b)
162 
163 #define	_MM512_MOVM_EPI32(a)                                                  \
164 	_mm512_movm_epi32(a)
165 
166 #define	_MM512_MOVM_EPI64(a)                                                  \
167 	_mm512_movm_epi64(a)
168 #endif		// #if	defined(__knl) || defined (__knl__)
169 
170 
171 /*
172  * The following macros are used to provide 512-bit compatibility with
173  * intrinsics that only exist with AVX/AVX2.
174  */
175 
176 #define	_MM512_CMPEQ_EPI32(a, b)                                              \
177 	_MM512_MOVM_EPI32(_mm512_cmpeq_epi32_mask(a, b))
178 
179 #define	_MM512_CMPEQ_PD(a, b)                                                 \
180 	_MM512_CMP_PD(a, b, _CMP_EQ_OQ)
181 
182 #define	_MM512_CMPGT_EPI32(a, b)                                              \
183 	_MM512_MOVM_EPI32(_mm512_cmpgt_epi32_mask(a, b))
184 
185 #define	_MM512_CMPEQ_EPI64(a, b)                                              \
186 	_MM512_MOVM_EPI64(_mm512_cmpeq_epi64_mask(a, b))
187 
188 #define	_MM512_CMP_PS(a, b, c)                                                \
189 	(__m512) _MM512_MOVM_EPI32(_mm512_cmp_ps_mask(a, b, c))
190 
191 #define	_MM512_CMP_PD(a, b, c)                                                \
192 	(__m512d) _MM512_MOVM_EPI64(_mm512_cmp_pd_mask(a, b, c))
193 
194 #define	_MM512_BLEND_EPI32(a,b,m)                                             \
195 	_mm512_mask_blend_epi32(m,a,b)
196 
197 #define	_MM512_BLEND_EPI64(a,b,m)                                             \
198 	_mm512_mask_blend_epi64(m,a,b)
199 
200 #define	_MM512_BLENDV_PS(a,b,m)                                               \
201 	(__m512) _mm512_ternarylogic_epi32(                                   \
202 		_mm512_castps_si512(a),                                       \
203 		_mm512_castps_si512(b),                                       \
204 		_mm512_srai_epi32(_mm512_castps_si512(m), 31),                \
205 		0xd8)
206 
207 #define	_MM512_BLENDV_PD(a,b,m)                                               \
208 	(__m512d) _mm512_ternarylogic_epi64(                                  \
209 		_mm512_castpd_si512(a),                                       \
210 		_mm512_castpd_si512(b),                                       \
211 		_mm512_srai_epi64(_mm512_castpd_si512(m), 63),                \
212 		0xd8)
213 
214 #define	_MM512_MOVEMASK_EPI32(a)                                              \
215 	(int) _mm512_cmpneq_epi32_mask(_mm512_setzero_si512(),                \
216 		_mm512_and_si512(_mm512_set1_epi32(0x80000000U), a))
217 
218 #define	_MM512_MOVEMASK_EPI64(a)                                              \
219 	(int) _mm512_cmpneq_epi64_mask(_mm512_setzero_si512(),                \
220 		_mm512_and_si512(_mm512_set1_epi64(0x8000000000000000ULL), a))
221 
222 #define	_MM512_MOVEMASK_PS(a)                                                 \
223 	_MM512_MOVEMASK_EPI32(_mm512_castps_si512(a))
224 
225 #define	_MM512_MOVEMASK_PD(a)                                                 \
226 	_MM512_MOVEMASK_EPI64(_mm512_castpd_si512(a))
227 
228 #define	_MM512_ROUND_PD(a,b)                                                  \
229 	_mm512_roundscale_pd(a,((0<<4)|b|_MM_FROUND_NO_EXC))
230 
231 #endif		// #ifndef	MTH_AVX512HELPER_H
232