1 2 /* 3 * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 */ 18 19 #if ! defined(TARGET_X8664) 20 #error "mth_avx512helper.h must have TARGET_X8664 defined" 21 #endif 22 23 #ifndef MTH_AVX512HELPER_H 24 #define MTH_AVX512HELPER_H 25 26 /* 27 * mth_avx512helper.h - helper macros for AVX512. 28 * 29 * Two objectives: 30 * 1) Provide AVX/AVX2 and SKYLAKE-AVX512 compatibility. 31 * There are instructions in the AVX/AVX2 extensions that do not exist 32 * with SKYLAKE-AVX512. Create macros that provide identical functionality 33 * to AVX/AVX2 with AVX512 - though using 512-bit registers. 34 * 35 * Example: 36 * Vector compare in the AVX/AVX2 extensions set a resulting 37 * vector register with a -1 (32 or 64-bit) where the results of the 38 * comparison match. AVX512 uses the K registers for the result of the 39 * compare. So extend _mm256_cmpeq_epi32(a,b) to _MM512_CMPEQ_EPI32 as: 40 * (__m512i) _mm512_maskz_set1_epi32(_mm512_cmpeq_epi32_mask(a, b), -1)) 41 * 42 * 43 * 2) Provide KNC and SKYLAKE-AVX512 compatibility. 44 * Another complication is that we currently build to have a common object 45 * between KNL and AVX512F (CPUID flags AVX512F for AVX-512, KNCNI for KNC) 46 * thus AVX512 instructions. 47 * 48 * Example: 49 * The KNC extensions do not have a "floating point" boolean AND() 50 * instruciton. 51 * Extend _mm512_and_ps(a,b) to _MM512_AND_PS as: 52 * (__m512) _mm512_and_si512(a, _mm512_castps_si512(b)) 53 * 54 * Macro FCN_AVX512(name) is used to create unique (entry point) names 55 * based upon the extensions "_knl" or "_512" depending whether KNL is 56 * targetted or not. 57 * 58 * Note: Not every possible AVX/AVX2 intrinsic is currently defined. 59 * They helper macros are created as needed in porting the FMA3 version 60 * of the math intrinsics source code. 61 */ 62 63 /* 64 * The following macros are used to have a common source between KNL and 65 * SKYLAKE-AVX512. 66 */ 67 68 #if defined(__knl) || defined (__knl__) 69 /* 70 * KNL implementations. 71 */ 72 #define FCN_AVX512(a) \ 73 a##_knl 74 75 #define _MM512_AND_PD(a, b) \ 76 (__m512d) _mm512_and_si512(_mm512_castpd_si512(a), \ 77 _mm512_castpd_si512(b)) 78 79 #define _MM512_AND_PS(a, b) \ 80 (__m512) _mm512_and_si512(_mm512_castps_si512(a), \ 81 _mm512_castps_si512(b)) 82 83 #define _MM512_ANDNOT_PD(a, b) \ 84 (__m512d) _mm512_andnot_si512(_mm512_castpd_si512(a), \ 85 _mm512_castpd_si512(b)) 86 87 #define _MM512_ANDNOT_PS(a, b) \ 88 (__m512) _mm512_andnot_si512(_mm512_castps_si512(a), \ 89 _mm512_castps_si512(b)) 90 91 #define _MM512_OR_PD(a, b) \ 92 (__m512d) _mm512_or_si512(_mm512_castpd_si512(a), \ 93 _mm512_castpd_si512(b)) 94 95 #define _MM512_OR_PS(a, b) \ 96 (__m512) _mm512_or_si512(_mm512_castps_si512(a), \ 97 _mm512_castps_si512(b)) 98 99 #define _MM512_XOR_PD(a, b) \ 100 (__m512d) _mm512_xor_si512(_mm512_castpd_si512(a), \ 101 _mm512_castpd_si512(b)) 102 103 #define _MM512_XOR_PS(a, b) \ 104 (__m512) _mm512_xor_si512(_mm512_castps_si512(a), \ 105 _mm512_castps_si512(b)) 106 107 #define _MM512_EXTRACTF256_PS(a,b) \ 108 (__m256) _mm512_extractf64x4_pd(_mm512_castps_pd(a),b) 109 110 #define _MM512_INSERTF256_PS(a,b,c) \ 111 (__m512) _mm512_insertf64x4(_mm512_castps_pd(a), \ 112 _mm256_castps_pd(b),c) 113 114 #define _MM512_EXTRACTI256_SI512(a,b) \ 115 _mm512_extracti64x4_epi64(a,b) 116 117 #define _MM512_MOVM_EPI32(a) \ 118 _mm512_maskz_set1_epi32(a,-1) 119 120 #define _MM512_MOVM_EPI64(a) \ 121 _mm512_maskz_set1_epi64(a,-1) 122 123 #else // #if defined(__knl) || defined (__knl__) 124 /* 125 * SKYLAKE-AVX512 implementations. 126 */ 127 #define FCN_AVX512(a) \ 128 a##_512 129 130 #define _MM512_AND_PS(a, b) \ 131 _mm512_and_ps(a, b) 132 133 #define _MM512_AND_PD(a, b) \ 134 _mm512_and_pd(a, b) 135 136 #define _MM512_ANDNOT_PS(a, b) \ 137 _mm512_andnot_ps(a, b) 138 139 #define _MM512_ANDNOT_PD(a, b) \ 140 _mm512_andnot_pd(a, b) 141 142 #define _MM512_OR_PS(a, b) \ 143 _mm512_or_ps(a, b) 144 145 #define _MM512_OR_PD(a, b) \ 146 _mm512_or_pd(a, b) 147 148 #define _MM512_XOR_PS(a, b) \ 149 _mm512_xor_ps(a, b) 150 151 #define _MM512_XOR_PD(a, b) \ 152 _mm512_xor_pd(a, b) 153 154 #define _MM512_EXTRACTF256_PS(a,b) \ 155 (__m256) _mm512_extractf32x8_ps(a,b) 156 157 #define _MM512_INSERTF256_PS(a,b,c) \ 158 _mm512_insertf32x8(a,b,c) 159 160 #define _MM512_EXTRACTI256_SI512(a,b) \ 161 _mm512_extracti32x8_epi32(a,b) 162 163 #define _MM512_MOVM_EPI32(a) \ 164 _mm512_movm_epi32(a) 165 166 #define _MM512_MOVM_EPI64(a) \ 167 _mm512_movm_epi64(a) 168 #endif // #if defined(__knl) || defined (__knl__) 169 170 171 /* 172 * The following macros are used to provide 512-bit compatibility with 173 * intrinsics that only exist with AVX/AVX2. 174 */ 175 176 #define _MM512_CMPEQ_EPI32(a, b) \ 177 _MM512_MOVM_EPI32(_mm512_cmpeq_epi32_mask(a, b)) 178 179 #define _MM512_CMPEQ_PD(a, b) \ 180 _MM512_CMP_PD(a, b, _CMP_EQ_OQ) 181 182 #define _MM512_CMPGT_EPI32(a, b) \ 183 _MM512_MOVM_EPI32(_mm512_cmpgt_epi32_mask(a, b)) 184 185 #define _MM512_CMPEQ_EPI64(a, b) \ 186 _MM512_MOVM_EPI64(_mm512_cmpeq_epi64_mask(a, b)) 187 188 #define _MM512_CMP_PS(a, b, c) \ 189 (__m512) _MM512_MOVM_EPI32(_mm512_cmp_ps_mask(a, b, c)) 190 191 #define _MM512_CMP_PD(a, b, c) \ 192 (__m512d) _MM512_MOVM_EPI64(_mm512_cmp_pd_mask(a, b, c)) 193 194 #define _MM512_BLEND_EPI32(a,b,m) \ 195 _mm512_mask_blend_epi32(m,a,b) 196 197 #define _MM512_BLEND_EPI64(a,b,m) \ 198 _mm512_mask_blend_epi64(m,a,b) 199 200 #define _MM512_BLENDV_PS(a,b,m) \ 201 (__m512) _mm512_ternarylogic_epi32( \ 202 _mm512_castps_si512(a), \ 203 _mm512_castps_si512(b), \ 204 _mm512_srai_epi32(_mm512_castps_si512(m), 31), \ 205 0xd8) 206 207 #define _MM512_BLENDV_PD(a,b,m) \ 208 (__m512d) _mm512_ternarylogic_epi64( \ 209 _mm512_castpd_si512(a), \ 210 _mm512_castpd_si512(b), \ 211 _mm512_srai_epi64(_mm512_castpd_si512(m), 63), \ 212 0xd8) 213 214 #define _MM512_MOVEMASK_EPI32(a) \ 215 (int) _mm512_cmpneq_epi32_mask(_mm512_setzero_si512(), \ 216 _mm512_and_si512(_mm512_set1_epi32(0x80000000U), a)) 217 218 #define _MM512_MOVEMASK_EPI64(a) \ 219 (int) _mm512_cmpneq_epi64_mask(_mm512_setzero_si512(), \ 220 _mm512_and_si512(_mm512_set1_epi64(0x8000000000000000ULL), a)) 221 222 #define _MM512_MOVEMASK_PS(a) \ 223 _MM512_MOVEMASK_EPI32(_mm512_castps_si512(a)) 224 225 #define _MM512_MOVEMASK_PD(a) \ 226 _MM512_MOVEMASK_EPI64(_mm512_castpd_si512(a)) 227 228 #define _MM512_ROUND_PD(a,b) \ 229 _mm512_roundscale_pd(a,((0<<4)|b|_MM_FROUND_NO_EXC)) 230 231 #endif // #ifndef MTH_AVX512HELPER_H 232