1# 2# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. 3# 4# Licensed under the Apache License, Version 2.0 (the "License"); 5# you may not use this file except in compliance with the License. 6# You may obtain a copy of the License at 7# 8# http://www.apache.org/licenses/LICENSE-2.0 9# 10# Unless required by applicable law or agreed to in writing, software 11# distributed under the License is distributed on an "AS IS" BASIS, 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13# See the License for the specific language governing permissions and 14# limitations under the License. 15# 16 17function print_hdrs() 18{ 19 print "\ 20/*\n\ 21 * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.\n\ 22 *\n\ 23 * NVIDIA CORPORATION and its licensors retain all intellectual property\n\ 24 * and proprietary rights in and to this software, related documentation\n\ 25 * and any modifications thereto. Any use, reproduction, disclosure or\n\ 26 * distribution of this software and related documentation without an express\n\ 27 * license agreement from NVIDIA CORPORATION is strictly prohibited.\n\ 28 *\n\ 29 */\n\ 30\n\n\ 31#ifdef __cplusplus\n\ 32extern \"C\" {\n\ 33#endif\n\n\ 34#include \"mth_intrinsics.h\" \n\ 35#include \"mth_tbldefs.h\" \n\ 36\n\n\ 37static const vrs4_t Csp1_4={1.0, 1.0, 1.0, 1.0}; \n\ 38static const vrd2_t Cdp1_2={1.0, 1.0}; \n\ 39static const vrs8_t Csp1_8={1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; \n\ 40static const vrd4_t Cdp1_4={1.0, 1.0, 1.0, 1.0}; \n\ 41static const vrs16_t Csp1_16={1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, \n\ 42 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; \n\ 43static const vrd8_t Cdp1_8={1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; \n\ 44\n\n\ 45#ifdef __cplusplus\n\ 46}\n\ 47#endif\n\ 48#if defined (TARGET_X8664) \n\ 49#include \"immintrin.h\" \n\ 50#elif defined (TARGET_LINUX_POWER) \n\ 51#include \"altivec.h\" \n\ 52#elif defined(TARGET_LINUX_ARM64) \n\ 53#include \"arm64intrin.h\" \n\ 54#include <search.h> \n\ 55#else \n\ 56#include <stddef.h> \n\ 57#include <stdint.h> \n\ 58#include <search.h> \n\ 59#include <assert.h> \n\ 60#endif\n\ 61\n\ 62#if !defined(TARGET_X8664) && !defined(TARGET_LINUX_POWER)\n\ 63static int u8nonzero(const void *a, const void *b) \n\ 64{ \n\ 65 assert(!a); \n\ 66 assert(b); \n\ 67 return !(*((uint8_t *)b)); \n\ 68} \n\ 69\n\ 70static inline int is_zero(const void *val, size_t nmemb) \n\ 71{ \n\ 72 return !lfind(NULL, val, &nmemb, sizeof(uint8_t), u8nonzero); \n\ 73} \n\ 74\n\ 75#endif \n\ 76\n\ 77" 78} 79 80function extern_c_begin() 81{ 82 if (TARGET == "ARM64") { 83 print "\n#ifdef __cplusplus" 84 print "extern \"C\" {" 85 print "#endif" 86 } 87} 88 89function extern_c_end() 90{ 91 if (TARGET == "ARM64") { 92 print "\n#ifdef __cplusplus" 93 print "}" 94 print "#endif" 95 } 96} 97 98function init_target_arrays() 99{ 100 if (TARGET == "POWER") { 101 divsd["fs"] = "vec_div(x, y)" 102 divsd["fd"] = "vec_div(x, y)" 103 divsd["rs"] = "vec_div(x, y)" 104 divsd["rd"] = "vec_div(x, y)" 105 # For some unexplained reason, the native and llvm compilers implements 106 # relaxed divide on POWER using reciprocal and a multiply. 107 divsd["rs"] = "vec_mul(x, vec_div(Csp1_" VL("s") ", y))" 108 divsd["rd"] = "vec_mul(x, vec_div(Cdp1_" VL("d") ", y))" 109 divsd["ps"] = "vec_div(x, y)" 110 divsd["pd"] = "vec_div(x, y)" 111 112 sqrtsd["fs"] = "vec_sqrt(x)" 113 sqrtsd["fd"] = "vec_sqrt(x)" 114 sqrtsd["rs"] = "vec_sqrt(x)" 115 sqrtsd["rd"] = "vec_sqrt(x)" 116 sqrtsd["ps"] = "vec_sqrt(x)" 117 sqrtsd["pd"] = "vec_sqrt(x)" 118 mask_all_zero = "(vec_all_eq(mask, vec_xor(mask,mask)) == 1)" 119 } else if (TARGET == "X8664") { 120 if (VLS == 4) { 121 _mm = "_mm" 122 __m = "__m128" 123 _si = "_si128" 124 } else if (VLS == 8) { 125 _mm = "_mm256" 126 __m = "__m256" 127 _si = "_si256" 128 } else { 129 _mm = "_mm512" 130 __m = "__m512" 131 _si = "_si512" 132 } 133 134 divsd["fs"] = _mm "_div_ps((" __m ")x, (" __m ")y)" 135 divsd["fd"] = _mm "_div_pd((" __m "d)x, (" __m "d)y)" 136 divsd["rs"] = _mm "_div_ps((" __m ")x, (" __m ")y)" 137 divsd["rd"] = _mm "_div_pd((" __m "d)x, (" __m "d)y)" 138 # For some unexplained reason, the native and llvm compilers implements 139 # relaxed divide on X86-64 using reciprocal and a multiply. 140 divsd["rs"] = _mm "_mul_ps((" __m ")x, " _mm "_div_ps(Csp1_" VL("s") ", ( " __m ")y))" 141 divsd["rd"] = _mm "_mul_pd((" __m "d)x, " _mm "_div_pd(Cdp1_" VL("d") ", ( " __m "d)y))" 142 divsd["ps"] = _mm "_div_ps((" __m ")x, (" __m ")y)" 143 divsd["pd"] = _mm "_div_pd((" __m "d)x, (" __m "d)y)" 144 145 sqrtsd["fs"] = _mm "_sqrt_ps((" __m ")x)" 146 sqrtsd["fd"] = _mm "_sqrt_pd((" __m "d)x)" 147 sqrtsd["rs"] = _mm "_sqrt_ps((" __m ")x)" 148 sqrtsd["rd"] = _mm "_sqrt_pd((" __m "d)x)" 149 sqrtsd["ps"] = _mm "_sqrt_ps((" __m ")x)" 150 sqrtsd["pd"] = _mm "_sqrt_pd((" __m "d)x)" 151 152 # For vector register size == 128, it would be faster to use the 153 # (_mm_testz_si128((__m128i)mask, _mm_set1_epi32(-1) == 1), but we 154 # compile mth_128mask.c for core2 processors (gcc -march=core2), 155 # and the ptest instruction (_mm_testz_si128()) is not available 156 # until SSE4.1. 157 158# mask_all_zero = (VLS == 4) ? \ 159# "(_mm_movemask_ps((__m128) _mm_cmpeq_epi32((__m128i)mask, \ 160# _mm_xor_si128((__m128i)mask,(__m128i)mask))) == 15)" : \ 161# "(_mm256_testz_si256((__m256i)mask, _mm256_set1_epi32(-1)) == 1)" 162 if (VLS == 4) { 163 mask_all_zero = \ 164 "(_mm_movemask_ps((__m128) _mm_cmpeq_epi32((__m128i)mask, " \ 165 "_mm_xor_si128((__m128i)mask,(__m128i)mask))) == 15)" 166 } else if (VLS == 8) { 167 mask_all_zero = \ 168 "(_mm256_testz_si256((__m256i)mask, _mm256_set1_epi32(-1)) == 1)" 169 } else { 170 mask_all_zero = \ 171 "(_mm512_test_epi32_mask((__m512i)mask, _mm512_set1_epi32(-1)) == 0)" 172 } 173 } else { 174 divsd["fs"] = "((x) / (y))" 175 divsd["fd"] = "((x) / (y))" 176 divsd["rs"] = "((x) / (y))" 177 divsd["rd"] = "((x) / (y))" 178 divsd["rs"] = "((x) / (y))" 179 divsd["rd"] = "((x) / (y))" 180 divsd["ps"] = "((x) / (y))" 181 divsd["pd"] = "((x) / (y))" 182 183 sqrtsd["fs"] = "(assert(!\"vsqrt\"), x)" 184 sqrtsd["fd"] = "(assert(!\"vsqrt\"), x)" 185 sqrtsd["rs"] = "(assert(!\"vsqrt\"), x)" 186 sqrtsd["rd"] = "(assert(!\"vsqrt\"), x)" 187 sqrtsd["ps"] = "(assert(!\"vsqrt\"), x)" 188 sqrtsd["pd"] = "(assert(!\"vsqrt\"), x)" 189 mask_all_zero = "(is_zero(&mask, sizeof mask))" 190 } 191 192 frps["f"]= "" 193 frps["r"]= "" 194 frps["p"]= "" 195 sds["s"]= "" 196 sds["d"]= "" 197 iks["i"]= "" 198 iks["k"]= "" 199} 200 201function VL(sd) 202{ 203 return sd == "s" ? VLS : VLD 204} 205 206function VR_T(sd) { 207 return "vr" sd (sd == "s" ? VLS : VLD) "_t" 208} 209 210function VI_T(sd) { 211 return "vi" sd (sd == "s" ? VLS : VLD) "_t" 212} 213 214function arg_ne_0(yarg, a, b) 215{ 216 return yarg != 0 ? a : b 217} 218 219function func_r_decl(name, frp, sd, yarg) 220{ 221 extern_c_begin() 222 print "\n" VR_T(sd) 223 print "__" frp sd "_" name "_" VL(sd) "_mn" \ 224 "(" VR_T(sd) " x" \ 225 arg_ne_0(yarg, ", " VR_T(sd) " y", "") \ 226 ", " VI_T(sd) " mask)" 227 228} 229 230function func_rr_def(name, frp, sd, safeval, yarg) { 231 func_r_decl(name, frp, sd, yarg) 232 print "{" 233 print " " \ 234 VR_T(sd) " (*fptr) (" VR_T(sd) \ 235 arg_ne_0(yarg, ", " VR_T(sd), "") \ 236 ");" 237 print " (void) fptr;" 238 239 # X86-64 tests assume input vector is return if mask is all zero. 240 # print " if(" mask_all_zero ") return (" VR_T(sd) ")mask;" 241 print " if(" mask_all_zero ") return x;" 242 print " x = (" VR_T(sd) ")((((" VI_T(sd) ")x & mask))" \ 243 arg_ne_0(safeval, " | ((" VI_T(sd) ")C" sd "p1_" VL(sd) " & ~mask)", "") \ 244 ");" 245 if (yarg != 0) { 246 print " y = (" VR_T(sd) ")((((" VI_T(sd) ")y & mask))" \ 247 arg_ne_0(safeval, " | ((" VI_T(sd) ")C" sd "p1_" VL(sd) " & ~mask)", "") \ 248 ");" 249 } 250 if (name != "div" && name != "sqrt") { 251 print " fptr = (" VR_T(sd) "(*) (" VR_T(sd), \ 252 (yarg != 0) ? ", " VR_T(sd) : "", \ 253 ")) MTH_DISPATCH_TBL[func_" name "][sv_" sd "v" VL(sd) "][frp_" frp "];" 254 print " return (fptr(x", (yarg != 0) ? ", y" : "", "));" 255 } else { 256 print " return (", (name == "div") ? divsd[frp sd] : sqrtsd[frp sd], ");" 257 } 258 259 print "}\n" 260 extern_c_end() 261} 262 263function func_pow_args_nomask(sd, is_scalar, ik, with_vars) 264{ 265 ll = VR_T(sd) arg_ne_0(with_vars, " x", "") ", " 266 if (is_scalar) { 267 ll = ll ((ik == "i") ? "int32_t" : "int64_t") arg_ne_0(with_vars, " iy", "") 268 } else { 269 if (sd == "s" && ik == "k") { 270 ll = ll VI_T("d") arg_ne_0(with_vars, " iyu", "") ", " \ 271 VI_T("d") arg_ne_0(with_vars, " iyl", "") 272 } else { 273 ll = ll VI_T(ik == "i" ? "s" : "d") arg_ne_0(with_vars, " iy", "") 274 } 275 } 276 277 return ll 278} 279 280function func_pow_decl(name, frp, sd, is_scalar, ik) 281{ 282 extern_c_begin() 283 print "\n" VR_T(sd) 284 l = "__" frp sd "_" name arg_ne_0(is_scalar, ik"1", ik)"_" VL(sd) "_mn" "(" 285 l = l func_pow_args_nomask(sd, is_scalar, ik, 1) 286 l = l ", " VI_T(sd) " mask)" 287 print l 288 289} 290 291function func_pow_def(name, frp, sd, is_scalar, ik) 292{ 293 func_pow_decl(name, frp, sd, is_scalar, ik) 294 print "{" 295 print " "\ 296 VR_T(sd) " (*fptr) (" func_pow_args_nomask(sd, is_scalar, ik, 0) ");" 297 # X86-64 tests assume input vector is return if mask is all zero. 298 # print " if(" mask_all_zero ") return (" VR_T(sd) ")mask;" 299 print " if(" mask_all_zero ") return x;" 300 print " x = ("VR_T(sd) ")((" VI_T(sd) ")x & mask);" 301 if (is_scalar == 0) { 302 if((sd == "s" && ik == "i") || (sd == "d" && ik == "k")) { 303 print " iy = iy & mask;" 304 } else { 305 print " {\n"\ 306 " int i;\n"\ 307 " for (i = 0 ; i < " VL(sd) "; i++) {\n"\ 308 " if (mask[i] == 0) {" 309 if (sd == "s") { 310 print " if(i < " VL(d) ") {\n"\ 311 " iyu[i] = 0;\n"\ 312 " } else {\n"\ 313 " iyl[i-" VL(d) "] = 0;\n"\ 314 " }" 315 } else { 316 print " iy[i] = 0;"\ 317 } 318 print " }\n }\n }" 319 } 320 } 321 print " fptr = (" VR_T(sd) "(*) (" \ 322 func_pow_args_nomask(sd, is_scalar, ik, 0) \ 323 ")) MTH_DISPATCH_TBL[func_" name arg_ne_0(is_scalar, ik"1", ik) \ 324 "][sv_" sd "v" VL(sd) "][frp_" frp "];" 325 print " return (fptr(x, ", \ 326 arg_ne_0(is_scalar == 0 && sd == "s" && ik == "k", "iyu, iyl", "iy") \ 327 "));" 328 329 print "}" 330 extern_c_end() 331} 332 333function do_all_rr(name, safeval, yarg) 334{ 335 336 for (frp in frps) { 337 for (sd in sds) { 338 func_rr_def(name, frp, sd, safeval, yarg) 339 } 340 } 341} 342 343function do_all_pow_r2i() 344{ 345 for (frp in frps) { 346 for (sd in sds) { 347 for (ik in iks) { 348 func_pow_def("pow", frp, sd, 1, ik) 349 func_pow_def("pow", frp, sd, 0, ik) 350 } 351 } 352 } 353} 354 355BEGIN { 356 if (TARGET == "POWER") { 357 if (MAX_VREG_SIZE != 128) { 358 print "TARGET == POWER, MAX_VREG_SIZE must be 128" 359 exit(1) 360 } 361 } else if (TARGET == "ARM64") { 362 if (MAX_VREG_SIZE != 128) { 363 print "TARGET == ARM64, MAX_VREG_SIZE must be 128" 364 exit(1) 365 } 366 } else if (MAX_VREG_SIZE != 128 && MAX_VREG_SIZE != 256 && MAX_VREG_SIZE != 512) { 367 print "TARGET == X8664, MAX_VREG_SIZE must be either 128, 256, or 512" 368 exit(1) 369 } 370 371 if (MAX_VREG_SIZE == 128) { 372 VLS = 4 373 VLD = 2 374 } else if (MAX_VREG_SIZE == 256) { 375 VLS = 8 376 VLD = 4 377 } else { 378 VLS = 16 379 VLD = 8 380 } 381 382# Initialize some associative arrays 383 init_target_arrays() 384 385 print_hdrs() 386 one_arg = 0 387 two_args = 1 388 389 390 do_all_rr("acos", 0, one_arg) 391 do_all_rr("asin", 0, one_arg) 392 do_all_rr("atan", 0, one_arg) 393 do_all_rr("atan2", 1, two_args) 394 do_all_rr("cos", 0, one_arg) 395 do_all_rr("sin", 0, one_arg) 396 do_all_rr("tan", 0, one_arg) 397 do_all_rr("sincos", 0, one_arg) 398 do_all_rr("cosh", 0, one_arg) 399 do_all_rr("sinh", 0, one_arg) 400 do_all_rr("tanh", 0, one_arg) 401 do_all_rr("exp", 0, one_arg) 402 do_all_rr("log", 1, one_arg) 403 do_all_rr("log10", 1, one_arg) 404 do_all_rr("pow", 0, two_args) 405 do_all_rr("div", 1, two_args) 406 do_all_rr("sqrt", 0, one_arg) 407 do_all_rr("mod", 1, two_args) 408 do_all_rr("aint", 0, one_arg) 409 do_all_rr("ceil", 0, one_arg) 410 do_all_rr("floor", 0, one_arg) 411 412 do_all_pow_r2i() 413} 414