1 /* 2 * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 * 16 */ 17 18 19 /* 20 * Define supported architecutres. 21 */ 22 typedef enum { 23 /* 24 * arch_any is only intended to be use in defining the dispatch 25 * table definitions. Its purpose is to simplify having to define 26 * the same set of jump table entries that are common to all processor 27 * platforms. 28 */ 29 arch_any=0, 30 #if defined(TARGET_LINUX_X8664) || defined(TARGET_OSX_X8664) || defined(TARGET_WIN_X8664) 31 arch_em64t, // em64t/opteron 32 arch_sse4, // SSE4A/SSE4.1 33 // greyhound, barcelona, core2, 34 // istanbul, nehalem, penryn, shanghai 35 arch_avx, // AVX 128 Intel sandybridge 36 arch_avxfma4, // AVX 128 AMD bulldozer, piledriver 37 arch_avx2, // AVX2 256 haswell 38 arch_avx512knl, // AVX512, knights landing 39 arch_avx512, // AVX512, skylake 40 #elif defined(TARGET_LINUX_POWER) 41 arch_p8, // Power8 42 arch_p9, // Power9 43 #elif defined(TARGET_LINUX_ARM64) 44 arch_armv8, // ARM V8 45 arch_armv81a, // ARM V8.1-A 46 arch_armv82, // ARM V8.2 47 #else 48 arch_generic, // Generic CPU 49 #endif 50 arch_size=16, // *** Always last 51 } arch_e; 52 53 /* 54 * Define scalar and vector formats. 55 */ 56 57 typedef enum { 58 sv_ss=0, // single scalar 59 sv_ds, // double scalar 60 sv_cs, // single complex - C ABI 61 sv_zs, // double complex - C ABI 62 sv_cv1, // single complex - vector 63 64 // 128-bit 65 sv_sv4, // single vector 66 sv_dv2, // double vector 67 sv_cv2, // single complex vector 68 sv_zv1, // double complex vector 69 70 // 256-bit 71 sv_sv8, // single vector 72 sv_dv4, // double vector 73 sv_cv4, // single complex vector 74 sv_zv2, // double complex vector 75 76 // 512-bit 77 sv_sv16, // single vector 78 sv_dv8, // double vector 79 sv_cv8, // single complex vector 80 sv_zv4, // double complex vector 81 82 // 128-bit - Masked 83 sv_sv4m, // single vector 84 sv_dv2m, // double vector 85 sv_cv2m, // single complex vector 86 sv_zv1m, // double complex vector 87 88 // 256-bit - Masked 89 sv_sv8m, // single vector 90 sv_dv4m, // double vector 91 sv_cv4m, // single complex vector 92 sv_zv2m, // double complex vector 93 94 // 512-bit - Masked 95 sv_sv16m, // single vector 96 sv_dv8m, // double vector 97 sv_cv8m, // single complex vector 98 sv_zv4m, // double complex vector 99 100 sv_size=64, // *** Always last 101 } sv_e; 102 103 104 /* 105 * Define fast/relaxed/precise classes. 106 */ 107 typedef enum { 108 frp_f=0, // fast 109 frp_r, // relaxed 110 frp_p, // precise 111 frp_s, // Sleef 112 frp_size, // *** Always last 113 } frp_e; 114 115 /* 116 * Define intrinsic funtions. 117 */ 118 typedef enum { 119 func_acos=0, 120 func_asin, 121 func_atan, 122 func_atan2, 123 func_cos, 124 func_sin, 125 func_tan, 126 func_cosh, 127 func_sinh, 128 func_tanh, 129 func_exp, 130 func_log, 131 func_log10, 132 func_pow, 133 func_powi1, // R{4,8}*I4 134 func_powi, // R{4,8}*I4(:) 135 func_powk1, // R{4,8}*I8 136 func_powk, // R{4,8}*I8(:) 137 func_sincos, // Returns pair of values 138 func_div, // division 139 func_sqrt, // square root 140 func_mod, // mod(R{4,8},R{4,8}) 141 func_aint, 142 func_ceil, 143 func_floor, 144 func_size, 145 } func_e; 146 147 /* 148 * Elements sizes; 149 */ 150 typedef enum { 151 elmtsz_32 = 0, // ss, cs 152 elmtsz_64, // ds, zs, cv1 153 elmtsz_128, // sv4, dv2, cv2, zv1 154 elmtsz_256, // sv8, dv4, cv4, zv2 155 elmtsz_512, // sv16, dv8, cv8, zv4 156 elmtsz_size 157 } elmtsz_e; 158 159 typedef void(*p2f)(); 160 161 typedef struct { 162 arch_e arch; // Architecture 163 func_e func; // Function 164 sv_e sv; // Scalar/vector type 165 p2f pf; // Pointer to fast 166 p2f pr; // Pointer to relaxed 167 p2f pp; // Pointer to precise 168 p2f ps; // Pointer to Sleef 169 } mth_intrins_defs_t; 170 171 extern p2f __mth_rt_vi_ptrs[func_size][sv_size][frp_size]; 172 extern p2f __mth_rt_vi_ptrs_stat[func_size][sv_size][frp_size]; 173 //extern p2f __mth_rt_intrins_ptrs[func_size][sv_size][frp_size]; 174 175 // _func: function 176 // _sv: scalar/vector types 177 // _a: architecture 178 // _f: name for fast 179 // _r: name for relaxed 180 // _p: name for precise 181 // _s: name for Sleef 182 #define MTHINTRIN(_func, _sv, _a, _f, _r, _p, _s) \ 183 extern void _f (void); \ 184 extern void _r (void); \ 185 extern void _p (void); \ 186 extern void _s (void); 187 188 #define MTH_DISPATCH_FUNC(f) f 189 #define MTH_DISPATCH_TBL __mth_rt_vi_ptrs 190 191 #define _MTH_I_INIT() 192 #ifdef MTH_I_INTRIN_INIT 193 #undef MTH_DISPATCH_FUNC 194 #define MTH_DISPATCH_FUNC(f) f##_init 195 #undef _MTH_I_INIT 196 #define _MTH_I_INIT() (void) __math_dispatch_init() 197 extern void __math_dispatch_init(void); 198 #endif 199 200 #ifdef MTH_I_INTRIN_STATS 201 #undef MTH_DISPATCH_FUNC 202 #define MTH_DISPATCH_FUNC(f) f##_prof 203 #undef MTH_DISPATCH_TBL 204 #define MTH_DISPATCH_TBL __mth_rt_vi_ptrs_stat 205 /* 206 * Unsigned integers of number of calls. 207 * 208 * XXX - Layout of the table is *different* than __mth_rt_vi_ptrs. 209 * TBD which layout is better. 210 */ 211 extern uint64_t __mth_rt_stats[frp_size][func_size][sv_size]; 212 #define _MTH_I_STATS_INC(_func,_sv,_frp) \ 213 (void)__sync_fetch_and_add(&__mth_rt_stats[_frp][_func][_sv], 1); 214 #else // MTH_I_INTRIN_STATS 215 #define _MTH_I_STATS_INC(_func,_sv,_frp) 216 #endif // MTH_I_INTRIN_STATS 217