1 /*
2  * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  *
16  */
17 
18 
19 /*
20  * Define supported architecutres.
21  */
22 typedef enum {
23 	/*
24 	 * arch_any is only intended to be use in defining the dispatch
25 	 * table definitions.  Its purpose is to simplify having to define
26 	 * the same set of jump table entries that are common to all processor
27 	 * platforms.
28 	 */
29 	arch_any=0,
30 #if defined(TARGET_LINUX_X8664) || defined(TARGET_OSX_X8664) || defined(TARGET_WIN_X8664)
31 	arch_em64t,	// em64t/opteron
32 	arch_sse4,	// SSE4A/SSE4.1
33 			// greyhound, barcelona, core2,
34 			// istanbul, nehalem, penryn, shanghai
35 	arch_avx,	// AVX 128 Intel sandybridge
36 	arch_avxfma4,	// AVX 128 AMD bulldozer, piledriver
37 	arch_avx2, 	// AVX2 256 haswell
38 	arch_avx512knl, // AVX512, knights landing
39 	arch_avx512,	// AVX512, skylake
40 #elif defined(TARGET_LINUX_POWER)
41 	arch_p8,	// Power8
42 	arch_p9,	// Power9
43 #elif defined(TARGET_LINUX_ARM64)
44 	arch_armv8,     // ARM V8
45 	arch_armv81a,   // ARM V8.1-A
46 	arch_armv82,    // ARM V8.2
47 #else
48 	arch_generic,  // Generic CPU
49 #endif
50 	arch_size=16,	// *** Always last
51 } arch_e;
52 
53 /*
54  * Define scalar and vector formats.
55  */
56 
57 typedef enum {
58 	sv_ss=0,	// single scalar
59 	sv_ds,		// double scalar
60 	sv_cs,		// single complex - C ABI
61 	sv_zs,		// double complex - C ABI
62 	sv_cv1,		// single complex - vector
63 
64 	// 128-bit
65 	sv_sv4,		// single vector
66 	sv_dv2,		// double vector
67 	sv_cv2,		// single complex vector
68 	sv_zv1,		// double complex vector
69 
70 	// 256-bit
71 	sv_sv8,		// single vector
72 	sv_dv4,		// double vector
73 	sv_cv4,		// single complex vector
74 	sv_zv2,		// double complex vector
75 
76 	// 512-bit
77 	sv_sv16,	// single vector
78 	sv_dv8,		// double vector
79 	sv_cv8,		// single complex vector
80 	sv_zv4,		// double complex vector
81 
82 	// 128-bit - Masked
83 	sv_sv4m,	// single vector
84 	sv_dv2m,	// double vector
85 	sv_cv2m,	// single complex vector
86 	sv_zv1m,	// double complex vector
87 
88 	// 256-bit - Masked
89 	sv_sv8m,	// single vector
90 	sv_dv4m,	// double vector
91 	sv_cv4m,	// single complex vector
92 	sv_zv2m,	// double complex vector
93 
94 	// 512-bit - Masked
95 	sv_sv16m,	// single vector
96 	sv_dv8m,	// double vector
97 	sv_cv8m,	// single complex vector
98 	sv_zv4m,	// double complex vector
99 
100 	sv_size=64,	// *** Always last
101 } sv_e;
102 
103 
104 /*
105  * Define fast/relaxed/precise classes.
106  */
107 typedef enum {
108 	frp_f=0,	// fast
109 	frp_r,		// relaxed
110 	frp_p,		// precise
111 	frp_s,		// Sleef
112 	frp_size,	// *** Always last
113 } frp_e;
114 
115 /*
116  * Define intrinsic funtions.
117  */
118 typedef enum {
119 	func_acos=0,
120 	func_asin,
121 	func_atan,
122 	func_atan2,
123 	func_cos,
124 	func_sin,
125 	func_tan,
126 	func_cosh,
127 	func_sinh,
128 	func_tanh,
129 	func_exp,
130 	func_log,
131 	func_log10,
132 	func_pow,
133 	func_powi1,	// R{4,8}*I4
134 	func_powi,	// R{4,8}*I4(:)
135 	func_powk1,	// R{4,8}*I8
136 	func_powk,	// R{4,8}*I8(:)
137 	func_sincos,	// Returns pair of values
138 	func_div,	// division
139 	func_sqrt,	// square root
140 	func_mod,	// mod(R{4,8},R{4,8})
141 	func_aint,
142 	func_ceil,
143 	func_floor,
144 	func_size,
145 } func_e;
146 
147 /*
148  *	Elements sizes;
149  */
150 typedef	enum	{
151 	elmtsz_32 = 0,	// ss,   cs
152 	elmtsz_64,	// ds,   zs,  cv1
153 	elmtsz_128,	// sv4,  dv2, cv2, zv1
154 	elmtsz_256,	// sv8,  dv4, cv4, zv2
155 	elmtsz_512,	// sv16, dv8, cv8, zv4
156 	elmtsz_size
157 } elmtsz_e;
158 
159 typedef	void(*p2f)();
160 
161 typedef	struct	{
162 	arch_e	arch;	// Architecture
163 	func_e	func;	// Function
164 	sv_e	sv;	// Scalar/vector type
165 	p2f	pf;	// Pointer to fast
166 	p2f	pr;	// Pointer to relaxed
167 	p2f	pp;	// Pointer to precise
168 	p2f	ps;	// Pointer to Sleef
169 } mth_intrins_defs_t;
170 
171 extern	p2f	__mth_rt_vi_ptrs[func_size][sv_size][frp_size];
172 extern	p2f	__mth_rt_vi_ptrs_stat[func_size][sv_size][frp_size];
173 //extern	p2f	__mth_rt_intrins_ptrs[func_size][sv_size][frp_size];
174 
175 // _func: function
176 // _sv: scalar/vector types
177 // _a: architecture
178 // _f: name for fast
179 // _r: name for relaxed
180 // _p: name for precise
181 // _s: name for Sleef
182 #define	MTHINTRIN(_func, _sv, _a, _f, _r, _p, _s) \
183 extern void _f (void); \
184 extern void _r (void); \
185 extern void _p (void); \
186 extern void _s (void);
187 
188 #define MTH_DISPATCH_FUNC(f)    f
189 #define	MTH_DISPATCH_TBL	__mth_rt_vi_ptrs
190 
191 #define	_MTH_I_INIT()
192 #ifdef	MTH_I_INTRIN_INIT
193 #undef	MTH_DISPATCH_FUNC
194 #define	MTH_DISPATCH_FUNC(f)	f##_init
195 #undef	_MTH_I_INIT
196 #define	_MTH_I_INIT()	(void) __math_dispatch_init()
197 extern	void __math_dispatch_init(void);
198 #endif
199 
200 #ifdef	MTH_I_INTRIN_STATS
201 #undef	MTH_DISPATCH_FUNC
202 #define	MTH_DISPATCH_FUNC(f)	f##_prof
203 #undef	MTH_DISPATCH_TBL
204 #define	MTH_DISPATCH_TBL	__mth_rt_vi_ptrs_stat
205 /*
206  * Unsigned integers of number of calls.
207  *
208  * XXX - Layout of the table is *different* than __mth_rt_vi_ptrs.
209  * TBD which layout is better.
210  */
211 extern	uint64_t	__mth_rt_stats[frp_size][func_size][sv_size];
212 #define	_MTH_I_STATS_INC(_func,_sv,_frp) \
213 	(void)__sync_fetch_and_add(&__mth_rt_stats[_frp][_func][_sv], 1);
214 #else	// MTH_I_INTRIN_STATS
215 #define	_MTH_I_STATS_INC(_func,_sv,_frp)
216 #endif	// MTH_I_INTRIN_STATS
217