1
2 /*
3 * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 */
18
19
20 /*
21 * Real.
22 */
23
24 typedef double vrd1_t;
25 typedef double vrd2_t __attribute__((vector_size(2*sizeof(double))));
26 typedef double vrd4_t __attribute__((vector_size(4*sizeof(double))));
27 typedef double vrd8_t __attribute__((vector_size(8*sizeof(double))));
28 typedef float vrs1_t;
29 typedef float vrs4_t __attribute__((vector_size(4*sizeof(float))));
30 typedef float vrs8_t __attribute__((vector_size(8*sizeof(float))));
31 typedef float vrs16_t __attribute__((vector_size(16*sizeof(float))));
32
33
34 /*
35 * Complex.
36 *
37 * Note:
38 * Vector structures cannot be made up of structures contaning real and
39 * imaginary components.
40 * As such, complex vector structures are in name only and simply
41 * overloaded to the REALs. To extract the R and i's, other macros or
42 * C constructs must be used.
43 */
44
45 typedef double vcd1_t __attribute__((vector_size(2*sizeof(double))));
46 typedef double vcd2_t __attribute__((vector_size(4*sizeof(double))));
47 typedef double vcd4_t __attribute__((vector_size(8*sizeof(double))));
48 typedef float vcs1_t __attribute__((vector_size(2*sizeof(float))));
49 typedef float vcs2_t __attribute__((vector_size(4*sizeof(float))));
50 typedef float vcs4_t __attribute__((vector_size(8*sizeof(float))));
51 typedef float vcs8_t __attribute__((vector_size(16*sizeof(float))));
52
53
54 /*
55 * Integer.
56 */
57
58 typedef int32_t vis1_t;
59 typedef int32_t vis2_t __attribute__((vector_size(2*sizeof(int32_t))));
60 typedef int32_t vis4_t __attribute__((vector_size(4*sizeof(int32_t))));
61 typedef int32_t vis8_t __attribute__((vector_size(8*sizeof(int32_t))));
62 typedef int32_t vis16_t __attribute__((vector_size(16*sizeof(int32_t))));
63 typedef int64_t vid1_t;
64 typedef int64_t vid2_t __attribute__((vector_size(2*sizeof(int64_t))));
65 typedef int64_t vid4_t __attribute__((vector_size(4*sizeof(int64_t))));
66 typedef int64_t vid8_t __attribute__((vector_size(8*sizeof(int64_t))));
67
68 #define _CONCAT2(a,b) a##b
69 #define CONCAT2(a,b) _CONCAT(a,b)
70 #define _CONCAT3(a,b,c) a##b##c
71 #define CONCAT3(a,b,c) _CONCAT3(a,b,c)
72 #define _CONCAT4(a,b,c,d) a##b##c##d
73 #define CONCAT4(a,b,c,d) _CONCAT4(a,b,c,d)
74 #define _CONCAT5(a,b,c,d,e) a##b##c##d##e
75 #define CONCAT5(a,b,c,d,e) _CONCAT5(a,b,c,d,e)
76 #define _CONCAT6(a,b,c,d,e,f) a##b##c##d##e##f
77 #define CONCAT6(a,b,c,d,e,f) _CONCAT6(a,b,c,d,e,f)
78 #define _CONCAT7(a,b,c,d,e,f,g) a##b##c##d##e##f##g
79 #define CONCAT7(a,b,c,d,e,f,g) _CONCAT7(a,b,c,d,e,f,g)
80 #define _CONCAT8(a,b,c,d,e,f,g,h) a##b##c##d##e##f##g##h
81 #define CONCAT8(a,b,c,d,e,f,g,h) _CONCAT8(a,b,c,d,e,f,g,h)
82
83 #define _STRINGIFY(_n) #_n
84 #define STRINGIFY(_n) _STRINGIFY(_n)
85
86
87 #if ! defined(MAX_VREG_SIZE)
88 #error MAX_VREG_SIZE must be defined.
89 #endif
90
91
92 #if MAX_VREG_SIZE == 64
93 #define VLS 1
94 #define VLD 1
95 #define VIS_T vis1_t
96 #define VID_T vid1_t
97 #define VRS_T vrs1_t
98 #define VRD_T vrd1_t
99 #define FMIN 1.0f
100 #define DMIN 1.0d
101 #define VRET(subscript) vret
102 #define ROUT(subscript) rout
103 #define ROUTM(subscript) routm
104 #define RES(subscript) res
105 #define EXP(subscript) exp
106 #define VVMASK(subscript) vvmask
107 #elif MAX_VREG_SIZE == 128
108 #define VLS 4
109 #define VLD 2
110 #define VIS_T vis4_t
111 #define VID_T vid2_t
112 #define VRS_T vrs4_t
113 #define VRD_T vrd2_t
114 #define FMIN 2.0f
115 #define DMIN 2.0d
116 #define VRET(subscript) vret[subscript]
117 #define ROUT(subscript) rout[subscript]
118 #define ROUTM(subscript) routm[subscript]
119 #define RES(subscript) res[subscript]
120 #define EXP(subscript) exp[subscript]
121 #define VVMASK(subscript) vvmask[subscript]
122 #elif MAX_VREG_SIZE == 256
123 #define VLS 8
124 #define VLD 4
125 #define VIS_T vis8_t
126 #define VID_T vid4_t
127 #define VRS_T vrs8_t
128 #define VRD_T vrd4_t
129 #define FMIN 6.0f
130 #define DMIN 6.0d
131 #define VRET(subscript) vret[subscript]
132 #define ROUT(subscript) rout[subscript]
133 #define ROUTM(subscript) routm[subscript]
134 #define RES(subscript) res[subscript]
135 #define EXP(subscript) exp[subscript]
136 #define VVMASK(subscript) vvmask[subscript]
137 #elif MAX_VREG_SIZE == 512
138 #define VLS 16
139 #define VLD 8
140 #define VIS_T vis16_t
141 #define VID_T vid8_t
142 #define VRS_T vrs16_t
143 #define VRD_T vrd8_t
144 #define FMIN 14.0f
145 #define DMIN 14.0d
146 #define VRET(subscript) vret[subscript]
147 #define ROUT(subscript) rout[subscript]
148 #define ROUTM(subscript) routm[subscript]
149 #define RES(subscript) res[subscript]
150 #define EXP(subscript) exp[subscript]
151 #define VVMASK(subscript) vvmask[subscript]
152 #else
153 #error MAX_VREG_SIZE must be one of 64, 128, 256, or 512
154 #endif
155
156 #define FCONST1 0.0f
157 #define FCONST2 31.0f
158 #define DCONST1 0.0d
159 #define DCONST2 31.0d
160
161 #define EXTERN_EFUNC(name) \
162 extern VRS_T \
163 CONCAT5(__fs_,name,_,VLS,)(VRS_T), CONCAT5(__fs_,name,_,VLS,m)(VRS_T,VIS_T), \
164 CONCAT5(__rs_,name,_,VLS,)(VRS_T), CONCAT5(__rs_,name,_,VLS,m)(VRS_T,VIS_T), \
165 CONCAT5(__ps_,name,_,VLS,)(VRS_T), CONCAT5(__ps_,name,_,VLS,m)(VRS_T,VIS_T); \
166 extern VRD_T \
167 CONCAT5(__fd_,name,_,VLD,)(VRD_T), CONCAT5(__fd_,name,_,VLD,m)(VRD_T,VID_T), \
168 CONCAT5(__rd_,name,_,VLD,)(VRD_T), CONCAT5(__rd_,name,_,VLD,m)(VRD_T,VID_T), \
169 CONCAT5(__pd_,name,_,VLD,)(VRD_T), CONCAT5(__pd_,name,_,VLD,m)(VRD_T,VID_T)
170
171 #define EXTERN_EFUNC2(name) \
172 extern VRS_T \
173 CONCAT5(__fs_,name,_,VLS,)(VRS_T,VRS_T), CONCAT5(__fs_,name,_,VLS,m)(VRS_T,VRS_T,VIS_T), \
174 CONCAT5(__rs_,name,_,VLS,)(VRS_T,VRS_T), CONCAT5(__rs_,name,_,VLS,m)(VRS_T,VRS_T,VIS_T), \
175 CONCAT5(__ps_,name,_,VLS,)(VRS_T,VRS_T), CONCAT5(__ps_,name,_,VLS,m)(VRS_T,VRS_T,VIS_T); \
176 extern VRD_T \
177 CONCAT5(__fd_,name,_,VLD,)(VRD_T,VRD_T), CONCAT5(__fd_,name,_,VLD,m)(VRD_T,VRD_T,VID_T), \
178 CONCAT5(__rd_,name,_,VLD,)(VRD_T,VRD_T), CONCAT5(__rd_,name,_,VLD,m)(VRD_T,VRD_T,VID_T), \
179 CONCAT5(__pd_,name,_,VLD,)(VRD_T,VRD_T), CONCAT5(__pd_,name,_,VLD,m)(VRD_T,VRD_T,VID_T)
180
181
182 #define EXTERN_EFUNC2i(name) \
183 extern VRS_T \
184 CONCAT5(__fs_,name,_,VLS,)(VRS_T,VIS_T), CONCAT5(__fs_,name,_,VLS,m)(VRS_T,VIS_T,VIS_T), \
185 CONCAT5(__rs_,name,_,VLS,)(VRS_T,VIS_T), CONCAT5(__rs_,name,_,VLS,m)(VRS_T,VIS_T,VIS_T), \
186 CONCAT5(__ps_,name,_,VLS,)(VRS_T,VIS_T), CONCAT5(__ps_,name,_,VLS,m)(VRS_T,VIS_T,VIS_T); \
187 extern VRD_T \
188 CONCAT5(__fd_,name,_,VLD,)(VRD_T,VIS_T), CONCAT5(__fd_,name,_,VLD,m)(VRD_T,VIS_T,VID_T), \
189 CONCAT5(__rd_,name,_,VLD,)(VRD_T,VIS_T), CONCAT5(__rd_,name,_,VLD,m)(VRD_T,VIS_T,VID_T), \
190 CONCAT5(__pd_,name,_,VLD,)(VRD_T,VIS_T), CONCAT5(__pd_,name,_,VLD,m)(VRD_T,VIS_T,VID_T)
191
192
193 #define EXTERN_EFUNC2i1(name) \
194 extern VRS_T \
195 CONCAT5(__fs_,name,_,VLS,)(VRS_T,int32_t), CONCAT5(__fs_,name,_,VLS,m)(VRS_T,int32_t,VIS_T), \
196 CONCAT5(__rs_,name,_,VLS,)(VRS_T,int32_t), CONCAT5(__rs_,name,_,VLS,m)(VRS_T,int32_t,VIS_T), \
197 CONCAT5(__ps_,name,_,VLS,)(VRS_T,int32_t), CONCAT5(__ps_,name,_,VLS,m)(VRS_T,int32_t,VIS_T); \
198 extern VRD_T \
199 CONCAT5(__fd_,name,_,VLD,)(VRD_T,int32_t), CONCAT5(__fd_,name,_,VLD,m)(VRD_T,int32_t,VID_T), \
200 CONCAT5(__rd_,name,_,VLD,)(VRD_T,int32_t), CONCAT5(__rd_,name,_,VLD,m)(VRD_T,int32_t,VID_T), \
201 CONCAT5(__pd_,name,_,VLD,)(VRD_T,int32_t), CONCAT5(__pd_,name,_,VLD,m)(VRD_T,int32_t,VID_T)
202
203
204 #define EXTERN_EFUNC2k(name) \
205 extern VRS_T \
206 CONCAT5(__fs_,name,_,VLS,)(VRS_T,VID_T), CONCAT5(__fs_,name,_,VLS,m)(VRS_T,VID_T,VIS_T), \
207 CONCAT5(__rs_,name,_,VLS,)(VRS_T,VID_T), CONCAT5(__rs_,name,_,VLS,m)(VRS_T,VID_T,VIS_T), \
208 CONCAT5(__ps_,name,_,VLS,)(VRS_T,VID_T), CONCAT5(__ps_,name,_,VLS,m)(VRS_T,VID_T,VIS_T); \
209 extern VRD_T \
210 CONCAT5(__fd_,name,_,VLD,)(VRD_T,VID_T), CONCAT5(__fd_,name,_,VLD,m)(VRD_T,VID_T,VID_T), \
211 CONCAT5(__rd_,name,_,VLD,)(VRD_T,VID_T), CONCAT5(__rd_,name,_,VLD,m)(VRD_T,VID_T,VID_T), \
212 CONCAT5(__pd_,name,_,VLD,)(VRD_T,VID_T), CONCAT5(__pd_,name,_,VLD,m)(VRD_T,VID_T,VID_T)
213
214
215 #define EXTERN_EFUNC2k1(name) \
216 extern VRS_T \
217 CONCAT5(__fs_,name,_,VLS,)(VRS_T,int64_t), CONCAT5(__fs_,name,_,VLS,m)(VRS_T,int64_t,VIS_T), \
218 CONCAT5(__rs_,name,_,VLS,)(VRS_T,int64_t), CONCAT5(__rs_,name,_,VLS,m)(VRS_T,int64_t,VIS_T), \
219 CONCAT5(__ps_,name,_,VLS,)(VRS_T,int64_t), CONCAT5(__ps_,name,_,VLS,m)(VRS_T,int64_t,VIS_T); \
220 extern VRD_T \
221 CONCAT5(__fd_,name,_,VLD,)(VRD_T,int64_t), CONCAT5(__fd_,name,_,VLD,m)(VRD_T,int64_t,VID_T), \
222 CONCAT5(__rd_,name,_,VLD,)(VRD_T,int64_t), CONCAT5(__rd_,name,_,VLD,m)(VRD_T,int64_t,VID_T), \
223 CONCAT5(__pd_,name,_,VLD,)(VRD_T,int64_t), CONCAT5(__pd_,name,_,VLD,m)(VRD_T,int64_t,VID_T)
224
225
226
227 EXTERN_EFUNC(acos);
228 EXTERN_EFUNC(asin);
229 EXTERN_EFUNC(atan);
230 EXTERN_EFUNC(cos);
231 EXTERN_EFUNC(cosh);
232 EXTERN_EFUNC(exp);
233 EXTERN_EFUNC(log10);
234 EXTERN_EFUNC(log);
235 EXTERN_EFUNC(sin);
236 EXTERN_EFUNC(sinh);
237 EXTERN_EFUNC(tan);
238 EXTERN_EFUNC(tanh);
239
240 EXTERN_EFUNC2(atan2);
241 EXTERN_EFUNC2(mod);
242 EXTERN_EFUNC2(pow);
243
244 EXTERN_EFUNC2i(powi);
245 EXTERN_EFUNC2i1(powi1);
246
247 EXTERN_EFUNC2k(powk);
248 EXTERN_EFUNC2k1(powk1);
249
250
251 int32_t mask_sp[1<<VLS][VLS] __attribute__((aligned(64)));
252 int64_t mask_dp[1<<VLD][VLD] __attribute__((aligned(64)));
253 #if !defined(TARGET_WIN_X8664)
254 int32_t verbose = 0;
255 #else
256 #if VERBOSE == 0
257 int32_t verbose = 0;
258 #else
259 int32_t verbose = 1;
260 #endif
261 #endif
262
263
264 #if !defined(TARGET_WIN_X8664)
265 static void
parseargs(int argc,char * argv[])266 parseargs(int argc, char *argv[])
267 {
268 int opt;
269
270 while ((opt = getopt(argc, argv, "v")) != -1) {
271 switch(opt) {
272 case 'v':
273 verbose = 1;
274 break;
275 default:
276 fprintf(stderr, "Usage %s [-v]\n",argv[0]);
277 }
278 }
279 }
280 #endif
281
282
283 static VRS_T
vrs_set_arg(float fmin,float fconst)284 vrs_set_arg(float fmin, float fconst )
285 {
286 VRS_T vret __attribute__((aligned(64)));
287 float fdelta;
288 int i;
289
290 fdelta = fconst + fmin;
291 for (i = 0; i < VLS; i++) {
292 VRET(i) = (1.0f / (fdelta + (float) i));
293 }
294
295 return vret;
296 }
297
298 static VRD_T
vrd_set_arg(double dmin,double dconst)299 vrd_set_arg(double dmin, double dconst )
300 {
301 VRD_T vret __attribute__((aligned(64)));
302 double ddelta;
303 int i;
304
305 ddelta = dconst + dmin;
306 for (i = 0; i < VLS; i++) {
307 VRET(i) = (1.0 / (ddelta + (double) i));
308 }
309
310 return vret;
311 }
312
313
314 static void
build_masks(bool gray_code)315 build_masks(bool gray_code)
316 {
317 int32_t i;
318 int32_t j;
319 int32_t k;
320
321 if (verbose) {
322 printf("%s: %s mask vectors\n",
323 __func__, gray_code ? "Gray code" : "binary");
324 }
325
326 memset(mask_sp, 0, sizeof mask_sp);
327 memset(mask_dp, 0, sizeof mask_dp);
328
329
330 for (j = 0; j < 1<<VLD; j++) {
331 k = gray_code ? j ^ (j>>1) : j;
332 for (i = 0; i < VLD; i++) {
333 mask_dp[j][i] = (k&0x1) * -1;
334 k = k>>1;
335 }
336 }
337
338 for (j = 0; j < 1<<VLS; j++) {
339 k = gray_code ? j ^ (j>>1) : j;
340 for (i = 0; i < VLS; i++) {
341 mask_sp[j][i] = (k&0x1) * -1;
342 k = k>>1;
343 }
344 }
345
346 if (verbose) {
347 for (j = 0; j < 1<<VLD; j++) {
348 for (i = 0; i < VLD; i++) {
349 printf(" %2lld", mask_dp[j][i]);
350 }
351 puts("");
352 }
353 for (j = 0; j < 1<<VLS; j++) {
354 for (i = 0; i < VLS; i++) {
355 printf(" %2d", mask_sp[j][i]);
356 }
357 puts("");
358 }
359 }
360
361 return;
362
363 }
364
365
366 static int
checkfltol1(float res,float exp,float ltol)367 checkfltol1(float res, float exp, float ltol)
368 {
369 int tests_passed = 0;
370 int tests_failed = 0;
371
372 if (exp == res) {
373 tests_passed ++;
374 }else if( exp != 0.0 && (fabsf((exp-res)/exp)) <= ltol ){
375 tests_passed ++;
376 }else if( exp == 0.0 && exp <= ltol ){
377 tests_passed ++;
378 } else {
379 tests_failed ++;
380 if (verbose) {
381 printf("test FAILED. res %f exp %f\n", res, exp);
382 }
383 }
384
385 if (verbose) {
386 if (tests_failed == 0) {
387 printf("1 test completed. %d tests PASSED. %d tests failed.\n",
388 tests_passed, tests_failed);
389 } else {
390 printf("1 test completed. %d tests passed. %d tests FAILED.\n",
391 tests_passed, tests_failed);
392 }
393 }
394
395 return(tests_failed);
396 }
397
398
399 static int
checkfltol(VRS_T res,VRS_T exp,VIS_T vvmask,int n,float ltol)400 checkfltol(VRS_T res, VRS_T exp, VIS_T vvmask, int n, float ltol)
401 {
402 int i;
403 int tests_failed = 0;
404
405 for (i = 0; i < n; i++) {
406 if (VVMASK(i) != 0) {
407 tests_failed += checkfltol1(RES(i), EXP(i), ltol);
408 }
409 }
410
411 return(tests_failed);
412 }
413
414