1 
2 /*
3  * Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  *     http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  */
18 
19 
20 /*
21  * Real.
22  */
23 
24 typedef double  vrd1_t;
25 typedef double  vrd2_t  __attribute__((vector_size(2*sizeof(double))));
26 typedef double  vrd4_t  __attribute__((vector_size(4*sizeof(double))));
27 typedef double  vrd8_t  __attribute__((vector_size(8*sizeof(double))));
28 typedef	float	vrs1_t;
29 typedef	float	vrs4_t	__attribute__((vector_size(4*sizeof(float))));
30 typedef	float	vrs8_t	__attribute__((vector_size(8*sizeof(float))));
31 typedef	float	vrs16_t	__attribute__((vector_size(16*sizeof(float))));
32 
33 
34 /*
35  * Complex.
36  *
37  * Note:
38  * Vector structures cannot be made up of structures contaning real and
39  * imaginary components.
40  * As such, complex vector structures are in name only and simply
41  * overloaded to the REALs.  To extract the R and i's, other macros or
42  * C constructs must be used.
43  */
44 
45 typedef double  vcd1_t  __attribute__((vector_size(2*sizeof(double))));
46 typedef double  vcd2_t  __attribute__((vector_size(4*sizeof(double))));
47 typedef double  vcd4_t  __attribute__((vector_size(8*sizeof(double))));
48 typedef float   vcs1_t  __attribute__((vector_size(2*sizeof(float))));
49 typedef float   vcs2_t  __attribute__((vector_size(4*sizeof(float))));
50 typedef float   vcs4_t  __attribute__((vector_size(8*sizeof(float))));
51 typedef float   vcs8_t  __attribute__((vector_size(16*sizeof(float))));
52 
53 
54 /*
55  * Integer.
56  */
57 
58 typedef	int32_t	vis1_t;
59 typedef	int32_t	vis2_t	__attribute__((vector_size(2*sizeof(int32_t))));
60 typedef	int32_t	vis4_t	__attribute__((vector_size(4*sizeof(int32_t))));
61 typedef	int32_t	vis8_t	__attribute__((vector_size(8*sizeof(int32_t))));
62 typedef	int32_t	vis16_t	__attribute__((vector_size(16*sizeof(int32_t))));
63 typedef	int64_t	vid1_t;
64 typedef	int64_t	vid2_t	__attribute__((vector_size(2*sizeof(int64_t))));
65 typedef	int64_t	vid4_t	__attribute__((vector_size(4*sizeof(int64_t))));
66 typedef	int64_t	vid8_t	__attribute__((vector_size(8*sizeof(int64_t))));
67 
68 #define _CONCAT2(a,b)    a##b
69 #define CONCAT2(a,b) _CONCAT(a,b)
70 #define _CONCAT3(a,b,c)    a##b##c
71 #define CONCAT3(a,b,c) _CONCAT3(a,b,c)
72 #define _CONCAT4(a,b,c,d)    a##b##c##d
73 #define CONCAT4(a,b,c,d) _CONCAT4(a,b,c,d)
74 #define _CONCAT5(a,b,c,d,e)    a##b##c##d##e
75 #define CONCAT5(a,b,c,d,e) _CONCAT5(a,b,c,d,e)
76 #define _CONCAT6(a,b,c,d,e,f)    a##b##c##d##e##f
77 #define CONCAT6(a,b,c,d,e,f) _CONCAT6(a,b,c,d,e,f)
78 #define _CONCAT7(a,b,c,d,e,f,g)    a##b##c##d##e##f##g
79 #define CONCAT7(a,b,c,d,e,f,g) _CONCAT7(a,b,c,d,e,f,g)
80 #define _CONCAT8(a,b,c,d,e,f,g,h)    a##b##c##d##e##f##g##h
81 #define CONCAT8(a,b,c,d,e,f,g,h) _CONCAT8(a,b,c,d,e,f,g,h)
82 
83 #define _STRINGIFY(_n) #_n
84 #define STRINGIFY(_n) _STRINGIFY(_n)
85 
86 
87 #if ! defined(MAX_VREG_SIZE)
88 #error  MAX_VREG_SIZE must be defined.
89 #endif
90 
91 
92 #if MAX_VREG_SIZE == 64
93 #define VLS     1
94 #define VLD     1
95 #define VIS_T   vis1_t
96 #define VID_T   vid1_t
97 #define VRS_T   vrs1_t
98 #define VRD_T   vrd1_t
99 #define FMIN	1.0f
100 #define DMIN	1.0d
101 #define VRET(subscript) vret
102 #define ROUT(subscript) rout
103 #define ROUTM(subscript) routm
104 #define RES(subscript) res
105 #define EXP(subscript) exp
106 #define VVMASK(subscript) vvmask
107 #elif MAX_VREG_SIZE == 128
108 #define VLS     4
109 #define VLD     2
110 #define VIS_T   vis4_t
111 #define VID_T   vid2_t
112 #define VRS_T   vrs4_t
113 #define VRD_T   vrd2_t
114 #define FMIN	2.0f
115 #define DMIN	2.0d
116 #define VRET(subscript) vret[subscript]
117 #define ROUT(subscript) rout[subscript]
118 #define ROUTM(subscript) routm[subscript]
119 #define RES(subscript) res[subscript]
120 #define EXP(subscript) exp[subscript]
121 #define VVMASK(subscript) vvmask[subscript]
122 #elif   MAX_VREG_SIZE == 256
123 #define VLS     8
124 #define VLD     4
125 #define VIS_T   vis8_t
126 #define VID_T   vid4_t
127 #define VRS_T   vrs8_t
128 #define VRD_T   vrd4_t
129 #define FMIN	6.0f
130 #define DMIN	6.0d
131 #define VRET(subscript) vret[subscript]
132 #define ROUT(subscript) rout[subscript]
133 #define ROUTM(subscript) routm[subscript]
134 #define RES(subscript) res[subscript]
135 #define EXP(subscript) exp[subscript]
136 #define VVMASK(subscript) vvmask[subscript]
137 #elif   MAX_VREG_SIZE == 512
138 #define VLS     16
139 #define VLD     8
140 #define VIS_T   vis16_t
141 #define VID_T   vid8_t
142 #define VRS_T   vrs16_t
143 #define VRD_T   vrd8_t
144 #define FMIN	14.0f
145 #define DMIN	14.0d
146 #define VRET(subscript) vret[subscript]
147 #define ROUT(subscript) rout[subscript]
148 #define ROUTM(subscript) routm[subscript]
149 #define RES(subscript) res[subscript]
150 #define EXP(subscript) exp[subscript]
151 #define VVMASK(subscript) vvmask[subscript]
152 #else
153 #error  MAX_VREG_SIZE must be one of 64, 128, 256, or 512
154 #endif
155 
156 #define FCONST1 0.0f
157 #define FCONST2 31.0f
158 #define DCONST1 0.0d
159 #define DCONST2 31.0d
160 
161 #define EXTERN_EFUNC(name) \
162     extern VRS_T \
163     CONCAT5(__fs_,name,_,VLS,)(VRS_T), CONCAT5(__fs_,name,_,VLS,m)(VRS_T,VIS_T), \
164     CONCAT5(__rs_,name,_,VLS,)(VRS_T), CONCAT5(__rs_,name,_,VLS,m)(VRS_T,VIS_T), \
165     CONCAT5(__ps_,name,_,VLS,)(VRS_T), CONCAT5(__ps_,name,_,VLS,m)(VRS_T,VIS_T); \
166     extern VRD_T \
167     CONCAT5(__fd_,name,_,VLD,)(VRD_T), CONCAT5(__fd_,name,_,VLD,m)(VRD_T,VID_T), \
168     CONCAT5(__rd_,name,_,VLD,)(VRD_T), CONCAT5(__rd_,name,_,VLD,m)(VRD_T,VID_T), \
169     CONCAT5(__pd_,name,_,VLD,)(VRD_T), CONCAT5(__pd_,name,_,VLD,m)(VRD_T,VID_T)
170 
171 #define EXTERN_EFUNC2(name) \
172     extern VRS_T \
173     CONCAT5(__fs_,name,_,VLS,)(VRS_T,VRS_T), CONCAT5(__fs_,name,_,VLS,m)(VRS_T,VRS_T,VIS_T), \
174     CONCAT5(__rs_,name,_,VLS,)(VRS_T,VRS_T), CONCAT5(__rs_,name,_,VLS,m)(VRS_T,VRS_T,VIS_T), \
175     CONCAT5(__ps_,name,_,VLS,)(VRS_T,VRS_T), CONCAT5(__ps_,name,_,VLS,m)(VRS_T,VRS_T,VIS_T); \
176     extern VRD_T \
177     CONCAT5(__fd_,name,_,VLD,)(VRD_T,VRD_T), CONCAT5(__fd_,name,_,VLD,m)(VRD_T,VRD_T,VID_T), \
178     CONCAT5(__rd_,name,_,VLD,)(VRD_T,VRD_T), CONCAT5(__rd_,name,_,VLD,m)(VRD_T,VRD_T,VID_T), \
179     CONCAT5(__pd_,name,_,VLD,)(VRD_T,VRD_T), CONCAT5(__pd_,name,_,VLD,m)(VRD_T,VRD_T,VID_T)
180 
181 
182 #define EXTERN_EFUNC2i(name) \
183     extern VRS_T \
184     CONCAT5(__fs_,name,_,VLS,)(VRS_T,VIS_T), CONCAT5(__fs_,name,_,VLS,m)(VRS_T,VIS_T,VIS_T), \
185     CONCAT5(__rs_,name,_,VLS,)(VRS_T,VIS_T), CONCAT5(__rs_,name,_,VLS,m)(VRS_T,VIS_T,VIS_T), \
186     CONCAT5(__ps_,name,_,VLS,)(VRS_T,VIS_T), CONCAT5(__ps_,name,_,VLS,m)(VRS_T,VIS_T,VIS_T); \
187     extern VRD_T \
188     CONCAT5(__fd_,name,_,VLD,)(VRD_T,VIS_T), CONCAT5(__fd_,name,_,VLD,m)(VRD_T,VIS_T,VID_T), \
189     CONCAT5(__rd_,name,_,VLD,)(VRD_T,VIS_T), CONCAT5(__rd_,name,_,VLD,m)(VRD_T,VIS_T,VID_T), \
190     CONCAT5(__pd_,name,_,VLD,)(VRD_T,VIS_T), CONCAT5(__pd_,name,_,VLD,m)(VRD_T,VIS_T,VID_T)
191 
192 
193 #define EXTERN_EFUNC2i1(name) \
194     extern VRS_T \
195     CONCAT5(__fs_,name,_,VLS,)(VRS_T,int32_t), CONCAT5(__fs_,name,_,VLS,m)(VRS_T,int32_t,VIS_T), \
196     CONCAT5(__rs_,name,_,VLS,)(VRS_T,int32_t), CONCAT5(__rs_,name,_,VLS,m)(VRS_T,int32_t,VIS_T), \
197     CONCAT5(__ps_,name,_,VLS,)(VRS_T,int32_t), CONCAT5(__ps_,name,_,VLS,m)(VRS_T,int32_t,VIS_T); \
198     extern VRD_T \
199     CONCAT5(__fd_,name,_,VLD,)(VRD_T,int32_t), CONCAT5(__fd_,name,_,VLD,m)(VRD_T,int32_t,VID_T), \
200     CONCAT5(__rd_,name,_,VLD,)(VRD_T,int32_t), CONCAT5(__rd_,name,_,VLD,m)(VRD_T,int32_t,VID_T), \
201     CONCAT5(__pd_,name,_,VLD,)(VRD_T,int32_t), CONCAT5(__pd_,name,_,VLD,m)(VRD_T,int32_t,VID_T)
202 
203 
204 #define EXTERN_EFUNC2k(name) \
205     extern VRS_T \
206     CONCAT5(__fs_,name,_,VLS,)(VRS_T,VID_T), CONCAT5(__fs_,name,_,VLS,m)(VRS_T,VID_T,VIS_T), \
207     CONCAT5(__rs_,name,_,VLS,)(VRS_T,VID_T), CONCAT5(__rs_,name,_,VLS,m)(VRS_T,VID_T,VIS_T), \
208     CONCAT5(__ps_,name,_,VLS,)(VRS_T,VID_T), CONCAT5(__ps_,name,_,VLS,m)(VRS_T,VID_T,VIS_T); \
209     extern VRD_T \
210     CONCAT5(__fd_,name,_,VLD,)(VRD_T,VID_T), CONCAT5(__fd_,name,_,VLD,m)(VRD_T,VID_T,VID_T), \
211     CONCAT5(__rd_,name,_,VLD,)(VRD_T,VID_T), CONCAT5(__rd_,name,_,VLD,m)(VRD_T,VID_T,VID_T), \
212     CONCAT5(__pd_,name,_,VLD,)(VRD_T,VID_T), CONCAT5(__pd_,name,_,VLD,m)(VRD_T,VID_T,VID_T)
213 
214 
215 #define EXTERN_EFUNC2k1(name) \
216     extern VRS_T \
217     CONCAT5(__fs_,name,_,VLS,)(VRS_T,int64_t), CONCAT5(__fs_,name,_,VLS,m)(VRS_T,int64_t,VIS_T), \
218     CONCAT5(__rs_,name,_,VLS,)(VRS_T,int64_t), CONCAT5(__rs_,name,_,VLS,m)(VRS_T,int64_t,VIS_T), \
219     CONCAT5(__ps_,name,_,VLS,)(VRS_T,int64_t), CONCAT5(__ps_,name,_,VLS,m)(VRS_T,int64_t,VIS_T); \
220     extern VRD_T \
221     CONCAT5(__fd_,name,_,VLD,)(VRD_T,int64_t), CONCAT5(__fd_,name,_,VLD,m)(VRD_T,int64_t,VID_T), \
222     CONCAT5(__rd_,name,_,VLD,)(VRD_T,int64_t), CONCAT5(__rd_,name,_,VLD,m)(VRD_T,int64_t,VID_T), \
223     CONCAT5(__pd_,name,_,VLD,)(VRD_T,int64_t), CONCAT5(__pd_,name,_,VLD,m)(VRD_T,int64_t,VID_T)
224 
225 
226 
227 EXTERN_EFUNC(acos);
228 EXTERN_EFUNC(asin);
229 EXTERN_EFUNC(atan);
230 EXTERN_EFUNC(cos);
231 EXTERN_EFUNC(cosh);
232 EXTERN_EFUNC(exp);
233 EXTERN_EFUNC(log10);
234 EXTERN_EFUNC(log);
235 EXTERN_EFUNC(sin);
236 EXTERN_EFUNC(sinh);
237 EXTERN_EFUNC(tan);
238 EXTERN_EFUNC(tanh);
239 
240 EXTERN_EFUNC2(atan2);
241 EXTERN_EFUNC2(mod);
242 EXTERN_EFUNC2(pow);
243 
244 EXTERN_EFUNC2i(powi);
245 EXTERN_EFUNC2i1(powi1);
246 
247 EXTERN_EFUNC2k(powk);
248 EXTERN_EFUNC2k1(powk1);
249 
250 
251 int32_t mask_sp[1<<VLS][VLS] __attribute__((aligned(64)));
252 int64_t mask_dp[1<<VLD][VLD] __attribute__((aligned(64)));
253 #if !defined(TARGET_WIN_X8664)
254 int32_t verbose = 0;
255 #else
256 #if VERBOSE == 0
257 int32_t verbose = 0;
258 #else
259 int32_t verbose = 1;
260 #endif
261 #endif
262 
263 
264 #if !defined(TARGET_WIN_X8664)
265 static void
parseargs(int argc,char * argv[])266 parseargs(int argc, char *argv[])
267 {
268 	int opt;
269 
270 	while ((opt = getopt(argc, argv, "v")) != -1) {
271 	    switch(opt) {
272 	    case 'v':
273 	        verbose = 1;
274 	        break;
275 	    default:
276 	        fprintf(stderr, "Usage %s [-v]\n",argv[0]);
277 	    }
278 	}
279 }
280 #endif
281 
282 
283 static VRS_T
vrs_set_arg(float fmin,float fconst)284 vrs_set_arg(float fmin, float fconst )
285 {
286     VRS_T   vret __attribute__((aligned(64)));
287     float   fdelta;
288     int     i;
289 
290     fdelta = fconst + fmin;
291     for (i = 0; i < VLS; i++) {
292        VRET(i) = (1.0f / (fdelta + (float) i));
293     }
294 
295     return vret;
296 }
297 
298 static VRD_T
vrd_set_arg(double dmin,double dconst)299 vrd_set_arg(double dmin, double dconst )
300 {
301     VRD_T   vret __attribute__((aligned(64)));
302     double  ddelta;
303     int     i;
304 
305     ddelta = dconst + dmin;
306     for (i = 0; i < VLS; i++) {
307        VRET(i) = (1.0 / (ddelta + (double) i));
308     }
309 
310     return vret;
311 }
312 
313 
314 static void
build_masks(bool gray_code)315 build_masks(bool gray_code)
316 {
317     int32_t    i;
318     int32_t    j;
319     int32_t    k;
320 
321     if (verbose) {
322         printf("%s: %s mask vectors\n",
323             __func__, gray_code ? "Gray code" : "binary");
324     }
325 
326     memset(mask_sp, 0, sizeof mask_sp);
327     memset(mask_dp, 0, sizeof mask_dp);
328 
329 
330     for (j = 0; j < 1<<VLD; j++) {
331         k = gray_code ? j ^ (j>>1) : j;
332         for (i = 0; i < VLD; i++) {
333             mask_dp[j][i] = (k&0x1) * -1;
334             k = k>>1;
335         }
336     }
337 
338     for (j = 0; j < 1<<VLS; j++) {
339         k = gray_code ? j ^ (j>>1) : j;
340         for (i = 0; i < VLS; i++) {
341             mask_sp[j][i] = (k&0x1) * -1;
342             k = k>>1;
343         }
344     }
345 
346     if (verbose) {
347     	for (j = 0; j < 1<<VLD; j++) {
348         	for (i = 0; i < VLD; i++) {
349             	printf(" %2lld", mask_dp[j][i]);
350         	}
351         	puts("");
352     	}
353     	for (j = 0; j < 1<<VLS; j++) {
354         	for (i = 0; i < VLS; i++) {
355             	printf(" %2d", mask_sp[j][i]);
356         	}
357         	puts("");
358     	}
359     }
360 
361     return;
362 
363 }
364 
365 
366 static int
checkfltol1(float res,float exp,float ltol)367 checkfltol1(float res, float exp, float ltol)
368 {
369     int tests_passed = 0;
370     int tests_failed = 0;
371 
372     if (exp == res) {
373         tests_passed ++;
374     }else if( exp != 0.0 && (fabsf((exp-res)/exp)) <= ltol ){
375         tests_passed ++;
376     }else if( exp == 0.0 && exp <= ltol ){
377         tests_passed ++;
378     } else {
379         tests_failed ++;
380 	if (verbose) {
381 	    printf("test FAILED. res %f  exp %f\n", res, exp);
382 	}
383     }
384 
385     if (verbose) {
386 	    if (tests_failed == 0) {
387 	        printf("1 test completed. %d tests PASSED. %d tests failed.\n",
388 	                      tests_passed, tests_failed);
389 	    } else {
390 	        printf("1 test completed. %d tests passed. %d tests FAILED.\n",
391 	                      tests_passed, tests_failed);
392 	    }
393     }
394 
395     return(tests_failed);
396 }
397 
398 
399 static int
checkfltol(VRS_T res,VRS_T exp,VIS_T vvmask,int n,float ltol)400 checkfltol(VRS_T res, VRS_T exp, VIS_T vvmask, int n, float ltol)
401 {
402     int i;
403     int tests_failed = 0;
404 
405     for (i = 0; i < n; i++) {
406 	if (VVMASK(i) != 0) {
407 	    tests_failed += checkfltol1(RES(i), EXP(i), ltol);
408 	}
409     }
410 
411     return(tests_failed);
412 }
413 
414