1 /*
2  * Microbenchmark for math functions.
3  *
4  * Copyright (c) 2018-2023, Arm Limited.
5  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6  */
7 
8 #undef _GNU_SOURCE
9 #define _GNU_SOURCE 1
10 #include <stdint.h>
11 #include <stdlib.h>
12 #include <stdio.h>
13 #include <string.h>
14 #include <time.h>
15 #include <math.h>
16 #include "mathlib.h"
17 
18 /* Number of measurements, best result is reported.  */
19 #define MEASURE 60
20 /* Array size.  */
21 #define N 8000
22 /* Iterations over the array.  */
23 #define ITER 125
24 
25 static double *Trace;
26 static size_t trace_size;
27 static double A[N];
28 static float Af[N];
29 static long measurecount = MEASURE;
30 static long itercount = ITER;
31 
32 #ifdef __vpcs
33 #include <arm_neon.h>
34 typedef float64x2_t v_double;
35 
36 #define v_double_len() 2
37 
38 static inline v_double
v_double_load(const double * p)39 v_double_load (const double *p)
40 {
41   return (v_double){p[0], p[1]};
42 }
43 
44 static inline v_double
v_double_dup(double x)45 v_double_dup (double x)
46 {
47   return (v_double){x, x};
48 }
49 
50 typedef float32x4_t v_float;
51 
52 #define v_float_len() 4
53 
54 static inline v_float
v_float_load(const float * p)55 v_float_load (const float *p)
56 {
57   return (v_float){p[0], p[1], p[2], p[3]};
58 }
59 
60 static inline v_float
v_float_dup(float x)61 v_float_dup (float x)
62 {
63   return (v_float){x, x, x, x};
64 }
65 #else
66 /* dummy definitions to make things compile.  */
67 typedef double v_double;
68 typedef float v_float;
69 #define v_double_len(x) 1
70 #define v_double_load(x) (x)[0]
71 #define v_double_dup(x) (x)
72 #define v_float_len(x) 1
73 #define v_float_load(x) (x)[0]
74 #define v_float_dup(x) (x)
75 
76 #endif
77 
78 #if WANT_SVE_MATH
79 #include <arm_sve.h>
80 typedef svbool_t sv_bool;
81 typedef svfloat64_t sv_double;
82 
83 #define sv_double_len() svcntd()
84 
85 static inline sv_double
sv_double_load(const double * p)86 sv_double_load (const double *p)
87 {
88   svbool_t pg = svptrue_b64();
89   return svld1(pg, p);
90 }
91 
92 static inline sv_double
sv_double_dup(double x)93 sv_double_dup (double x)
94 {
95   return svdup_n_f64(x);
96 }
97 
98 typedef svfloat32_t sv_float;
99 
100 #define sv_float_len() svcntw()
101 
102 static inline sv_float
sv_float_load(const float * p)103 sv_float_load (const float *p)
104 {
105   svbool_t pg = svptrue_b32();
106   return svld1(pg, p);
107 }
108 
109 static inline sv_float
sv_float_dup(float x)110 sv_float_dup (float x)
111 {
112   return svdup_n_f32(x);
113 }
114 #else
115 /* dummy definitions to make things compile.  */
116 #define sv_double_len(x) 1
117 #define sv_float_len(x) 1
118 #endif
119 
120 static double
dummy(double x)121 dummy (double x)
122 {
123   return x;
124 }
125 
126 static float
dummyf(float x)127 dummyf (float x)
128 {
129   return x;
130 }
131 #ifdef __vpcs
132 __vpcs static v_double
__vn_dummy(v_double x)133 __vn_dummy (v_double x)
134 {
135   return x;
136 }
137 
138 __vpcs static v_float
__vn_dummyf(v_float x)139 __vn_dummyf (v_float x)
140 {
141   return x;
142 }
143 #endif
144 #if WANT_SVE_MATH
145 static sv_double
__sv_dummy(sv_double x,sv_bool pg)146 __sv_dummy (sv_double x, sv_bool pg)
147 {
148   return x;
149 }
150 
151 static sv_float
__sv_dummyf(sv_float x,sv_bool pg)152 __sv_dummyf (sv_float x, sv_bool pg)
153 {
154   return x;
155 }
156 
157 #endif
158 
159 #include "test/mathbench_wrappers.h"
160 
161 static const struct fun
162 {
163   const char *name;
164   int prec;
165   int vec;
166   double lo;
167   double hi;
168   union
169   {
170     double (*d) (double);
171     float (*f) (float);
172 #ifdef __vpcs
173     __vpcs v_double (*vnd) (v_double);
174     __vpcs v_float (*vnf) (v_float);
175 #endif
176 #if WANT_SVE_MATH
177     sv_double (*svd) (sv_double, sv_bool);
178     sv_float (*svf) (sv_float, sv_bool);
179 #endif
180   } fun;
181 } funtab[] = {
182 #define D(func, lo, hi) {#func, 'd', 0, lo, hi, {.d = func}},
183 #define F(func, lo, hi) {#func, 'f', 0, lo, hi, {.f = func}},
184 #define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}},
185 #define VNF(func, lo, hi) {#func, 'f', 'n', lo, hi, {.vnf = func}},
186 #define SVD(func, lo, hi) {#func, 'd', 's', lo, hi, {.svd = func}},
187 #define SVF(func, lo, hi) {#func, 'f', 's', lo, hi, {.svf = func}},
188 D (dummy, 1.0, 2.0)
189 F (dummyf, 1.0, 2.0)
190 #ifdef __vpcs
191 VND (__vn_dummy, 1.0, 2.0)
192 VNF (__vn_dummyf, 1.0, 2.0)
193 #endif
194 #if WANT_SVE_MATH
195 SVD (__sv_dummy, 1.0, 2.0)
196 SVF (__sv_dummyf, 1.0, 2.0)
197 #endif
198 #include "test/mathbench_funcs.h"
199 {0},
200 #undef F
201 #undef D
202 #undef VNF
203 #undef VND
204 #undef SVF
205 #undef SVD
206 };
207 
208 static void
gen_linear(double lo,double hi)209 gen_linear (double lo, double hi)
210 {
211   for (int i = 0; i < N; i++)
212     A[i] = (lo * (N - i) + hi * i) / N;
213 }
214 
215 static void
genf_linear(double lo,double hi)216 genf_linear (double lo, double hi)
217 {
218   for (int i = 0; i < N; i++)
219     Af[i] = (float)(lo * (N - i) + hi * i) / N;
220 }
221 
222 static inline double
asdouble(uint64_t i)223 asdouble (uint64_t i)
224 {
225   union
226   {
227     uint64_t i;
228     double f;
229   } u = {i};
230   return u.f;
231 }
232 
233 static uint64_t seed = 0x0123456789abcdef;
234 
235 static double
frand(double lo,double hi)236 frand (double lo, double hi)
237 {
238   seed = 6364136223846793005ULL * seed + 1;
239   return lo + (hi - lo) * (asdouble (seed >> 12 | 0x3ffULL << 52) - 1.0);
240 }
241 
242 static void
gen_rand(double lo,double hi)243 gen_rand (double lo, double hi)
244 {
245   for (int i = 0; i < N; i++)
246     A[i] = frand (lo, hi);
247 }
248 
249 static void
genf_rand(double lo,double hi)250 genf_rand (double lo, double hi)
251 {
252   for (int i = 0; i < N; i++)
253     Af[i] = (float)frand (lo, hi);
254 }
255 
256 static void
gen_trace(int index)257 gen_trace (int index)
258 {
259   for (int i = 0; i < N; i++)
260     A[i] = Trace[index + i];
261 }
262 
263 static void
genf_trace(int index)264 genf_trace (int index)
265 {
266   for (int i = 0; i < N; i++)
267     Af[i] = (float)Trace[index + i];
268 }
269 
270 static void
run_thruput(double f (double))271 run_thruput (double f (double))
272 {
273   for (int i = 0; i < N; i++)
274     f (A[i]);
275 }
276 
277 static void
runf_thruput(float f (float))278 runf_thruput (float f (float))
279 {
280   for (int i = 0; i < N; i++)
281     f (Af[i]);
282 }
283 
284 volatile double zero = 0;
285 
286 static void
run_latency(double f (double))287 run_latency (double f (double))
288 {
289   double z = zero;
290   double prev = z;
291   for (int i = 0; i < N; i++)
292     prev = f (A[i] + prev * z);
293 }
294 
295 static void
runf_latency(float f (float))296 runf_latency (float f (float))
297 {
298   float z = (float)zero;
299   float prev = z;
300   for (int i = 0; i < N; i++)
301     prev = f (Af[i] + prev * z);
302 }
303 
304 #ifdef __vpcs
305 static void
run_vn_thruput(__vpcs v_double f (v_double))306 run_vn_thruput (__vpcs v_double f (v_double))
307 {
308   for (int i = 0; i < N; i += v_double_len ())
309     f (v_double_load (A+i));
310 }
311 
312 static void
runf_vn_thruput(__vpcs v_float f (v_float))313 runf_vn_thruput (__vpcs v_float f (v_float))
314 {
315   for (int i = 0; i < N; i += v_float_len ())
316     f (v_float_load (Af+i));
317 }
318 
319 static void
run_vn_latency(__vpcs v_double f (v_double))320 run_vn_latency (__vpcs v_double f (v_double))
321 {
322   volatile uint64x2_t vsel = (uint64x2_t) { 0, 0 };
323   uint64x2_t sel = vsel;
324   v_double prev = v_double_dup (0);
325   for (int i = 0; i < N; i += v_double_len ())
326     prev = f (vbslq_f64 (sel, prev, v_double_load (A+i)));
327 }
328 
329 static void
runf_vn_latency(__vpcs v_float f (v_float))330 runf_vn_latency (__vpcs v_float f (v_float))
331 {
332   volatile uint32x4_t vsel = (uint32x4_t) { 0, 0, 0, 0 };
333   uint32x4_t sel = vsel;
334   v_float prev = v_float_dup (0);
335   for (int i = 0; i < N; i += v_float_len ())
336     prev = f (vbslq_f32 (sel, prev, v_float_load (Af+i)));
337 }
338 #endif
339 
340 #if WANT_SVE_MATH
341 static void
run_sv_thruput(sv_double f (sv_double,sv_bool))342 run_sv_thruput (sv_double f (sv_double, sv_bool))
343 {
344   for (int i = 0; i < N; i += sv_double_len ())
345     f (sv_double_load (A+i), svptrue_b64 ());
346 }
347 
348 static void
runf_sv_thruput(sv_float f (sv_float,sv_bool))349 runf_sv_thruput (sv_float f (sv_float, sv_bool))
350 {
351   for (int i = 0; i < N; i += sv_float_len ())
352     f (sv_float_load (Af+i), svptrue_b32 ());
353 }
354 
355 static void
run_sv_latency(sv_double f (sv_double,sv_bool))356 run_sv_latency (sv_double f (sv_double, sv_bool))
357 {
358   volatile sv_bool vsel = svptrue_b64 ();
359   sv_bool sel = vsel;
360   sv_double prev = sv_double_dup (0);
361   for (int i = 0; i < N; i += sv_double_len ())
362     prev = f (svsel_f64 (sel, sv_double_load (A+i), prev), svptrue_b64 ());
363 }
364 
365 static void
runf_sv_latency(sv_float f (sv_float,sv_bool))366 runf_sv_latency (sv_float f (sv_float, sv_bool))
367 {
368   volatile sv_bool vsel = svptrue_b32 ();
369   sv_bool sel = vsel;
370   sv_float prev = sv_float_dup (0);
371   for (int i = 0; i < N; i += sv_float_len ())
372     prev = f (svsel_f32 (sel, sv_float_load (Af+i), prev), svptrue_b32 ());
373 }
374 #endif
375 
376 static uint64_t
tic(void)377 tic (void)
378 {
379   struct timespec ts;
380   if (clock_gettime (CLOCK_REALTIME, &ts))
381     abort ();
382   return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
383 }
384 
385 #define TIMEIT(run, f) do { \
386   dt = -1; \
387   run (f); /* Warm up.  */ \
388   for (int j = 0; j < measurecount; j++) \
389     { \
390       uint64_t t0 = tic (); \
391       for (int i = 0; i < itercount; i++) \
392 	run (f); \
393       uint64_t t1 = tic (); \
394       if (t1 - t0 < dt) \
395 	dt = t1 - t0; \
396     } \
397 } while (0)
398 
399 static void
bench1(const struct fun * f,int type,double lo,double hi)400 bench1 (const struct fun *f, int type, double lo, double hi)
401 {
402   uint64_t dt = 0;
403   uint64_t ns100;
404   const char *s = type == 't' ? "rthruput" : "latency";
405   int vlen = 1;
406 
407   if (f->vec == 'n')
408     vlen = f->prec == 'd' ? v_double_len() : v_float_len();
409   else if (f->vec == 's')
410     vlen = f->prec == 'd' ? sv_double_len() : sv_float_len();
411 
412   if (f->prec == 'd' && type == 't' && f->vec == 0)
413     TIMEIT (run_thruput, f->fun.d);
414   else if (f->prec == 'd' && type == 'l' && f->vec == 0)
415     TIMEIT (run_latency, f->fun.d);
416   else if (f->prec == 'f' && type == 't' && f->vec == 0)
417     TIMEIT (runf_thruput, f->fun.f);
418   else if (f->prec == 'f' && type == 'l' && f->vec == 0)
419     TIMEIT (runf_latency, f->fun.f);
420 #ifdef __vpcs
421   else if (f->prec == 'd' && type == 't' && f->vec == 'n')
422     TIMEIT (run_vn_thruput, f->fun.vnd);
423   else if (f->prec == 'd' && type == 'l' && f->vec == 'n')
424     TIMEIT (run_vn_latency, f->fun.vnd);
425   else if (f->prec == 'f' && type == 't' && f->vec == 'n')
426     TIMEIT (runf_vn_thruput, f->fun.vnf);
427   else if (f->prec == 'f' && type == 'l' && f->vec == 'n')
428     TIMEIT (runf_vn_latency, f->fun.vnf);
429 #endif
430 #if WANT_SVE_MATH
431   else if (f->prec == 'd' && type == 't' && f->vec == 's')
432     TIMEIT (run_sv_thruput, f->fun.svd);
433   else if (f->prec == 'd' && type == 'l' && f->vec == 's')
434     TIMEIT (run_sv_latency, f->fun.svd);
435   else if (f->prec == 'f' && type == 't' && f->vec == 's')
436     TIMEIT (runf_sv_thruput, f->fun.svf);
437   else if (f->prec == 'f' && type == 'l' && f->vec == 's')
438     TIMEIT (runf_sv_latency, f->fun.svf);
439 #endif
440 
441   if (type == 't')
442     {
443       ns100 = (100 * dt + itercount * N / 2) / (itercount * N);
444       printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g] vlen %d\n",
445 	      f->name, s,
446 	      (unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
447 	      (unsigned long long) dt, lo, hi, vlen);
448     }
449   else if (type == 'l')
450     {
451       ns100 = (100 * dt + itercount * N / vlen / 2) / (itercount * N / vlen);
452       printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g] vlen %d\n",
453 	      f->name, s,
454 	      (unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
455 	      (unsigned long long) dt, lo, hi, vlen);
456     }
457   fflush (stdout);
458 }
459 
460 static void
bench(const struct fun * f,double lo,double hi,int type,int gen)461 bench (const struct fun *f, double lo, double hi, int type, int gen)
462 {
463   if (f->prec == 'd' && gen == 'r')
464     gen_rand (lo, hi);
465   else if (f->prec == 'd' && gen == 'l')
466     gen_linear (lo, hi);
467   else if (f->prec == 'd' && gen == 't')
468     gen_trace (0);
469   else if (f->prec == 'f' && gen == 'r')
470     genf_rand (lo, hi);
471   else if (f->prec == 'f' && gen == 'l')
472     genf_linear (lo, hi);
473   else if (f->prec == 'f' && gen == 't')
474     genf_trace (0);
475 
476   if (gen == 't')
477     hi = trace_size / N;
478 
479   if (type == 'b' || type == 't')
480     bench1 (f, 't', lo, hi);
481 
482   if (type == 'b' || type == 'l')
483     bench1 (f, 'l', lo, hi);
484 
485   for (int i = N; i < trace_size; i += N)
486     {
487       if (f->prec == 'd')
488 	gen_trace (i);
489       else
490 	genf_trace (i);
491 
492       lo = i / N;
493       if (type == 'b' || type == 't')
494 	bench1 (f, 't', lo, hi);
495 
496       if (type == 'b' || type == 'l')
497 	bench1 (f, 'l', lo, hi);
498     }
499 }
500 
501 static void
readtrace(const char * name)502 readtrace (const char *name)
503 {
504 	int n = 0;
505 	FILE *f = strcmp (name, "-") == 0 ? stdin : fopen (name, "r");
506 	if (!f)
507 	  {
508 	    printf ("openning \"%s\" failed: %m\n", name);
509 	    exit (1);
510 	  }
511 	for (;;)
512 	  {
513 	    if (n >= trace_size)
514 	      {
515 		trace_size += N;
516 		Trace = realloc (Trace, trace_size * sizeof (Trace[0]));
517 		if (Trace == NULL)
518 		  {
519 		    printf ("out of memory\n");
520 		    exit (1);
521 		  }
522 	      }
523 	    if (fscanf (f, "%lf", Trace + n) != 1)
524 	      break;
525 	    n++;
526 	  }
527 	if (ferror (f) || n == 0)
528 	  {
529 	    printf ("reading \"%s\" failed: %m\n", name);
530 	    exit (1);
531 	  }
532 	fclose (f);
533 	if (n % N == 0)
534 	  trace_size = n;
535 	for (int i = 0; n < trace_size; n++, i++)
536 	  Trace[n] = Trace[i];
537 }
538 
539 static void
usage(void)540 usage (void)
541 {
542   printf ("usage: ./mathbench [-g rand|linear|trace] [-t latency|thruput|both] "
543 	  "[-i low high] [-f tracefile] [-m measurements] [-c iterations] func "
544 	  "[func2 ..]\n");
545   printf ("func:\n");
546   printf ("%7s [run all benchmarks]\n", "all");
547   for (const struct fun *f = funtab; f->name; f++)
548     printf ("%7s [low: %g high: %g]\n", f->name, f->lo, f->hi);
549   exit (1);
550 }
551 
552 int
main(int argc,char * argv[])553 main (int argc, char *argv[])
554 {
555   int usergen = 0, gen = 'r', type = 'b', all = 0;
556   double lo = 0, hi = 0;
557   const char *tracefile = "-";
558 
559   argv++;
560   argc--;
561   for (;;)
562     {
563       if (argc <= 0)
564 	usage ();
565       if (argv[0][0] != '-')
566 	break;
567       else if (argc >= 3 && strcmp (argv[0], "-i") == 0)
568 	{
569 	  usergen = 1;
570 	  lo = strtod (argv[1], 0);
571 	  hi = strtod (argv[2], 0);
572 	  argv += 3;
573 	  argc -= 3;
574 	}
575       else if (argc >= 2 && strcmp (argv[0], "-m") == 0)
576 	{
577 	  measurecount = strtol (argv[1], 0, 0);
578 	  argv += 2;
579 	  argc -= 2;
580 	}
581       else if (argc >= 2 && strcmp (argv[0], "-c") == 0)
582 	{
583 	  itercount = strtol (argv[1], 0, 0);
584 	  argv += 2;
585 	  argc -= 2;
586 	}
587       else if (argc >= 2 && strcmp (argv[0], "-g") == 0)
588 	{
589 	  gen = argv[1][0];
590 	  if (strchr ("rlt", gen) == 0)
591 	    usage ();
592 	  argv += 2;
593 	  argc -= 2;
594 	}
595       else if (argc >= 2 && strcmp (argv[0], "-f") == 0)
596 	{
597 	  gen = 't';  /* -f implies -g trace.  */
598 	  tracefile = argv[1];
599 	  argv += 2;
600 	  argc -= 2;
601 	}
602       else if (argc >= 2 && strcmp (argv[0], "-t") == 0)
603 	{
604 	  type = argv[1][0];
605 	  if (strchr ("ltb", type) == 0)
606 	    usage ();
607 	  argv += 2;
608 	  argc -= 2;
609 	}
610       else
611 	usage ();
612     }
613   if (gen == 't')
614     {
615       readtrace (tracefile);
616       lo = hi = 0;
617       usergen = 1;
618     }
619   while (argc > 0)
620     {
621       int found = 0;
622       all = strcmp (argv[0], "all") == 0;
623       for (const struct fun *f = funtab; f->name; f++)
624 	if (all || strcmp (argv[0], f->name) == 0)
625 	  {
626 	    found = 1;
627 	    if (!usergen)
628 	      {
629 		lo = f->lo;
630 		hi = f->hi;
631 	      }
632 	    bench (f, lo, hi, type, gen);
633 	    if (usergen && !all)
634 	      break;
635 	  }
636       if (!found)
637 	printf ("unknown function: %s\n", argv[0]);
638       argv++;
639       argc--;
640     }
641   return 0;
642 }
643