1 /*
2  * Microbenchmark for math functions.
3  *
4  * Copyright (c) 2018-2022, Arm Limited.
5  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6  */
7 
8 #undef _GNU_SOURCE
9 #define _GNU_SOURCE 1
10 #include <stdint.h>
11 #include <stdlib.h>
12 #include <stdio.h>
13 #include <string.h>
14 #include <time.h>
15 #include <math.h>
16 #include "mathlib.h"
17 
18 #ifndef WANT_VMATH
19 /* Enable the build of vector math code.  */
20 # define WANT_VMATH 1
21 #endif
22 
23 /* Number of measurements, best result is reported.  */
24 #define MEASURE 60
25 /* Array size.  */
26 #define N 8000
27 /* Iterations over the array.  */
28 #define ITER 125
29 
30 static double *Trace;
31 static size_t trace_size;
32 static double A[N];
33 static float Af[N];
34 static long measurecount = MEASURE;
35 static long itercount = ITER;
36 
37 #if __aarch64__ && WANT_VMATH
38 typedef __f64x2_t v_double;
39 
40 #define v_double_len() 2
41 
42 static inline v_double
43 v_double_load (const double *p)
44 {
45   return (v_double){p[0], p[1]};
46 }
47 
48 static inline v_double
49 v_double_dup (double x)
50 {
51   return (v_double){x, x};
52 }
53 
54 typedef __f32x4_t v_float;
55 
56 #define v_float_len() 4
57 
58 static inline v_float
59 v_float_load (const float *p)
60 {
61   return (v_float){p[0], p[1], p[2], p[3]};
62 }
63 
64 static inline v_float
65 v_float_dup (float x)
66 {
67   return (v_float){x, x, x, x};
68 }
69 #if WANT_SVE_MATH
70 #include <arm_sve.h>
71 typedef svbool_t sv_bool;
72 typedef svfloat64_t sv_double;
73 
74 #define sv_double_len() svcntd()
75 
76 static inline sv_double
77 sv_double_load (const double *p)
78 {
79   svbool_t pg = svptrue_b64();
80   return svld1(pg, p);
81 }
82 
83 static inline sv_double
84 sv_double_dup (double x)
85 {
86   return svdup_n_f64(x);
87 }
88 
89 typedef svfloat32_t sv_float;
90 
91 #define sv_float_len() svcntw()
92 
93 static inline sv_float
94 sv_float_load (const float *p)
95 {
96   svbool_t pg = svptrue_b32();
97   return svld1(pg, p);
98 }
99 
100 static inline sv_float
101 sv_float_dup (float x)
102 {
103   return svdup_n_f32(x);
104 }
105 #endif
106 #else
107 /* dummy definitions to make things compile.  */
108 typedef double v_double;
109 typedef float v_float;
110 #define v_double_len(x) 1
111 #define v_double_load(x) (x)[0]
112 #define v_double_dup(x) (x)
113 #define v_float_len(x) 1
114 #define v_float_load(x) (x)[0]
115 #define v_float_dup(x) (x)
116 #endif
117 
118 static double
119 dummy (double x)
120 {
121   return x;
122 }
123 
124 static float
125 dummyf (float x)
126 {
127   return x;
128 }
129 #if WANT_VMATH
130 #if __aarch64__
131 static v_double
132 __v_dummy (v_double x)
133 {
134   return x;
135 }
136 
137 static v_float
138 __v_dummyf (v_float x)
139 {
140   return x;
141 }
142 
143 #ifdef __vpcs
144 __vpcs static v_double
145 __vn_dummy (v_double x)
146 {
147   return x;
148 }
149 
150 __vpcs static v_float
151 __vn_dummyf (v_float x)
152 {
153   return x;
154 }
155 #endif
156 #if WANT_SVE_MATH
157 static sv_double
158 __sv_dummy (sv_double x, sv_bool pg)
159 {
160   return x;
161 }
162 
163 static sv_float
164 __sv_dummyf (sv_float x, sv_bool pg)
165 {
166   return x;
167 }
168 
169 #endif
170 #endif
171 #endif
172 
173 #include "test/mathbench_wrappers.h"
174 
175 static const struct fun
176 {
177   const char *name;
178   int prec;
179   int vec;
180   double lo;
181   double hi;
182   union
183   {
184     double (*d) (double);
185     float (*f) (float);
186     v_double (*vd) (v_double);
187     v_float (*vf) (v_float);
188 #ifdef __vpcs
189     __vpcs v_double (*vnd) (v_double);
190     __vpcs v_float (*vnf) (v_float);
191 #endif
192 #if WANT_SVE_MATH
193     sv_double (*svd) (sv_double, sv_bool);
194     sv_float (*svf) (sv_float, sv_bool);
195 #endif
196   } fun;
197 } funtab[] = {
198 #define D(func, lo, hi) {#func, 'd', 0, lo, hi, {.d = func}},
199 #define F(func, lo, hi) {#func, 'f', 0, lo, hi, {.f = func}},
200 #define VD(func, lo, hi) {#func, 'd', 'v', lo, hi, {.vd = func}},
201 #define VF(func, lo, hi) {#func, 'f', 'v', lo, hi, {.vf = func}},
202 #define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}},
203 #define VNF(func, lo, hi) {#func, 'f', 'n', lo, hi, {.vnf = func}},
204 #define SVD(func, lo, hi) {#func, 'd', 's', lo, hi, {.svd = func}},
205 #define SVF(func, lo, hi) {#func, 'f', 's', lo, hi, {.svf = func}},
206 D (dummy, 1.0, 2.0)
207 F (dummyf, 1.0, 2.0)
208 #if WANT_VMATH
209 #if __aarch64__
210 VD (__v_dummy, 1.0, 2.0)
211 VF (__v_dummyf, 1.0, 2.0)
212 #ifdef __vpcs
213 VND (__vn_dummy, 1.0, 2.0)
214 VNF (__vn_dummyf, 1.0, 2.0)
215 #endif
216 #if WANT_SVE_MATH
217 SVD (__sv_dummy, 1.0, 2.0)
218 SVF (__sv_dummyf, 1.0, 2.0)
219 #endif
220 #endif
221 #endif
222 #include "test/mathbench_funcs.h"
223 {0},
224 #undef F
225 #undef D
226 #undef VF
227 #undef VD
228 #undef VNF
229 #undef VND
230 #undef SVF
231 #undef SVD
232 };
233 
234 static void
235 gen_linear (double lo, double hi)
236 {
237   for (int i = 0; i < N; i++)
238     A[i] = (lo * (N - i) + hi * i) / N;
239 }
240 
241 static void
242 genf_linear (double lo, double hi)
243 {
244   for (int i = 0; i < N; i++)
245     Af[i] = (float)(lo * (N - i) + hi * i) / N;
246 }
247 
248 static inline double
249 asdouble (uint64_t i)
250 {
251   union
252   {
253     uint64_t i;
254     double f;
255   } u = {i};
256   return u.f;
257 }
258 
259 static uint64_t seed = 0x0123456789abcdef;
260 
261 static double
262 frand (double lo, double hi)
263 {
264   seed = 6364136223846793005ULL * seed + 1;
265   return lo + (hi - lo) * (asdouble (seed >> 12 | 0x3ffULL << 52) - 1.0);
266 }
267 
268 static void
269 gen_rand (double lo, double hi)
270 {
271   for (int i = 0; i < N; i++)
272     A[i] = frand (lo, hi);
273 }
274 
275 static void
276 genf_rand (double lo, double hi)
277 {
278   for (int i = 0; i < N; i++)
279     Af[i] = (float)frand (lo, hi);
280 }
281 
282 static void
283 gen_trace (int index)
284 {
285   for (int i = 0; i < N; i++)
286     A[i] = Trace[index + i];
287 }
288 
289 static void
290 genf_trace (int index)
291 {
292   for (int i = 0; i < N; i++)
293     Af[i] = (float)Trace[index + i];
294 }
295 
296 static void
297 run_thruput (double f (double))
298 {
299   for (int i = 0; i < N; i++)
300     f (A[i]);
301 }
302 
303 static void
304 runf_thruput (float f (float))
305 {
306   for (int i = 0; i < N; i++)
307     f (Af[i]);
308 }
309 
310 volatile double zero = 0;
311 
312 static void
313 run_latency (double f (double))
314 {
315   double z = zero;
316   double prev = z;
317   for (int i = 0; i < N; i++)
318     prev = f (A[i] + prev * z);
319 }
320 
321 static void
322 runf_latency (float f (float))
323 {
324   float z = (float)zero;
325   float prev = z;
326   for (int i = 0; i < N; i++)
327     prev = f (Af[i] + prev * z);
328 }
329 
330 static void
331 run_v_thruput (v_double f (v_double))
332 {
333   for (int i = 0; i < N; i += v_double_len ())
334     f (v_double_load (A+i));
335 }
336 
337 static void
338 runf_v_thruput (v_float f (v_float))
339 {
340   for (int i = 0; i < N; i += v_float_len ())
341     f (v_float_load (Af+i));
342 }
343 
344 static void
345 run_v_latency (v_double f (v_double))
346 {
347   v_double z = v_double_dup (zero);
348   v_double prev = z;
349   for (int i = 0; i < N; i += v_double_len ())
350     prev = f (v_double_load (A+i) + prev * z);
351 }
352 
353 static void
354 runf_v_latency (v_float f (v_float))
355 {
356   v_float z = v_float_dup (zero);
357   v_float prev = z;
358   for (int i = 0; i < N; i += v_float_len ())
359     prev = f (v_float_load (Af+i) + prev * z);
360 }
361 
362 #ifdef __vpcs
363 static void
364 run_vn_thruput (__vpcs v_double f (v_double))
365 {
366   for (int i = 0; i < N; i += v_double_len ())
367     f (v_double_load (A+i));
368 }
369 
370 static void
371 runf_vn_thruput (__vpcs v_float f (v_float))
372 {
373   for (int i = 0; i < N; i += v_float_len ())
374     f (v_float_load (Af+i));
375 }
376 
377 static void
378 run_vn_latency (__vpcs v_double f (v_double))
379 {
380   v_double z = v_double_dup (zero);
381   v_double prev = z;
382   for (int i = 0; i < N; i += v_double_len ())
383     prev = f (v_double_load (A+i) + prev * z);
384 }
385 
386 static void
387 runf_vn_latency (__vpcs v_float f (v_float))
388 {
389   v_float z = v_float_dup (zero);
390   v_float prev = z;
391   for (int i = 0; i < N; i += v_float_len ())
392     prev = f (v_float_load (Af+i) + prev * z);
393 }
394 #endif
395 
396 #if WANT_SVE_MATH
397 static void
398 run_sv_thruput (sv_double f (sv_double, sv_bool))
399 {
400   for (int i = 0; i < N; i += sv_double_len ())
401     f (sv_double_load (A+i), svptrue_b64 ());
402 }
403 
404 static void
405 runf_sv_thruput (sv_float f (sv_float, sv_bool))
406 {
407   for (int i = 0; i < N; i += sv_float_len ())
408     f (sv_float_load (Af+i), svptrue_b32 ());
409 }
410 
411 static void
412 run_sv_latency (sv_double f (sv_double, sv_bool))
413 {
414   sv_double z = sv_double_dup (zero);
415   sv_double prev = z;
416   for (int i = 0; i < N; i += sv_double_len ())
417     prev = f (svmad_f64_x (svptrue_b64 (), prev, z, sv_double_load (A+i)), svptrue_b64 ());
418 }
419 
420 static void
421 runf_sv_latency (sv_float f (sv_float, sv_bool))
422 {
423   sv_float z = sv_float_dup (zero);
424   sv_float prev = z;
425   for (int i = 0; i < N; i += sv_float_len ())
426     prev = f (svmad_f32_x (svptrue_b32 (), prev, z, sv_float_load (Af+i)), svptrue_b32 ());
427 }
428 #endif
429 
430 static uint64_t
431 tic (void)
432 {
433   struct timespec ts;
434   if (clock_gettime (CLOCK_REALTIME, &ts))
435     abort ();
436   return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
437 }
438 
439 #define TIMEIT(run, f) do { \
440   dt = -1; \
441   run (f); /* Warm up.  */ \
442   for (int j = 0; j < measurecount; j++) \
443     { \
444       uint64_t t0 = tic (); \
445       for (int i = 0; i < itercount; i++) \
446 	run (f); \
447       uint64_t t1 = tic (); \
448       if (t1 - t0 < dt) \
449 	dt = t1 - t0; \
450     } \
451 } while (0)
452 
453 static void
454 bench1 (const struct fun *f, int type, double lo, double hi)
455 {
456   uint64_t dt = 0;
457   uint64_t ns100;
458   const char *s = type == 't' ? "rthruput" : "latency";
459   int vlen = 1;
460 
461   if (f->vec && f->prec == 'd')
462     vlen = v_double_len();
463   else if (f->vec && f->prec == 'f')
464     vlen = v_float_len();
465 
466   if (f->prec == 'd' && type == 't' && f->vec == 0)
467     TIMEIT (run_thruput, f->fun.d);
468   else if (f->prec == 'd' && type == 'l' && f->vec == 0)
469     TIMEIT (run_latency, f->fun.d);
470   else if (f->prec == 'f' && type == 't' && f->vec == 0)
471     TIMEIT (runf_thruput, f->fun.f);
472   else if (f->prec == 'f' && type == 'l' && f->vec == 0)
473     TIMEIT (runf_latency, f->fun.f);
474   else if (f->prec == 'd' && type == 't' && f->vec == 'v')
475     TIMEIT (run_v_thruput, f->fun.vd);
476   else if (f->prec == 'd' && type == 'l' && f->vec == 'v')
477     TIMEIT (run_v_latency, f->fun.vd);
478   else if (f->prec == 'f' && type == 't' && f->vec == 'v')
479     TIMEIT (runf_v_thruput, f->fun.vf);
480   else if (f->prec == 'f' && type == 'l' && f->vec == 'v')
481     TIMEIT (runf_v_latency, f->fun.vf);
482 #ifdef __vpcs
483   else if (f->prec == 'd' && type == 't' && f->vec == 'n')
484     TIMEIT (run_vn_thruput, f->fun.vnd);
485   else if (f->prec == 'd' && type == 'l' && f->vec == 'n')
486     TIMEIT (run_vn_latency, f->fun.vnd);
487   else if (f->prec == 'f' && type == 't' && f->vec == 'n')
488     TIMEIT (runf_vn_thruput, f->fun.vnf);
489   else if (f->prec == 'f' && type == 'l' && f->vec == 'n')
490     TIMEIT (runf_vn_latency, f->fun.vnf);
491 #endif
492 #if WANT_SVE_MATH
493   else if (f->prec == 'd' && type == 't' && f->vec == 's')
494     TIMEIT (run_sv_thruput, f->fun.svd);
495   else if (f->prec == 'd' && type == 'l' && f->vec == 's')
496     TIMEIT (run_sv_latency, f->fun.svd);
497   else if (f->prec == 'f' && type == 't' && f->vec == 's')
498     TIMEIT (runf_sv_thruput, f->fun.svf);
499   else if (f->prec == 'f' && type == 'l' && f->vec == 's')
500     TIMEIT (runf_sv_latency, f->fun.svf);
501 #endif
502 
503   if (type == 't')
504     {
505       ns100 = (100 * dt + itercount * N / 2) / (itercount * N);
506       printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g]\n", f->name, s,
507 	      (unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
508 	      (unsigned long long) dt, lo, hi);
509     }
510   else if (type == 'l')
511     {
512       ns100 = (100 * dt + itercount * N / vlen / 2) / (itercount * N / vlen);
513       printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g]\n", f->name, s,
514 	      (unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
515 	      (unsigned long long) dt, lo, hi);
516     }
517   fflush (stdout);
518 }
519 
520 static void
521 bench (const struct fun *f, double lo, double hi, int type, int gen)
522 {
523   if (f->prec == 'd' && gen == 'r')
524     gen_rand (lo, hi);
525   else if (f->prec == 'd' && gen == 'l')
526     gen_linear (lo, hi);
527   else if (f->prec == 'd' && gen == 't')
528     gen_trace (0);
529   else if (f->prec == 'f' && gen == 'r')
530     genf_rand (lo, hi);
531   else if (f->prec == 'f' && gen == 'l')
532     genf_linear (lo, hi);
533   else if (f->prec == 'f' && gen == 't')
534     genf_trace (0);
535 
536   if (gen == 't')
537     hi = trace_size / N;
538 
539   if (type == 'b' || type == 't')
540     bench1 (f, 't', lo, hi);
541 
542   if (type == 'b' || type == 'l')
543     bench1 (f, 'l', lo, hi);
544 
545   for (int i = N; i < trace_size; i += N)
546     {
547       if (f->prec == 'd')
548 	gen_trace (i);
549       else
550 	genf_trace (i);
551 
552       lo = i / N;
553       if (type == 'b' || type == 't')
554 	bench1 (f, 't', lo, hi);
555 
556       if (type == 'b' || type == 'l')
557 	bench1 (f, 'l', lo, hi);
558     }
559 }
560 
561 static void
562 readtrace (const char *name)
563 {
564 	int n = 0;
565 	FILE *f = strcmp (name, "-") == 0 ? stdin : fopen (name, "r");
566 	if (!f)
567 	  {
568 	    printf ("openning \"%s\" failed: %m\n", name);
569 	    exit (1);
570 	  }
571 	for (;;)
572 	  {
573 	    if (n >= trace_size)
574 	      {
575 		trace_size += N;
576 		Trace = realloc (Trace, trace_size * sizeof (Trace[0]));
577 		if (Trace == NULL)
578 		  {
579 		    printf ("out of memory\n");
580 		    exit (1);
581 		  }
582 	      }
583 	    if (fscanf (f, "%lf", Trace + n) != 1)
584 	      break;
585 	    n++;
586 	  }
587 	if (ferror (f) || n == 0)
588 	  {
589 	    printf ("reading \"%s\" failed: %m\n", name);
590 	    exit (1);
591 	  }
592 	fclose (f);
593 	if (n % N == 0)
594 	  trace_size = n;
595 	for (int i = 0; n < trace_size; n++, i++)
596 	  Trace[n] = Trace[i];
597 }
598 
599 static void
600 usage (void)
601 {
602   printf ("usage: ./mathbench [-g rand|linear|trace] [-t latency|thruput|both] "
603 	  "[-i low high] [-f tracefile] [-m measurements] [-c iterations] func "
604 	  "[func2 ..]\n");
605   printf ("func:\n");
606   printf ("%7s [run all benchmarks]\n", "all");
607   for (const struct fun *f = funtab; f->name; f++)
608     printf ("%7s [low: %g high: %g]\n", f->name, f->lo, f->hi);
609   exit (1);
610 }
611 
612 int
613 main (int argc, char *argv[])
614 {
615   int usergen = 0, gen = 'r', type = 'b', all = 0;
616   double lo = 0, hi = 0;
617   const char *tracefile = "-";
618 
619   argv++;
620   argc--;
621   for (;;)
622     {
623       if (argc <= 0)
624 	usage ();
625       if (argv[0][0] != '-')
626 	break;
627       else if (argc >= 3 && strcmp (argv[0], "-i") == 0)
628 	{
629 	  usergen = 1;
630 	  lo = strtod (argv[1], 0);
631 	  hi = strtod (argv[2], 0);
632 	  argv += 3;
633 	  argc -= 3;
634 	}
635       else if (argc >= 2 && strcmp (argv[0], "-m") == 0)
636 	{
637 	  measurecount = strtol (argv[1], 0, 0);
638 	  argv += 2;
639 	  argc -= 2;
640 	}
641       else if (argc >= 2 && strcmp (argv[0], "-c") == 0)
642 	{
643 	  itercount = strtol (argv[1], 0, 0);
644 	  argv += 2;
645 	  argc -= 2;
646 	}
647       else if (argc >= 2 && strcmp (argv[0], "-g") == 0)
648 	{
649 	  gen = argv[1][0];
650 	  if (strchr ("rlt", gen) == 0)
651 	    usage ();
652 	  argv += 2;
653 	  argc -= 2;
654 	}
655       else if (argc >= 2 && strcmp (argv[0], "-f") == 0)
656 	{
657 	  gen = 't';  /* -f implies -g trace.  */
658 	  tracefile = argv[1];
659 	  argv += 2;
660 	  argc -= 2;
661 	}
662       else if (argc >= 2 && strcmp (argv[0], "-t") == 0)
663 	{
664 	  type = argv[1][0];
665 	  if (strchr ("ltb", type) == 0)
666 	    usage ();
667 	  argv += 2;
668 	  argc -= 2;
669 	}
670       else
671 	usage ();
672     }
673   if (gen == 't')
674     {
675       readtrace (tracefile);
676       lo = hi = 0;
677       usergen = 1;
678     }
679   while (argc > 0)
680     {
681       int found = 0;
682       all = strcmp (argv[0], "all") == 0;
683       for (const struct fun *f = funtab; f->name; f++)
684 	if (all || strcmp (argv[0], f->name) == 0)
685 	  {
686 	    found = 1;
687 	    if (!usergen)
688 	      {
689 		lo = f->lo;
690 		hi = f->hi;
691 	      }
692 	    bench (f, lo, hi, type, gen);
693 	    if (usergen && !all)
694 	      break;
695 	  }
696       if (!found)
697 	printf ("unknown function: %s\n", argv[0]);
698       argv++;
699       argc--;
700     }
701   return 0;
702 }
703