1 /*
2 * fp-bench.c - A collection of simple floating point microbenchmarks.
3 *
4 * Copyright (C) 2018, Emilio G. Cota <cota@braap.org>
5 *
6 * License: GNU GPL, version 2 or later.
7 * See the COPYING file in the top-level directory.
8 */
9 #ifndef HW_POISON_H
10 #error Must define HW_POISON_H to work around TARGET_* poisoning
11 #endif
12
13 #include "qemu/osdep.h"
14 #include <math.h>
15 #include <fenv.h>
16 #include "qemu/timer.h"
17 #include "qemu/int128.h"
18 #include "fpu/softfloat.h"
19
20 /* amortize the computation of random inputs */
21 #define OPS_PER_ITER 50000
22
23 #define MAX_OPERANDS 3
24
25 #define SEED_A 0xdeadfacedeadface
26 #define SEED_B 0xbadc0feebadc0fee
27 #define SEED_C 0xbeefdeadbeefdead
28
29 enum op {
30 OP_ADD,
31 OP_SUB,
32 OP_MUL,
33 OP_DIV,
34 OP_FMA,
35 OP_SQRT,
36 OP_CMP,
37 OP_MAX_NR,
38 };
39
40 static const char * const op_names[] = {
41 [OP_ADD] = "add",
42 [OP_SUB] = "sub",
43 [OP_MUL] = "mul",
44 [OP_DIV] = "div",
45 [OP_FMA] = "mulAdd",
46 [OP_SQRT] = "sqrt",
47 [OP_CMP] = "cmp",
48 [OP_MAX_NR] = NULL,
49 };
50
51 enum precision {
52 PREC_SINGLE,
53 PREC_DOUBLE,
54 PREC_QUAD,
55 PREC_FLOAT32,
56 PREC_FLOAT64,
57 PREC_FLOAT128,
58 PREC_MAX_NR,
59 };
60
61 enum rounding {
62 ROUND_EVEN,
63 ROUND_ZERO,
64 ROUND_DOWN,
65 ROUND_UP,
66 ROUND_TIEAWAY,
67 N_ROUND_MODES,
68 };
69
70 static const char * const round_names[] = {
71 [ROUND_EVEN] = "even",
72 [ROUND_ZERO] = "zero",
73 [ROUND_DOWN] = "down",
74 [ROUND_UP] = "up",
75 [ROUND_TIEAWAY] = "tieaway",
76 };
77
78 enum tester {
79 TESTER_SOFT,
80 TESTER_HOST,
81 TESTER_MAX_NR,
82 };
83
84 static const char * const tester_names[] = {
85 [TESTER_SOFT] = "soft",
86 [TESTER_HOST] = "host",
87 [TESTER_MAX_NR] = NULL,
88 };
89
90 union fp {
91 float f;
92 double d;
93 float32 f32;
94 float64 f64;
95 float128 f128;
96 uint64_t u64;
97 };
98
99 struct op_state;
100
101 typedef float (*float_func_t)(const struct op_state *s);
102 typedef double (*double_func_t)(const struct op_state *s);
103
104 union fp_func {
105 float_func_t float_func;
106 double_func_t double_func;
107 };
108
109 typedef void (*bench_func_t)(void);
110
111 struct op_desc {
112 const char * const name;
113 };
114
115 #define DEFAULT_DURATION_SECS 1
116
117 static uint64_t random_ops[MAX_OPERANDS] = {
118 SEED_A, SEED_B, SEED_C,
119 };
120
121 static float128 random_quad_ops[MAX_OPERANDS] = {
122 {SEED_A, SEED_B}, {SEED_B, SEED_C}, {SEED_C, SEED_A},
123 };
124 static float_status soft_status;
125 static enum precision precision;
126 static enum op operation;
127 static enum tester tester;
128 static uint64_t n_completed_ops;
129 static unsigned int duration = DEFAULT_DURATION_SECS;
130 static int64_t ns_elapsed;
131 /* disable optimizations with volatile */
132 static volatile union fp res;
133
134 /*
135 * From: https://en.wikipedia.org/wiki/Xorshift
136 * This is faster than rand_r(), and gives us a wider range (RAND_MAX is only
137 * guaranteed to be >= INT_MAX).
138 */
xorshift64star(uint64_t x)139 static uint64_t xorshift64star(uint64_t x)
140 {
141 x ^= x >> 12; /* a */
142 x ^= x << 25; /* b */
143 x ^= x >> 27; /* c */
144 return x * UINT64_C(2685821657736338717);
145 }
146
update_random_ops(int n_ops,enum precision prec)147 static void update_random_ops(int n_ops, enum precision prec)
148 {
149 int i;
150
151 for (i = 0; i < n_ops; i++) {
152
153 switch (prec) {
154 case PREC_SINGLE:
155 case PREC_FLOAT32:
156 {
157 uint64_t r = random_ops[i];
158 do {
159 r = xorshift64star(r);
160 } while (!float32_is_normal(r));
161 random_ops[i] = r;
162 break;
163 }
164 case PREC_DOUBLE:
165 case PREC_FLOAT64:
166 {
167 uint64_t r = random_ops[i];
168 do {
169 r = xorshift64star(r);
170 } while (!float64_is_normal(r));
171 random_ops[i] = r;
172 break;
173 }
174 case PREC_QUAD:
175 case PREC_FLOAT128:
176 {
177 float128 r = random_quad_ops[i];
178 uint64_t hi = r.high;
179 uint64_t lo = r.low;
180 do {
181 hi = xorshift64star(hi);
182 lo = xorshift64star(lo);
183 r = make_float128(hi, lo);
184 } while (!float128_is_normal(r));
185 random_quad_ops[i] = r;
186 break;
187 }
188 default:
189 g_assert_not_reached();
190 }
191 }
192 }
193
fill_random(union fp * ops,int n_ops,enum precision prec,bool no_neg)194 static void fill_random(union fp *ops, int n_ops, enum precision prec,
195 bool no_neg)
196 {
197 int i;
198
199 for (i = 0; i < n_ops; i++) {
200 switch (prec) {
201 case PREC_SINGLE:
202 case PREC_FLOAT32:
203 ops[i].f32 = make_float32(random_ops[i]);
204 if (no_neg && float32_is_neg(ops[i].f32)) {
205 ops[i].f32 = float32_chs(ops[i].f32);
206 }
207 break;
208 case PREC_DOUBLE:
209 case PREC_FLOAT64:
210 ops[i].f64 = make_float64(random_ops[i]);
211 if (no_neg && float64_is_neg(ops[i].f64)) {
212 ops[i].f64 = float64_chs(ops[i].f64);
213 }
214 break;
215 case PREC_QUAD:
216 case PREC_FLOAT128:
217 ops[i].f128 = random_quad_ops[i];
218 if (no_neg && float128_is_neg(ops[i].f128)) {
219 ops[i].f128 = float128_chs(ops[i].f128);
220 }
221 break;
222 default:
223 g_assert_not_reached();
224 }
225 }
226 }
227
228 /*
229 * The main benchmark function. Instead of (ab)using macros, we rely
230 * on the compiler to unfold this at compile-time.
231 */
bench(enum precision prec,enum op op,int n_ops,bool no_neg)232 static void bench(enum precision prec, enum op op, int n_ops, bool no_neg)
233 {
234 int64_t tf = get_clock() + duration * 1000000000LL;
235
236 while (get_clock() < tf) {
237 union fp ops[MAX_OPERANDS];
238 int64_t t0;
239 int i;
240
241 update_random_ops(n_ops, prec);
242 switch (prec) {
243 case PREC_SINGLE:
244 fill_random(ops, n_ops, prec, no_neg);
245 t0 = get_clock();
246 for (i = 0; i < OPS_PER_ITER; i++) {
247 float a = ops[0].f;
248 float b = ops[1].f;
249 float c = ops[2].f;
250
251 switch (op) {
252 case OP_ADD:
253 res.f = a + b;
254 break;
255 case OP_SUB:
256 res.f = a - b;
257 break;
258 case OP_MUL:
259 res.f = a * b;
260 break;
261 case OP_DIV:
262 res.f = a / b;
263 break;
264 case OP_FMA:
265 res.f = fmaf(a, b, c);
266 break;
267 case OP_SQRT:
268 res.f = sqrtf(a);
269 break;
270 case OP_CMP:
271 res.u64 = isgreater(a, b);
272 break;
273 default:
274 g_assert_not_reached();
275 }
276 }
277 break;
278 case PREC_DOUBLE:
279 fill_random(ops, n_ops, prec, no_neg);
280 t0 = get_clock();
281 for (i = 0; i < OPS_PER_ITER; i++) {
282 double a = ops[0].d;
283 double b = ops[1].d;
284 double c = ops[2].d;
285
286 switch (op) {
287 case OP_ADD:
288 res.d = a + b;
289 break;
290 case OP_SUB:
291 res.d = a - b;
292 break;
293 case OP_MUL:
294 res.d = a * b;
295 break;
296 case OP_DIV:
297 res.d = a / b;
298 break;
299 case OP_FMA:
300 res.d = fma(a, b, c);
301 break;
302 case OP_SQRT:
303 res.d = sqrt(a);
304 break;
305 case OP_CMP:
306 res.u64 = isgreater(a, b);
307 break;
308 default:
309 g_assert_not_reached();
310 }
311 }
312 break;
313 case PREC_FLOAT32:
314 fill_random(ops, n_ops, prec, no_neg);
315 t0 = get_clock();
316 for (i = 0; i < OPS_PER_ITER; i++) {
317 float32 a = ops[0].f32;
318 float32 b = ops[1].f32;
319 float32 c = ops[2].f32;
320
321 switch (op) {
322 case OP_ADD:
323 res.f32 = float32_add(a, b, &soft_status);
324 break;
325 case OP_SUB:
326 res.f32 = float32_sub(a, b, &soft_status);
327 break;
328 case OP_MUL:
329 res.f = float32_mul(a, b, &soft_status);
330 break;
331 case OP_DIV:
332 res.f32 = float32_div(a, b, &soft_status);
333 break;
334 case OP_FMA:
335 res.f32 = float32_muladd(a, b, c, 0, &soft_status);
336 break;
337 case OP_SQRT:
338 res.f32 = float32_sqrt(a, &soft_status);
339 break;
340 case OP_CMP:
341 res.u64 = float32_compare_quiet(a, b, &soft_status);
342 break;
343 default:
344 g_assert_not_reached();
345 }
346 }
347 break;
348 case PREC_FLOAT64:
349 fill_random(ops, n_ops, prec, no_neg);
350 t0 = get_clock();
351 for (i = 0; i < OPS_PER_ITER; i++) {
352 float64 a = ops[0].f64;
353 float64 b = ops[1].f64;
354 float64 c = ops[2].f64;
355
356 switch (op) {
357 case OP_ADD:
358 res.f64 = float64_add(a, b, &soft_status);
359 break;
360 case OP_SUB:
361 res.f64 = float64_sub(a, b, &soft_status);
362 break;
363 case OP_MUL:
364 res.f = float64_mul(a, b, &soft_status);
365 break;
366 case OP_DIV:
367 res.f64 = float64_div(a, b, &soft_status);
368 break;
369 case OP_FMA:
370 res.f64 = float64_muladd(a, b, c, 0, &soft_status);
371 break;
372 case OP_SQRT:
373 res.f64 = float64_sqrt(a, &soft_status);
374 break;
375 case OP_CMP:
376 res.u64 = float64_compare_quiet(a, b, &soft_status);
377 break;
378 default:
379 g_assert_not_reached();
380 }
381 }
382 break;
383 case PREC_FLOAT128:
384 fill_random(ops, n_ops, prec, no_neg);
385 t0 = get_clock();
386 for (i = 0; i < OPS_PER_ITER; i++) {
387 float128 a = ops[0].f128;
388 float128 b = ops[1].f128;
389 float128 c = ops[2].f128;
390
391 switch (op) {
392 case OP_ADD:
393 res.f128 = float128_add(a, b, &soft_status);
394 break;
395 case OP_SUB:
396 res.f128 = float128_sub(a, b, &soft_status);
397 break;
398 case OP_MUL:
399 res.f128 = float128_mul(a, b, &soft_status);
400 break;
401 case OP_DIV:
402 res.f128 = float128_div(a, b, &soft_status);
403 break;
404 case OP_FMA:
405 res.f128 = float128_muladd(a, b, c, 0, &soft_status);
406 break;
407 case OP_SQRT:
408 res.f128 = float128_sqrt(a, &soft_status);
409 break;
410 case OP_CMP:
411 res.u64 = float128_compare_quiet(a, b, &soft_status);
412 break;
413 default:
414 g_assert_not_reached();
415 }
416 }
417 break;
418 default:
419 g_assert_not_reached();
420 }
421 ns_elapsed += get_clock() - t0;
422 n_completed_ops += OPS_PER_ITER;
423 }
424 }
425
426 #define GEN_BENCH(name, type, prec, op, n_ops) \
427 static void __attribute__((flatten)) name(void) \
428 { \
429 bench(prec, op, n_ops, false); \
430 }
431
432 #define GEN_BENCH_NO_NEG(name, type, prec, op, n_ops) \
433 static void __attribute__((flatten)) name(void) \
434 { \
435 bench(prec, op, n_ops, true); \
436 }
437
438 #define GEN_BENCH_ALL_TYPES(opname, op, n_ops) \
439 GEN_BENCH(bench_ ## opname ## _float, float, PREC_SINGLE, op, n_ops) \
440 GEN_BENCH(bench_ ## opname ## _double, double, PREC_DOUBLE, op, n_ops) \
441 GEN_BENCH(bench_ ## opname ## _float32, float32, PREC_FLOAT32, op, n_ops) \
442 GEN_BENCH(bench_ ## opname ## _float64, float64, PREC_FLOAT64, op, n_ops) \
443 GEN_BENCH(bench_ ## opname ## _float128, float128, PREC_FLOAT128, op, n_ops)
444
445 GEN_BENCH_ALL_TYPES(add, OP_ADD, 2)
446 GEN_BENCH_ALL_TYPES(sub, OP_SUB, 2)
447 GEN_BENCH_ALL_TYPES(mul, OP_MUL, 2)
448 GEN_BENCH_ALL_TYPES(div, OP_DIV, 2)
449 GEN_BENCH_ALL_TYPES(fma, OP_FMA, 3)
450 GEN_BENCH_ALL_TYPES(cmp, OP_CMP, 2)
451 #undef GEN_BENCH_ALL_TYPES
452
453 #define GEN_BENCH_ALL_TYPES_NO_NEG(name, op, n) \
454 GEN_BENCH_NO_NEG(bench_ ## name ## _float, float, PREC_SINGLE, op, n) \
455 GEN_BENCH_NO_NEG(bench_ ## name ## _double, double, PREC_DOUBLE, op, n) \
456 GEN_BENCH_NO_NEG(bench_ ## name ## _float32, float32, PREC_FLOAT32, op, n) \
457 GEN_BENCH_NO_NEG(bench_ ## name ## _float64, float64, PREC_FLOAT64, op, n) \
458 GEN_BENCH_NO_NEG(bench_ ## name ## _float128, float128, PREC_FLOAT128, op, n)
459
460 GEN_BENCH_ALL_TYPES_NO_NEG(sqrt, OP_SQRT, 1)
461 #undef GEN_BENCH_ALL_TYPES_NO_NEG
462
463 #undef GEN_BENCH_NO_NEG
464 #undef GEN_BENCH
465
466 #define GEN_BENCH_FUNCS(opname, op) \
467 [op] = { \
468 [PREC_SINGLE] = bench_ ## opname ## _float, \
469 [PREC_DOUBLE] = bench_ ## opname ## _double, \
470 [PREC_FLOAT32] = bench_ ## opname ## _float32, \
471 [PREC_FLOAT64] = bench_ ## opname ## _float64, \
472 [PREC_FLOAT128] = bench_ ## opname ## _float128, \
473 }
474
475 static const bench_func_t bench_funcs[OP_MAX_NR][PREC_MAX_NR] = {
476 GEN_BENCH_FUNCS(add, OP_ADD),
477 GEN_BENCH_FUNCS(sub, OP_SUB),
478 GEN_BENCH_FUNCS(mul, OP_MUL),
479 GEN_BENCH_FUNCS(div, OP_DIV),
480 GEN_BENCH_FUNCS(fma, OP_FMA),
481 GEN_BENCH_FUNCS(sqrt, OP_SQRT),
482 GEN_BENCH_FUNCS(cmp, OP_CMP),
483 };
484
485 #undef GEN_BENCH_FUNCS
486
run_bench(void)487 static void run_bench(void)
488 {
489 bench_func_t f;
490
491 set_float_2nan_prop_rule(float_2nan_prop_s_ab, &soft_status);
492
493 f = bench_funcs[operation][precision];
494 g_assert(f);
495 f();
496 }
497
498 /* @arr must be NULL-terminated */
find_name(const char * const * arr,const char * name)499 static int find_name(const char * const *arr, const char *name)
500 {
501 int i;
502
503 for (i = 0; arr[i] != NULL; i++) {
504 if (strcmp(name, arr[i]) == 0) {
505 return i;
506 }
507 }
508 return -1;
509 }
510
usage_complete(int argc,char * argv[])511 static void usage_complete(int argc, char *argv[])
512 {
513 gchar *op_list = g_strjoinv(", ", (gchar **)op_names);
514 gchar *tester_list = g_strjoinv(", ", (gchar **)tester_names);
515
516 fprintf(stderr, "Usage: %s [options]\n", argv[0]);
517 fprintf(stderr, "options:\n");
518 fprintf(stderr, " -d = duration, in seconds. Default: %d\n",
519 DEFAULT_DURATION_SECS);
520 fprintf(stderr, " -h = show this help message.\n");
521 fprintf(stderr, " -o = floating point operation (%s). Default: %s\n",
522 op_list, op_names[0]);
523 fprintf(stderr, " -p = floating point precision (single, double, quad[soft only]). "
524 "Default: single\n");
525 fprintf(stderr, " -r = rounding mode (even, zero, down, up, tieaway). "
526 "Default: even\n");
527 fprintf(stderr, " -t = tester (%s). Default: %s\n",
528 tester_list, tester_names[0]);
529 fprintf(stderr, " -z = flush inputs to zero (soft tester only). "
530 "Default: disabled\n");
531 fprintf(stderr, " -Z = flush output to zero (soft tester only). "
532 "Default: disabled\n");
533
534 g_free(tester_list);
535 g_free(op_list);
536 }
537
round_name_to_mode(const char * name)538 static int round_name_to_mode(const char *name)
539 {
540 int i;
541
542 for (i = 0; i < N_ROUND_MODES; i++) {
543 if (!strcmp(round_names[i], name)) {
544 return i;
545 }
546 }
547 return -1;
548 }
549
550 static G_NORETURN
die_host_rounding(enum rounding rounding)551 void die_host_rounding(enum rounding rounding)
552 {
553 fprintf(stderr, "fatal: '%s' rounding not supported on this host\n",
554 round_names[rounding]);
555 exit(EXIT_FAILURE);
556 }
557
set_host_precision(enum rounding rounding)558 static void set_host_precision(enum rounding rounding)
559 {
560 int rhost;
561
562 switch (rounding) {
563 case ROUND_EVEN:
564 rhost = FE_TONEAREST;
565 break;
566 case ROUND_ZERO:
567 rhost = FE_TOWARDZERO;
568 break;
569 case ROUND_DOWN:
570 rhost = FE_DOWNWARD;
571 break;
572 case ROUND_UP:
573 rhost = FE_UPWARD;
574 break;
575 case ROUND_TIEAWAY:
576 die_host_rounding(rounding);
577 return;
578 default:
579 g_assert_not_reached();
580 }
581
582 if (fesetround(rhost)) {
583 die_host_rounding(rounding);
584 }
585 }
586
set_soft_precision(enum rounding rounding)587 static void set_soft_precision(enum rounding rounding)
588 {
589 signed char mode;
590
591 switch (rounding) {
592 case ROUND_EVEN:
593 mode = float_round_nearest_even;
594 break;
595 case ROUND_ZERO:
596 mode = float_round_to_zero;
597 break;
598 case ROUND_DOWN:
599 mode = float_round_down;
600 break;
601 case ROUND_UP:
602 mode = float_round_up;
603 break;
604 case ROUND_TIEAWAY:
605 mode = float_round_ties_away;
606 break;
607 default:
608 g_assert_not_reached();
609 }
610 soft_status.float_rounding_mode = mode;
611 }
612
parse_args(int argc,char * argv[])613 static void parse_args(int argc, char *argv[])
614 {
615 int c;
616 int val;
617 int rounding = ROUND_EVEN;
618
619 for (;;) {
620 c = getopt(argc, argv, "d:ho:p:r:t:zZ");
621 if (c < 0) {
622 break;
623 }
624 switch (c) {
625 case 'd':
626 duration = atoi(optarg);
627 break;
628 case 'h':
629 usage_complete(argc, argv);
630 exit(EXIT_SUCCESS);
631 case 'o':
632 val = find_name(op_names, optarg);
633 if (val < 0) {
634 fprintf(stderr, "Unsupported op '%s'\n", optarg);
635 exit(EXIT_FAILURE);
636 }
637 operation = val;
638 break;
639 case 'p':
640 if (!strcmp(optarg, "single")) {
641 precision = PREC_SINGLE;
642 } else if (!strcmp(optarg, "double")) {
643 precision = PREC_DOUBLE;
644 } else if (!strcmp(optarg, "quad")) {
645 precision = PREC_QUAD;
646 } else {
647 fprintf(stderr, "Unsupported precision '%s'\n", optarg);
648 exit(EXIT_FAILURE);
649 }
650 break;
651 case 'r':
652 rounding = round_name_to_mode(optarg);
653 if (rounding < 0) {
654 fprintf(stderr, "fatal: invalid rounding mode '%s'\n", optarg);
655 exit(EXIT_FAILURE);
656 }
657 break;
658 case 't':
659 val = find_name(tester_names, optarg);
660 if (val < 0) {
661 fprintf(stderr, "Unsupported tester '%s'\n", optarg);
662 exit(EXIT_FAILURE);
663 }
664 tester = val;
665 break;
666 case 'z':
667 soft_status.flush_inputs_to_zero = 1;
668 break;
669 case 'Z':
670 soft_status.flush_to_zero = 1;
671 break;
672 }
673 }
674
675 /* set precision and rounding mode based on the tester */
676 switch (tester) {
677 case TESTER_HOST:
678 set_host_precision(rounding);
679 break;
680 case TESTER_SOFT:
681 set_soft_precision(rounding);
682 switch (precision) {
683 case PREC_SINGLE:
684 precision = PREC_FLOAT32;
685 break;
686 case PREC_DOUBLE:
687 precision = PREC_FLOAT64;
688 break;
689 case PREC_QUAD:
690 precision = PREC_FLOAT128;
691 break;
692 default:
693 g_assert_not_reached();
694 }
695 break;
696 default:
697 g_assert_not_reached();
698 }
699 }
700
pr_stats(void)701 static void pr_stats(void)
702 {
703 printf("%.2f MFlops\n", (double)n_completed_ops / ns_elapsed * 1e3);
704 }
705
main(int argc,char * argv[])706 int main(int argc, char *argv[])
707 {
708 parse_args(argc, argv);
709 run_bench();
710 pr_stats();
711 return 0;
712 }
713