1 /*
2  * Copyright (C) 2013-2021 Canonical, Ltd.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License
6  * as published by the Free Software Foundation; either version 2
7  * of the License, or (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write to the Free Software
16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17  *
18  * This code is a complete clean re-write of the stress tool by
19  * Colin Ian King <colin.king@canonical.com> and attempts to be
20  * backwardly compatible with the stress tool by Amos Waterland
21  * <apw@rossby.metr.ou.edu> but has more stress tests and more
22  * functionality.
23  *
24  */
25 #include "stress-ng.h"
26 
27 #define GAMMA 		(0.57721566490153286060651209008240243104215933593992L)
28 #define OMEGA		(0.56714329040978387299996866221035554975381578718651L)
29 #define PSI		(3.35988566624317755317201130291892717968890513373197L)
30 #define PI		(3.14159265358979323846264338327950288419716939937511L)
31 
32 #define STATS_MAX		(250)
33 #define FFT_SIZE		(4096)
34 #define STRESS_CPU_DITHER_X	(1024)
35 #define STRESS_CPU_DITHER_Y	(768)
36 #define MATRIX_PROD_SIZE 	(128)
37 #define CORRELATE_DATA_LEN	(8192)
38 #define CORRELATE_LEN		(CORRELATE_DATA_LEN / 16)
39 #define SIEVE_SIZE              (104730)
40 
41 /*
42  * Some math workarounds for functions that some
43  * math libraries don't have implemented (yet)
44  *
45  * Try and use builtin variants first, then lib math
46  * then try a workaround.
47  */
48 #if defined(HAVE_BUILTIN_CABSL)
49 #define shim_cabsl(x)	__builtin_cabsl(x)
50 #else
51 #if defined(HAVE_CABSL)
52 #define shim_cabsl(x)	cabsl(x)
53 #else
54 #define shim_cabsl(x)	cabs(x)
55 #endif
56 #endif
57 
58 #if defined(HAVE_BUILTIN_LGAMMAL)
59 #define shim_lgammal(x)	__builtin_lgammal(x)
60 #else
61 #if defined(HAVE_LGAMMAL)
62 #define shim_lgammal(x)	lgammal(x)
63 #else
64 #define shim_lgammal(x)	lgamma(x)
65 #endif
66 #endif
67 
68 #if defined(HAVE_BUILTIN_CPOW)
69 #define shim_cpow(x, z)	__builtin_cpow(x, z)
70 #else
71 #if defined(HAVE_CPOW)
72 #define shim_cpow(x, z)	cpow(x, z)
73 #else
74 #define shim_cpow(x, z)	pow(x, z)
75 #endif
76 #endif
77 
78 #if defined(HAVE_BUILTIN_POWL)
79 #define shim_powl(x, y)	__builtin_powl(x, y)
80 #else
81 #if defined(HAVE_POWL)
82 #define shim_powl(x, y)	powl(x, y)
83 #else
84 #define shim_powl(x, y)	pow(x, y)
85 #endif
86 #endif
87 
88 #if defined(HAVE_BUILTIN_RINTL)
89 #define shim_rintl(x)	__builtin_rintl(x)
90 #else
91 #if defined(HAVE_RINTL)
92 #define shim_rintl(x)	rintl(x)
93 #else
94 #define shim_rintl(x)	shim_rint(x)
95 #endif
96 #endif
97 
98 #if defined(HAVE_BUILTIN_LOG)
99 #define shim_log(x)	__builtin_log(x)
100 #else
101 #define shim_log(x)	log(x)
102 #endif
103 
104 #if defined(HAVE_BUILTIN_LOGL)
105 #define shim_logl(x)	__builtin_logl(x)
106 #else
107 #if defined(HAVE_LOGL)
108 #define shim_logl(x)	logl(x)
109 #else
110 #define shim_logl(x)	shim_log(x)
111 #endif
112 #endif
113 
114 #if defined(HAVE_BUILTIN_EXP)
115 #define shim_exp(x)	__builtin_exp(x)
116 #else
117 #define shim_exp(x)	exp(x)
118 #endif
119 
120 #if defined(HAVE_BUILTIN_EXPL)
121 #define shim_expl(x)	__builtin_expl(x)
122 #else
123 #if defined(HAVE_EXPL) && !defined(__HAIKU__)
124 #define shim_expl(x)	expl(x)
125 #else
126 #define shim_expl(x)	shim_exp(x)
127 #endif
128 #endif
129 
130 #if defined(HAVE_BUILTIN_COSF)
131 #define shim_cosf(x)	__builtin_cosf(x)
132 #else
133 #define shim_cosf(x)	cosf(x)
134 #endif
135 
136 #if defined(HAVE_BUILTIN_COS)
137 #define shim_cos(x)	__builtin_cos(x)
138 #else
139 #define shim_cos(x)	cos(x)
140 #endif
141 
142 #if defined(HAVE_BUILTIN_COSL)
143 #define shim_cosl(x)	__builtin_cosl(x)
144 #else
145 #if defined(HAVE_COSL)
146 #define shim_cosl(x)	cosl(x)
147 #else
148 #define shim_cosl(x)	((long double)shim_cos((double)(x)))
149 #endif
150 #endif
151 
152 #if defined(HAVE_BUILTIN_COSHL)
153 #define shim_coshl(x)	__builtin_coshl(x)
154 #else
155 #if defined(HAVE_COSHL)
156 #define shim_coshl(x)	coshl(x)
157 #else
158 #define shim_coshl(x)	((long double)cosh((double)(x)))
159 #endif
160 #endif
161 
162 #if defined(HAVE_BUILTIN_CCOS)
163 #define shim_ccos(x)	__builtin_ccos(x)
164 #else
165 #if defined(HAVE_CCOS)
166 #define	shim_ccos(x)	ccos(x)
167 #else
168 #define	shim_ccos(x)	shim_cos(x)
169 #endif
170 #endif
171 
172 #if defined(HAVE_BUILTIN_CCOSF)
173 #define shim_ccosf(x)	__builtin_ccosf(x)
174 #else
175 #if defined(HAVE_CCOSF)
176 #define	shim_ccosf(x)	ccosf(x)
177 #else
178 #define	shim_ccosf(x)	shim_ccos(x)
179 #endif
180 #endif
181 
182 #if defined(HAVE_BUILTIN_CCOSL)
183 #define shim_ccosl(x)	__builtin_ccosl(x)
184 #else
185 #if defined(HAVE_CCOSL)
186 #define	shim_ccosl(x)	ccosl(x)
187 #else
188 #define	shim_ccosl(x)	((long double complex)shim_ccos((double complex)(x))
189 #endif
190 #endif
191 
192 #if defined(HAVE_BUILTIN_SINF)
193 #define shim_sinf(x)	__builtin_sin(x)
194 #else
195 #define shim_sinf(x)	sinf(x)
196 #endif
197 
198 #if defined(HAVE_BUILTIN_SIN)
199 #define shim_sin(x)	__builtin_sin(x)
200 #else
201 #define shim_sin(x)	sin(x)
202 #endif
203 
204 #if defined(HAVE_BUILTIN_SINL)
205 #define shim_sinl(x)	__builtin_sinl(x)
206 #else
207 #if defined(HAVE_SINL)
208 #define shim_sinl(x)	sinl(x)
209 #else
210 #define shim_sinl(x)	((long double)shim_sin((double)(x)))
211 #endif
212 #endif
213 
214 #if defined(HAVE_BUILTIN_SINHL)
215 #define shim_sinhl(x)	__builtin_sinhl(x)
216 #else
217 #if defined(HAVE_SINHL)
218 #define shim_sinhl(x)	sinhl(x)
219 #else
220 #define shim_sinhl(x)	((long double)sinh((double)(x)))
221 #endif
222 #endif
223 
224 #if defined(HAVE_BUILTIN_CSIN)
225 #define shim_csin(x)	__builtin_csin(x)
226 #else
227 #if defined(HAVE_CSIN)
228 #define	shim_csin(x)	csin(x)
229 #else
230 #define	shim_csin(x)	shim_sin(x)
231 #endif
232 #endif
233 
234 #if defined(HAVE_BUILTIN_CSINF)
235 #define shim_csinf(x)	__builtin_csinf(x)
236 #else
237 #if defined(HAVE_CSINF)
238 #define	shim_csinf(x)	csinf(x)
239 #else
240 #define	shim_csinf(x)	shim_csin(x)
241 #endif
242 #endif
243 
244 #if defined(HAVE_BUILTIN_CSINL)
245 #define shim_csinl(x)	__builtin_csinl(x)
246 #else
247 #if defined(HAVE_CSINL)
248 #define	shim_csinl(x)	csinl(x)
249 #else
250 #define	shim_csinl(x)	(long double complex)shim_csin((double complex)(x))
251 #endif
252 #endif
253 
254 #if defined(HAVE_BUILTIN_SQRT)
255 #define shim_sqrt(x)	__builtin_sqrt(x)
256 #else
257 #define shim_sqrt(x)	sqrt(x)
258 #endif
259 
260 #if defined(HAVE_BUILTIN_SQRTL)
261 #define shim_sqrtl(x)	__builtin_sqrtl(x)
262 #else
263 #if defined(HAVE_SQRTL)
264 #define shim_sqrtl(x)	sqrtl(x)
265 #else
266 #define shim_sqrtl(x)	shim_sqrt(x)
267 #endif
268 #endif
269 
270 #if defined(HAVE_BUILTIN_FABS)
271 #define shim_fabs(x)	__builtin_fabs(x)
272 #else
273 #define shim_fabs(x)	fabs(x)
274 #endif
275 
276 #if defined(HAVE_BUILTIN_FABSL)
277 #define shim_fabsl(x)	__builtin_fabsl(x)
278 #else
279 #define shim_fabsl(x)	fabsl(x)
280 #endif
281 
282 #if defined(HAVE_BUILTIN_RINT)
283 #define shim_rint(x)	__builtin_rint(x)
284 #else
285 #define shim_rint(x)	rint(x)
286 #endif
287 
288 
289 /*
290  *  the CPU stress test has different classes of cpu stressor
291  */
292 typedef void (*stress_cpu_func)(const char *name);
293 
294 typedef struct {
295 	const char		*name;	/* human readable form of stressor */
296 	const stress_cpu_func	func;	/* the cpu method function */
297 } stress_cpu_method_info_t;
298 
299 static const stress_help_t help[] = {
300 	{ "c N", "cpu N",		"start N workers that perform CPU only loading" },
301 	{ NULL,  "cpu-ops N",		"stop after N cpu bogo operations" },
302 	{ "l P", "cpu-load P",		"load CPU by P %, 0=sleep, 100=full load (see -c)" },
303 	{ NULL,	 "cpu-load-slice S",	"specify time slice during busy load" },
304 	{ NULL,  "cpu-method M",	"specify stress cpu method M, default is all" },
305 	{ NULL,	 NULL,			NULL }
306 };
307 
308 static const stress_cpu_method_info_t cpu_methods[];
309 
310 /* Don't make this static to ensure dithering does not get optimised out */
311 uint8_t pixels[STRESS_CPU_DITHER_X][STRESS_CPU_DITHER_Y];
312 
stress_set_cpu_load(const char * opt)313 static int stress_set_cpu_load(const char *opt) {
314 	int32_t cpu_load;
315 
316 	cpu_load = stress_get_int32(opt);
317 	stress_check_range("cpu-load", (uint64_t)cpu_load, 0, 100);
318 	return stress_set_setting("cpu-load", TYPE_ID_INT32, &cpu_load);
319 }
320 
321 /*
322  *  stress_set_cpu_load_slice()
323  *	< 0   - number of iterations per busy slice
324  *	= 0   - random duration between 0..0.5 seconds
325  *	> 0   - milliseconds per busy slice
326  */
stress_set_cpu_load_slice(const char * opt)327 static int stress_set_cpu_load_slice(const char *opt)
328 {
329 	int32_t cpu_load_slice;
330 
331 	cpu_load_slice = stress_get_int32(opt);
332 	if ((cpu_load_slice < -5000) || (cpu_load_slice > 5000)) {
333 		(void)fprintf(stderr, "cpu-load-slice must in the range -5000 to 5000.\n");
334 		_exit(EXIT_FAILURE);
335 	}
336 	return stress_set_setting("cpu-load-slice", TYPE_ID_INT32, &cpu_load_slice);
337 }
338 
339 /*
340  *  stress_cpu_sqrt()
341  *	stress CPU on square roots
342  */
stress_cpu_sqrt(const char * name)343 static void HOT TARGET_CLONES stress_cpu_sqrt(const char *name)
344 {
345 	int i;
346 
347 	for (i = 0; i < 16384; i++) {
348 		uint64_t rnd = stress_mwc32();
349 		double r_d = shim_sqrt((double)rnd) * shim_sqrt((double)rnd);
350 		long double r_ld = shim_sqrtl((long double)rnd) * shim_sqrtl((long double)rnd);
351 		register uint64_t tmp;
352 
353 		r_d = shim_rint(r_d);
354 		tmp = (uint64_t)r_d;
355 		if (UNLIKELY((g_opt_flags & OPT_FLAGS_VERIFY) && (tmp != rnd))) {
356 			pr_fail("%s: sqrt error detected on "
357 				"sqrt(%" PRIu64 ")\n", name, rnd);
358 			if (!keep_stressing_flag())
359 				break;
360 		}
361 
362 		r_ld = shim_rintl(r_ld);
363 		tmp = (uint64_t)r_ld;
364 		if (UNLIKELY((g_opt_flags & OPT_FLAGS_VERIFY) && (tmp != rnd))) {
365 			pr_fail("%s: sqrtf error detected on "
366 				"sqrt(%" PRIu64 ")\n", name, rnd);
367 			if (!keep_stressing_flag())
368 				break;
369 		}
370 	}
371 }
372 
stress_is_affinity_set(void)373 static bool stress_is_affinity_set(void)
374 {
375 #if defined(HAVE_SCHED_GETAFFINITY)
376 	cpu_set_t mask;
377 	int i;
378 	const int cpus_online = (int)stress_get_processors_online();
379 
380 	CPU_ZERO(&mask);
381 	if (sched_getaffinity(0, sizeof(mask), &mask) < 0)
382 		return false;	/* Can't tell, so assume not */
383 
384 	/*
385 	 * If any of the CPU affinities across all the CPUs
386 	 * are zero then we know the stressor as been pinned
387 	 * to some CPUs and not to others, so affinity has been
388 	 * set which can lead to load balancing difficulties
389 	 */
390 	for (i = 0; i < cpus_online; i++) {
391 		if (!CPU_ISSET(i, &mask))
392 			return true;
393 	}
394 	return false;
395 #else
396 	return false;	/* Don't know, so assume not */
397 #endif
398 }
399 
400 /*
401  *  stress_cpu_loop()
402  *	simple CPU busy loop
403  */
stress_cpu_loop(const char * name)404 static void OPTIMIZE0 stress_cpu_loop(const char *name)
405 {
406 	uint32_t i, i_sum = 0;
407 	const uint32_t sum = 134209536UL;
408 
409 	for (i = 0; i < 16384; i++) {
410 		i_sum += i;
411 		FORCE_DO_NOTHING();
412 	}
413 	if ((g_opt_flags & OPT_FLAGS_VERIFY) && (i_sum != sum))
414 		pr_fail("%s: cpu loop 0..16383 sum was %" PRIu32 " and "
415 			"did not match the expected value of %" PRIu32 "\n",
416 			name, i_sum, sum);
417 }
418 
419 /*
420  *  stress_cpu_gcd()
421  *	compute Greatest Common Divisor
422  */
stress_cpu_gcd(const char * name)423 static void HOT OPTIMIZE3 TARGET_CLONES stress_cpu_gcd(const char *name)
424 {
425 	uint32_t i, gcd_sum = 0;
426 	const uint32_t gcd_checksum = 63000868UL;
427 	uint64_t lcm_sum = 0;
428 	const uint64_t lcm_checksum = 41637399273ULL;
429 
430 	for (i = 0; i < 16384; i++) {
431 		register uint32_t a = i, b = i % (3 + (1997 ^ i));
432 		register uint64_t lcm = ((uint64_t)a * b);
433 
434 		while (b != 0) {
435 			register uint32_t r = b;
436 			b = a % b;
437 			a = r;
438 		}
439 		if (a)
440 			lcm_sum += (lcm / a);
441 		gcd_sum += a;
442 		FORCE_DO_NOTHING();
443 	}
444 	if ((g_opt_flags & OPT_FLAGS_VERIFY) &&
445 	    (gcd_sum != gcd_checksum) &&
446 	    (lcm_sum != lcm_checksum))
447 		pr_fail("%s: gcd error detected, failed modulo "
448 			"or assignment operations\n", name);
449 }
450 
451 /*
452  *  stress_cpu_bitops()
453  *	various bit manipulation hacks from bithacks
454  *	https://graphics.stanford.edu/~seander/bithacks.html
455  */
stress_cpu_bitops(const char * name)456 static void HOT OPTIMIZE3 TARGET_CLONES stress_cpu_bitops(const char *name)
457 {
458 	uint32_t i, i_sum = 0;
459 	const uint32_t sum = 0x8aac0aab;
460 
461 	for (i = 0; i < 16384; i++) {
462 		{
463 			register uint32_t r, v, s = (sizeof(v) * 8) - 1;
464 
465 			/* Reverse bits */
466 			r = v = i;
467 			for (v >>= 1; v; v >>= 1, s--) {
468 				r <<= 1;
469 				r |= v & 1;
470 			}
471 			r <<= s;
472 			i_sum += r;
473 		}
474 		{
475 			/* parity check */
476 			register uint32_t v = i;
477 
478 			v ^= v >> 16;
479 			v ^= v >> 8;
480 			v ^= v >> 4;
481 			v &= 0xf;
482 			i_sum += (0x6996 >> v) & 1;
483 		}
484 		{
485 			/* Brian Kernighan count bits */
486 			register uint32_t j, v = i;
487 
488 			for (j = 0; v; j++)
489 				v &= v - 1;
490 			i_sum += j;
491 		}
492 		{
493 			/* round up to nearest highest power of 2 */
494 			register uint32_t v = i - 1;
495 
496 			v |= v >> 1;
497 			v |= v >> 2;
498 			v |= v >> 4;
499 			v |= v >> 8;
500 			v |= v >> 16;
501 			i_sum += v;
502 		}
503 	}
504 	if ((g_opt_flags & OPT_FLAGS_VERIFY) && (i_sum != sum))
505 		pr_fail("%s: bitops error detected, failed "
506 			"bitops operations\n", name);
507 }
508 
509 /*
510  *  stress_cpu_trig()
511  *	simple sin, cos trig functions
512  */
stress_cpu_trig(const char * name)513 static void HOT stress_cpu_trig(const char *name)
514 {
515 	int i;
516 	long double d_sum = 0.0L;
517 
518 	(void)name;
519 
520 	for (i = 0; i < 1500; i++) {
521 		long double theta = (2.0L * PI * (long double)i)/1500.0L;
522 		{
523 			double thetad = (double)theta;
524 			float thetaf = (float)theta;
525 
526 			d_sum += (shim_cosl(theta) * shim_sinl(theta));
527 			d_sum += ((long double)shim_cos(thetad) * (long double)shim_sin(thetad));
528 			d_sum += ((long double)shim_cosf(thetaf) * (long double)shim_sinf(thetaf));
529 		}
530 		{
531 			long double thetal = theta * 2.0L;
532 			double thetad = (double)thetal;
533 			float thetaf = (float)thetal;
534 
535 			d_sum += shim_cosl(thetal);
536 			d_sum += (long double)shim_cos(thetad);
537 			d_sum += (long double)shim_cosf(thetaf);
538 		}
539 		{
540 			long double thetal = theta * 3.0L;
541 			double thetad = (double)thetal;
542 			float thetaf = (float)thetal;
543 
544 			d_sum += shim_sinl(thetal);
545 			d_sum += (long double)shim_sin(thetad);
546 			d_sum += (long double)shim_sinf(thetaf);
547 		}
548 	}
549 	stress_long_double_put(d_sum);
550 }
551 
552 /*
553  *  stress_cpu_hyperbolic()
554  *	simple hyperbolic sinh, cosh functions
555  */
stress_cpu_hyperbolic(const char * name)556 static void HOT stress_cpu_hyperbolic(const char *name)
557 {
558 	int i;
559 	long double d_sum = 0.0L;
560 
561 	(void)name;
562 
563 	for (i = 0; i < 1500; i++) {
564 		long double theta = (2.0L * PI * (long double)i)/1500.0L;
565 		{
566 			double thetad = (double)theta;
567 			float thetaf = (float)theta;
568 
569 			d_sum += (shim_coshl(theta) * shim_sinhl(theta));
570 			d_sum += ((long double)cosh(thetad) * (long double)sinh(thetad));
571 			d_sum += ((long double)coshf(thetaf) * (long double)sinhf(thetaf));
572 		}
573 		{
574 			long double thetal = theta * 2.0L;
575 			double thetad = (double)theta;
576 			float thetaf = (float)theta;
577 
578 			d_sum += shim_coshl(thetal);
579 			d_sum += (long double)cosh(thetad);
580 			d_sum += (long double)coshf(thetaf);
581 		}
582 		{
583 			long double thetal = theta * 3.0L;
584 			double thetad = (double)theta;
585 			float thetaf = (float)theta;
586 
587 			d_sum += shim_sinhl(thetal);
588 			d_sum += (long double)sinh(thetad);
589 			d_sum += (long double)sinhf(thetaf);
590 		}
591 	}
592 	stress_long_double_put(d_sum);
593 }
594 
595 /*
596  *  stress_cpu_rand()
597  *	generate lots of pseudo-random integers
598  */
stress_cpu_rand(const char * name)599 static void HOT OPTIMIZE3 stress_cpu_rand(const char *name)
600 {
601 	int i;
602 	uint32_t i_sum = 0;
603 	const uint32_t sum = 0xc253698c;
604 
605 	STRESS_MWC_SEED();
606 	for (i = 0; i < 16384; i++)
607 		i_sum += stress_mwc32();
608 
609 	if ((g_opt_flags & OPT_FLAGS_VERIFY) && (i_sum != sum))
610 		pr_fail("%s: rand error detected, failed sum of "
611 			"pseudo-random values\n", name);
612 }
613 
614 /*
615  *  stress_cpu_rand48()
616  *	generate random values using rand48 family of functions
617  */
stress_cpu_rand48(const char * name)618 static void HOT OPTIMIZE3 stress_cpu_rand48(const char *name)
619 {
620 	int i;
621 	double d = 0;
622 	long int l = 0;
623 
624 	(void)name;
625 
626 	srand48(0x0defaced);
627 	for (i = 0; i < 16384; i++) {
628 		d += drand48();
629 		l += lrand48();
630 	}
631 	stress_double_put(d);
632 	stress_uint64_put((uint64_t)l);
633 }
634 
635 /*
636  *  stress_cpu_lfsr32()
637  *	generate 16384 values from the Galois polynomial
638  *	x^32 + x^31 + x^29 + x + 1
639  */
stress_cpu_lfsr32(const char * name)640 static void HOT OPTIMIZE3 stress_cpu_lfsr32(const char *name)
641 {
642         static uint32_t lfsr = 0xf63acb01;
643 	register int i;
644 
645 	(void)name;
646 
647 	for (i = 0; i < 16384; i++) {
648 		lfsr = (lfsr >> 1) ^ (unsigned int)(-(lfsr & 1u) & 0xd0000001U);
649 	}
650 	stress_uint32_put(lfsr);
651 }
652 
653 /*
654  *  stress_cpu_nsqrt()
655  *	iterative Newton–Raphson square root
656  */
stress_cpu_nsqrt(const char * name)657 static void HOT OPTIMIZE3 TARGET_CLONES stress_cpu_nsqrt(const char *name)
658 {
659 	int i;
660 	const long double precision = 1.0e-12L;
661 	const int max_iter = 56;
662 
663 	for (i = 16300; i < 16384; i++) {
664 		long double n = (long double)i;
665 		long double lo = (n < 1.0L) ? n : 1.0L;
666 		long double hi = (n < 1.0L) ? 1.0L : n;
667 		long double rt;
668 		int j = 0;
669 
670 		while ((j++ < max_iter) && ((hi - lo) > precision)) {
671 			long double g = (lo + hi) / 2.0L;
672 			if ((g * g) > n)
673 				hi = g;
674 			else
675 				lo = g;
676 		}
677 		rt = (lo + hi) / 2.0L;
678 
679 		if (g_opt_flags & OPT_FLAGS_VERIFY) {
680 			const long double r2 = shim_rintl(rt * rt);
681 
682 			if (j >= max_iter)
683 				pr_fail("%s: Newton-Raphson sqrt "
684 					"computation took more iterations "
685 					"than expected\n", name);
686 			if ((int)r2 != i)
687 				pr_fail("%s: Newton-Raphson sqrt not "
688 					"accurate enough\n", name);
689 		}
690 	}
691 }
692 
693 /*
694  *  stress_cpu_phi()
695  *	compute the Golden Ratio
696  */
stress_cpu_phi(const char * name)697 static void HOT OPTIMIZE3 TARGET_CLONES stress_cpu_phi(const char *name)
698 {
699 	long double phi; /* Golden ratio */
700 	const long double precision = 1.0e-15L;
701 	const long double phi_ = (1.0L + shim_sqrtl(5.0L)) / 2.0L;
702 	register uint64_t a, b;
703 	const uint64_t mask = 1ULL << 63;
704 	int i;
705 
706 	/* Pick any two starting points */
707 	a = stress_mwc64() % 99;
708 	b = stress_mwc64() % 99;
709 
710 	/* Iterate until we approach overflow */
711 	for (i = 0; (i < 64) && !((a | b) & mask); i++) {
712 		/* Find nth term */
713 		register uint64_t c = a + b;
714 
715 		a = b;
716 		b = c;
717 	}
718 	/* And we have the golden ratio */
719 	phi = (long double)b / (long double)a;
720 
721 	if ((g_opt_flags & OPT_FLAGS_VERIFY) &&
722 	    (shim_fabsl(phi - phi_) > precision))
723 		pr_fail("%s: Golden Ratio phi not accurate enough\n",
724 			name);
725 }
726 
727 /*
728  *  stress_cpu_apery()
729  *      compute Apéry's constant
730  */
stress_cpu_apery(const char * name)731 static void HOT OPTIMIZE3 stress_cpu_apery(const char *name)
732 {
733 	uint32_t n;
734 	long double a = 0.0L, a_ = a;
735 	const long double precision = 1.0e-14L;
736 
737 	(void)name;
738 
739 	for (n = 1; n < 100000; n++) {
740 		long double n3 = (long double)n;
741 
742 		a_ = a;
743 		n3 = n3 * n3 * n3;
744 		a += (1.0L / n3);
745 		if (shim_fabsl(a - a_) < precision)
746 			break;
747 	}
748 	if (shim_fabsl(a - a_) > precision)
749 		pr_fail("%s: Apéry's const not accurate enough\n", name);
750 }
751 
752 
753 #if defined(HAVE_COMPLEX_H) &&		\
754     defined(HAVE_COMPLEX) &&		\
755     defined(__STDC_IEC_559_COMPLEX__) &&\
756     !defined(__UCLIBC__)
757 /*
758  *  fft_partial()
759  *  	partial Fast Fourier Transform
760  */
fft_partial(double complex * data,double complex * tmp,const int n,const int m)761 static void HOT OPTIMIZE3 fft_partial(
762 	double complex *data,
763 	double complex *tmp,
764 	const int n,
765 	const int m)
766 {
767 	if (m < n) {
768 		const int m2 = m * 2;
769 		int i;
770 
771 		fft_partial(tmp, data, n, m2);
772 		fft_partial(tmp + m, data + m, n, m2);
773 		for (i = 0; i < n; i += m2) {
774 			const double complex negI = -(double complex)I;
775 			double complex v = tmp[i];
776 			double complex t =
777 				cexp((negI * (double)PI * (double)i) /
778 				     (double)n) * tmp[i + m];
779 			data[i / 2] = v + t;
780 			data[(i + n) / 2] = v - t;
781 		}
782 	}
783 }
784 
785 /*
786  *  stress_cpu_fft()
787  *	Fast Fourier Transform
788  */
stress_cpu_fft(const char * name)789 static void HOT TARGET_CLONES stress_cpu_fft(const char *name)
790 {
791 	static double complex buf[FFT_SIZE], tmp[FFT_SIZE];
792 	int i;
793 
794 	(void)name;
795 
796 	for (i = 0; i < FFT_SIZE; i++)
797 		buf[i] = (double complex)(i % 63);
798 
799 	(void)memcpy(tmp, buf, sizeof(*tmp) * FFT_SIZE);
800 	fft_partial(buf, tmp, FFT_SIZE, 1);
801 }
802 #endif
803 
804 /*
805  *   stress_cpu_euler()
806  *	compute e using series
807  */
stress_cpu_euler(const char * name)808 static void HOT OPTIMIZE3 TARGET_CLONES stress_cpu_euler(const char *name)
809 {
810 	long double e = 1.0L, last_e;
811 	long double fact = 1.0L;
812 	long double precision = 1.0e-20L;
813 	int n = 1;
814 
815 	do {
816 		last_e = e;
817 		fact *= n;
818 		n++;
819 		e += (1.0L / fact);
820 	} while ((n < 25) && (shim_fabsl(e - last_e) > precision));
821 
822 	if ((g_opt_flags & OPT_FLAGS_VERIFY) && (n >= 25))
823 		pr_fail("%s: Euler computation took more iterations "
824 			"than expected\n", name);
825 }
826 
827 /*
828  *  random_buffer()
829  *	fill a uint8_t buffer full of random data
830  *	buffer *must* be multiple of 4 bytes in size
831  */
random_buffer(uint8_t * data,const size_t len)832 static void random_buffer(uint8_t *data, const size_t len)
833 {
834 	size_t i;
835 
836 	for (i = 0; i < len / 4; i++) {
837 		uint32_t v = stress_mwc32();
838 
839 		*data++ = (uint8_t)v;
840 		v >>= 8;
841 		*data++ = (uint8_t)v;
842 		v >>= 8;
843 		*data++ = (uint8_t)v;
844 		v >>= 8;
845 		*data++ = (uint8_t)v;
846 	}
847 }
848 
849 /*
850  *  stress_cpu_collatz()
851  *	stress test integer collatz conjecture
852  */
stress_cpu_collatz(const char * name)853 static void HOT OPTIMIZE3 TARGET_CLONES stress_cpu_collatz(const char *name)
854 {
855 	register uint64_t n = 989345275647ULL;	/* Has 1348 steps in cycle */
856 	register int i;
857 
858 	for (i = 0; n != 1; i++) {
859 		n = (n & 1) ? (3 * n) + 1 : n / 2;
860 	}
861 	if ((g_opt_flags & OPT_FLAGS_VERIFY) && (i != 1348))
862 		pr_fail("%s: error detected, failed collatz progression\n",
863 			name);
864 }
865 
866 /*
867  *  stress_cpu_hash_generic()
868  *	stress test generic string hash function
869  */
stress_cpu_hash_generic(const char * name,const char * hash_name,uint32_t (* hash_func)(const char * str),const uint32_t result)870 static void stress_cpu_hash_generic(
871 	const char *name,
872 	const char *hash_name,
873 	uint32_t (*hash_func)(const char *str),
874 	const uint32_t result)
875 {
876 	char buffer[128];
877 	size_t i;
878 	uint32_t i_sum = 0;
879 
880 	STRESS_MWC_SEED();
881 	random_buffer((uint8_t *)buffer, sizeof(buffer));
882 	/* Make it ASCII range ' '..'_' */
883 	for (i = 0; i < sizeof(buffer); i++)
884 		buffer[i] = (buffer[i] & 0x3f) + ' ';
885 
886 	for (i = sizeof(buffer) - 1; i; i--) {
887 		buffer[i] = '\0';
888 		i_sum += hash_func(buffer);
889 	}
890 	if ((g_opt_flags & OPT_FLAGS_VERIFY) && (i_sum != result))
891 		pr_fail("%s: %s error detected, failed hash %s sum\n",
892 			name, hash_name, hash_name);
893 }
894 
895 /*
896  *  stress_cpu_jenkin()
897  *	multiple iterations on jenkin hash
898  */
stress_cpu_jenkin(const char * name)899 static void stress_cpu_jenkin(const char *name)
900 {
901 	uint8_t buffer[128];
902 	size_t i;
903 	uint32_t i_sum = 0;
904 	const uint32_t sum = 0xc53302a5;
905 
906 	STRESS_MWC_SEED();
907 	random_buffer(buffer, sizeof(buffer));
908 
909 	for (i = sizeof(buffer) - 1; i; i--) {
910 		buffer[i] = '\0';
911 		i_sum += stress_hash_jenkin(buffer, sizeof(buffer));
912 	}
913 
914 	if ((g_opt_flags & OPT_FLAGS_VERIFY) && (i_sum != sum))
915 		pr_fail("%s: jenkin error detected, failed hash jenkin sum\n",
916 			name);
917 }
918 
919 /*
920  *  stress_cpu_little_endian()
921  *	returns true if CPU is little endian
922  */
stress_cpu_little_endian(void)923 static inline bool stress_cpu_little_endian(void)
924 {
925 	const uint32_t x = 0x12345678;
926 	const uint8_t *y = (const uint8_t *)&x;
927 
928 	return *y == 0x78;
929 }
930 
931 /*
932  *  stress_cpu_murmur3_32
933  *	 multiple iterations on murmur3_32 hash, based on
934  *	 Austin Appleby's Murmur3 hash, code derived from
935  *	 https://en.wikipedia.org/wiki/MurmurHash
936  */
stress_cpu_murmur3_32(const char * name)937 static void stress_cpu_murmur3_32(const char *name)
938 {
939 	uint8_t buffer[128];
940 	size_t i;
941 	uint32_t sum, i_sum = 0;
942 	const uint32_t seed = 0xf12b35e1; /* arbitrary value */
943 
944 	STRESS_MWC_SEED();
945 	random_buffer(buffer, sizeof(buffer));
946 	for (i = sizeof(buffer) - 1; i; i--) {
947 		buffer[i] = '\0';
948 		i_sum += stress_hash_murmur3_32((uint8_t *)buffer, sizeof(buffer), seed);
949 	}
950 
951 	/*
952 	 *  Murmur produces different results depending on the Endianness
953 	 */
954 	sum = stress_cpu_little_endian() ? 0xa53a4bb1 : 0x71eb83cc;
955 
956 	if ((g_opt_flags & OPT_FLAGS_VERIFY) && (i_sum != sum))
957 		pr_fail("%s: murmur3_32 error detected, failed hash murmur3_32 sum\n",
958 			name);
959 }
960 
961 /*
962  *  stress_cpu_pjw()
963  *	stress test hash pjw
964  */
stress_cpu_pjw(const char * name)965 static void stress_cpu_pjw(const char *name)
966 {
967 	stress_cpu_hash_generic(name, "pjw", stress_hash_pjw, 0xa89a91c0);
968 }
969 
970 /*
971  *  stress_cpu_djb2a()
972  *	stress test hash djb2a
973  */
stress_cpu_djb2a(const char * name)974 static void stress_cpu_djb2a(const char *name)
975 {
976 	stress_cpu_hash_generic(name, "djb2a", stress_hash_djb2a, 0x6a60cb5a);
977 }
978 
979 /*
980  *  stress_cpu_fnv1a()
981  *	stress test hash fnv1a
982  */
stress_cpu_fnv1a(const char * name)983 static void HOT stress_cpu_fnv1a(const char *name)
984 {
985 	stress_cpu_hash_generic(name, "fnv1a", stress_hash_fnv1a, 0x8ef17e80);
986 }
987 
988 /*
989  *  stress_cpu_sdbm()
990  *	stress test hash sdbm
991  */
stress_cpu_sdbm(const char * name)992 static void stress_cpu_sdbm(const char *name)
993 {
994 	stress_cpu_hash_generic(name, "sdbm", stress_hash_sdbm, 0x46357819);
995 }
996 
997 /*
998  *  stress_cpu_nhash()
999  *	stress test hash nhash
1000  */
stress_cpu_nhash(const char * name)1001 static void stress_cpu_nhash(const char *name)
1002 {
1003 	stress_cpu_hash_generic(name, "nhash", stress_hash_nhash, 0x1cc86e3);
1004 }
1005 
1006 /*
1007  *  stress_cpu_idct()
1008  *	compute 8x8 Inverse Discrete Cosine Transform
1009  */
stress_cpu_idct(const char * name)1010 static void HOT OPTIMIZE3 TARGET_CLONES stress_cpu_idct(const char *name)
1011 {
1012 	const double invsqrt2 = 1.0 / shim_sqrt(2.0);
1013 	const double pi_over_16 = (double)PI / 16.0;
1014 	const int sz = 8;
1015 	int i, j, u, v;
1016 	float data[sz][sz], idct[sz][sz];
1017 
1018 	/*
1019 	 *  Set up DCT
1020 	 */
1021 	for (i = 0; i < sz; i++) {
1022 		for (j = 0; j < sz; j++) {
1023 			data[i][j] = (i + j == 0) ? 2040: 0;
1024 		}
1025 	}
1026 	for (i = 0; i < sz; i++) {
1027 		const double pi_i = (i + i + 1) * pi_over_16;
1028 
1029 		for (j = 0; j < sz; j++) {
1030 			const double pi_j = (j + j + 1) * pi_over_16;
1031 			double sum = 0.0;
1032 
1033 			for (u = 0; u < sz; u++) {
1034 				const double cos_pi_i_u = shim_cos(pi_i * u);
1035 
1036 				for (v = 0; v < sz; v++) {
1037 					const double cos_pi_j_v =
1038 						shim_cos(pi_j * v);
1039 
1040 					sum += ((double)data[u][v] *
1041 						(u ? 1.0 : invsqrt2) *
1042 						(v ? 1.0 : invsqrt2) *
1043 						cos_pi_i_u * cos_pi_j_v);
1044 				}
1045 			}
1046 			idct[i][j] = (float)(0.25 * sum);
1047 		}
1048 	}
1049 	/* Final output should be a 8x8 matrix of values 255 */
1050 	if (g_opt_flags & OPT_FLAGS_VERIFY) {
1051 		for (i = 0; i < sz; i++) {
1052 			for (j = 0; j < sz; j++) {
1053 				if ((int)idct[i][j] != 255) {
1054 					pr_fail("%s: IDCT error detected, "
1055 						"IDCT[%d][%d] was %d, "
1056 						"expecting 255\n",
1057 						name, i, j, (int)idct[i][j]);
1058 				}
1059 			}
1060 			if (!keep_stressing_flag())
1061 				return;
1062 		}
1063 	}
1064 }
1065 
1066 #define int_ops(_type, a, b, c1, c2, c3)\
1067 	do {				\
1068 		a += b;			\
1069 		b ^= a;			\
1070 		a >>= 1;		\
1071 		b <<= 2;		\
1072 		b -= a;			\
1073 		a ^= (_type)~0;		\
1074 		b ^= ~(c1);		\
1075 		a *= 3;			\
1076 		b *= 7;			\
1077 		a += 2;			\
1078 		b -= 3;			\
1079 		a /= 77;		\
1080 		b /= 3;			\
1081 		a <<= 1;		\
1082 		b <<= 2;		\
1083 		a |= 1;			\
1084 		b |= 3;			\
1085 		a *= stress_mwc32();	\
1086 		b ^= stress_mwc32();	\
1087 		a += stress_mwc32();	\
1088 		b -= stress_mwc32();	\
1089 		a /= 7;			\
1090 		b /= 9;			\
1091 		a |= (c2);		\
1092 		b &= (c3);		\
1093 	} while (0);
1094 
1095 #define C1 	(0xf0f0f0f0f0f0f0f0ULL)
1096 #define C2	(0x1000100010001000ULL)
1097 #define C3	(0xffeffffefebefffeULL)
1098 
1099 /*
1100  *  Generic int stressor macro
1101  */
1102 #define stress_cpu_int(_type, _sz, _a, _b, _c1, _c2, _c3)	\
1103 static void HOT OPTIMIZE3 TARGET_CLONES stress_cpu_int ## _sz(const char *name)\
1104 {								\
1105 	const _type mask = (_type)~(_type)0;			\
1106 	const _type a_final = _a;				\
1107 	const _type b_final = _b;				\
1108 	const _type c1 = _c1 & mask;				\
1109 	const _type c2 = _c2 & mask;				\
1110 	const _type c3 = _c3 & mask;				\
1111 	register _type a, b;					\
1112 	int i;							\
1113 								\
1114 	STRESS_MWC_SEED();					\
1115 	a = (_type)stress_mwc32();				\
1116 	b = (_type)stress_mwc32();				\
1117 								\
1118 	for (i = 0; i < 1000; i++) {				\
1119 		int_ops(_type, a, b, c1, c2, c3)		\
1120 	}							\
1121 								\
1122 	if ((g_opt_flags & OPT_FLAGS_VERIFY) &&			\
1123 	    ((a != a_final) || (b != b_final)))			\
1124 		pr_fail("%s: int" # _sz " error detected, " 	\
1125 			"failed int" # _sz 			\
1126 			" math operations\n", name);		\
1127 }								\
1128 
1129 /* For compilers that support int128 .. */
1130 #if defined(HAVE_INT128_T)
1131 
1132 #define _UINT128(hi, lo)	((((__uint128_t)hi << 64) | (__uint128_t)lo))
1133 
1134 stress_cpu_int(__uint128_t, 128,
1135 	_UINT128(0x132af604d8b9183a,0x5e3af8fa7a663d74),
1136 	_UINT128(0x62f086e6160e4e,0xd84c9f800365858),
1137 	_UINT128(C1, C1), _UINT128(C2, C2), _UINT128(C3, C3))
1138 #endif
1139 
1140 stress_cpu_int(uint64_t, 64, \
1141 	0x013f7f6dc1d79197cULL, 0x01863d2c6969a51ceULL,
1142 	C1, C2, C3)
1143 
1144 stress_cpu_int(uint32_t, 32, \
1145 	0x1ce9b547UL, 0xa24b33aUL,
1146 	C1, C2, C3)
1147 
1148 stress_cpu_int(uint16_t, 16, \
1149 	0x1871, 0x07f0,
1150 	C1, C2, C3)
1151 
1152 stress_cpu_int(uint8_t, 8, \
1153 	0x12, 0x1a,
1154 	C1, C2, C3)
1155 
1156 #define float_ops(_type, a, b, c, d, _sin, _cos)	\
1157 	do {						\
1158 		a = a + b;				\
1159 		b = a * c;				\
1160 		c = a - b;				\
1161 		d = a / b;				\
1162 		a = c / (_type)0.1923L;			\
1163 		b = c + a;				\
1164 		c = b * (_type)3.12L;			\
1165 		d = d + b + (_type)_sin(a);		\
1166 		a = (b + c) / c;			\
1167 		b = b * c;				\
1168 		c = c + (_type)1.0L;			\
1169 		d = d - (_type)_sin(c);			\
1170 		a = a * (_type)_cos(b);			\
1171 		b = b + (_type)_cos(c);			\
1172 		c = (_type)_sin(a + b) / (_type)2.344L;	\
1173 		b = d - (_type)1.0L;			\
1174 	} while (0)
1175 
1176 /*
1177  *  Generic floating point stressor macro
1178  */
1179 #define stress_cpu_fp(_type, _name, _sin, _cos)		\
1180 static void HOT OPTIMIZE3 TARGET_CLONES stress_cpu_ ## _name(const char *name)\
1181 {							\
1182 	int i;						\
1183 	const uint32_t r1 = stress_mwc32(),		\
1184 		       r2 = stress_mwc32();		\
1185 	_type a = (_type)0.18728L, 			\
1186 	      b = (_type)r1,				\
1187 	      c = (_type)r2,				\
1188 	      d = (_type)0.0,				\
1189 	      r;					\
1190 							\
1191 	(void)name;					\
1192 							\
1193 	for (i = 0; i < 1000; i++) {			\
1194 		float_ops(_type, a, b, c, d,		\
1195 			_sin, _cos);			\
1196 	}						\
1197 	r = a + b + c + d;				\
1198 	stress_double_put((double)r);			\
1199 }
1200 
stress_cpu_fp(float,float,shim_sinf,shim_cosf)1201 stress_cpu_fp(float, float, shim_sinf, shim_cosf)
1202 stress_cpu_fp(double, double, shim_sin, shim_cos)
1203 stress_cpu_fp(long double, longdouble, shim_sinl, shim_cosl)
1204 #if defined(HAVE_FLOAT_DECIMAL32) &&	\
1205     !defined(__clang__)
1206 stress_cpu_fp(_Decimal32, decimal32, shim_sinf, shim_cosf)
1207 #endif
1208 #if defined(HAVE_FLOAT_DECIMAL64) &&	\
1209     !defined(__clang__)
1210 stress_cpu_fp(_Decimal64, decimal64, shim_sin, shim_cos)
1211 #endif
1212 #if defined(HAVE_FLOAT_DECIMAL128) &&	\
1213     !defined(__clang__)
1214 stress_cpu_fp(_Decimal128, decimal128, shim_sinl, shim_cosl)
1215 #endif
1216 #if defined(HAVE_FLOAT16) &&	\
1217     !defined(__clang__)
1218 stress_cpu_fp(__fp16, float16, shim_sin, shim_cos)
1219 #endif
1220 #if defined(HAVE_FLOAT32) &&	\
1221     !defined(__clang__)
1222 stress_cpu_fp(_Float32, float32, shim_sin, shim_cos)
1223 #endif
1224 #if defined(HAVE_FLOAT64) &&	\
1225     !defined(__clang__)
1226 stress_cpu_fp(_Float64, float64, shim_sin, shim_cos)
1227 #endif
1228 #if defined(HAVE_FLOAT80) &&	\
1229     !defined(__clang__)
1230 stress_cpu_fp(__float80, float80, shim_sinl, shim_cosl)
1231 #endif
1232 #if defined(HAVE_FLOAT128) &&	\
1233     !defined(__clang__)
1234 stress_cpu_fp(__float128, float128, shim_sinl, shim_cosl)
1235 #endif
1236 
1237 /* Append floating point literal specifier to literal value */
1238 #define FP(val, ltype)	val ## ltype
1239 
1240 #if defined(HAVE_COMPLEX_H) &&		\
1241     defined(HAVE_COMPLEX) &&		\
1242     defined(__STDC_IEC_559_COMPLEX__) &&\
1243     !defined(__UCLIBC__)
1244 /*
1245  *  Generic complex stressor macro
1246  */
1247 #define stress_cpu_complex(_type, _ltype, _name, _csin, _ccos)	\
1248 static void HOT OPTIMIZE3 TARGET_CLONES stress_cpu_ ## _name(const char *name)\
1249 {								\
1250 	int i;							\
1251 	const uint32_t r1 = stress_mwc32(),			\
1252 		       r2 = stress_mwc32();			\
1253 	_type cI = (_type)I;					\
1254 	_type a = FP(0.18728, _ltype) + 			\
1255 		cI * FP(0.2762, _ltype),			\
1256 		b = (_type)r1 - cI * FP(0.11121, _ltype),	\
1257 		c = (_type)r2 + cI * stress_mwc32(), 		\
1258 		d = (_type)0.0,					\
1259 		r;						\
1260 								\
1261 	(void)name;						\
1262 								\
1263 	for (i = 0; i < 1000; i++) {				\
1264 		float_ops(_type, a, b, c, d,			\
1265 			_csin, _ccos);				\
1266 	}							\
1267 	r = a + b + c + d;					\
1268 	stress_double_put((double)r);				\
1269 }
1270 
1271 stress_cpu_complex(complex float, f, complex_float, shim_csinf, shim_ccosf)
1272 stress_cpu_complex(complex double, , complex_double, shim_csin, shim_ccos)
1273 stress_cpu_complex(complex long double, l, complex_long_double, shim_csinl, shim_ccosl)
1274 #endif
1275 
1276 #define int_float_ops(_ftype, flt_a, flt_b, flt_c, flt_d,	\
1277 	_sin, _cos, _inttype, int_a, int_b, _c1, _c2, _c3)	\
1278 	do {							\
1279 		int_a += int_b;					\
1280 		int_b ^= int_a;					\
1281 		flt_a = flt_a + flt_b;				\
1282 		int_a >>= 1;					\
1283 		int_b <<= 2;					\
1284 		flt_b = flt_a * flt_c;				\
1285 		int_b -= int_a;					\
1286 		int_a ^= ~(_inttype)0;				\
1287 		flt_c = flt_a - flt_b;				\
1288 		int_b ^= ~(_c1);				\
1289 		int_a *= 3;					\
1290 		flt_d = flt_a / flt_b;				\
1291 		int_b *= 7;					\
1292 		int_a += 2;					\
1293 		flt_a = flt_c / (_ftype)0.1923L;		\
1294 		int_b -= 3;					\
1295 		int_a /= 77;					\
1296 		flt_b = flt_c + flt_a;				\
1297 		int_b /= 3;					\
1298 		int_a <<= 1;					\
1299 		flt_c = flt_b * (_ftype)3.12L;			\
1300 		int_b <<= 2;					\
1301 		int_a |= 1;					\
1302 		flt_d = flt_d + flt_b + (_ftype)_sin(flt_a);	\
1303 		int_b |= 3;					\
1304 		int_a *= stress_mwc32();			\
1305 		flt_a = (flt_b + flt_c) / flt_c;		\
1306 		int_b ^= stress_mwc32();			\
1307 		int_a += stress_mwc32();			\
1308 		flt_b = flt_b * flt_c;				\
1309 		int_b -= stress_mwc32();			\
1310 		int_a /= 7;					\
1311 		flt_c = flt_c + (_ftype)1.0L;			\
1312 		int_b /= 9;					\
1313 		flt_d = flt_d - (_ftype)_sin(flt_c);		\
1314 		int_a |= (_c2);					\
1315 		flt_a = flt_a * (_ftype)_cos(flt_b);		\
1316 		flt_b = flt_b + (_ftype)_cos(flt_c);		\
1317 		int_b &= (_c3);					\
1318 		flt_c = (_ftype)_sin(flt_a + flt_b) / (_ftype)2.344L;	\
1319 		flt_b = flt_d - (_ftype)1.0L;			\
1320 	} while (0)
1321 
1322 
1323 /*
1324  *  Generic integer and floating point stressor macro
1325  */
1326 #define stress_cpu_int_fp(_inttype, _sz, _ftype, _name, _a, _b, \
1327 	_c1, _c2, _c3, _sinf, _cosf)				\
1328 static void HOT OPTIMIZE3 TARGET_CLONES stress_cpu_int ## _sz ## _ ## _name(const char *name)\
1329 {								\
1330 	int i;							\
1331 	_inttype int_a, int_b;					\
1332 	const _inttype mask = (_inttype)~0;			\
1333 	const _inttype a_final = _a;				\
1334 	const _inttype b_final = _b;				\
1335 	const _inttype c1 = _c1 & mask;				\
1336 	const _inttype c2 = _c2 & mask;				\
1337 	const _inttype c3 = _c3 & mask;				\
1338 	const uint32_t r1 = stress_mwc32(),			\
1339 		       r2 = stress_mwc32();			\
1340 	_ftype flt_a = (_ftype)0.18728L,			\
1341 	       flt_b = (_ftype)r1,				\
1342 	       flt_c = (_ftype)r2,				\
1343 	       flt_d = (_ftype)0.0,				\
1344 	       flt_r;						\
1345 								\
1346 	STRESS_MWC_SEED();					\
1347 	int_a = stress_mwc32();					\
1348 	int_b = stress_mwc32();					\
1349 								\
1350 	for (i = 0; i < 1000; i++) {				\
1351 		int_float_ops(_ftype, flt_a, flt_b, flt_c, 	\
1352 			flt_d,_sinf, _cosf, _inttype,		\
1353 			int_a, int_b, c1, c2, c3);		\
1354 	}							\
1355 	if ((g_opt_flags & OPT_FLAGS_VERIFY) &&			\
1356 	    ((int_a != a_final) || (int_b != b_final)))		\
1357 		pr_fail("%s: int" # _sz " error detected, "	\
1358 			"failed int" # _sz "" # _ftype		\
1359 			" math operations\n", name);		\
1360 								\
1361 	flt_r = flt_a + flt_b + flt_c + flt_d;			\
1362 	stress_double_put((double)flt_r);			\
1363 }
1364 
1365 stress_cpu_int_fp(uint32_t, 32, float, float,
1366 	0x1ce9b547UL, 0xa24b33aUL,
1367 	C1, C2, C3, shim_sinf, shim_cosf)
1368 stress_cpu_int_fp(uint32_t, 32, double, double,
1369 	0x1ce9b547UL, 0xa24b33aUL,
1370 	C1, C2, C3, shim_sin, shim_cos)
1371 stress_cpu_int_fp(uint32_t, 32, long double, longdouble,
1372 	0x1ce9b547UL, 0xa24b33aUL,
1373 	C1, C2, C3, shim_sinl, shim_cosl)
1374 stress_cpu_int_fp(uint64_t, 64, float, float,
1375 	0x13f7f6dc1d79197cULL, 0x1863d2c6969a51ceULL,
1376 	C1, C2, C3, shim_sinf, shim_cosf)
1377 stress_cpu_int_fp(uint64_t, 64, double, double,
1378 	0x13f7f6dc1d79197cULL, 0x1863d2c6969a51ceULL,
1379 	C1, C2, C3, shim_sin, shim_cos)
1380 stress_cpu_int_fp(uint64_t, 64, long double, longdouble,
1381 	0x13f7f6dc1d79197cULL, 0x1863d2c6969a51ceULL,
1382 	C1, C2, C3, shim_sinl, shim_cosl)
1383 
1384 #if defined(HAVE_INT128_T)
1385 stress_cpu_int_fp(__uint128_t, 128, float, float,
1386 	_UINT128(0x132af604d8b9183a,0x5e3af8fa7a663d74),
1387 	_UINT128(0x0062f086e6160e4e,0x0d84c9f800365858),
1388 	_UINT128(C1, C1), _UINT128(C2, C2), _UINT128(C3, C3),
1389 	shim_sinf, shim_cosf)
1390 stress_cpu_int_fp(__uint128_t, 128, double, double,
1391 	_UINT128(0x132af604d8b9183a,0x5e3af8fa7a663d74),
1392 	_UINT128(0x0062f086e6160e4e,0x0d84c9f800365858),
1393 	_UINT128(C1, C1), _UINT128(C2, C2), _UINT128(C3, C3),
1394 	shim_sin, shim_cos)
1395 stress_cpu_int_fp(__uint128_t, 128, long double, longdouble,
1396 	_UINT128(0x132af604d8b9183a,0x5e3af8fa7a663d74),
1397 	_UINT128(0x0062f086e6160e4e,0x0d84c9f800365858),
1398 	_UINT128(C1, C1), _UINT128(C2, C2), _UINT128(C3, C3),
1399 	shim_sinl, shim_cosl)
1400 #if defined(HAVE_FLOAT_DECIMAL32) &&	\
1401     !defined(__clang__)
1402 stress_cpu_int_fp(__uint128_t, 128, _Decimal32, decimal32,
1403 	_UINT128(0x132af604d8b9183a,0x5e3af8fa7a663d74),
1404 	_UINT128(0x0062f086e6160e4e,0x0d84c9f800365858),
1405 	_UINT128(C1, C1), _UINT128(C2, C2), _UINT128(C3, C3),
1406 	(_Decimal32)shim_sinf, (_Decimal32)shim_cosf)
1407 #endif
1408 #if defined(HAVE_FLOAT_DECIMAL64) &&	\
1409     !defined(__clang__)
1410 stress_cpu_int_fp(__uint128_t, 128, _Decimal64, decimal64,
1411 	_UINT128(0x132af604d8b9183a,0x5e3af8fa7a663d74),
1412 	_UINT128(0x0062f086e6160e4e,0x0d84c9f800365858),
1413 	_UINT128(C1, C1), _UINT128(C2, C2), _UINT128(C3, C3),
1414 	(_Decimal64)shim_sin, (_Decimal64)shim_cos)
1415 #endif
1416 #if defined(HAVE_FLOAT_DECIMAL128) &&	\
1417     !defined(__clang__)
1418 stress_cpu_int_fp(__uint128_t, 128, _Decimal128, decimal128,
1419 	_UINT128(0x132af604d8b9183a,0x5e3af8fa7a663d74),
1420 	_UINT128(0x0062f086e6160e4e,0x0d84c9f800365858),
1421 	_UINT128(C1, C1), _UINT128(C2, C2), _UINT128(C3, C3),
1422 	(_Decimal128)shim_sinl, (_Decimal128)shim_cosl)
1423 #endif
1424 #endif
1425 
1426 /*
1427  *  stress_cpu_rgb()
1428  *	CCIR 601 RGB to YUV to RGB conversion
1429  */
1430 static void HOT OPTIMIZE3 TARGET_CLONES stress_cpu_rgb(const char *name)
1431 {
1432 	int i;
1433 	uint32_t rgb = stress_mwc32() & 0xffffff;
1434 	uint8_t r = (uint8_t)(rgb >> 16);
1435 	uint8_t g = (uint8_t)(rgb >> 8);
1436 	uint8_t b = (uint8_t)rgb;
1437 
1438 	(void)name;
1439 
1440 	/* Do a 1000 colours starting from the rgb seed */
1441 	for (i = 0; i < 1000; i++) {
1442 		float y, u, v;
1443 
1444 		/* RGB to CCIR 601 YUV */
1445 		y = (0.299f * r) + (0.587f * g) + (0.114f * b);
1446 		u = (b - y) * 0.565f;
1447 		v = (r - y) * 0.713f;
1448 
1449 		/* YUV back to RGB */
1450 		r = (uint8_t)(y + (1.403f * v));
1451 		g = (uint8_t)(y - (0.344f * u) - (0.714f * v));
1452 		b = (uint8_t)(y + (1.770f * u));
1453 
1454 		/* And bump each colour to make next round */
1455 		r += 1;
1456 		g += 2;
1457 		b += 3;
1458 		stress_uint64_put(r + g + b);
1459 	}
1460 }
1461 
1462 /*
1463  *  stress_cpu_matrix_prod(void)
1464  *	matrix product
1465  */
stress_cpu_matrix_prod(const char * name)1466 static void HOT OPTIMIZE3 TARGET_CLONES stress_cpu_matrix_prod(const char *name)
1467 {
1468 	int i, j, k;
1469 
1470 	static long double a[MATRIX_PROD_SIZE][MATRIX_PROD_SIZE],
1471 		    	   b[MATRIX_PROD_SIZE][MATRIX_PROD_SIZE],
1472 		    	   r[MATRIX_PROD_SIZE][MATRIX_PROD_SIZE];
1473 	long double v = 1 / (long double)((uint32_t)~0);
1474 	long double sum = 0.0L;
1475 
1476 	(void)name;
1477 
1478 	for (i = 0; i < MATRIX_PROD_SIZE; i++) {
1479 		for (j = 0; j < MATRIX_PROD_SIZE; j++) {
1480 			const uint32_t r1 = stress_mwc32();
1481 			const uint32_t r2 = stress_mwc32();
1482 
1483 			a[i][j] = (long double)r1 * v;
1484 			b[i][j] = (long double)r2 * v;
1485 			r[i][j] = 0.0L;
1486 		}
1487 	}
1488 
1489 	for (i = 0; i < MATRIX_PROD_SIZE; i++) {
1490 		for (j = 0; j < MATRIX_PROD_SIZE; j++) {
1491 			for (k = 0; k < MATRIX_PROD_SIZE; k++) {
1492 				r[i][j] += a[i][k] * b[k][j];
1493 			}
1494 		}
1495 	}
1496 
1497 	for (i = 0; i < MATRIX_PROD_SIZE; i++)
1498 		for (j = 0; j < MATRIX_PROD_SIZE; j++)
1499 			sum += r[i][j];
1500 	stress_long_double_put(sum);
1501 }
1502 
1503 /*
1504  *   stress_cpu_fibonacci()
1505  *	compute fibonacci series
1506  */
stress_cpu_fibonacci(const char * name)1507 static void HOT OPTIMIZE3 stress_cpu_fibonacci(const char *name)
1508 {
1509 	const uint64_t fn_res = 0xa94fad42221f2702ULL;
1510 	register uint64_t f1 = 0, f2 = 1, fn;
1511 
1512 	do {
1513 		fn = f1 + f2;
1514 		f1 = f2;
1515 		f2 = fn;
1516 	} while (!(fn & 0x8000000000000000ULL));
1517 
1518 	if ((g_opt_flags & OPT_FLAGS_VERIFY) && (fn_res != fn))
1519 		pr_fail("%s: fibonacci error detected, summation "
1520 			"or assignment failure\n", name);
1521 }
1522 
1523 /*
1524  *  stress_cpu_psi
1525  *	compute the constant psi,
1526  * 	the reciprocal Fibonacci constant
1527  */
stress_cpu_psi(const char * name)1528 static void HOT OPTIMIZE3 stress_cpu_psi(const char *name)
1529 {
1530 	long double f1 = 0.0L, f2 = 1.0L;
1531 	long double psi = 0.0L, last_psi;
1532 	long double precision = 1.0e-20L;
1533 	int i = 0;
1534 	const int max_iter = 100;
1535 
1536 	do {
1537 		long double fn = f1 + f2;
1538 		f1 = f2;
1539 		f2 = fn;
1540 		last_psi = psi;
1541 		psi += 1.0L / f1;
1542 		i++;
1543 	} while ((i < max_iter) && (shim_fabsl(psi - last_psi) > precision));
1544 
1545 	if (g_opt_flags & OPT_FLAGS_VERIFY) {
1546 		if (shim_fabsl(psi - PSI) > 1.0e-15L)
1547 			pr_fail("%s: calculation of reciprocal "
1548 				"Fibonacci constant phi not as accurate "
1549 				"as expected\n", name);
1550 		if (i >= max_iter)
1551 			pr_fail("%s: calculation of reciprocal "
1552 				"Fibonacci constant took more iterations "
1553 				"than expected\n", name);
1554 	}
1555 
1556 	stress_long_double_put(psi);
1557 }
1558 
1559 /*
1560  *   stress_cpu_ln2
1561  *	compute ln(2) using series
1562  */
stress_cpu_ln2(const char * name)1563 static void HOT OPTIMIZE3 TARGET_CLONES stress_cpu_ln2(const char *name)
1564 {
1565 	long double ln2 = 0.0L, last_ln2 = 0.0L;
1566 	long double precision = 1.0e-7L;
1567 	register int n = 1;
1568 	const int max_iter = 10000;
1569 
1570 	/* Not the fastest converging series */
1571 	do {
1572 		last_ln2 = ln2;
1573 		/* Unroll, do several ops */
1574 		ln2 += (long double)1.0L / (long double)n++;
1575 		ln2 -= (long double)1.0L / (long double)n++;
1576 		ln2 += (long double)1.0L / (long double)n++;
1577 		ln2 -= (long double)1.0L / (long double)n++;
1578 		ln2 += (long double)1.0L / (long double)n++;
1579 		ln2 -= (long double)1.0L / (long double)n++;
1580 		ln2 += (long double)1.0L / (long double)n++;
1581 		ln2 -= (long double)1.0L / (long double)n++;
1582 	} while ((n < max_iter) && (shim_fabsl(ln2 - last_ln2) > precision));
1583 
1584 	if ((g_opt_flags & OPT_FLAGS_VERIFY) && (n >= max_iter))
1585 		pr_fail("%s: calculation of ln(2) took more "
1586 			"iterations than expected\n", name);
1587 
1588 	stress_long_double_put(ln2);
1589 }
1590 
1591 /*
1592  *  ackermann()
1593  *	a naive/simple implementation of the ackermann function
1594  */
ackermann(const uint32_t m,const uint32_t n)1595 static uint32_t HOT ackermann(const uint32_t m, const uint32_t n)
1596 {
1597 	if (m == 0)
1598 		return n + 1;
1599 	else if (n == 0)
1600 		return ackermann(m - 1, 1);
1601 	else
1602 		return ackermann(m - 1, ackermann(m, n - 1));
1603 }
1604 
1605 /*
1606  *   stress_cpu_ackermann
1607  *	compute ackermann function
1608  */
stress_cpu_ackermann(const char * name)1609 static void stress_cpu_ackermann(const char *name)
1610 {
1611 	uint32_t a = ackermann(3, 7);
1612 
1613 	if ((g_opt_flags & OPT_FLAGS_VERIFY) && (a != 0x3fd))
1614 		pr_fail("%s: ackermann error detected, "
1615 			"ackermann(3,9) miscalculated\n", name);
1616 }
1617 
1618 /*
1619  *   stress_cpu_explog
1620  *	compute exp(log(n))
1621  */
stress_cpu_explog(const char * name)1622 static void HOT stress_cpu_explog(const char *name)
1623 {
1624 	uint32_t i;
1625 	double n = 1e6;
1626 
1627 	(void)name;
1628 
1629 	for (i = 1; i < 100000; i++)
1630 		n = exp(log(n) / 1.00002);
1631 }
1632 
1633 /*
1634  *  This could be a ternary operator, v = (v op val) ? a : b
1635  *  but it may be optimised down, so force a compare and jmp
1636  *  with -O0 and a if/else construct
1637  */
1638 #define JMP(v, op, val, a, b)			\
1639 do {						\
1640 	if (v op val)				\
1641 		v = a;				\
1642 	else					\
1643 		v = b;				\
1644 	stress_uint32_put((uint32_t)(next + i));\
1645 } while (0)
1646 
1647 /*
1648  *   stress_cpu_jmp
1649  *	jmp conditionals
1650  */
stress_cpu_jmp(const char * name)1651 static void HOT OPTIMIZE0 stress_cpu_jmp(const char *name)
1652 {
1653 	register int i, next = 0;
1654 
1655 	(void)name;
1656 
1657 	for (i = 1; i < 1000; i++) {
1658 		/* Force lots of compare jmps */
1659 		JMP(next, ==, 1, 2, 3);
1660 		JMP(next, >, 2, 0, 1);
1661 		JMP(next, <, 1, 1, 0);
1662 		JMP(next, ==, 1, 2, 3);
1663 		JMP(next, >, 2, 0, 1);
1664 		JMP(next, <, 1, 1, 0);
1665 		JMP(next, ==, 1, 2, 3);
1666 		JMP(next, >, 2, 0, 1);
1667 		JMP(next, <, 1, 1, 0);
1668 		JMP(next, ==, 1, 2, 3);
1669 		JMP(next, >, 2, 0, 1);
1670 		JMP(next, <, 1, 1, 0);
1671 	}
1672 }
1673 
1674 /*
1675  *  ccitt_crc16()
1676  *	perform naive CCITT CRC16
1677  */
ccitt_crc16(const uint8_t * data,size_t n)1678 static uint16_t HOT OPTIMIZE3 ccitt_crc16(const uint8_t *data, size_t n)
1679 {
1680 	/*
1681 	 *  The CCITT CRC16 polynomial is
1682 	 *     16    12    5
1683 	 *    x   + x   + x  + 1
1684 	 *
1685 	 *  which is 0x11021, but to make the computation
1686 	 *  simpler, this has been reversed to 0x8408 and
1687 	 *  the top bit ignored..
1688 	 *  We can get away with a 17 bit polynomial
1689 	 *  being represented by a 16 bit value because
1690 	 *  we are assuming the top bit is always set.
1691 	 */
1692 	const uint16_t polynomial = 0x8408;
1693 	register uint16_t crc = 0xffff;
1694 
1695 	if (!n)
1696 		return 0;
1697 
1698 	for (; n; n--) {
1699 		uint8_t i;
1700 		uint8_t val = (uint16_t)0xff & *data++;
1701 
1702 		for (i = 8; i; --i, val >>= 1) {
1703 			bool do_xor = 1 & (val ^ crc);
1704 			crc >>= 1;
1705 			crc ^= do_xor ? polynomial : 0;
1706 		}
1707 	}
1708 
1709 	crc = ~crc;
1710 	return ((uint16_t)(crc << 8)) | (crc >> 8);
1711 }
1712 
1713 /*
1714  *   stress_cpu_crc16
1715  *	compute 1024 rounds of CCITT CRC16
1716  */
stress_cpu_crc16(const char * name)1717 static void stress_cpu_crc16(const char *name)
1718 {
1719 	uint8_t buffer[1024];
1720 	size_t i;
1721 
1722 	(void)name;
1723 
1724 	random_buffer(buffer, sizeof(buffer));
1725 	for (i = 1; i < sizeof(buffer); i++)
1726 		stress_uint64_put(ccitt_crc16(buffer, i));
1727 }
1728 
1729 /*
1730  *  fletcher16
1731  *	naive implementation of fletcher16 checksum
1732  */
fletcher16(const uint8_t * data,const size_t len)1733 static uint16_t HOT OPTIMIZE3 fletcher16(const uint8_t *data, const size_t len)
1734 {
1735 	register uint16_t sum1 = 0, sum2 = 0;
1736 	register size_t i;
1737 
1738 	for (i = 0; i < len; i++) {
1739 		sum1 = (sum1 + data[i]) % 255;
1740 		sum2 = (sum2 + sum1) % 255;
1741 	}
1742 	return ((uint16_t)(sum2 << 8)) | sum1;
1743 }
1744 
1745 /*
1746  *   stress_cpu_fletcher16()
1747  *	compute 1024 rounds of fletcher16 checksum
1748  */
stress_cpu_fletcher16(const char * name)1749 static void stress_cpu_fletcher16(const char *name)
1750 {
1751 	uint8_t buffer[1024];
1752 	size_t i;
1753 
1754 	(void)name;
1755 
1756 	random_buffer((uint8_t *)buffer, sizeof(buffer));
1757 	for (i = 1; i < sizeof(buffer); i++)
1758 		stress_uint16_put(fletcher16(buffer, i));
1759 }
1760 
1761 /*
1762  *   stress_cpu_ipv4checksum
1763  *	compute 1024 rounds of IPv4 checksum
1764  */
stress_cpu_ipv4checksum(const char * name)1765 static void stress_cpu_ipv4checksum(const char *name)
1766 {
1767 	uint16_t buffer[512];
1768 	size_t i;
1769 
1770 	(void)name;
1771 
1772 	random_buffer((uint8_t *)buffer, sizeof(buffer));
1773 	for (i = 1; i < sizeof(buffer); i++)
1774 		stress_uint16_put(stress_ipv4_checksum(buffer, i));
1775 }
1776 
1777 #if defined(HAVE_COMPLEX_H) &&		\
1778     defined(HAVE_COMPLEX) &&		\
1779     defined(__STDC_IEC_559_COMPLEX__) &&\
1780     !defined(__UCLIBC__)
1781 /*
1782  *  zeta()
1783  *	Riemann zeta function
1784  */
zeta(const long double complex s,long double precision)1785 static inline long double complex HOT OPTIMIZE3 zeta(
1786 	const long double complex s,
1787 	long double precision)
1788 {
1789 	int i = 1;
1790 	long double complex z = 0.0L, zold = 0.0L;
1791 
1792 	do {
1793 		double complex pwr = shim_cpow(i++, (complex double)s);
1794 		zold = z;
1795 		z += 1 / (long double complex)pwr;
1796 	} while (shim_cabsl(z - zold) > precision);
1797 
1798 	return z;
1799 }
1800 
1801 /*
1802  * stress_cpu_zeta()
1803  *	stress test Zeta(2.0)..Zeta(10.0)
1804  */
stress_cpu_zeta(const char * name)1805 static void stress_cpu_zeta(const char *name)
1806 {
1807 	long double precision = 0.00000001L;
1808 	int i;
1809 
1810 	(void)name;
1811 
1812 	for (i = 2; i < 11; i++) {
1813 		long double complex z = zeta((long double complex)i, precision);
1814 
1815 		stress_long_double_put((long double)z);
1816 	}
1817 }
1818 #endif
1819 
1820 /*
1821  * stress_cpu_gamma()
1822  *	stress Euler–Mascheroni constant gamma
1823  */
stress_cpu_gamma(const char * name)1824 static void HOT OPTIMIZE3 stress_cpu_gamma(const char *name)
1825 {
1826 	long double precision = 1.0e-10L;
1827 	long double sum = 0.0L, k = 1.0L, _gamma = 0.0L, gammaold;
1828 
1829 	do {
1830 		gammaold = _gamma;
1831 		sum += 1.0L / k;
1832 		_gamma = sum - shim_logl(k);
1833 		k += 1.0L;
1834 	} while ((k < 1e6L) && shim_fabsl(_gamma - gammaold) > precision);
1835 
1836 	stress_long_double_put(_gamma);
1837 
1838 	if (g_opt_flags & OPT_FLAGS_VERIFY) {
1839 		if (shim_fabsl(_gamma - GAMMA) > 1.0e-5L)
1840 			pr_fail("%s: calculation of Euler-Mascheroni "
1841 				"constant not as accurate as expected\n", name);
1842 		if (k > 80000.0L)
1843 			pr_fail("%s: calculation of Euler-Mascheroni "
1844 				"constant took more iterations than "
1845 				"expected\n", name);
1846 	}
1847 
1848 }
1849 
1850 /*
1851  * stress_cpu_correlate()
1852  *
1853  *  Introduction to Signal Processing,
1854  *  Prentice-Hall, 1995, ISBN: 0-13-209172-0.
1855  */
stress_cpu_correlate(const char * name)1856 static void HOT OPTIMIZE3 stress_cpu_correlate(const char *name)
1857 {
1858 	size_t i, j;
1859 	double data_average = 0.0;
1860 	static double data[CORRELATE_DATA_LEN];
1861 	static double corr[CORRELATE_LEN + 1];
1862 
1863 	(void)name;
1864 
1865 	/* Generate some random data */
1866 	for (i = 0; i < CORRELATE_DATA_LEN; i++) {
1867 		const uint64_t r = stress_mwc64();
1868 
1869 		data[i] = (double)r;
1870 		data_average += data[i];
1871 	}
1872 	data_average /= (double)CORRELATE_DATA_LEN;
1873 
1874 	/* And correlate */
1875 	for (i = 0; i <= CORRELATE_LEN; i++) {
1876 		corr[i] = 0.0;
1877 		for (j = 0; j < CORRELATE_DATA_LEN - i; j++) {
1878 			corr[i] += (data[i + j] - data_average) *
1879 				   (data[j] - data_average);
1880 		}
1881 		corr[i] /= (double)CORRELATE_LEN;
1882 		stress_double_put(corr[i]);
1883 	}
1884 }
1885 
1886 
1887 /*
1888  * stress_cpu_sieve()
1889  * 	slightly optimised Sieve of Eratosthenes
1890  */
stress_cpu_sieve(const char * name)1891 static void HOT OPTIMIZE3 stress_cpu_sieve(const char *name)
1892 {
1893 	const double dsqrt = shim_sqrt(SIEVE_SIZE);
1894 	const uint32_t nsqrt = (uint32_t)dsqrt;
1895 	static uint32_t sieve[(SIEVE_SIZE + 31) / 32];
1896 	uint32_t i, j;
1897 
1898 	(void)memset(sieve, 0xff, sizeof(sieve));
1899 	for (i = 2; i < nsqrt; i++)
1900 		if (STRESS_GETBIT(sieve, i))
1901 			for (j = i * i; j < SIEVE_SIZE; j += i)
1902 				STRESS_CLRBIT(sieve, j);
1903 
1904 	/* And count up number of primes */
1905 	for (j = 0, i = 2; i < SIEVE_SIZE; i++) {
1906 		if (STRESS_GETBIT(sieve, i))
1907 			j++;
1908 	}
1909 	if ((g_opt_flags & OPT_FLAGS_VERIFY) && (j != 10000))
1910 		pr_fail("%s: sieve error detected, number of "
1911 			"primes has been miscalculated\n", name);
1912 }
1913 
1914 /*
1915  *  is_prime()
1916  *	return true if n is prime
1917  *	http://en.wikipedia.org/wiki/Primality_test
1918  */
is_prime(uint32_t n)1919 static inline HOT OPTIMIZE3 ALWAYS_INLINE uint32_t is_prime(uint32_t n)
1920 {
1921 	register uint32_t i, max;
1922 	double dsqrt;
1923 
1924 	if (UNLIKELY(n <= 3))
1925 		return n >= 2;
1926 	if ((n % 2 == 0) || (n % 3 == 0))
1927 		return 0;
1928 
1929 	dsqrt = shim_sqrt(n);
1930 	max = (uint32_t)dsqrt + 1;
1931 	for (i = 5; i < max; i+= 6)
1932 		if ((n % i == 0) || (n % (i + 2) == 0))
1933 			return 0;
1934 	return 1;
1935 }
1936 
1937 /*
1938  *  stress_cpu_prime()
1939  *
1940  */
stress_cpu_prime(const char * name)1941 static void stress_cpu_prime(const char *name)
1942 {
1943 	uint32_t i, nprimes = 0;
1944 
1945 	for (i = 0; i < SIEVE_SIZE; i++) {
1946 		nprimes += is_prime(i);
1947 	}
1948 
1949 	if ((g_opt_flags & OPT_FLAGS_VERIFY) && (nprimes != 10000))
1950 		pr_fail("%s: prime error detected, number of primes "
1951 			"has been miscalculated\n", name);
1952 }
1953 
1954 /*
1955  *  stress_cpu_gray()
1956  *	compute gray codes
1957  */
stress_cpu_gray(const char * name)1958 static void HOT OPTIMIZE3 TARGET_CLONES stress_cpu_gray(const char *name)
1959 {
1960 	register uint32_t i;
1961 	register uint64_t sum = 0;
1962 
1963 	for (i = 0; i < 0x10000; i++) {
1964 		register uint32_t gray_code;
1965 
1966 		/* Binary to Gray code */
1967 		gray_code = (i >> 1) ^ i;
1968 		sum += gray_code;
1969 
1970 		/* Gray code back to binary */
1971 #if 0
1972 		{
1973 			/* Slow iterative method */
1974 			register uint32_t mask;
1975 
1976 			for (mask = gray_code >> 1; mask; mask >>= 1)
1977 				gray_code ^= mask;
1978 		}
1979 #else
1980 		/* Fast non-loop method */
1981 		gray_code ^= (gray_code >> 1);
1982 		gray_code ^= (gray_code >> 2);
1983 		gray_code ^= (gray_code >> 4);
1984 		gray_code ^= (gray_code >> 8);
1985 		gray_code ^= (gray_code >> 16);
1986 #endif
1987 		sum += gray_code;
1988 	}
1989 	if ((g_opt_flags & OPT_FLAGS_VERIFY) && (sum != 0xffff0000))
1990 		pr_fail("%s: gray code error detected, sum of gray "
1991 			"codes between 0x00000 and 0x10000 miscalculated\n",
1992 			name);
1993 }
1994 
1995 /*
1996  * hanoi()
1997  *	do a Hanoi move
1998  */
hanoi(const uint16_t n,const char p1,const char p2,const char p3)1999 static uint32_t HOT hanoi(
2000 	const uint16_t n,
2001 	const char p1,
2002 	const char p2,
2003 	const char p3)
2004 {
2005 	if (UNLIKELY(n == 0)) {
2006 		/* Move p1 -> p2 */
2007 		return 1;
2008 	} else {
2009 		uint32_t m = hanoi(n - 1, p1, p3, p2);
2010 		/* Move p1 -> p2 */
2011 		m += hanoi(n - 1, p3, p2, p1);
2012 		return m;
2013 	}
2014 }
2015 
2016 /*
2017  *  stress_cpu_hanoi
2018  *	stress with recursive Towers of Hanoi
2019  */
stress_cpu_hanoi(const char * name)2020 static void stress_cpu_hanoi(const char *name)
2021 {
2022 	uint32_t n = hanoi(20, 'X', 'Y', 'Z');
2023 
2024 	if ((g_opt_flags & OPT_FLAGS_VERIFY) && (n != 1048576))
2025 		pr_fail("%s: number of hanoi moves different from "
2026 			"the expected number\n", name);
2027 
2028 	stress_uint64_put(n);
2029 }
2030 
2031 /*
2032  *  stress_floatconversion
2033  *	exercise conversion to/from different floating point values
2034  */
stress_cpu_floatconversion(const char * name)2035 static void TARGET_CLONES stress_cpu_floatconversion(const char *name)
2036 {
2037 	float f_sum = 0.0;
2038 	double d_sum = 0.0;
2039 	long double ld_sum = 0.0L;
2040 	register uint32_t i, j_sum = 0;
2041 
2042 	(void)name;
2043 
2044 	for (i = 0; i < 65536; i++) {
2045 		float f;
2046 		double d;
2047 		long double ld;
2048 
2049 		f = (float)i;
2050 		d = (double)f;
2051 		ld = (long double)d;
2052 
2053 		f_sum += f;
2054 		d_sum += d;
2055 		ld_sum += ld;
2056 		j_sum += (uint32_t)ld;
2057 
2058 		f = (float)(double)i;
2059 		f_sum += f;
2060 		f = (float)(long double)i;
2061 		f_sum += f;
2062 		f = (float)(double)(long double)i;
2063 		f_sum += f;
2064 		f = (float)(long double)(double)i;
2065 		f_sum += f;
2066 
2067 		d = (double)(long double)f;
2068 		d_sum += d;
2069 		d = (double)(float)f;
2070 		d_sum += d;
2071 		d = (double)(long double)(float)f;
2072 		d_sum += d;
2073 		d = (double)(float)(long double)f;
2074 		d_sum += d;
2075 
2076 		ld = (long double)(float)d;
2077 		ld_sum += ld;
2078 		ld = (long double)(double)d;
2079 		ld_sum += ld;
2080 		ld = (long double)(float)(double)d;
2081 		ld_sum += ld;
2082 		ld = (long double)(double)(float)d;
2083 		ld_sum += ld;
2084 	}
2085 	stress_long_double_put(ld_sum);
2086 	stress_double_put(d_sum);
2087 	stress_float_put(f_sum);
2088 	stress_uint32_put(j_sum);
2089 }
2090 
2091 /*
2092  *  stress_intconversion
2093  *	exercise conversion to/from different int values
2094  */
stress_cpu_intconversion(const char * name)2095 static void stress_cpu_intconversion(const char *name)
2096 {
2097 	int16_t i16_sum = (int16_t)stress_mwc16();
2098 	int32_t i32_sum = (int32_t)stress_mwc32();
2099 	int64_t i64_sum = (int64_t)stress_mwc64();
2100 
2101 	register uint32_t i;
2102 
2103 	(void)name;
2104 
2105 	for (i = 0; i < 65536; i++) {
2106 		int16_t i16;
2107 		int32_t i32;
2108 		int64_t	i64;
2109 
2110 		i16 = (int16_t)i;
2111 		i32 = (int32_t)i;
2112 		i64 = (int64_t)i;
2113 
2114 		i16_sum += i16;
2115 		i32_sum += i32;
2116 		i64_sum += i64;
2117 
2118 		i16 = -(int16_t)(uint32_t)-(int64_t)(uint64_t)i64_sum;
2119 		i16_sum -= i16;
2120 		i32 = -(int16_t)(uint32_t)-(int64_t)(uint64_t)i16_sum;
2121 		i32_sum -= i32;
2122 		i64 = -(int16_t)(uint32_t)-(int64_t)(uint64_t)i32_sum;
2123 		i64_sum -= i64;
2124 
2125 		i16 = -(int16_t)(uint64_t)-(int32_t)(uint64_t)i64_sum;
2126 		i16_sum += i16;
2127 		i32 = -(int16_t)(uint64_t)-(int32_t)(uint64_t)i16_sum;
2128 		i32_sum += i32;
2129 		i64 = -(int16_t)(uint64_t)-(int32_t)(uint64_t)i32_sum;
2130 		i64_sum += i64;
2131 
2132 		i16 = (int16_t)-((int32_t)(uint16_t)-(int64_t)(uint64_t)i64_sum);
2133 		i16_sum -= i16;
2134 		i32 = -(int32_t)(uint16_t)-(int64_t)(uint64_t)i16_sum;
2135 		i32_sum -= i32;
2136 		i64 = -(int32_t)(uint16_t)-(int64_t)(uint64_t)i32_sum;
2137 		i64_sum -= i64;
2138 
2139 		i16 = (int16_t)-((int32_t)(uint64_t)-(int16_t)(uint64_t)i64_sum);
2140 		i16_sum += i16;
2141 		i32 = -(int32_t)(uint64_t)-(int16_t)(uint64_t)i16_sum;
2142 		i32_sum += i32;
2143 		i64 = -(int32_t)(uint64_t)-(int16_t)(uint64_t)i32_sum;
2144 		i64_sum += i64;
2145 
2146 		i16 = (int16_t)-((int64_t)(uint16_t)-(int32_t)(uint64_t)i64_sum);
2147 		i16_sum -= i16;
2148 		i32 = (int32_t)-((int64_t)(uint16_t)-(int32_t)(uint64_t)i16_sum);
2149 		i32_sum -= i32;
2150 		i64 = (int64_t)(uint16_t)-(int32_t)(uint64_t)i32_sum;
2151 		i64_sum -= i64;
2152 
2153 		i16 = (int16_t)-((int64_t)(uint32_t)-(int16_t)(uint64_t)i64_sum);
2154 		i16_sum += i16;
2155 		i32 = (int32_t)-((int64_t)(uint32_t)-(int16_t)(uint64_t)i16_sum);
2156 		i32_sum += i32;
2157 		i64 = -(int64_t)(uint32_t)-(int16_t)(uint64_t)i32_sum;
2158 		i64_sum += i64;
2159 	}
2160 	stress_uint16_put((uint16_t)i16_sum);
2161 	stress_uint32_put((uint32_t)i32_sum);
2162 	stress_uint64_put((uint64_t)i64_sum);
2163 }
2164 
2165 /*
2166  *  factorial()
2167  *	compute n!
2168  */
factorial(int n)2169 static inline long double HOT OPTIMIZE3 factorial(int n)
2170 {
2171 	static const long double factorials[] = {
2172 		1.0L,
2173 		1.0L,
2174 		2.0L,
2175 		6.0L,
2176 		24.0L,
2177 		120.0L,
2178 		720.0L,
2179 		5040.0L,
2180 		40320.0L,
2181 		362880.0L,
2182 		3628800.0L,
2183 		39916800.0L,
2184 		479001600.0L,
2185 		6227020800.0L,
2186 		87178291200.0L,
2187 		1307674368000.0L,
2188 		20922789888000.0L,
2189 		355687428096000.0L,
2190 		6402373705728000.0L,
2191 		121645100408832000.0L,
2192 		2432902008176640000.0L,
2193 		51090942171709440000.0L,
2194 		1124000727777607680000.0L,
2195 		25852016738884976640000.0L,
2196 		620448401733239439360000.0L,
2197 		15511210043330985984000000.0L,
2198 		403291461126605635592388608.0L,
2199 		10888869450418352161430700032.0L,
2200 		304888344611713860511469666304.0L,
2201 		8841761993739701954695181369344.0L,
2202 		265252859812191058647452510846976.0L,
2203 		8222838654177922818071027836256256.0L,
2204 		263130836933693530178272890760200192.0L
2205 	};
2206 
2207 	if (n < (int)SIZEOF_ARRAY(factorials))
2208 		return factorials[n];
2209 
2210 	return roundl(shim_expl(shim_lgammal((long double)(n + 1))));
2211 }
2212 
2213 /*
2214  *  stress_cpu_pi()
2215  *	compute pi using the Srinivasa Ramanujan
2216  *	fast convergence algorithm
2217  */
stress_cpu_pi(const char * name)2218 static void HOT OPTIMIZE3 stress_cpu_pi(const char *name)
2219 {
2220 	long double s = 0.0L, pi = 0.0L, last_pi = 0.0L;
2221 	const long double precision = 1.0e-20L;
2222 	const long double c = 2.0L * shim_sqrtl(2.0L) / 9801.0L;
2223 	const int max_iter = 5;
2224 	int k = 0;
2225 
2226 	do {
2227 		last_pi = pi;
2228 		s += (factorial(4 * k) *
2229 			((26390.0L * (long double)k) + 1103)) /
2230 			(shim_powl(factorial(k), 4.0L) * shim_powl(396.0L, 4.0L * k));
2231 		pi = 1 / (s * c);
2232 		k++;
2233 	} while ((k < max_iter) && (shim_fabsl(pi - last_pi) > precision));
2234 
2235 	/* Quick sanity checks */
2236 	if (g_opt_flags & OPT_FLAGS_VERIFY) {
2237 		if (k >= max_iter)
2238 			pr_fail("%s: number of iterations to compute "
2239 				"pi was more than expected\n", name);
2240 		if (shim_fabsl(pi - PI) > 1.0e-15L)
2241 			pr_fail("%s: accuracy of computed pi is not "
2242 				"as good as expected\n", name);
2243 	}
2244 
2245 	stress_long_double_put(pi);
2246 }
2247 
2248 /*
2249  *  stress_cpu_omega()
2250  *	compute the constant omega
2251  *	See http://en.wikipedia.org/wiki/Omega_constant
2252  */
stress_cpu_omega(const char * name)2253 static void HOT OPTIMIZE3 stress_cpu_omega(const char *name)
2254 {
2255 	long double omega = 0.5L, last_omega = 0.0L;
2256 	const long double precision = 1.0e-20L;
2257 	const int max_iter = 6;
2258 	int n = 0;
2259 
2260 	/*
2261 	 * Omega converges very quickly, on most CPUs it is
2262 	 * within 6 iterations.
2263 	 */
2264 	do {
2265 		last_omega = omega;
2266 		omega = (1 + omega) / (1 + shim_expl(omega));
2267 		n++;
2268 	} while ((n < max_iter) && (shim_fabsl(omega - last_omega) > precision));
2269 
2270 	if (g_opt_flags & OPT_FLAGS_VERIFY) {
2271 		if (n > max_iter)
2272 			pr_fail("%s: number of iterations to compute "
2273 				"omega was more than expected (%d vs %d)\n",
2274 				name, n, max_iter);
2275 		if (shim_fabsl(omega - OMEGA) > 1.0e-16L)
2276 			pr_fail("%s: accuracy of computed omega is "
2277 				"not as good as expected\n", name);
2278 	}
2279 
2280 	stress_long_double_put(omega);
2281 }
2282 
2283 #define HAMMING(G, i, nybble, code) 			\
2284 do {							\
2285 	int8_t res;					\
2286 	res = (((G[3] >> i) & (nybble >> 3)) & 1) ^	\
2287 	      (((G[2] >> i) & (nybble >> 2)) & 1) ^	\
2288 	      (((G[1] >> i) & (nybble >> 1)) & 1) ^	\
2289 	      (((G[0] >> i) & (nybble >> 0)) & 1);	\
2290 	code ^= ((res & 1) << i);			\
2291 } while (0)
2292 
2293 /*
2294  *  hamming84()
2295  *	compute Hamming (8,4) codes
2296  */
hamming84(const uint8_t nybble)2297 static uint8_t HOT OPTIMIZE3 hamming84(const uint8_t nybble)
2298 {
2299 	/*
2300 	 * Hamming (8,4) Generator matrix
2301 	 * (4 parity bits, 4 data bits)
2302 	 *
2303 	 *  p1 p2 p3 p4 d1 d2 d3 d4
2304 	 *  0  1  1  1  1  0  0  0
2305 	 *  1  0  1  1  0  1  0  0
2306 	 *  1  1  0  1  0  0  1  0
2307 	 *  1  1  1  0  0  0  0  1
2308 	 *
2309 	 * Where:
2310 	 *  d1..d4 = 4 data bits
2311 	 *  p1..p4 = 4 parity bits:
2312 	 *    p1 = d2 + d3 + d4
2313 	 *    p2 = d1 + d3 + d4
2314 	 *    p3 = d1 + d2 + d4
2315 	 *    p4 = d1 + d2 + d3
2316 	 *
2317 	 * G[] is reversed to turn G[3-j] into G[j] to save a subtraction
2318 	 */
2319 	static const uint8_t G[] = {
2320 		0xf1,	/* 0b11110001 */
2321 		0xd2,	/* 0b11010010 */
2322 		0xb4,	/* 0b10110100 */
2323 		0x78,	/* 0b01111000 */
2324 	};
2325 
2326 	register uint8_t code = 0;
2327 
2328 	/* Unrolled 8 bit loop x unrolled 4 bit loop  */
2329 	HAMMING(G, 7, nybble, code);
2330 	HAMMING(G, 6, nybble, code);
2331 	HAMMING(G, 5, nybble, code);
2332 	HAMMING(G, 4, nybble, code);
2333 	HAMMING(G, 3, nybble, code);
2334 	HAMMING(G, 2, nybble, code);
2335 	HAMMING(G, 1, nybble, code);
2336 	HAMMING(G, 0, nybble, code);
2337 
2338 	return code;
2339 }
2340 
2341 /*
2342  *  stress_cpu_hamming()
2343  *	compute hamming code on 65536 x 4 nybbles
2344  */
stress_cpu_hamming(const char * name)2345 static void HOT OPTIMIZE3 TARGET_CLONES stress_cpu_hamming(const char *name)
2346 {
2347 	uint32_t i;
2348 	uint32_t sum = 0;
2349 
2350 	for (i = 0; i < 65536; i++) {
2351 		uint32_t encoded;
2352 
2353 		/* 4 x 4 bits to 4 x 8 bits hamming encoded */
2354 		encoded = (uint32_t)(hamming84((i >> 12) & 0xf) << 24) |
2355 			  (uint32_t)(hamming84((i >> 8) & 0xf) << 16) |
2356 			  (uint32_t)(hamming84((i >> 4) & 0xf) << 8) |
2357 			  (uint32_t)(hamming84((i >> 0) & 0xf) << 0);
2358 		sum += encoded;
2359 	}
2360 
2361 	if ((g_opt_flags & OPT_FLAGS_VERIFY) && (sum != 0xffff8000))
2362 		pr_fail("%s: hamming error detected, sum of 65536 "
2363 			"hamming codes not correct\n", name);
2364 }
2365 
2366 
stress_cpu_callfunc_func(ssize_t n,uint64_t u64arg,uint32_t u32arg,uint16_t u16arg,uint8_t u8arg,uint64_t * p_u64arg,uint32_t * p_u32arg,uint16_t * p_u16arg,uint8_t * p_u8arg)2367 static ptrdiff_t stress_cpu_callfunc_func(
2368 	ssize_t		n,
2369 	uint64_t	u64arg,
2370 	uint32_t	u32arg,
2371 	uint16_t	u16arg,
2372 	uint8_t		u8arg,
2373 	uint64_t	*p_u64arg,
2374 	uint32_t	*p_u32arg,
2375 	uint16_t	*p_u16arg,
2376 	uint8_t		*p_u8arg)
2377 {
2378 	if (LIKELY(n > 0))
2379 		return stress_cpu_callfunc_func(n - 1,
2380 			u64arg, u32arg, u16arg, u8arg,
2381 			p_u64arg, p_u32arg, p_u16arg, p_u8arg);
2382 	else
2383 		return &u64arg - p_u64arg;
2384 }
2385 
2386 /*
2387  *  stress_cpu_callfunc()
2388  *	deep function calls
2389  */
stress_cpu_callfunc(const char * name)2390 static void stress_cpu_callfunc(const char *name)
2391 {
2392 	uint64_t	u64arg = stress_mwc64();
2393 	uint32_t	u32arg = stress_mwc32();
2394 	uint16_t	u16arg = stress_mwc16();
2395 	uint8_t		u8arg  = stress_mwc8();
2396 	ptrdiff_t	ret;
2397 
2398 	(void)name;
2399 
2400 	ret = stress_cpu_callfunc_func(1024,
2401 		u64arg, u32arg, u16arg, u8arg,
2402 		&u64arg, &u32arg, &u16arg, &u8arg);
2403 
2404 	stress_uint64_put((uint64_t)ret);
2405 }
2406 
2407 
2408 #define P2(n) n, n^1, n^1, n
2409 #define P4(n) P2(n), P2(n^1), P2(n^1), P2(n)
2410 #define P6(n) P4(n), P4(n^1), P4(n^1), P4(n)
2411 
2412 static const bool stress_cpu_parity_table[256] = {
2413 	P6(0), P6(1), P6(1), P6(0)
2414 };
2415 
2416 /*
2417  *  stress_cpu_parity
2418  *	compute parity different ways
2419  */
stress_cpu_parity(const char * name)2420 static void stress_cpu_parity(const char *name)
2421 {
2422 	uint32_t val = 0x83fb5acf;
2423 	size_t i;
2424 
2425 	for (i = 0; i < 1000; i++, val++) {
2426 		register uint32_t parity, p;
2427 		uint32_t v;
2428 		union {
2429 			uint32_t v32;
2430 			uint8_t  v8[4];
2431 		} u;
2432 
2433 		/*
2434 		 * Naive way
2435 		 */
2436 		v = val;
2437 		parity = 0;
2438 		while (v) {
2439 			if (v & 1)
2440 				parity = !parity;
2441 			v >>= 1;
2442 		}
2443 
2444 		/*
2445 		 * Naive way with Brian Kernigan's bit counting optimisation
2446 		 * https://graphics.stanford.edu/~seander/bithacks.html
2447 		 */
2448 		v = val;
2449 		p = 0;
2450 		while (v) {
2451 			p = !p;
2452 			v = v & (v - 1);
2453 		}
2454 		if ((g_opt_flags & OPT_FLAGS_VERIFY) && (p != parity))
2455 			pr_fail("%s: parity error detected, using "
2456 				"optimised naive method\n",  name);
2457 
2458 		/*
2459 		 * "Compute parity of a word with a multiply"
2460 		 * the Andrew Shapira method,
2461 		 * https://graphics.stanford.edu/~seander/bithacks.html
2462 		 */
2463 		v = val;
2464 		v ^= v >> 1;
2465 		v ^= v >> 2;
2466 		v = (v & 0x11111111U) * 0x11111111U;
2467 		p = (v >> 28) & 1;
2468 		if ((g_opt_flags & OPT_FLAGS_VERIFY) && (p != parity))
2469 			pr_fail("%s: parity error detected, using the "
2470 				"multiply Shapira method\n",  name);
2471 
2472 		/*
2473 		 * "Compute parity in parallel"
2474 		 * https://graphics.stanford.edu/~seander/bithacks.html
2475 		 */
2476 		v = val;
2477 		v ^= v >> 16;
2478 		v ^= v >> 8;
2479 		v ^= v >> 4;
2480 		v &= 0xf;
2481 		p = (0x6996 >> v) & 1;
2482 		if ((g_opt_flags & OPT_FLAGS_VERIFY) && (p != parity))
2483 			pr_fail("%s: parity error detected, using "
2484 				"the parallel method\n",  name);
2485 
2486 		/*
2487 		 * "Compute parity by lookup table"
2488 		 * https://graphics.stanford.edu/~seander/bithacks.html
2489 		 * Variation #1
2490 		 */
2491 		v = val;
2492 		v ^= v >> 16;
2493 		v ^= v >> 8;
2494 		p = stress_cpu_parity_table[v & 0xff];
2495 		if ((g_opt_flags & OPT_FLAGS_VERIFY) && (p != parity))
2496 			pr_fail("%s: parity error detected, using "
2497 				"the lookup method, variation 1\n",  name);
2498 
2499 		/*
2500 		 * "Compute parity by lookup table"
2501 		 * https://graphics.stanford.edu/~seander/bithacks.html
2502 		 * Variation #2
2503 		 */
2504 		u.v32 = val;
2505 		p = stress_cpu_parity_table[u.v8[0] ^ u.v8[1] ^ u.v8[2] ^ u.v8[3]];
2506 		if ((g_opt_flags & OPT_FLAGS_VERIFY) && (p != parity))
2507 			pr_fail("%s: parity error detected, using the "
2508 				"lookup method, variation 2\n",  name);
2509 #if defined(HAVE_BUILTIN_PARITY)
2510 		/*
2511 		 *  Compute parity using built-in function
2512 		 */
2513 		p = __builtin_parity((unsigned int)val);
2514 		if ((g_opt_flags & OPT_FLAGS_VERIFY) && (p != parity))
2515 			pr_fail("%s: parity error detected, using "
2516 				"the __builtin_parity function\n",  name);
2517 #endif
2518 	}
2519 }
2520 
2521 /*
2522  *  stress_cpu_dither
2523  *	perform 8 bit to 1 bit gray scale
2524  *	Floyd–Steinberg dither
2525  */
stress_cpu_dither(const char * name)2526 static void TARGET_CLONES stress_cpu_dither(const char *name)
2527 {
2528 	size_t x, y;
2529 
2530 	(void)name;
2531 
2532 	/*
2533 	 *  Generate some random 8 bit image
2534 	 */
2535 	for (y = 0; y < STRESS_CPU_DITHER_Y; y += 8) {
2536 		for (x = 0; x < STRESS_CPU_DITHER_X; x ++) {
2537 			uint64_t v = stress_mwc64();
2538 
2539 			pixels[x][y + 0] = (uint8_t)v;
2540 			v >>= 8;
2541 			pixels[x][y + 1] = (uint8_t)v;
2542 			v >>= 8;
2543 			pixels[x][y + 2] = (uint8_t)v;
2544 			v >>= 8;
2545 			pixels[x][y + 3] = (uint8_t)v;
2546 			v >>= 8;
2547 			pixels[x][y + 4] = (uint8_t)v;
2548 			v >>= 8;
2549 			pixels[x][y + 5] = (uint8_t)v;
2550 			v >>= 8;
2551 			pixels[x][y + 6] = (uint8_t)v;
2552 			v >>= 8;
2553 			pixels[x][y + 7] = (uint8_t)v;
2554 		}
2555 	}
2556 
2557 	/*
2558 	 *  ..and dither
2559 	 */
2560 	for (y = 0; y < STRESS_CPU_DITHER_Y; y++) {
2561 		for (x = 0; x < STRESS_CPU_DITHER_X; x++) {
2562 			uint8_t pixel = pixels[x][y];
2563 			uint8_t quant = (pixel < 128) ? 0 : 255;
2564 			int32_t error = pixel - quant;
2565 
2566 			bool xok1 = x < (STRESS_CPU_DITHER_X - 1);
2567 			bool xok2 = x > 0;
2568 			bool yok1 = y < (STRESS_CPU_DITHER_Y - 1);
2569 
2570 			if (xok1)
2571 				pixels[x + 1][y] +=
2572 					(error * 7) >> 4;
2573 			if (xok2 && yok1)
2574 				pixels[x - 1][y + 1] +=
2575 					(error * 3) >> 4;
2576 			if (yok1)
2577 				pixels[x][y + 1] +=
2578 					(error * 5) >> 4;
2579 			if (xok1 && yok1)
2580 				pixels[x + 1][y + 1] +=
2581 					error >> 4;
2582 		}
2583 	}
2584 }
2585 
2586 /*
2587  *  stress_cpu_div16
2588  *	perform 50000 x 16 bit divisions, these are traditionally
2589  *	slow ops
2590  */
stress_cpu_div16(const char * name)2591 static void TARGET_CLONES stress_cpu_div16(const char *name)
2592 {
2593 	register uint16_t i, j;
2594 	const uint16_t di = 0xdUL;
2595 	const uint16_t max = 0xfde8;
2596 
2597 	(void)name;
2598 
2599 	for (i = 0, j = 1; i < max; i += di) {
2600 		register uint32_t r = i / j;
2601 
2602 		j = 1 | ((j << 1) ^ j);
2603 		stress_uint16_put(r);
2604 	}
2605 }
2606 
2607 /*
2608  *  stress_cpu_div32
2609  *	perform 50000 x 32 bit divisions, these are traditionally
2610  *	slow ops
2611  */
stress_cpu_div32(const char * name)2612 static void TARGET_CLONES stress_cpu_div32(const char *name)
2613 {
2614 	register uint32_t i, j;
2615 	const uint32_t di = 0x0014e3dUL;
2616 	const uint32_t max = 0xfeff9bd4UL;
2617 
2618 	(void)name;
2619 
2620 	for (i = 0, j = 1; i < max; i += di) {
2621 		register uint32_t r = i / j;
2622 
2623 		j = 1 | ((j << 1) ^ j);
2624 		stress_uint32_put(r);
2625 	}
2626 }
2627 
2628 /*
2629  *  stress_cpu_div64
2630  *	perform 50000 x 64 bit divisions, these are traditionally
2631  *	really slow ops
2632  */
stress_cpu_div64(const char * name)2633 static void TARGET_CLONES stress_cpu_div64(const char *name)
2634 {
2635 	register uint64_t i, j;
2636 	const uint64_t di = 0x000014ced130f7513LL;
2637 	const uint64_t dj = 0x000013cba9876543ULL;
2638 	const uint64_t max = 0xfe00000000000000ULL;
2639 
2640 	(void)name;
2641 
2642 	for (i = 0, j = 0x7fffffffffffULL; i < max; i += di, j -= dj) {
2643 		register uint64_t r = i / j;
2644 		stress_uint64_put(r);
2645 	}
2646 }
2647 
2648 /*
2649  *  stress_cpu_cpuid()
2650  *	get CPU id info, x86 only
2651  *	see https://en.wikipedia.org/wiki/CPUID
2652  */
2653 #if defined(STRESS_ARCH_X86)
stress_cpu_cpuid(const char * name)2654 static void TARGET_CLONES stress_cpu_cpuid(const char *name)
2655 {
2656 	register int i;
2657 
2658 	(void)name;
2659 
2660 	for (i = 0; i < 1000; i++) {
2661 		uint32_t eax, ebx, ecx, edx;
2662 
2663 		/*  Highest Function Parameter and Manufacturer ID */
2664 		eax = 0;
2665 		ebx = 0; /* Not required */
2666 		ecx = 0; /* Not required */
2667 		edx = 0; /* Not required */
2668 		stress_x86_cpuid(&eax, &ebx, &ecx, &edx);
2669 		stress_uint32_put(eax);
2670 
2671 		/* Processor Info and Feature Bits */
2672 		eax = 1;
2673 		ebx = 0; /* Not required */
2674 		ecx = 0; /* Not required */
2675 		edx = 0; /* Not required */
2676 		stress_x86_cpuid(&eax, &ebx, &ecx, &edx);
2677 		stress_uint32_put(eax);
2678 
2679 		/*  Cache and TLB Descriptor information */
2680 		eax = 2;
2681 		ebx = 0; /* Not required */
2682 		ecx = 0; /* Not required */
2683 		edx = 0; /* Not required */
2684 		stress_x86_cpuid(&eax, &ebx, &ecx, &edx);
2685 		stress_uint32_put(eax);
2686 
2687 		/* Processor Serial Number */
2688 		eax = 3;
2689 		ebx = 0; /* Not required */
2690 		ecx = 0; /* Not required */
2691 		edx = 0; /* Not required */
2692 		stress_x86_cpuid(&eax, &ebx, &ecx, &edx);
2693 		stress_uint32_put(eax);
2694 
2695 		/* Intel thread/core and cache topology */
2696 		eax = 4;
2697 		ebx = 0; /* Not required */
2698 		ecx = 0; /* Not required */
2699 		edx = 0; /* Not required */
2700 		stress_x86_cpuid(&eax, &ebx, &ecx, &edx);
2701 		stress_uint32_put(eax);
2702 
2703 		/* Thermal and power management */
2704 		eax = 6;
2705 		ebx = 0; /* Not required */
2706 		ecx = 0; /* Not required */
2707 		edx = 0; /* Not required */
2708 		stress_x86_cpuid(&eax, &ebx, &ecx, &edx);
2709 		stress_uint32_put(eax);
2710 
2711 		/* Extended Features */
2712 		eax = 6;
2713 		ebx = 0; /* Not required */
2714 		ecx = 0; /* Must be 0 */
2715 		edx = 0; /* Not required */
2716 		stress_x86_cpuid(&eax, &ebx, &ecx, &edx);
2717 		stress_uint32_put(eax);
2718 
2719 		/* Extended Features */
2720 		eax = 7;
2721 		ebx = 0; /* Not required */
2722 		ecx = 0; /* Must be 0 */
2723 		edx = 0; /* Not required */
2724 		stress_x86_cpuid(&eax, &ebx, &ecx, &edx);
2725 		stress_uint32_put(eax);
2726 
2727 		/* Extended Features */
2728 		eax = 7;
2729 		ebx = 0; /* Not required */
2730 		ecx = 1; /* Must be 1 */
2731 		edx = 0; /* Not required */
2732 		stress_x86_cpuid(&eax, &ebx, &ecx, &edx);
2733 		stress_uint32_put(eax);
2734 
2735 		/* Intel thread/core and cache topology */
2736 		eax = 0xb;
2737 		ebx = 0; /* Not required */
2738 		ecx = 0; /* Not required */
2739 		edx = 0; /* Not required */
2740 		stress_x86_cpuid(&eax, &ebx, &ecx, &edx);
2741 		stress_uint32_put(eax);
2742 
2743 		/* Get highest extended function index */
2744 		eax = 0x80000000;
2745 		ebx = 0; /* Not required */
2746 		ecx = 0; /* Not required */
2747 		edx = 0; /* Not required */
2748 		stress_x86_cpuid(&eax, &ebx, &ecx, &edx);
2749 		stress_uint32_put(eax);
2750 
2751 		/* Extended processor info */
2752 		eax = 0x80000001;
2753 		ebx = 0; /* Not required */
2754 		ecx = 0; /* Not required */
2755 		edx = 0; /* Not required */
2756 		stress_x86_cpuid(&eax, &ebx, &ecx, &edx);
2757 		stress_uint32_put(eax);
2758 
2759 		/* Processor brand string */
2760 		eax = 0x80000002;
2761 		ebx = 0; /* Not required */
2762 		ecx = 0; /* Not required */
2763 		edx = 0; /* Not required */
2764 		stress_x86_cpuid(&eax, &ebx, &ecx, &edx);
2765 		stress_uint32_put(eax);
2766 
2767 		/* Processor brand string */
2768 		eax = 0x80000003;
2769 		ebx = 0; /* Not required */
2770 		ecx = 0; /* Not required */
2771 		edx = 0; /* Not required */
2772 		stress_x86_cpuid(&eax, &ebx, &ecx, &edx);
2773 		stress_uint32_put(eax);
2774 
2775 		/* Processor brand string */
2776 		eax = 0x80000004;
2777 		ebx = 0; /* Not required */
2778 		ecx = 0; /* Not required */
2779 		edx = 0; /* Not required */
2780 		stress_x86_cpuid(&eax, &ebx, &ecx, &edx);
2781 		stress_uint32_put(eax);
2782 
2783 		/* L1 Cache and TLB Identifiers */
2784 		eax = 0x80000005;
2785 		ebx = 0; /* Not required */
2786 		ecx = 0; /* Not required */
2787 		edx = 0; /* Not required */
2788 		stress_x86_cpuid(&eax, &ebx, &ecx, &edx);
2789 		stress_uint32_put(eax);
2790 
2791 		/* Extended L2 Cache Features */
2792 		eax = 0x80000006;
2793 		ebx = 0; /* Not required */
2794 		ecx = 0; /* Not required */
2795 		edx = 0; /* Not required */
2796 		stress_x86_cpuid(&eax, &ebx, &ecx, &edx);
2797 		stress_uint32_put(eax);
2798 
2799 		/* Advanced Power Management information */
2800 		eax = 0x80000007;
2801 		ebx = 0; /* Not required */
2802 		ecx = 0; /* Not required */
2803 		edx = 0; /* Not required */
2804 		stress_x86_cpuid(&eax, &ebx, &ecx, &edx);
2805 		stress_uint32_put(eax);
2806 
2807 		/* Virtual and Physical address size */
2808 		eax = 0x80000008;
2809 		ebx = 0; /* Not required */
2810 		ecx = 0; /* Not required */
2811 		edx = 0; /* Not required */
2812 		stress_x86_cpuid(&eax, &ebx, &ecx, &edx);
2813 		stress_uint32_put(eax);
2814 	}
2815 }
2816 #endif
2817 
2818 /*
2819  *  stress_cpu_union
2820  *	perform bit field operations on a union
2821  */
stress_cpu_union(const char * name)2822 static void TARGET_CLONES stress_cpu_union(const char *name)
2823 {
2824 	typedef union {
2825 		struct {
2826 			uint64_t	b1:1;
2827 			uint64_t	b10:10;
2828 			uint64_t	b2:2;
2829 			uint64_t	b9:9;
2830 			uint64_t	b3:3;
2831 			uint64_t	b8:8;
2832 			uint64_t	b4:4;
2833 			uint64_t	b7:7;
2834 			uint64_t	b5:5;
2835 			uint64_t	b6:6;
2836 		} bits64;
2837 		uint64_t	u64:64;
2838 		union {
2839 			uint8_t		b1:1;
2840 			uint8_t		b7:7;
2841 			uint8_t		b8:8;
2842 		} bits8;
2843 		struct {
2844 			uint16_t	b15:15;
2845 			uint16_t	b1:1;
2846 		} bits16;
2847 		struct {
2848 			uint32_t	b10:10;
2849 			uint32_t	b20:20;
2850 #if defined(__TINYC__)
2851 			uint32_t	f:1;	/* cppcheck-suppress unusedStructMember */
2852 #else
2853 			uint32_t	:1;	/* cppcheck-suppress unusedStructMember */
2854 #endif
2855 			uint32_t	b1:1;
2856 		} bits32;
2857 		uint32_t	u32:30;
2858 	} stress_u_t;
2859 
2860 	static stress_u_t u;
2861 	size_t i;
2862 
2863 	(void)name;
2864 	for (i = 0; i < 1000; i++) {
2865 		u.bits64.b1 ^= 1;
2866 		u.bits64.b2--;
2867 		u.bits32.b10 ^= ~0;
2868 		u.bits64.b3++;
2869 		u.bits16.b1--;
2870 		u.bits8.b1++;
2871 		u.bits64.b4 *= 2;
2872 		u.bits32.b20 += 3;
2873 		u.u64 += 0x1037fc2ae21ef829ULL;
2874 		u.bits64.b6--;
2875 		u.bits8.b7 *= 3;
2876 		u.bits64.b5 += (u.bits64.b4 << 1);
2877 		u.bits32.b1 ^= 1;
2878 		u.bits64.b7++;
2879 		u.bits8.b8 ^= 0xaa;
2880 		u.bits64.b8--;
2881 		u.bits16.b15 ^= 0xbeef;
2882 		u.bits64.b9++;
2883 		u.bits64.b10 *= 5;
2884 		u.u32 += 1;
2885 	}
2886 }
2887 
2888 /*
2889  *  Solution from http://www.cl.cam.ac.uk/~mr10/backtrk.pdf
2890  *     see section 2.1
2891  */
queens_try(uint32_t left_diag,uint32_t cols,uint32_t right_diag,uint32_t all)2892 static uint32_t queens_try(
2893 	uint32_t left_diag,
2894 	uint32_t cols,
2895 	uint32_t right_diag,
2896 	uint32_t all)
2897 {
2898 	register uint32_t solutions = 0;
2899 	register uint32_t poss = ~(left_diag | cols | right_diag) & all;
2900 
2901 	while (poss) {
2902 		register uint32_t inv = -poss;
2903 		register uint32_t bit = poss & inv;
2904 		register uint32_t new_cols = cols | bit;
2905 
2906 		poss -= bit;
2907 		solutions += (new_cols == all) ?
2908 			1 : queens_try((left_diag | bit) << 1,
2909 				new_cols, (right_diag | bit) >> 1, all);
2910 	}
2911 	return solutions;
2912 }
2913 
2914 
2915 /*
2916  *  stress_cpu_queens
2917  *	solve the queens problem for sizes 1..11
2918  */
stress_cpu_queens(const char * name)2919 static void stress_cpu_queens(const char *name)
2920 {
2921 	uint32_t all, n;
2922 
2923 	static const uint32_t queens_solutions[] = {
2924 		0, 1, 0, 0, 2, 10, 4, 40, 92, 352, 724, 2680, 14200
2925 	};
2926 
2927 	for (all = 1, n = 1; n < 12; n++) {
2928 		const uint32_t solutions = queens_try(0, 0, 0, all);
2929 
2930 		if ((g_opt_flags & OPT_FLAGS_VERIFY) &&
2931 		    (solutions != queens_solutions[n]))
2932 			pr_fail("%s: queens solution error detected "
2933 				"on board size %" PRIu32 "\n",
2934 				name, n);
2935 		all = (all + all) + 1;
2936 	}
2937 }
2938 
2939 /*
2940  *  stress_cpu_factorial
2941  *	find factorials from 1..150 using
2942  *	Stirling's and Ramanujan's Approximations.
2943  */
stress_cpu_factorial(const char * name)2944 static void stress_cpu_factorial(const char *name)
2945 {
2946 	int n;
2947 	long double f = 1.0L;
2948 	const long double precision = 1.0e-6L;
2949 	const long double sqrt_pi = shim_sqrtl(PI);
2950 
2951 	for (n = 1; n < 150; n++) {
2952 		long double np1 = (long double)(n + 1);
2953 		long double fact = roundl(shim_expl(shim_lgammal(np1)));
2954 		long double dn;
2955 
2956 		f *= (long double)n;
2957 
2958 		/* Stirling */
2959 		if ((g_opt_flags & OPT_FLAGS_VERIFY) &&
2960 		    ((f - fact) / fact > precision)) {
2961 			pr_fail("%s: Stirling's approximation of factorial(%d) out of range\n",
2962 				name, n);
2963 		}
2964 
2965 		/* Ramanujan */
2966 		dn = (long double)n;
2967 		fact = sqrt_pi * shim_powl((dn / (long double)M_E), dn);
2968 		fact *= shim_powl((((((((8 * dn) + 4)) * dn) + 1) * dn) + 1.0L/30.0L), (1.0L/6.0L));
2969 		if ((g_opt_flags & OPT_FLAGS_VERIFY) &&
2970 		    ((f - fact) / fact > precision)) {
2971 			pr_fail("%s: Ramanujan's approximation of factorial(%d) out of range\n",
2972 				name, n);
2973 		}
2974 	}
2975 }
2976 
2977 /*
2978  *  stress_cpu_stats
2979  *	Exercise some standard stats computations on random data
2980  */
stress_cpu_stats(const char * name)2981 static void stress_cpu_stats(const char *name)
2982 {
2983 	size_t i;
2984 	double data[STATS_MAX];
2985 	double min, max, am = 0.0, gm, hm = 0.0, stddev = 0.0;
2986 	int64_t expon = 0;
2987 	double mant = 1.0;
2988 	const double inverse_n = 1.0 / (double)STATS_MAX;
2989 
2990 	for (i = 0; i < STATS_MAX; i++)
2991 		data[i] = ((double)(stress_mwc32() + 1)) / 4294967296.0;
2992 
2993 	min = max = data[0];
2994 
2995 	for (i = 0; i < STATS_MAX; i++) {
2996 		double d = data[i];
2997 		double f;
2998 		int e;
2999 
3000 		f = frexp(d, &e);
3001 		mant *= f;
3002 		expon += e;
3003 
3004 		if (min > d)
3005 			min = d;
3006 		if (max < d)
3007 			max = d;
3008 
3009 		am += d;
3010 		hm += 1 / d;
3011 	}
3012 	/* Arithmetic mean (average) */
3013 	am = am / STATS_MAX;
3014 	/* Geometric mean */
3015 	gm = pow(mant, inverse_n) *
3016 	     pow(2.0, (double)expon * inverse_n);
3017 	/* Harmonic mean */
3018 	hm = STATS_MAX / hm;
3019 
3020 	for (i = 0; i < STATS_MAX; i++) {
3021 		double d = data[i] - am;
3022 		stddev += (d * d);
3023 	}
3024 	/* Standard Deviation */
3025 	stddev = shim_sqrt(stddev);
3026 
3027 	stress_double_put(am);
3028 	stress_double_put(gm);
3029 	stress_double_put(hm);
3030 	stress_double_put(stddev);
3031 
3032 	if (min > hm)
3033 		pr_fail("%s: stats: minimum %f > harmonic mean %f\n",
3034 			name, min, hm);
3035 	if (hm > gm)
3036 		pr_fail("%s: stats: harmonic mean %f > geometric mean %f\n",
3037 			name, hm, gm);
3038 	if (gm > am)
3039 		pr_fail("%s: stats: geometric mean %f > arithmetic mean %f\n",
3040 			name, gm, am);
3041 	if (am > max)
3042 		pr_fail("%s: stats: arithmetic mean %f > maximum %f\n",
3043 			name, am, max);
3044 }
3045 
3046 /*
3047  *  stress_cpu_all()
3048  *	iterate over all cpu stressors
3049  */
stress_cpu_all(const char * name)3050 static HOT OPTIMIZE3 void stress_cpu_all(const char *name)
3051 {
3052 	static int i = 1;	/* Skip over stress_cpu_all */
3053 
3054 	cpu_methods[i++].func(name);
3055 	if (!cpu_methods[i].func)
3056 		i = 1;
3057 }
3058 
3059 /*
3060  * Table of cpu stress methods
3061  */
3062 static const stress_cpu_method_info_t cpu_methods[] = {
3063 	{ "all",		stress_cpu_all },	/* Special "all test */
3064 
3065 	{ "ackermann",		stress_cpu_ackermann },
3066 	{ "apery",		stress_cpu_apery },
3067 	{ "bitops",		stress_cpu_bitops },
3068 	{ "callfunc",		stress_cpu_callfunc },
3069 #if defined(HAVE_COMPLEX_H) &&		\
3070     defined(HAVE_COMPLEX) &&		\
3071     defined(__STDC_IEC_559_COMPLEX__) &&\
3072     !defined(__UCLIBC__)
3073 	{ "cdouble",		stress_cpu_complex_double },
3074 	{ "cfloat",		stress_cpu_complex_float },
3075 	{ "clongdouble",	stress_cpu_complex_long_double },
3076 #endif
3077 	{ "collatz",		stress_cpu_collatz },
3078 	{ "correlate",		stress_cpu_correlate },
3079 #if defined(STRESS_ARCH_X86)
3080 	{ "cpuid",		stress_cpu_cpuid },
3081 #endif
3082 	{ "crc16",		stress_cpu_crc16 },
3083 #if defined(HAVE_FLOAT_DECIMAL32) &&	\
3084     !defined(__clang__)
3085 	{ "decimal32",		stress_cpu_decimal32 },
3086 #endif
3087 #if defined(HAVE_FLOAT_DECIMAL64) &&	\
3088     !defined(__clang__)
3089 	{ "decimal64",		stress_cpu_decimal64 },
3090 #endif
3091 #if defined(HAVE_FLOAT_DECIMAL128) &&	\
3092     !defined(__clang__)
3093 	{ "decimal128",		stress_cpu_decimal128 },
3094 #endif
3095 	{ "dither",		stress_cpu_dither },
3096 	{ "div16",		stress_cpu_div16 },
3097 	{ "div32",		stress_cpu_div32 },
3098 	{ "div64",		stress_cpu_div64 },
3099 	{ "djb2a",		stress_cpu_djb2a },
3100 	{ "double",		stress_cpu_double },
3101 	{ "euler",		stress_cpu_euler },
3102 	{ "explog",		stress_cpu_explog },
3103 	{ "factorial",		stress_cpu_factorial },
3104 	{ "fibonacci",		stress_cpu_fibonacci },
3105 #if defined(HAVE_COMPLEX_H) &&		\
3106     defined(HAVE_COMPLEX) &&		\
3107     defined(__STDC_IEC_559_COMPLEX__) &&\
3108     !defined(__UCLIBC__)
3109 	{ "fft",		stress_cpu_fft },
3110 #endif
3111 	{ "fletcher16",		stress_cpu_fletcher16 },
3112 	{ "float",		stress_cpu_float },
3113 #if defined(HAVE_FLOAT16) &&	\
3114     !defined(__clang__)
3115 	{ "float16",		stress_cpu_float16 },
3116 #endif
3117 #if defined(HAVE_FLOAT32) &&	\
3118     !defined(__clang__)
3119 	{ "float32",		stress_cpu_float32 },
3120 #endif
3121 #if defined(HAVE_FLOAT64) &&	\
3122     !defined(__clang__)
3123 	{ "float64",		stress_cpu_float64 },
3124 #endif
3125 #if defined(HAVE_FLOAT80) &&	\
3126     !defined(__clang__)
3127 	{ "float80",		stress_cpu_float80 },
3128 #endif
3129 #if defined(HAVE_FLOAT128) &&	\
3130     !defined(__clang__)
3131 	{ "float128",		stress_cpu_float128 },
3132 #endif
3133 	{ "floatconversion",	stress_cpu_floatconversion },
3134 	{ "fnv1a",		stress_cpu_fnv1a },
3135 	{ "gamma",		stress_cpu_gamma },
3136 	{ "gcd",		stress_cpu_gcd },
3137 	{ "gray",		stress_cpu_gray },
3138 	{ "hamming",		stress_cpu_hamming },
3139 	{ "hanoi",		stress_cpu_hanoi },
3140 	{ "hyperbolic",		stress_cpu_hyperbolic },
3141 	{ "idct",		stress_cpu_idct },
3142 #if defined(HAVE_INT128_T)
3143 	{ "int128",		stress_cpu_int128 },
3144 #endif
3145 	{ "int64",		stress_cpu_int64 },
3146 	{ "int32",		stress_cpu_int32 },
3147 	{ "int16",		stress_cpu_int16 },
3148 	{ "int8",		stress_cpu_int8 },
3149 #if defined(HAVE_INT128_T)
3150 	{ "int128float",	stress_cpu_int128_float },
3151 	{ "int128double",	stress_cpu_int128_double },
3152 	{ "int128longdouble",	stress_cpu_int128_longdouble },
3153 #if defined(HAVE_FLOAT_DECIMAL32) &&	\
3154     !defined(__clang__)
3155 	{ "int128decimal32",	stress_cpu_int128_decimal32 },
3156 #endif
3157 #if defined(HAVE_FLOAT_DECIMAL64) &&	\
3158     !defined(__clang__)
3159 	{ "int128decimal64",	stress_cpu_int128_decimal64 },
3160 #endif
3161 #if defined(HAVE_FLOAT_DECIMAL128) &&	\
3162     !defined(__clang__)
3163 	{ "int128decimal128",	stress_cpu_int128_decimal128 },
3164 #endif
3165 #endif
3166 	{ "int64float",		stress_cpu_int64_float },
3167 	{ "int64double",	stress_cpu_int64_double },
3168 	{ "int64longdouble",	stress_cpu_int64_longdouble },
3169 	{ "int32float",		stress_cpu_int32_float },
3170 	{ "int32double",	stress_cpu_int32_double },
3171 	{ "int32longdouble",	stress_cpu_int32_longdouble },
3172 	{ "intconversion",	stress_cpu_intconversion },
3173 	{ "ipv4checksum",	stress_cpu_ipv4checksum },
3174 	{ "jenkin",		stress_cpu_jenkin },
3175 	{ "jmp",		stress_cpu_jmp },
3176 	{ "lfsr32",		stress_cpu_lfsr32 },
3177 	{ "ln2",		stress_cpu_ln2 },
3178 	{ "longdouble",		stress_cpu_longdouble },
3179 	{ "loop",		stress_cpu_loop },
3180 	{ "matrixprod",		stress_cpu_matrix_prod },
3181 	{ "murmur3_32",		stress_cpu_murmur3_32 },
3182 	{ "nhash",		stress_cpu_nhash },
3183 	{ "nsqrt",		stress_cpu_nsqrt },
3184 	{ "omega",		stress_cpu_omega },
3185 	{ "parity",		stress_cpu_parity },
3186 	{ "phi",		stress_cpu_phi },
3187 	{ "pi",			stress_cpu_pi },
3188 	{ "pjw",		stress_cpu_pjw },
3189 	{ "prime",		stress_cpu_prime },
3190 	{ "psi",		stress_cpu_psi },
3191 	{ "queens",		stress_cpu_queens },
3192 	{ "rand",		stress_cpu_rand },
3193 	{ "rand48",		stress_cpu_rand48 },
3194 	{ "rgb",		stress_cpu_rgb },
3195 	{ "sdbm",		stress_cpu_sdbm },
3196 	{ "sieve",		stress_cpu_sieve },
3197 	{ "stats",		stress_cpu_stats },
3198 	{ "sqrt", 		stress_cpu_sqrt },
3199 	{ "trig",		stress_cpu_trig },
3200 	{ "union",		stress_cpu_union },
3201 #if defined(HAVE_COMPLEX_H) &&		\
3202     defined(HAVE_COMPLEX) &&		\
3203     defined(__STDC_IEC_559_COMPLEX__) &&\
3204     !defined(__UCLIBC__)
3205 	{ "zeta",		stress_cpu_zeta },
3206 #endif
3207 	{ NULL,			NULL }
3208 };
3209 
3210 /*
3211  *  stress_set_cpu_method()
3212  *	set the default cpu stress method
3213  */
stress_set_cpu_method(const char * name)3214 static int stress_set_cpu_method(const char *name)
3215 {
3216 	stress_cpu_method_info_t const *info;
3217 
3218 	for (info = cpu_methods; info->func; info++) {
3219 		if (!strcmp(info->name, name)) {
3220 			stress_set_setting("cpu-method", TYPE_ID_UINTPTR_T, &info);
3221 			return 0;
3222 		}
3223 	}
3224 
3225 	(void)fprintf(stderr, "cpu-method must be one of:");
3226 	for (info = cpu_methods; info->func; info++) {
3227 		(void)fprintf(stderr, " %s", info->name);
3228 	}
3229 	(void)fprintf(stderr, "\n");
3230 
3231 	return -1;
3232 }
3233 
3234 /*
3235  *  stress_per_cpu_time()
3236  *	try to get accurage CPU time from CPUTIME clock,
3237  *	or fall back to wall clock time if not possible.
3238  */
stress_per_cpu_time(void)3239 static double stress_per_cpu_time(void)
3240 {
3241 #if defined(CLOCK_PROCESS_CPUTIME_ID)
3242 	struct timespec ts;
3243 	static bool use_clock_gettime = true;
3244 
3245 	/*
3246 	 *  Where possible try to get time used on the CPU
3247 	 *  rather than wall clock time to get more accurate
3248 	 *  CPU consumption measurements
3249 	 */
3250 	if (use_clock_gettime) {
3251 		if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts) == 0) {
3252 			return (double)ts.tv_sec + ((double)ts.tv_nsec) / (double)STRESS_NANOSECOND;
3253 		} else {
3254 			use_clock_gettime = false;
3255 		}
3256 	}
3257 #endif
3258 	/*
3259 	 *  Can't get CPU clock time, fall back to wall clock time
3260 	 */
3261 	return stress_time_now();
3262 }
3263 
3264 /*
3265  *  stress_cpu()
3266  *	stress CPU by doing floating point math ops
3267  */
stress_cpu(const stress_args_t * args)3268 static int HOT OPTIMIZE3 stress_cpu(const stress_args_t *args)
3269 {
3270 	double bias;
3271 	const stress_cpu_method_info_t *cpu_method = &cpu_methods[0];
3272 	stress_cpu_func func;
3273 	int32_t cpu_load = 100;
3274 	int32_t cpu_load_slice = -64;
3275 
3276 	(void)stress_get_setting("cpu-load", &cpu_load);
3277 	(void)stress_get_setting("cpu-load-slice", &cpu_load_slice);
3278 	(void)stress_get_setting("cpu-method", &cpu_method);
3279 
3280 	func = cpu_method->func;
3281 
3282 	pr_dbg("%s using method '%s'\n", args->name, cpu_method->name);
3283 
3284 	/*
3285 	 * It is unlikely, but somebody may request to do a zero
3286 	 * load stress test(!)
3287 	 */
3288 	if (cpu_load == 0) {
3289 		(void)sleep((unsigned int)g_opt_timeout);
3290 		return EXIT_SUCCESS;
3291 	}
3292 
3293 	stress_set_proc_state(args->name, STRESS_STATE_RUN);
3294 
3295 	/*
3296 	 * Normal use case, 100% load, simple spinning on CPU
3297 	 */
3298 	if (cpu_load == 100) {
3299 		do {
3300 			(void)func(args->name);
3301 			inc_counter(args);
3302 		} while (keep_stressing(args));
3303 		return EXIT_SUCCESS;
3304 	}
3305 
3306 	/*
3307 	 * More complex percentage CPU utilisation.  This is
3308 	 * not intended to be 100% accurate timing, it is good
3309 	 * enough for most purposes.
3310 	 */
3311 	bias = 0.0;
3312 	do {
3313 		double delay, t1, t2;
3314 		struct timeval tv;
3315 
3316 		t1 = stress_per_cpu_time();
3317 		if (cpu_load_slice < 0) {
3318 			/* < 0 specifies number of iterations to do per slice */
3319 			int j;
3320 
3321 			for (j = 0; j < -cpu_load_slice; j++) {
3322 				(void)func(args->name);
3323 				if (!keep_stressing_flag())
3324 					break;
3325 				inc_counter(args);
3326 			}
3327 			t2 = stress_per_cpu_time();
3328 		} else if (cpu_load_slice == 0) {
3329 			/* == 0, random time slices */
3330 			const uint16_t r = stress_mwc16();
3331 			double slice_end = t1 + ((double)r / 131072.0);
3332 			do {
3333 				(void)func(args->name);
3334 				t2 = stress_per_cpu_time();
3335 				if (!keep_stressing_flag())
3336 					break;
3337 				inc_counter(args);
3338 			} while (t2 < slice_end);
3339 		} else {
3340 			/* > 0, time slice in milliseconds */
3341 			const double slice_end = t1 + ((double)cpu_load_slice / 1000.0);
3342 
3343 			do {
3344 				(void)func(args->name);
3345 				t2 = stress_per_cpu_time();
3346 				if (!keep_stressing_flag())
3347 					break;
3348 				inc_counter(args);
3349 			} while (t2 < slice_end);
3350 		}
3351 
3352 		/* Must not calculate this with zero % load */
3353 		delay = (((100 - cpu_load) * (t2 - t1)) / (double)cpu_load);
3354 		delay -= bias;
3355 
3356 		/* We may have clock warping so don't sleep for -ve delays */
3357 		if (delay < 0.0) {
3358 			bias = 0.0;
3359 		} else {
3360 			/*
3361 			 *  We need to sleep for a small amount of
3362 			 *  time, measurements need to be based on
3363 			 *  wall clock time and NOT on cpu time used.
3364 			 */
3365 			double t3;
3366 
3367 			t2 = stress_time_now();
3368 
3369 			tv.tv_sec = (time_t)delay;
3370 			tv.tv_usec = (long)((delay - (double)tv.tv_sec) * 1000000.0);
3371 			(void)select(0, NULL, NULL, NULL, &tv);
3372 			t3 = stress_time_now();
3373 			/* Bias takes account of the time to do the delay */
3374 			bias = (t3 - t2) - delay;
3375 		}
3376 	} while (keep_stressing(args));
3377 
3378 	if (stress_is_affinity_set() && (args->instance == 0)) {
3379 		pr_inf("%s: CPU affinity probably set, this can affect CPU loading\n",
3380 			args->name);
3381 	}
3382 
3383 	stress_set_proc_state(args->name, STRESS_STATE_DEINIT);
3384 
3385 	return EXIT_SUCCESS;
3386 }
3387 
stress_cpu_set_default(void)3388 static void stress_cpu_set_default(void)
3389 {
3390 	stress_set_cpu_method("all");
3391 }
3392 
3393 static const stress_opt_set_func_t opt_set_funcs[] = {
3394 	{ OPT_cpu_load,		stress_set_cpu_load },
3395 	{ OPT_cpu_load_slice,	stress_set_cpu_load_slice },
3396 	{ OPT_cpu_method,	stress_set_cpu_method },
3397 	{ 0,			NULL },
3398 };
3399 
3400 stressor_info_t stress_cpu_info = {
3401 	.stressor = stress_cpu,
3402 	.set_default = stress_cpu_set_default,
3403 	.class = CLASS_CPU,
3404 	.opt_set_funcs = opt_set_funcs,
3405 	.help = help
3406 };
3407