1 #define _POSIX_C_SOURCE 199309L
2 
3 #include <float.h>
4 #include <math.h>
5 #include <stdint.h>
6 #include <stdio.h>
7 #include <time.h>
8 
9 #ifdef __MACH__
10   #include <mach/clock.h>
11   #include <mach/mach.h>
12 #endif
13 
14 #include <emmintrin.h>
15 
rcp_ieee(float x)16 static float rcp_ieee(float x)
17 {
18 	return 1.0f / x;
19 }
20 
rcp_sse(float x)21 static float rcp_sse(float x)
22 {
23 	return _mm_cvtss_f32(_mm_rcp_ss(_mm_set1_ps(x)));
24 }
25 
rcp_newton(float x)26 static float rcp_newton(float x)
27 {
28 	float xi;
29 
30 	xi  = rcp_sse(x);
31 	xi = xi + xi * (1.0f - x * xi);
32 
33 	return xi;
34 }
35 
sub_timespec(struct timespec a,struct timespec b)36 static long long sub_timespec(struct timespec a, struct timespec b)
37 {
38 	return (long long)(a.tv_sec - b.tv_sec) * 1000000000 + a.tv_nsec - b.tv_nsec;
39 }
40 
41 __attribute__((noinline))
check_accuracy(void)42 static void check_accuracy(void)
43 {
44 	double abs_err = 0.0;
45 	double rel_err = 0.0;
46 
47 	double abs_err_x = 0.0;
48 	double rel_err_x = 0.0;
49 
50 	float x;
51 
52 	for (x = 1.0f; x < 2.0f; x = nextafterf(x, 2.0f)) {
53 		float exact  = rcp_ieee(x);
54 		float approx = rcp_newton(x);
55 
56 		double abs_err_curr = fabsf(exact - approx);
57 		double rel_err_curr = abs_err_curr / exact;
58 
59 		if (abs_err_curr > abs_err) {
60 			abs_err   = abs_err_curr;
61 			abs_err_x = x;
62 		}
63 		if (rel_err_curr > rel_err) {
64 			rel_err   = rel_err_curr;
65 			rel_err_x = x;
66 		}
67 	}
68 
69 	printf("rcp(1) = %e (%e)\n", rcp_newton(1.0f), 1.0);
70 	printf("rcp(2) = %e (%e)\n", rcp_newton(2.0f), 0.5);
71 
72 	printf("rcp(1-eps) = %e (%e)\n", rcp_newton(1.0f - FLT_EPSILON / 2.0f), rcp_ieee(1.0f - FLT_EPSILON / 2.0f));
73 	printf("rcp(1+eps) = %e (%e)\n", rcp_newton(1.0f + FLT_EPSILON), rcp_ieee(1.0f - FLT_EPSILON));
74 	printf("rcp(2-eps) = %e (%e)\n", rcp_newton(2.0f - FLT_EPSILON), rcp_ieee(2.0f - FLT_EPSILON));
75 
76 	printf("abs err: %e @ %e (%a)\n", abs_err, abs_err_x, abs_err_x);
77 	printf("rel err: %e @ %e (%a)\n", rel_err, rel_err_x, rel_err_x);
78 }
79 
80 #ifdef __MACH__ // OS X does not have clock_gettime, use clock_get_time
81   #define clock_gettime(id, tp) \
82 	do { \
83 		clock_serv_t cclock; \
84 		mach_timespec_t mts; \
85 		host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock); \
86 		clock_get_time(cclock, &mts); \
87 		mach_port_deallocate(mach_task_self(), cclock); \
88 		(tp)->tv_sec = mts.tv_sec; \
89 		(tp)->tv_nsec = mts.tv_nsec; \
90 	} while (0)
91 #endif
92 
93 __attribute__((noinline))
benchmark_approx(void)94 static void benchmark_approx(void)
95 {
96 	struct timespec tp1;
97 	struct timespec tp2;
98 	long long diff;
99 
100 	float x;
101 
102 	clock_gettime(CLOCK_REALTIME, &tp1);
103 
104 	for (x = 1.0f; x < 2.0f; x += FLT_EPSILON) {
105 		volatile float q = rcp_newton(x);
106 	}
107 
108 	clock_gettime(CLOCK_REALTIME, &tp2);
109 
110 	diff = sub_timespec(tp2, tp1);
111 	printf("rcp (approx): %lld (%e rcp/s)\n", diff, (1.0 / FLT_EPSILON) * 1000000000 / diff);
112 }
113 
114 __attribute__((noinline))
benchmark_exact(void)115 static void benchmark_exact(void)
116 {
117 	struct timespec tp1;
118 	struct timespec tp2;
119 	long long diff;
120 
121 	float x;
122 
123 	clock_gettime(CLOCK_REALTIME, &tp1);
124 
125 	for (x = 1.0f; x < 2.0f; x += FLT_EPSILON) {
126 		volatile float q = rcp_ieee(x);
127 	}
128 
129 	clock_gettime(CLOCK_REALTIME, &tp2);
130 
131 	diff = sub_timespec(tp2, tp1);
132 	printf("rcp (exact):  %lld (%e rcp/s)\n", diff, (1.0 / FLT_EPSILON) * 1000000000 / diff);
133 }
134 
main()135 int main()
136 {
137 	check_accuracy();
138 
139 	benchmark_approx();
140 	benchmark_exact();
141 }
142