1 #include "benchmark_helpers.hpp"
2 #include <cmath>
3
4 #ifdef __SSE__
5 #include <xmmintrin.h>
6 #endif
7
8 using namespace std;
9
10 nova::aligned_array<float, 64> out, in, in2;
11
12 typedef float afloat __attribute__ ((__aligned__(16)));
13
bench_1(float * out,float * in1,float in2,float slope,unsigned int n)14 void __noinline__ bench_1(float * out, float * in1, float in2, float slope, unsigned int n)
15 {
16 for (unsigned int i = 0; i != n; ++i)
17 {
18 out[i] = in1[i] + in2;
19 in2 += slope;
20 }
21 }
22
bench_2(float * out,float * in1,float in2,float slope,unsigned int n)23 void __noinline__ bench_2(float * out, float * in1, float in2, float slope, unsigned int n)
24 {
25 for (unsigned int i = 0; i != n; i += 4)
26 {
27 out[i] = in1[i] + in2; in2 += slope;
28 out[i+1] = in1[i+1] + in2; in2 += slope;
29 out[i+2] = in1[i+2] + in2; in2 += slope;
30 out[i+3] = in1[i+3] + in2; in2 += slope;
31 }
32 }
33
34 #ifdef __SSE__
bench_3(float * out,float * in1,float in2,float slope,unsigned int n)35 void __noinline__ bench_3(float * out, float * in1, float in2, float slope, unsigned int n)
36 {
37 __m128 arg2 = _mm_set_ps(in2, in2+slope, in2+slope+slope, in2+slope+slope+slope);
38 const __m128 vslope = _mm_set_ps1(slope+slope+slope+slope);
39
40 std::size_t loops = n / 4;
41
42 do {
43 __m128 arg1 = _mm_load_ps(in1);
44 __m128 result = _mm_add_ps(arg1, arg2);
45 arg2 = _mm_add_ps(arg2, vslope);
46 _mm_store_ps(out, result);
47 in1+=4;
48 out+=4;
49 } while (--loops);
50 }
51
bench_3a(float * out,float * in1,float in2,float slope,unsigned int n)52 void __noinline__ bench_3a(float * out, float * in1, float in2, float slope, unsigned int n)
53 {
54 __m128 arg2 = _mm_set_ps(in2, in2+slope, in2+ 2*slope, in2+3*slope);
55 const __m128 vslope = _mm_set_ps1(4 * slope);
56
57 std::size_t loops = n / 4;
58
59 do {
60 __m128 arg1 = _mm_load_ps(in1);
61 __m128 result = _mm_add_ps(arg1, arg2);
62 arg2 = _mm_add_ps(arg2, vslope);
63 _mm_store_ps(out, result);
64 in1+=4;
65 out+=4;
66 } while (--loops);
67 }
68 #endif
69
bench_4(float * out,float * in1,float in2,float slope,unsigned int n)70 void __noinline__ bench_4(float * out, float * in1, float in2, float slope, unsigned int n)
71 {
72 for (unsigned int i = 0; i != n; ++i)
73 {
74 out[i] = in1[i] * in2;
75 in2 += slope;
76 }
77 }
78
bench_5(float * out,float * in1,float in2,float slope,unsigned int n)79 void __noinline__ bench_5(float * out, float * in1, float in2, float slope, unsigned int n)
80 {
81 for (unsigned int i = 0; i != n; i += 4)
82 {
83 out[i] = in1[i] * in2; in2 += slope;
84 out[i+1] = in1[i+1] * in2; in2 += slope;
85 out[i+2] = in1[i+2] * in2; in2 += slope;
86 out[i+3] = in1[i+3] * in2; in2 += slope;
87 }
88 }
89
90 #ifdef __SSE__
bench_6(float * out,float * in1,float in2,float slope,unsigned int n)91 void __noinline__ bench_6(float * out, float * in1, float in2, float slope, unsigned int n)
92 {
93 __m128 arg2 = _mm_set_ps(in2, in2+slope, in2+slope+slope, in2+slope+slope+slope);
94 const __m128 vslope = _mm_set_ps1(slope+slope+slope+slope);
95
96 std::size_t loops = n / 4;
97
98 do {
99 __m128 arg1 = _mm_load_ps(in1);
100 __m128 result = _mm_mul_ps(arg1, arg2);
101 arg2 = _mm_add_ps(arg2, vslope);
102 _mm_store_ps(out, result);
103 in1+=4;
104 out+=4;
105 } while (--loops);
106 }
107
bench_6a(float * out,float * in1,float in2,float slope,unsigned int n)108 void __noinline__ bench_6a(float * out, float * in1, float in2, float slope, unsigned int n)
109 {
110 __m128 arg2 = _mm_set_ps(in2, in2+slope, in2+2*slope, in2+3*slope);
111 const __m128 vslope = _mm_set_ps1(4*slope);
112
113 std::size_t loops = n / 4;
114
115 do {
116 __m128 arg1 = _mm_load_ps(in1);
117 __m128 result = _mm_mul_ps(arg1, arg2);
118 arg2 = _mm_add_ps(arg2, vslope);
119 _mm_store_ps(out, result);
120 in1+=4;
121 out+=4;
122 } while (--loops);
123 }
124 #endif
125
main(void)126 int main(void)
127 {
128 out.assign(0.f);
129 in.assign(0.2f);
130 in2.assign(0.3f);
131
132 const unsigned int iterations = 50000000;
133
134 run_bench(boost::bind(bench_1, out.begin(), in.begin(), 0.1f, 0.001f, 64), iterations);
135 run_bench(boost::bind(bench_2, out.begin(), in.begin(), 0.1f, 0.001f, 64), iterations);
136 #ifdef __SSE__
137 run_bench(boost::bind(bench_3, out.begin(), in.begin(), 0.1f, 0.001f, 64), iterations);
138 run_bench(boost::bind(bench_3a, out.begin(), in.begin(), 0.1f, 0.001f, 64), iterations);
139 #endif
140
141 run_bench(boost::bind(bench_4, out.begin(), in.begin(), 0.1f, 0.001f, 64), iterations);
142 run_bench(boost::bind(bench_5, out.begin(), in.begin(), 0.1f, 0.001f, 64), iterations);
143
144 #ifdef __SSE__
145 run_bench(boost::bind(bench_6, out.begin(), in.begin(), 0.1f, 0.001f, 64), iterations);
146 run_bench(boost::bind(bench_6a, out.begin(), in.begin(), 0.1f, 0.001f, 64), iterations);
147 #endif
148 }
149