1 #include "benchmark_helpers.hpp"
2
3 #ifdef __SSE__
4 #include <xmmintrin.h>
5 #endif
6
7 #include "../simd_binary_arithmetic.hpp"
8
9 using namespace nova;
10 using namespace std;
11
12 aligned_array<float, 64> out, in;
13
14 #ifdef __SSE__
bench_1(float * out,float * in,float f,unsigned int n)15 void __noinline__ bench_1(float * out, float * in, float f, unsigned int n)
16 {
17 n /= 4;
18
19 __m128 scalar = _mm_set_ps1(f);
20
21 do
22 {
23 __m128 arg = _mm_load_ps(in);
24 __m128 result = _mm_add_ps(arg, scalar);
25 _mm_store_ps(out, result);
26 in += 4;
27 out += 4;
28 }
29 while (--n);
30 }
31
bench_2(float * out,float * in,float f,unsigned int n)32 void __noinline__ bench_2(float * out, float * in, float f, unsigned int n)
33 {
34 n /= 8;
35 __m128 scalar = _mm_set_ps1(f);
36 do
37 {
38 __m128 arg = _mm_load_ps(in);
39 __m128 result = _mm_add_ps(arg, scalar);
40 _mm_store_ps(out, result);
41
42 arg = _mm_load_ps(in+4);
43 result = _mm_add_ps(arg, scalar);
44 _mm_store_ps(out+4, result);
45 in += 8;
46 out += 8;
47 }
48 while (--n);
49 }
50
bench_3(float * __restrict__ out,float * __restrict__ in,float f,unsigned int n)51 void __noinline__ bench_3(float * __restrict__ out, float * __restrict__ in, float f, unsigned int n)
52 {
53 n /= 8;
54 __m128 scalar = _mm_set_ps1(f);
55 do
56 {
57 __m128 arg = _mm_load_ps(in);
58 __m128 result = _mm_add_ps(arg, scalar);
59 _mm_store_ps(out, result);
60
61 arg = _mm_load_ps(in+4);
62 result = _mm_add_ps(arg, scalar);
63 _mm_store_ps(out+4, result);
64 in += 8;
65 out += 8;
66 }
67 while (--n);
68 }
69
70
bench_4(float * out,float * in,float f,unsigned int n)71 void __noinline__ bench_4(float * out, float * in, float f, unsigned int n)
72 {
73 n /= 8;
74
75 __m128 scalar = _mm_set_ps1(f);
76
77 do
78 {
79 __m128 arg = _mm_load_ps(in);
80 __m128 arg2 = _mm_load_ps(in+4);
81 __m128 result = _mm_add_ps(arg, scalar);
82 __m128 result2 = _mm_add_ps(arg2, scalar);
83 _mm_store_ps(out, result);
84 _mm_store_ps(out+4, result2);
85 in += 8;
86 out += 8;
87 }
88 while (--n);
89 }
90
bench_5(float * out,float * in,float f,unsigned int n)91 void __noinline__ bench_5(float * out, float * in, float f, unsigned int n)
92 {
93 n /= 16;
94
95 __m128 scalar = _mm_set_ps1(f);
96
97 do
98 {
99 __m128 arg = _mm_load_ps(in);
100 __m128 arg2 = _mm_load_ps(in+4);
101 __m128 arg3 = _mm_load_ps(in+8);
102 __m128 arg4 = _mm_load_ps(in+12);
103 __m128 result = _mm_add_ps(arg, scalar);
104 __m128 result2 = _mm_add_ps(arg2, scalar);
105 __m128 result3 = _mm_add_ps(arg3, scalar);
106 __m128 result4 = _mm_add_ps(arg4, scalar);
107 _mm_store_ps(out, result);
108 _mm_store_ps(out+4, result2);
109 _mm_store_ps(out+8, result3);
110 _mm_store_ps(out+12, result4);
111 in += 16;
112 out += 16;
113 }
114 while (--n);
115 }
116 #endif
117
118 /*void __noinline__ bench_6(float * out, float * in, float f, unsigned int n)
119 {
120 n /= 8;
121
122 do
123 {
124 nova::plus_vec_simd<8>(out, in, f);
125 in += 8;
126 out += 8;
127 }
128 while (--n);
129 }
130 */
bench_7(float * out,float * in,float f,unsigned int n)131 void __noinline__ bench_7(float * out, float * in, float f, unsigned int n)
132 {
133 n /= 16;
134
135 do
136 {
137 nova::plus_vec_simd<16>(out, in, f);
138 in += 16;
139 out += 16;
140 }
141 while (--n);
142 }
143
bench_8(float * out,float * in,float f,unsigned int n)144 void __noinline__ bench_8(float * out, float * in, float f, unsigned int n)
145 {
146 n /= 32;
147
148 do
149 {
150 nova::plus_vec_simd<32>(out, in, f);
151 in += 32;
152 out += 32;
153 }
154 while (--n);
155 }
156
157
main(void)158 int main(void)
159 {
160 out.assign(0.f);
161 in.assign(0.f);
162
163 const unsigned int iterations = 100000000;
164
165 #ifdef __SSE__
166 run_bench(boost::bind(bench_1, out.begin(), in.begin(), 1.f, 64), iterations);
167 run_bench(boost::bind(bench_2, out.begin(), in.begin(), 1.f, 64), iterations);
168 run_bench(boost::bind(bench_3, out.begin(), in.begin(), 1.f, 64), iterations);
169 run_bench(boost::bind(bench_4, out.begin(), in.begin(), 1.f, 64), iterations);
170 run_bench(boost::bind(bench_5, out.begin(), in.begin(), 1.f, 64), iterations);
171 #endif
172 /* run_bench(boost::bind(bench_6, out.begin(), in.begin(), 1.f, 64), iterations);*/
173 run_bench(boost::bind(bench_7, out.begin(), in.begin(), 1.f, 64), iterations);
174 run_bench(boost::bind(bench_8, out.begin(), in.begin(), 1.f, 64), iterations);
175 }
176