1 #include "benchmark_helpers.hpp"
2 
3 #ifdef __SSE__
4 #include <xmmintrin.h>
5 #endif
6 
7 #include "../simd_binary_arithmetic.hpp"
8 
9 using namespace nova;
10 using namespace std;
11 
12 aligned_array<float, 64> out, in;
13 
14 #ifdef __SSE__
bench_1(float * out,float * in,float f,unsigned int n)15 void __noinline__ bench_1(float * out, float * in, float f, unsigned int n)
16 {
17     n /= 4;
18 
19     __m128 scalar = _mm_set_ps1(f);
20 
21     do
22     {
23         __m128 arg = _mm_load_ps(in);
24         __m128 result = _mm_add_ps(arg, scalar);
25         _mm_store_ps(out, result);
26         in += 4;
27         out += 4;
28     }
29     while (--n);
30 }
31 
bench_2(float * out,float * in,float f,unsigned int n)32 void __noinline__ bench_2(float * out, float * in, float f, unsigned int n)
33 {
34     n /= 8;
35     __m128 scalar = _mm_set_ps1(f);
36     do
37     {
38         __m128 arg = _mm_load_ps(in);
39         __m128 result = _mm_add_ps(arg, scalar);
40         _mm_store_ps(out, result);
41 
42         arg = _mm_load_ps(in+4);
43         result = _mm_add_ps(arg, scalar);
44         _mm_store_ps(out+4, result);
45         in += 8;
46         out += 8;
47     }
48     while (--n);
49 }
50 
bench_3(float * __restrict__ out,float * __restrict__ in,float f,unsigned int n)51 void  __noinline__ bench_3(float * __restrict__ out, float * __restrict__ in, float f, unsigned int n)
52 {
53     n /= 8;
54     __m128 scalar = _mm_set_ps1(f);
55     do
56     {
57         __m128 arg = _mm_load_ps(in);
58         __m128 result = _mm_add_ps(arg, scalar);
59         _mm_store_ps(out, result);
60 
61         arg = _mm_load_ps(in+4);
62         result = _mm_add_ps(arg, scalar);
63         _mm_store_ps(out+4, result);
64         in += 8;
65         out += 8;
66     }
67     while (--n);
68 }
69 
70 
bench_4(float * out,float * in,float f,unsigned int n)71 void __noinline__ bench_4(float * out, float * in, float f, unsigned int n)
72 {
73     n /= 8;
74 
75     __m128 scalar = _mm_set_ps1(f);
76 
77     do
78     {
79         __m128 arg  = _mm_load_ps(in);
80         __m128 arg2 = _mm_load_ps(in+4);
81         __m128 result  = _mm_add_ps(arg, scalar);
82         __m128 result2 = _mm_add_ps(arg2, scalar);
83         _mm_store_ps(out, result);
84         _mm_store_ps(out+4, result2);
85         in += 8;
86         out += 8;
87     }
88     while (--n);
89 }
90 
bench_5(float * out,float * in,float f,unsigned int n)91 void __noinline__ bench_5(float * out, float * in, float f, unsigned int n)
92 {
93     n /= 16;
94 
95     __m128 scalar = _mm_set_ps1(f);
96 
97     do
98     {
99         __m128 arg  = _mm_load_ps(in);
100         __m128 arg2 = _mm_load_ps(in+4);
101         __m128 arg3 = _mm_load_ps(in+8);
102         __m128 arg4 = _mm_load_ps(in+12);
103         __m128 result  = _mm_add_ps(arg, scalar);
104         __m128 result2 = _mm_add_ps(arg2, scalar);
105         __m128 result3 = _mm_add_ps(arg3, scalar);
106         __m128 result4 = _mm_add_ps(arg4, scalar);
107         _mm_store_ps(out, result);
108         _mm_store_ps(out+4, result2);
109         _mm_store_ps(out+8, result3);
110         _mm_store_ps(out+12, result4);
111         in += 16;
112         out += 16;
113     }
114     while (--n);
115 }
116 #endif
117 
118 /*void __noinline__ bench_6(float * out, float * in, float f, unsigned int n)
119 {
120     n /= 8;
121 
122     do
123     {
124         nova::plus_vec_simd<8>(out, in, f);
125         in += 8;
126         out += 8;
127     }
128     while (--n);
129 }
130 */
bench_7(float * out,float * in,float f,unsigned int n)131 void __noinline__ bench_7(float * out, float * in, float f, unsigned int n)
132 {
133     n /= 16;
134 
135     do
136     {
137         nova::plus_vec_simd<16>(out, in, f);
138         in += 16;
139         out += 16;
140     }
141     while (--n);
142 }
143 
bench_8(float * out,float * in,float f,unsigned int n)144 void __noinline__ bench_8(float * out, float * in, float f, unsigned int n)
145 {
146     n /= 32;
147 
148     do
149     {
150         nova::plus_vec_simd<32>(out, in, f);
151         in += 32;
152         out += 32;
153     }
154     while (--n);
155 }
156 
157 
main(void)158 int main(void)
159 {
160     out.assign(0.f);
161     in.assign(0.f);
162 
163     const unsigned int iterations = 100000000;
164 
165 #ifdef __SSE__
166     run_bench(boost::bind(bench_1, out.begin(), in.begin(), 1.f, 64), iterations);
167     run_bench(boost::bind(bench_2, out.begin(), in.begin(), 1.f, 64), iterations);
168     run_bench(boost::bind(bench_3, out.begin(), in.begin(), 1.f, 64), iterations);
169     run_bench(boost::bind(bench_4, out.begin(), in.begin(), 1.f, 64), iterations);
170     run_bench(boost::bind(bench_5, out.begin(), in.begin(), 1.f, 64), iterations);
171 #endif
172  /*   run_bench(boost::bind(bench_6, out.begin(), in.begin(), 1.f, 64), iterations);*/
173     run_bench(boost::bind(bench_7, out.begin(), in.begin(), 1.f, 64), iterations);
174     run_bench(boost::bind(bench_8, out.begin(), in.begin(), 1.f, 64), iterations);
175 }
176