1 #include "benchmark_helpers.hpp"
2 #include <cmath>
3 
4 #ifdef __SSE__
5 #include <xmmintrin.h>
6 #endif
7 
8 using namespace std;
9 
10 nova::aligned_array<float, 64> out, in, in2;
11 
12 typedef float afloat __attribute__ ((__aligned__(16)));
13 
bench_1(float * out,float * in1,float in2,float slope,unsigned int n)14 void __noinline__ bench_1(float * out, float * in1, float in2, float slope, unsigned int n)
15 {
16     for (unsigned int i = 0; i != n; ++i)
17     {
18         out[i] = in1[i] + in2;
19         in2 += slope;
20     }
21 }
22 
bench_2(float * out,float * in1,float in2,float slope,unsigned int n)23 void __noinline__ bench_2(float * out, float * in1, float in2, float slope, unsigned int n)
24 {
25     for (unsigned int i = 0; i != n; i += 4)
26     {
27         out[i] = in1[i] + in2; in2 += slope;
28         out[i+1] = in1[i+1] + in2; in2 += slope;
29         out[i+2] = in1[i+2] + in2; in2 += slope;
30         out[i+3] = in1[i+3] + in2; in2 += slope;
31     }
32 }
33 
34 #ifdef __SSE__
bench_3(float * out,float * in1,float in2,float slope,unsigned int n)35 void __noinline__ bench_3(float * out, float * in1, float in2, float slope, unsigned int n)
36 {
37     __m128 arg2 = _mm_set_ps(in2, in2+slope, in2+slope+slope, in2+slope+slope+slope);
38     const __m128 vslope = _mm_set_ps1(slope+slope+slope+slope);
39 
40     std::size_t loops = n / 4;
41 
42     do {
43         __m128 arg1 = _mm_load_ps(in1);
44         __m128 result = _mm_add_ps(arg1, arg2);
45         arg2 = _mm_add_ps(arg2, vslope);
46         _mm_store_ps(out, result);
47         in1+=4;
48         out+=4;
49     } while (--loops);
50 }
51 
bench_3a(float * out,float * in1,float in2,float slope,unsigned int n)52 void __noinline__ bench_3a(float * out, float * in1, float in2, float slope, unsigned int n)
53 {
54     __m128 arg2 = _mm_set_ps(in2, in2+slope, in2+ 2*slope, in2+3*slope);
55     const __m128 vslope = _mm_set_ps1(4 * slope);
56 
57     std::size_t loops = n / 4;
58 
59     do {
60         __m128 arg1 = _mm_load_ps(in1);
61         __m128 result = _mm_add_ps(arg1, arg2);
62         arg2 = _mm_add_ps(arg2, vslope);
63         _mm_store_ps(out, result);
64         in1+=4;
65         out+=4;
66     } while (--loops);
67 }
68 #endif
69 
bench_4(float * out,float * in1,float in2,float slope,unsigned int n)70 void __noinline__ bench_4(float * out, float * in1, float in2, float slope, unsigned int n)
71 {
72     for (unsigned int i = 0; i != n; ++i)
73     {
74         out[i] = in1[i] * in2;
75         in2 += slope;
76     }
77 }
78 
bench_5(float * out,float * in1,float in2,float slope,unsigned int n)79 void __noinline__ bench_5(float * out, float * in1, float in2, float slope, unsigned int n)
80 {
81     for (unsigned int i = 0; i != n; i += 4)
82     {
83         out[i] = in1[i] * in2; in2 += slope;
84         out[i+1] = in1[i+1] * in2; in2 += slope;
85         out[i+2] = in1[i+2] * in2; in2 += slope;
86         out[i+3] = in1[i+3] * in2; in2 += slope;
87     }
88 }
89 
90 #ifdef __SSE__
bench_6(float * out,float * in1,float in2,float slope,unsigned int n)91 void __noinline__ bench_6(float * out, float * in1, float in2, float slope, unsigned int n)
92 {
93     __m128 arg2 = _mm_set_ps(in2, in2+slope, in2+slope+slope, in2+slope+slope+slope);
94     const __m128 vslope = _mm_set_ps1(slope+slope+slope+slope);
95 
96     std::size_t loops = n / 4;
97 
98     do {
99         __m128 arg1 = _mm_load_ps(in1);
100         __m128 result = _mm_mul_ps(arg1, arg2);
101         arg2 = _mm_add_ps(arg2, vslope);
102         _mm_store_ps(out, result);
103         in1+=4;
104         out+=4;
105     } while (--loops);
106 }
107 
bench_6a(float * out,float * in1,float in2,float slope,unsigned int n)108 void __noinline__ bench_6a(float * out, float * in1, float in2, float slope, unsigned int n)
109 {
110     __m128 arg2 = _mm_set_ps(in2, in2+slope, in2+2*slope, in2+3*slope);
111     const __m128 vslope = _mm_set_ps1(4*slope);
112 
113     std::size_t loops = n / 4;
114 
115     do {
116         __m128 arg1 = _mm_load_ps(in1);
117         __m128 result = _mm_mul_ps(arg1, arg2);
118         arg2 = _mm_add_ps(arg2, vslope);
119         _mm_store_ps(out, result);
120         in1+=4;
121         out+=4;
122     } while (--loops);
123 }
124 #endif
125 
main(void)126 int main(void)
127 {
128     out.assign(0.f);
129     in.assign(0.2f);
130     in2.assign(0.3f);
131 
132     const unsigned int iterations = 50000000;
133 
134     run_bench(boost::bind(bench_1, out.begin(), in.begin(), 0.1f, 0.001f, 64), iterations);
135     run_bench(boost::bind(bench_2, out.begin(), in.begin(), 0.1f, 0.001f, 64), iterations);
136 #ifdef __SSE__
137     run_bench(boost::bind(bench_3, out.begin(), in.begin(), 0.1f, 0.001f, 64), iterations);
138     run_bench(boost::bind(bench_3a, out.begin(), in.begin(), 0.1f, 0.001f, 64), iterations);
139 #endif
140 
141     run_bench(boost::bind(bench_4, out.begin(), in.begin(), 0.1f, 0.001f, 64), iterations);
142     run_bench(boost::bind(bench_5, out.begin(), in.begin(), 0.1f, 0.001f, 64), iterations);
143 
144 #ifdef __SSE__
145     run_bench(boost::bind(bench_6, out.begin(), in.begin(), 0.1f, 0.001f, 64), iterations);
146     run_bench(boost::bind(bench_6a, out.begin(), in.begin(), 0.1f, 0.001f, 64), iterations);
147 #endif
148 }
149