1 /**
2  * Copyright 2014 Andreas Schäfer
3  *
4  * Distributed under the Boost Software License, Version 1.0. (See accompanying
5  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
6  */
7 
8 #include <iostream>
9 #include <libflatarray/flat_array.hpp>
10 #include <libflatarray/short_vec.hpp>
11 #include <libflatarray/testbed/cpu_benchmark.hpp>
12 #include <libflatarray/testbed/evaluate.hpp>
13 
14 #ifdef __SSE__
15 #include <xmmintrin.h>
16 #endif
17 
18 #ifdef __AVX__
19 #include <immintrin.h>
20 #endif
21 
22 #define WEIGHT_S 0.11
23 #define WEIGHT_T 0.12
24 #define WEIGHT_W 0.13
25 #define WEIGHT_C 0.20
26 #define WEIGHT_E 0.15
27 #define WEIGHT_B 0.16
28 #define WEIGHT_N 0.17
29 #define DELTA_T 0.001
30 #define SOFTENING 0.1
31 
32 using namespace LibFlatArray;
33 
34 class JacobiD3Q7 : public cpu_benchmark
35 {
36 public:
family()37     std::string family()
38     {
39         return "JacobiD3Q7";
40     }
41 
unit()42     std::string unit()
43     {
44         return "GLUPS";
45     }
46 
glups(std::vector<int> dim,int steps,double time) const47     double glups(std::vector<int> dim, int steps, double time) const
48     {
49         double updates = steps;
50         for (std::size_t i = 0; i < dim.size(); ++i) {
51             updates *= dim[i] - 2;
52         }
53 
54         double gigaLatticeUpdatesPerSecond = updates / time * 1e-9;
55         return gigaLatticeUpdatesPerSecond;
56     }
57 };
58 
59 class JacobiD3Q7Vanilla : public JacobiD3Q7
60 {
61 public:
species()62     std::string species()
63     {
64         return "vanilla";
65     }
66 
performance(std::vector<int> dim)67     double performance(std::vector<int> dim)
68     {
69         int dim_x = dim[0];
70         int dim_y = dim[1];
71         int dim_z = dim[2];
72         int maxT = 200000000 / dim_x / dim_y / dim_z;
73         maxT = std::max(16, maxT);
74 
75         int offsetZ = dim_x * dim_y;
76         int gridVolume = dim_x * dim_y * dim_z;
77         std::vector<double> compressedGrid(2 * gridVolume);
78         double *gridOld = &compressedGrid[0];
79         double *gridNew = &compressedGrid[gridVolume];
80 
81         for (int z = 0; z < dim_z; ++z) {
82             for (int y = 0; y < dim_y; ++y) {
83                 for (int x = 0; x < dim_x; ++x) {
84                     gridOld[z * offsetZ + y * dim_y + x] = x + y + z;
85                     gridNew[z * offsetZ + y * dim_y + x] = x + y + z;
86                 }
87             }
88         }
89 
90         double tStart = time();
91 
92         for (int t = 0; t < maxT; ++t) {
93             for (int z = 1; z < (dim_z - 1); ++z) {
94                 for (int y = 1; y < (dim_y - 1); ++y) {
95                     updateLine(gridOld, gridNew, 1, y, z, dim_x - 1, dim_x, offsetZ);
96                 }
97             }
98         }
99 
100         double tEnd = time();
101 
102         if (gridOld[1 * offsetZ + 1 * dim_y + 1] ==
103             gridNew[1 * offsetZ + 1 * dim_y + 1]) {
104             std::cout << "this is a debug statement to prevent the compiler from optimizing away the update routine\n";
105         }
106 
107         return glups(dim, maxT, tEnd - tStart);
108     }
109 
110 private:
updateLine(double * gridOld,double * gridNew,const int xStart,const int y,const int z,const int xEnd,const int offsetY,const int offsetZ) const111     void updateLine(double *gridOld, double *gridNew,
112                     const int xStart, const int y,       const int z,
113                     const int xEnd,   const int offsetY, const int offsetZ) const
114     {
115         for (int x = xStart; x < xEnd; ++x) {
116             gridNew[x + y * offsetY + z * offsetZ] =
117                 gridOld[x + y * offsetY + z * offsetZ - 1 * offsetZ] * WEIGHT_S +
118                 gridOld[x + y * offsetY + z * offsetZ - 1 * offsetY] * WEIGHT_T +
119                 gridOld[x + y * offsetY + z * offsetZ - 1          ] * WEIGHT_W +
120                 gridOld[x + y * offsetY + z * offsetZ + 0          ] * WEIGHT_C +
121                 gridOld[x + y * offsetY + z * offsetZ + 1          ] * WEIGHT_E +
122                 gridOld[x + y * offsetY + z * offsetZ + 1 * offsetY] * WEIGHT_B +
123                 gridOld[x + y * offsetY + z * offsetZ + 1 * offsetZ] * WEIGHT_N;
124         }
125     }
126 };
127 
128 #ifdef __SSE__
129 
130 class JacobiD3Q7Pepper : public JacobiD3Q7
131 {
132 public:
species()133     std::string species()
134     {
135         return "pepper";
136     }
137 
performance(std::vector<int> dim)138     double performance(std::vector<int> dim)
139     {
140         int dim_x = dim[0];
141         int dim_y = dim[1];
142         int dim_z = dim[2];
143         int maxT = 200000000 / dim_x / dim_y / dim_z;
144         maxT = std::max(16, maxT);
145 
146         int offsetZ = dim_x * dim_y;
147         int gridVolume = dim_x * dim_y * dim_z;
148         std::vector<double> compressedGrid(2 * gridVolume);
149         double *gridOld = &compressedGrid[0];
150         double *gridNew = &compressedGrid[gridVolume];
151 
152         for (int z = 0; z < dim_z; ++z) {
153             for (int y = 0; y < dim_y; ++y) {
154                 for (int x = 0; x < dim_x; ++x) {
155                     gridOld[z * offsetZ + y * dim_y + x] = x + y + z;
156                     gridNew[z * offsetZ + y * dim_y + x] = x + y + z;
157                 }
158             }
159         }
160 
161         double tStart = time();
162 
163         for (int t = 0; t < maxT; ++t) {
164             for (int z = 1; z < (dim_z - 1); ++z) {
165                 for (int y = 1; y < (dim_y - 1); ++y) {
166                     updateLine(gridOld, gridNew, 1, y, z, dim_x - 1, dim_x, offsetZ);
167                 }
168             }
169         }
170 
171         double tEnd = time();
172 
173         if (gridOld[1 * offsetZ + 1 * dim_y + 1] ==
174             gridNew[1 * offsetZ + 1 * dim_y + 1]) {
175             std::cout << "this is a debug statement to prevent the compiler from optimizing away the update routine\n";
176         }
177 
178         return glups(dim, maxT, tEnd - tStart);
179     }
180 
181 private:
updateLine(double * gridOld,double * gridNew,const int xStart,const int y,const int z,const int xEnd,const int offsetY,const int offsetZ) const182     void updateLine(double *gridOld, double *gridNew,
183                     const int xStart, const int y,       const int z,
184                     const int xEnd,   const int offsetY, const int offsetZ) const
185     {
186         __m128d factorS = _mm_set1_pd(WEIGHT_S);
187         __m128d factorT = _mm_set1_pd(WEIGHT_T);
188         __m128d factorW = _mm_set1_pd(WEIGHT_W);
189         __m128d factorE = _mm_set1_pd(WEIGHT_E);
190         __m128d factorB = _mm_set1_pd(WEIGHT_B);
191         __m128d factorN = _mm_set1_pd(WEIGHT_N);
192 
193         int x = xStart;
194 
195         if (x % 2) {
196             gridNew[x + y * offsetY + z * offsetZ] =
197                 gridOld[x + y * offsetY + z * offsetZ - 1 * offsetZ] * WEIGHT_S +
198                 gridOld[x + y * offsetY + z * offsetZ - 1 * offsetY] * WEIGHT_N +
199                 gridOld[x + y * offsetY + z * offsetZ - 1          ] * WEIGHT_W +
200                 gridOld[x + y * offsetY + z * offsetZ + 0          ] * WEIGHT_C +
201                 gridOld[x + y * offsetY + z * offsetZ + 1          ] * WEIGHT_E +
202                 gridOld[x + y * offsetY + z * offsetZ + 1 * offsetY] * WEIGHT_B +
203                 gridOld[x + y * offsetY + z * offsetZ + 1 * offsetZ] * WEIGHT_N;
204             ++x;
205         }
206 
207         for (; x < (xEnd - 7); x += 8) {
208             // load south row:
209             __m128d bufA = _mm_load_pd(gridOld + x + y * offsetY + z * offsetZ - 1 * offsetZ + 0);
210             __m128d bufB = _mm_load_pd(gridOld + x + y * offsetY + z * offsetZ - 1 * offsetZ + 2);
211             __m128d bufC = _mm_load_pd(gridOld + x + y * offsetY + z * offsetZ - 1 * offsetZ + 4);
212             __m128d bufD = _mm_load_pd(gridOld + x + y * offsetY + z * offsetZ - 1 * offsetZ + 6);
213             __m128d bufE;
214 
215             bufA = _mm_mul_pd(bufA, factorS);
216             bufB = _mm_mul_pd(bufB, factorS);
217             bufC = _mm_mul_pd(bufC, factorS);
218             bufD = _mm_mul_pd(bufD, factorS);
219 
220             // load top row:
221             __m128d sumA = _mm_load_pd(gridOld + x + y * offsetY + z * offsetZ - 1 * offsetY + 0);
222             __m128d sumB = _mm_load_pd(gridOld + x + y * offsetY + z * offsetZ - 1 * offsetY + 2);
223             __m128d sumC = _mm_load_pd(gridOld + x + y * offsetY + z * offsetZ - 1 * offsetY + 4);
224             __m128d sumD = _mm_load_pd(gridOld + x + y * offsetY + z * offsetZ - 1 * offsetY + 6);
225 
226             sumA = _mm_mul_pd(sumA, factorT);
227             sumB = _mm_mul_pd(sumB, factorT);
228             sumC = _mm_mul_pd(sumC, factorT);
229             sumD = _mm_mul_pd(sumD, factorT);
230 
231             sumA = _mm_add_pd(sumA, bufA);
232             sumB = _mm_add_pd(sumB, bufB);
233             sumC = _mm_add_pd(sumC, bufC);
234             sumD = _mm_add_pd(sumD, bufD);
235 
236             // load left/right row:
237             bufA = _mm_loadu_pd(gridOld + x + y * offsetY + z * offsetZ + 0 * offsetZ - 1);
238             bufB = _mm_loadu_pd(gridOld + x + y * offsetY + z * offsetZ + 0 * offsetZ + 1);
239             bufC = _mm_loadu_pd(gridOld + x + y * offsetY + z * offsetZ + 0 * offsetZ + 3);
240             bufD = _mm_loadu_pd(gridOld + x + y * offsetY + z * offsetZ + 0 * offsetZ + 5);
241             bufE = _mm_loadu_pd(gridOld + x + y * offsetY + z * offsetZ + 0 * offsetZ + 7);
242 
243             sumA = _mm_add_pd(sumA, _mm_mul_pd(bufA, factorW));
244             sumB = _mm_add_pd(sumB, _mm_mul_pd(bufB, factorW));
245             sumC = _mm_add_pd(sumC, _mm_mul_pd(bufC, factorW));
246             sumD = _mm_add_pd(sumD, _mm_mul_pd(bufD, factorW));
247 
248             sumA = _mm_add_pd(sumA, _mm_mul_pd(bufB, factorE));
249             sumB = _mm_add_pd(sumB, _mm_mul_pd(bufC, factorE));
250             sumC = _mm_add_pd(sumC, _mm_mul_pd(bufD, factorE));
251             sumD = _mm_add_pd(sumD, _mm_mul_pd(bufE, factorE));
252 
253             // load bottom row:
254             bufA = _mm_load_pd(gridOld + x + y * offsetY + z * offsetZ + 1 * offsetY + 0);
255             bufB = _mm_load_pd(gridOld + x + y * offsetY + z * offsetZ + 1 * offsetY + 2);
256             bufC = _mm_load_pd(gridOld + x + y * offsetY + z * offsetZ + 1 * offsetY + 4);
257             bufD = _mm_load_pd(gridOld + x + y * offsetY + z * offsetZ + 1 * offsetY + 6);
258 
259             bufA = _mm_mul_pd(bufA, factorB);
260             bufB = _mm_mul_pd(bufB, factorB);
261             bufC = _mm_mul_pd(bufC, factorB);
262             bufD = _mm_mul_pd(bufD, factorB);
263 
264             sumA = _mm_add_pd(sumA, bufA);
265             sumB = _mm_add_pd(sumB, bufB);
266             sumC = _mm_add_pd(sumC, bufC);
267             sumD = _mm_add_pd(sumD, bufD);
268 
269             // load north row:
270             bufA = _mm_load_pd(gridOld + x + y * offsetY + z * offsetZ + 1 * offsetZ + 0);
271             bufB = _mm_load_pd(gridOld + x + y * offsetY + z * offsetZ + 1 * offsetZ + 2);
272             bufC = _mm_load_pd(gridOld + x + y * offsetY + z * offsetZ + 1 * offsetZ + 4);
273             bufD = _mm_load_pd(gridOld + x + y * offsetY + z * offsetZ + 1 * offsetZ + 6);
274 
275             bufA = _mm_mul_pd(bufA, factorN);
276             bufB = _mm_mul_pd(bufB, factorN);
277             bufC = _mm_mul_pd(bufC, factorN);
278             bufD = _mm_mul_pd(bufD, factorN);
279 
280             sumA = _mm_add_pd(sumA, bufA);
281             sumB = _mm_add_pd(sumB, bufB);
282             sumC = _mm_add_pd(sumC, bufC);
283             sumD = _mm_add_pd(sumD, bufD);
284 
285             _mm_stream_pd(gridNew + x + y * offsetY + z * offsetZ + 0, sumA);
286             _mm_stream_pd(gridNew + x + y * offsetY + z * offsetZ + 2, sumB);
287             _mm_stream_pd(gridNew + x + y * offsetY + z * offsetZ + 4, sumC);
288             _mm_stream_pd(gridNew + x + y * offsetY + z * offsetZ + 6, sumD);
289         }
290 
291         for (; x < xEnd; ++x) {
292             gridNew[x + y * offsetY + z * offsetZ] =
293                 gridOld[x + y * offsetY + z * offsetZ - 1 * offsetZ] * WEIGHT_S +
294                 gridOld[x + y * offsetY + z * offsetZ - 1 * offsetY] * WEIGHT_N +
295                 gridOld[x + y * offsetY + z * offsetZ - 1          ] * WEIGHT_W +
296                 gridOld[x + y * offsetY + z * offsetZ + 0          ] * WEIGHT_C +
297                 gridOld[x + y * offsetY + z * offsetZ + 1          ] * WEIGHT_E +
298                 gridOld[x + y * offsetY + z * offsetZ + 1 * offsetY] * WEIGHT_B +
299                 gridOld[x + y * offsetY + z * offsetZ + 1 * offsetZ] * WEIGHT_N;
300         }
301     }
302 };
303 
304 #endif
305 
306 class JacobiCell
307 {
308 public:
JacobiCell(double temp=0)309     explicit JacobiCell(double temp = 0) :
310         temp(temp)
311     {}
312 
313     double temp;
314 };
315 
316 LIBFLATARRAY_REGISTER_SOA(JacobiCell,
317                           ((double)(temp)))
318 
319 class JacobiD3Q7Bronze : public JacobiD3Q7
320 {
321 public:
322     class UpdateFunctor
323     {
324     public:
UpdateFunctor(long dim_x,long dim_y,long dim_z)325         UpdateFunctor(long dim_x, long dim_y, long dim_z) :
326             dim_x(dim_x),
327             dim_y(dim_y),
328             dim_z(dim_z)
329         {}
330 
331         template<typename accessor_type1, typename accessor_type2>
operator ()(accessor_type1 & accessor1,accessor_type2 & accessor2) const332         void operator()(accessor_type1& accessor1, accessor_type2& accessor2) const
333         {
334             for (long z = 1; z < (dim_z - 1); ++z) {
335                 for (long y = 1; y < (dim_y - 1); ++y) {
336                     long indexStart = accessor1.gen_index(1,        y, z);
337                     long indexEnd   = accessor1.gen_index(dim_x - 1, y, z);
338 
339                     for (accessor1.index() = indexStart, accessor2.index() = indexStart;
340                          accessor1.index() < indexEnd;
341                          accessor1 += 1, accessor2 += 1) {
342 
343                         accessor2.temp() =
344                             accessor1[coord< 0,  0, -1>()].temp() * WEIGHT_S +
345                             accessor1[coord< 0, -1,  0>()].temp() * WEIGHT_T +
346                             accessor1[coord<-1,  0,  0>()].temp() * WEIGHT_W +
347                             accessor1[coord< 0,  0,  0>()].temp() * WEIGHT_C +
348                             accessor1[coord< 1,  0,  0>()].temp() * WEIGHT_E +
349                             accessor1[coord< 0,  1,  0>()].temp() * WEIGHT_B +
350                             accessor1[coord< 0,  0,  1>()].temp() * WEIGHT_N;
351                     }
352                 }
353             }
354         }
355 
356     private:
357         long dim_x;
358         long dim_y;
359         long dim_z;
360     };
361 
species()362     std::string species()
363     {
364         return "bronze";
365     }
366 
performance(std::vector<int> dim)367     double performance(std::vector<int> dim)
368     {
369         long dim_x = dim[0];
370         long dim_y = dim[1];
371         long dim_z = dim[2];
372         int maxT = 200000000 / dim_x / dim_y / dim_z;
373         maxT = std::max(16, maxT);
374 
375         soa_grid<JacobiCell> gridOld(dim_x, dim_y, dim_z);
376         soa_grid<JacobiCell> gridNew(dim_x, dim_y, dim_z);
377 
378         for (long z = 0; z < dim_z; ++z) {
379             for (long y = 0; y < dim_y; ++y) {
380                 for (long x = 0; x < dim_x; ++x) {
381                     gridOld.set(x, y, z, JacobiCell(x + y + z));
382                     gridNew.set(x, y, z, JacobiCell(x + y + z));
383                 }
384             }
385         }
386 
387         double tStart = time();
388 
389         UpdateFunctor functor(dim_x, dim_y, dim_z);
390         for (int t = 0; t < maxT; ++t) {
391             gridOld.callback(&gridNew, functor);
392             using std::swap;
393             swap(gridOld, gridNew);
394         }
395 
396         double tEnd = time();
397 
398         if (gridOld.get(1, 1, 1).temp ==
399             gridNew.get(1, 1, 1).temp) {
400             std::cout << gridOld.get(1, 1, 1).temp << "\n";
401             std::cout << gridNew.get(1, 1, 1).temp << "\n";
402             std::cout << "this is a debug statement to prevent the compiler from optimizing away the update routine\n";
403         }
404 
405         return glups(dim, maxT, tEnd - tStart);
406     }
407 
408 private:
updateLine(double * gridOld,double * gridNew,const long xStart,const long y,const long z,const long xEnd,const long offsetY,const long offsetZ) const409     void updateLine(double *gridOld, double *gridNew,
410                     const long xStart, const long y,       const long z,
411                     const long xEnd,   const long offsetY, const long offsetZ) const
412     {
413         for (long x = xStart; x < xEnd; ++x) {
414             gridNew[x + y * offsetY + z * offsetZ] =
415                 gridOld[x + y * offsetY + z * offsetZ - 1 * offsetZ] * WEIGHT_S +
416                 gridOld[x + y * offsetY + z * offsetZ - 1 * offsetY] * WEIGHT_T +
417                 gridOld[x + y * offsetY + z * offsetZ - 1          ] * WEIGHT_W +
418                 gridOld[x + y * offsetY + z * offsetZ + 0          ] * WEIGHT_C +
419                 gridOld[x + y * offsetY + z * offsetZ + 1          ] * WEIGHT_E +
420                 gridOld[x + y * offsetY + z * offsetZ + 1 * offsetY] * WEIGHT_B +
421                 gridOld[x + y * offsetY + z * offsetZ + 1 * offsetZ] * WEIGHT_N;
422         }
423     }
424 };
425 
426 #ifdef __SSE__
427 
428 class JacobiD3Q7Silver : public JacobiD3Q7
429 {
430 public:
431     class UpdateFunctor
432     {
433     public:
UpdateFunctor(long dim_x,long dim_y,long dim_z)434         UpdateFunctor(long dim_x, long dim_y, long dim_z) :
435             dim_x(dim_x),
436             dim_y(dim_y),
437             dim_z(dim_z)
438         {}
439 
440         template<typename accessor_type1, typename accessor_type2>
operator ()(accessor_type1 & accessor1,accessor_type2 & accessor2) const441         void operator()(accessor_type1& accessor1,
442                         accessor_type2& accessor2) const
443         {
444             __m128d factorS = _mm_set1_pd(WEIGHT_S);
445             __m128d factorT = _mm_set1_pd(WEIGHT_T);
446             __m128d factorW = _mm_set1_pd(WEIGHT_W);
447             __m128d factorE = _mm_set1_pd(WEIGHT_E);
448             __m128d factorB = _mm_set1_pd(WEIGHT_B);
449             __m128d factorN = _mm_set1_pd(WEIGHT_N);
450 
451             for (long z = 1; z < (dim_z - 1); ++z) {
452                 for (long y = 1; y < (dim_y - 1); ++y) {
453                     long indexStart = accessor1.gen_index(1,         y, z);
454                     long indexEnd   = accessor1.gen_index(dim_x - 1, y, z);
455 
456                     accessor1.index() = indexStart;
457                     accessor2.index() = indexStart;
458 
459                     accessor2.temp() =
460                         accessor1[coord< 0,  0, -1>()].temp() * WEIGHT_S +
461                         accessor1[coord< 0, -1,  0>()].temp() * WEIGHT_T +
462                         accessor1[coord<-1,  0,  0>()].temp() * WEIGHT_W +
463                         accessor1[coord< 0,  0,  0>()].temp() * WEIGHT_C +
464                         accessor1[coord< 1,  0,  0>()].temp() * WEIGHT_E +
465                         accessor1[coord< 0,  1,  0>()].temp() * WEIGHT_B +
466                         accessor1[coord< 0,  0,  1>()].temp() * WEIGHT_N;
467 
468                     accessor1.index() += 1;
469                     accessor2.index() += 1;
470 
471                     for (;
472                          accessor1.index() < (indexEnd - 7);
473                          accessor1.index() += 8, accessor2.index() += 8) {
474 
475                         // load south row:
476                         __m128d bufA = _mm_load_pd(&accessor1[coord<0, 0, -1>()].temp() + 0);
477                         __m128d bufB = _mm_load_pd(&accessor1[coord<0, 0, -1>()].temp() + 2);
478                         __m128d bufC = _mm_load_pd(&accessor1[coord<0, 0, -1>()].temp() + 4);
479                         __m128d bufD = _mm_load_pd(&accessor1[coord<0, 0, -1>()].temp() + 6);
480                         __m128d bufE;
481 
482                         bufA = _mm_mul_pd(bufA, factorS);
483                         bufB = _mm_mul_pd(bufB, factorS);
484                         bufC = _mm_mul_pd(bufC, factorS);
485                         bufD = _mm_mul_pd(bufD, factorS);
486 
487                         // load top row:
488                         __m128d sumA = _mm_load_pd(&accessor1[coord<0, -1, 0>()].temp() + 0);
489                         __m128d sumB = _mm_load_pd(&accessor1[coord<0, -1, 0>()].temp() + 2);
490                         __m128d sumC = _mm_load_pd(&accessor1[coord<0, -1, 0>()].temp() + 4);
491                         __m128d sumD = _mm_load_pd(&accessor1[coord<0, -1, 0>()].temp() + 6);
492 
493                         sumA = _mm_mul_pd(sumA, factorT);
494                         sumB = _mm_mul_pd(sumB, factorT);
495                         sumC = _mm_mul_pd(sumC, factorT);
496                         sumD = _mm_mul_pd(sumD, factorT);
497 
498                         sumA = _mm_add_pd(sumA, bufA);
499                         sumB = _mm_add_pd(sumB, bufB);
500                         sumC = _mm_add_pd(sumC, bufC);
501                         sumD = _mm_add_pd(sumD, bufD);
502 
503                         // load left/right row:
504                         bufA = _mm_loadu_pd(&accessor1[coord<0, 0, 0>()].temp() - 1);
505                         bufB = _mm_loadu_pd(&accessor1[coord<0, 0, 0>()].temp() + 1);
506                         bufC = _mm_loadu_pd(&accessor1[coord<0, 0, 0>()].temp() + 3);
507                         bufD = _mm_loadu_pd(&accessor1[coord<0, 0, 0>()].temp() + 5);
508                         bufE = _mm_loadu_pd(&accessor1[coord<0, 0, 0>()].temp() + 7);
509 
510                         sumA = _mm_add_pd(sumA, _mm_mul_pd(bufA, factorW));
511                         sumB = _mm_add_pd(sumB, _mm_mul_pd(bufB, factorW));
512                         sumC = _mm_add_pd(sumC, _mm_mul_pd(bufC, factorW));
513                         sumD = _mm_add_pd(sumD, _mm_mul_pd(bufD, factorW));
514 
515                         sumA = _mm_add_pd(sumA, _mm_mul_pd(bufB, factorE));
516                         sumB = _mm_add_pd(sumB, _mm_mul_pd(bufC, factorE));
517                         sumC = _mm_add_pd(sumC, _mm_mul_pd(bufD, factorE));
518                         sumD = _mm_add_pd(sumD, _mm_mul_pd(bufE, factorE));
519 
520                         // load bottom row:
521                         bufA = _mm_load_pd(&accessor1[coord<0, 1, 0>()].temp() + 0);
522                         bufB = _mm_load_pd(&accessor1[coord<0, 1, 0>()].temp() + 2);
523                         bufC = _mm_load_pd(&accessor1[coord<0, 1, 0>()].temp() + 4);
524                         bufD = _mm_load_pd(&accessor1[coord<0, 1, 0>()].temp() + 6);
525 
526                         bufA = _mm_mul_pd(bufA, factorB);
527                         bufB = _mm_mul_pd(bufB, factorB);
528                         bufC = _mm_mul_pd(bufC, factorB);
529                         bufD = _mm_mul_pd(bufD, factorB);
530 
531                         sumA = _mm_add_pd(sumA, bufA);
532                         sumB = _mm_add_pd(sumB, bufB);
533                         sumC = _mm_add_pd(sumC, bufC);
534                         sumD = _mm_add_pd(sumD, bufD);
535 
536                         // load north row:
537                         bufA = _mm_load_pd(&accessor1[coord<0, 0, 1>()].temp() + 0);
538                         bufB = _mm_load_pd(&accessor1[coord<0, 0, 1>()].temp() + 2);
539                         bufC = _mm_load_pd(&accessor1[coord<0, 0, 1>()].temp() + 4);
540                         bufD = _mm_load_pd(&accessor1[coord<0, 0, 1>()].temp() + 6);
541 
542                         bufA = _mm_mul_pd(bufA, factorN);
543                         bufB = _mm_mul_pd(bufB, factorN);
544                         bufC = _mm_mul_pd(bufC, factorN);
545                         bufD = _mm_mul_pd(bufD, factorN);
546 
547                         sumA = _mm_add_pd(sumA, bufA);
548                         sumB = _mm_add_pd(sumB, bufB);
549                         sumC = _mm_add_pd(sumC, bufC);
550                         sumD = _mm_add_pd(sumD, bufD);
551 
552                         _mm_stream_pd(&accessor2[coord<0, 0, 0>()].temp() + 0, sumA);
553                         _mm_stream_pd(&accessor2[coord<0, 0, 0>()].temp() + 2, sumB);
554                         _mm_stream_pd(&accessor2[coord<0, 0, 0>()].temp() + 4, sumC);
555                         _mm_stream_pd(&accessor2[coord<0, 0, 0>()].temp() + 6, sumD);
556                     }
557 
558 
559                     for (;
560                          accessor1.index() < (indexEnd - 1);
561                          accessor1.index() += 1, accessor2.index() += 1) {
562                         accessor2.temp() =
563                             accessor1[coord< 0,  0, -1>()].temp() * WEIGHT_S +
564                             accessor1[coord< 0, -1,  0>()].temp() * WEIGHT_T +
565                             accessor1[coord<-1,  0,  0>()].temp() * WEIGHT_W +
566                             accessor1[coord< 0,  0,  0>()].temp() * WEIGHT_C +
567                             accessor1[coord< 1,  0,  0>()].temp() * WEIGHT_E +
568                             accessor1[coord< 0,  1,  0>()].temp() * WEIGHT_B +
569                             accessor1[coord< 0,  0,  1>()].temp() * WEIGHT_N;
570 
571                     }
572                 }
573             }
574         }
575 
576     private:
577         long dim_x;
578         long dim_y;
579         long dim_z;
580     };
581 
species()582     std::string species()
583     {
584         return "silver";
585     }
586 
performance(std::vector<int> dim)587     double performance(std::vector<int> dim)
588     {
589         long dim_x = dim[0];
590         long dim_y = dim[1];
591         long dim_z = dim[2];
592         int maxT = 200000000 / dim_x / dim_y / dim_z;
593         maxT = std::max(16, maxT);
594 
595         soa_grid<JacobiCell> gridOld(dim_x, dim_y, dim_z);
596         soa_grid<JacobiCell> gridNew(dim_x, dim_y, dim_z);
597 
598         for (long z = 0; z < dim_z; ++z) {
599             for (long y = 0; y < dim_y; ++y) {
600                 for (long x = 0; x < dim_x; ++x) {
601                     gridOld.set(x, y, z, JacobiCell(x + y + z));
602                     gridNew.set(x, y, z, JacobiCell(x + y + z));
603                 }
604             }
605         }
606 
607         double tStart = time();
608 
609         UpdateFunctor functor(dim_x, dim_y, dim_z);
610         for (int t = 0; t < maxT; ++t) {
611             gridOld.callback(&gridNew, functor);
612             using std::swap;
613             swap(gridOld, gridNew);
614         }
615 
616         double tEnd = time();
617 
618         if (gridOld.get(20, 20, 20).temp ==
619             gridNew.get(10, 10, 10).temp) {
620             std::cout << "this is a debug statement to prevent the compiler from optimizing away the update routine\n";
621         }
622 
623         return glups(dim, maxT, tEnd - tStart);
624     }
625 
626 private:
updateLine(double * gridOld,double * gridNew,const long xStart,const long y,const long z,const long xEnd,const long offsetY,const long offsetZ) const627     void updateLine(double *gridOld, double *gridNew,
628                     const long xStart, const long y,       const long z,
629                     const long xEnd,   const long offsetY, const long offsetZ) const
630     {
631         for (long x = xStart; x < xEnd; ++x) {
632             gridNew[x + y * offsetY + z * offsetZ] =
633                 gridOld[x + y * offsetY + z * offsetZ - 1 * offsetZ] * WEIGHT_S +
634                 gridOld[x + y * offsetY + z * offsetZ - 1 * offsetY] * WEIGHT_T +
635                 gridOld[x + y * offsetY + z * offsetZ - 1          ] * WEIGHT_W +
636                 gridOld[x + y * offsetY + z * offsetZ + 0          ] * WEIGHT_C +
637                 gridOld[x + y * offsetY + z * offsetZ + 1          ] * WEIGHT_E +
638                 gridOld[x + y * offsetY + z * offsetZ + 1 * offsetY] * WEIGHT_B +
639                 gridOld[x + y * offsetY + z * offsetZ + 1 * offsetZ] * WEIGHT_N;
640         }
641     }
642 };
643 
644 #endif
645 
646 class JacobiD3Q7Gold : public JacobiD3Q7
647 {
648 public:
649     class UpdateFunctor
650     {
651     public:
UpdateFunctor(long dim_x,long dim_y,long dim_z)652         UpdateFunctor(long dim_x, long dim_y, long dim_z) :
653             dim_x(dim_x),
654             dim_y(dim_y),
655             dim_z(dim_z)
656         {}
657 
658         template<typename accessor_type1, typename accessor_type2>
operator ()(accessor_type1 & accessor_old,accessor_type2 & accessor_new) const659         void operator()(accessor_type1& accessor_old,
660                         accessor_type2& accessor_new) const
661         {
662             typedef typename LibFlatArray::estimate_optimum_short_vec_type<double, accessor_type1>::VALUE my_short_vec;
663 
664 #ifdef _OPENMP
665 #pragma omp parallel for schedule(static) firstprivate(accessorOld, accessorNew)
666 #endif
667             for (std::size_t z = 1; z < (dim_z - 1); ++z) {
668                 for (std::size_t y = 1; y < (dim_y - 1); ++y) {
669                     std::size_t x = 1;
670                     std::size_t end_x = dim_x - 1;
671 
672                     LIBFLATARRAY_LOOP_PEELER_TEMPLATE(
673                         my_short_vec,
674                         std::size_t,
675                         x,
676                         end_x,
677                         update_line,
678                         y,
679                         z,
680                         accessor_old,
681                         accessor_new);
682                 }
683             }
684         }
685 
686         template<typename SHORT_VEC, typename SOA_ACCESSOR_1, typename SOA_ACCESSOR_2>
update_line(std::size_t & x,std::size_t end_x,std::size_t y,std::size_t z,SOA_ACCESSOR_1 & accessor_old,SOA_ACCESSOR_2 & accessor_new) const687         void update_line(
688             std::size_t& x,
689             std::size_t end_x,
690             std::size_t y,
691             std::size_t z,
692             SOA_ACCESSOR_1& accessor_old,
693             SOA_ACCESSOR_2& accessor_new) const
694         {
695             accessor_old.index() = SOA_ACCESSOR_1::gen_index(x, y, z);
696             accessor_new.index() = SOA_ACCESSOR_2::gen_index(x, y, z);
697 
698             SHORT_VEC buf;
699             SHORT_VEC factorS = WEIGHT_S;
700             SHORT_VEC factorT = WEIGHT_T;
701             SHORT_VEC factorW = WEIGHT_W;
702             SHORT_VEC factorE = WEIGHT_E;
703             SHORT_VEC factorB = WEIGHT_B;
704             SHORT_VEC factorN = WEIGHT_N;
705 
706             for (; x < end_x; x += SHORT_VEC::ARITY) {
707                 using LibFlatArray::coord;
708                 buf =  SHORT_VEC(&accessor_old[coord< 0,  0, -1>()].temp()) * factorS;
709                 buf += SHORT_VEC(&accessor_old[coord< 0, -1,  0>()].temp()) * factorT;
710                 buf += SHORT_VEC(&accessor_old[coord<-1,  0,  0>()].temp()) * factorW;
711                 buf += SHORT_VEC(&accessor_old[coord< 1,  0,  0>()].temp()) * factorE;
712                 buf += SHORT_VEC(&accessor_old[coord< 0,  1,  0>()].temp()) * factorB;
713                 buf += SHORT_VEC(&accessor_old[coord< 0,  0,  1>()].temp()) * factorN;
714 
715                 &accessor_new.temp() << buf;
716 
717                 accessor_new += SHORT_VEC::ARITY;
718                 accessor_old += SHORT_VEC::ARITY;
719             }
720 
721         }
722 
723     private:
724         std::size_t dim_x;
725         std::size_t dim_y;
726         std::size_t dim_z;
727     };
728 
species()729     std::string species()
730     {
731         return "gold";
732     }
733 
performance(std::vector<int> dim)734     double performance(std::vector<int> dim)
735     {
736         long dim_x = dim[0];
737         long dim_y = dim[1];
738         long dim_z = dim[2];
739         int maxT = 200000000 / dim_x / dim_y / dim_z;
740         maxT = std::max(16, maxT);
741 
742         soa_grid<JacobiCell> gridOld(dim_x, dim_y, dim_z);
743         soa_grid<JacobiCell> gridNew(dim_x, dim_y, dim_z);
744 
745         for (long z = 0; z < dim_z; ++z) {
746             for (long y = 0; y < dim_y; ++y) {
747                 for (long x = 0; x < dim_x; ++x) {
748                     gridOld.set(x, y, z, JacobiCell(x + y + z));
749                     gridNew.set(x, y, z, JacobiCell(x + y + z));
750                 }
751             }
752         }
753 
754         double tStart = time();
755 
756         UpdateFunctor functor(dim_x, dim_y, dim_z);
757         for (int t = 0; t < maxT; ++t) {
758             gridOld.callback(&gridNew, functor);
759             using std::swap;
760             swap(gridOld, gridNew);
761         }
762 
763         double tEnd = time();
764 
765         if (gridOld.get(20, 20, 20).temp ==
766             gridNew.get(10, 10, 10).temp) {
767             std::cout << "this is a debug statement to prevent the compiler from optimizing away the update routine\n";
768         }
769 
770         return glups(dim, maxT, tEnd - tStart);
771     }
772 
773 private:
updateLine(double * gridOld,double * gridNew,const long xStart,const long y,const long z,const long xEnd,const long offsetY,const long offsetZ) const774     void updateLine(double *gridOld, double *gridNew,
775                     const long xStart, const long y,       const long z,
776                     const long xEnd,   const long offsetY, const long offsetZ) const
777     {
778         for (long x = xStart; x < xEnd; ++x) {
779             gridNew[x + y * offsetY + z * offsetZ] =
780                 gridOld[x + y * offsetY + z * offsetZ - 1 * offsetZ] * WEIGHT_S +
781                 gridOld[x + y * offsetY + z * offsetZ - 1 * offsetY] * WEIGHT_T +
782                 gridOld[x + y * offsetY + z * offsetZ - 1          ] * WEIGHT_W +
783                 gridOld[x + y * offsetY + z * offsetZ + 0          ] * WEIGHT_C +
784                 gridOld[x + y * offsetY + z * offsetZ + 1          ] * WEIGHT_E +
785                 gridOld[x + y * offsetY + z * offsetZ + 1 * offsetY] * WEIGHT_B +
786                 gridOld[x + y * offsetY + z * offsetZ + 1 * offsetZ] * WEIGHT_N;
787         }
788     }
789 };
790 
791 class Particle
792 {
793 public:
Particle(const float posX=0,const float posY=0,const float posZ=0,const float velX=0,const float velY=0,const float velZ=0,const float charge=0)794     explicit inline Particle(
795         const float posX = 0,
796         const float posY = 0,
797         const float posZ = 0,
798         const float velX = 0,
799         const float velY = 0,
800         const float velZ = 0,
801         const float charge = 0) :
802         posX(posX),
803         posY(posY),
804         posZ(posZ),
805         velX(velX),
806         velY(velY),
807         velZ(velZ),
808         charge(charge)
809     {}
810 
811     float posX;
812     float posY;
813     float posZ;
814     float velX;
815     float velY;
816     float velZ;
817     float charge;
818 };
819 
820 LIBFLATARRAY_REGISTER_SOA(Particle,
821                           ((float)(posX))
822                           ((float)(posY))
823                           ((float)(posZ))
824                           ((float)(velX))
825                           ((float)(velY))
826                           ((float)(velZ))
827                           ((float)(charge)))
828 
829 class ArrayParticle
830 {
831 public:
832     inline
ArrayParticle(const float posX=0,const float posY=0,const float posZ=0,const float velX=0,const float velY=0,const float velZ=0,const float charge=0)833     explicit ArrayParticle(
834         const float posX = 0,
835         const float posY = 0,
836         const float posZ = 0,
837         const float velX = 0,
838         const float velY = 0,
839         const float velZ = 0,
840         const float charge = 0) :
841         charge(charge)
842     {
843         pos[0] = posX;
844         pos[1] = posY;
845         pos[2] = posZ;
846         vel[0] = velX;
847         vel[1] = velY;
848         vel[2] = velZ;
849     }
850 
851     float pos[3];
852     float vel[3];
853     float charge;
854 };
855 
856 LIBFLATARRAY_REGISTER_SOA(ArrayParticle,
857                           ((float)(pos)(3))
858                           ((float)(vel)(3))
859                           ((float)(charge)))
860 
861 class NBody : public cpu_benchmark
862 {
863 public:
family()864     std::string family()
865     {
866         return "NBody";
867     }
868 
unit()869     std::string unit()
870     {
871         return "GFLOPS";
872     }
873 
gflops(double numParticles,double repeats,double tStart,double tEnd)874     double gflops(double numParticles, double repeats, double tStart, double tEnd)
875     {
876         double flops = repeats * numParticles * (9 + numParticles * (3 + 6 + 5 + 3 + 3));
877         double gflops = flops / (tEnd - tStart) * 1e-9;
878         return gflops;
879     }
880 };
881 
882 class NBodyVanilla : public NBody
883 {
884 public:
species()885     std::string species()
886     {
887         return "vanilla";
888     }
889 
performance(std::vector<int> dim)890     double performance(std::vector<int> dim)
891     {
892         int numParticles = dim[0];
893         int repeats = dim[1];
894 
895         std::vector<Particle> particlesA;
896         std::vector<Particle> particlesB;
897         particlesA.reserve(numParticles);
898         particlesB.reserve(numParticles);
899 
900         for (int i = 0; i < numParticles; ++i) {
901             Particle p(
902                 i, i * i, sin(i),
903                 i % 11, i % 13, i % 19,
904                 10 + cos(2 * i));
905 
906             particlesA.push_back(p);
907             particlesB.push_back(p);
908         }
909 
910         double tStart = time();
911 
912         for (int t = 0; t < repeats; ++t) {
913             for (int i = 0; i < numParticles; ++i) {
914                 float posX = particlesA[i].posX;
915                 float posY = particlesA[i].posY;
916                 float posZ = particlesA[i].posZ;
917 
918                 float velX = particlesA[i].velX;
919                 float velY = particlesA[i].velY;
920                 float velZ = particlesA[i].velZ;
921 
922                 float charge = particlesA[i].charge;
923 
924                 float accelerationX = 0;
925                 float accelerationY = 0;
926                 float accelerationZ = 0;
927 
928                 for (int j = 0; j < numParticles; ++j) {
929                     float deltaX = posX - particlesA[j].posX;
930                     float deltaY = posY - particlesA[j].posY;
931                     float deltaZ = posZ - particlesA[j].posZ;
932                     float distance2 = deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ + SOFTENING;
933                     float factor = charge * particlesA[j].charge * DELTA_T / distance2 / sqrt(distance2);
934                     float forceX = deltaX * factor;
935                     float forceY = deltaY * factor;
936                     float forceZ = deltaZ * factor;
937 
938                     accelerationX += forceX;
939                     accelerationY += forceY;
940                     accelerationZ += forceZ;
941                 }
942 
943                 particlesB[i].posX = posX + velX * DELTA_T;
944                 particlesB[i].posY = posY + velY * DELTA_T;
945                 particlesB[i].posZ = posZ + velZ * DELTA_T;
946 
947                 particlesB[i].velX = velX + accelerationX;
948                 particlesB[i].velY = velY + accelerationY;
949                 particlesB[i].velZ = velZ + accelerationZ;
950 
951                 particlesB[i].charge = charge;
952             }
953 
954             using std::swap;
955             swap(particlesA, particlesB);
956         }
957 
958         double tEnd = time();
959 
960         if (particlesA[0].posX == 0.12345) {
961             std::cout << "this is a debug statement to prevent the compiler from optimizing away the update routine\n";
962         }
963 
964         return gflops(numParticles, repeats, tStart, tEnd);
965     }
966 };
967 
968 #ifdef __AVX__
969 
970 class NBodyPepper : public NBody
971 {
972 public:
species()973     std::string species()
974     {
975         return "pepper";
976     }
977 
performance(std::vector<int> dim)978     double performance(std::vector<int> dim)
979     {
980         int numParticles = dim[0];
981         int repeats = dim[1];
982 
983         std::vector<float> posXA;
984         std::vector<float> posYA;
985         std::vector<float> posZA;
986         std::vector<float> posXB;
987         std::vector<float> posYB;
988         std::vector<float> posZB;
989 
990         std::vector<float> velXA;
991         std::vector<float> velYA;
992         std::vector<float> velZA;
993         std::vector<float> velXB;
994         std::vector<float> velYB;
995         std::vector<float> velZB;
996 
997         std::vector<float> chargeA;
998         std::vector<float> chargeB;
999 
1000         for (int i = 0; i < numParticles; ++i) {
1001             Particle p(
1002                 i, i * i, sin(i),
1003                 i % 11, i % 13, i % 19,
1004                 10 + cos(2 * i));
1005 
1006             posXA.push_back(p.posX);
1007             posXB.push_back(p.posX);
1008             posYA.push_back(p.posY);
1009             posYB.push_back(p.posY);
1010             posZA.push_back(p.posZ);
1011             posZB.push_back(p.posZ);
1012 
1013             velXA.push_back(p.velX);
1014             velXB.push_back(p.velX);
1015             velYA.push_back(p.velY);
1016             velYB.push_back(p.velY);
1017             velZA.push_back(p.velZ);
1018             velZB.push_back(p.velZ);
1019 
1020             chargeA.push_back(p.charge);
1021             chargeB.push_back(p.charge);
1022         }
1023 
1024         double tStart = time();
1025 
1026         for (int t = 0; t < repeats; ++t) {
1027             for (int i = 0; i < numParticles; ++i) {
1028                 int j;
1029 
1030                 float posX = posXA[i];
1031                 float posY = posYA[i];
1032                 float posZ = posZA[i];
1033                 __m256 posXV = _mm256_set1_ps(posX);
1034                 __m256 posYV = _mm256_set1_ps(posY);
1035                 __m256 posZV = _mm256_set1_ps(posZ);
1036 
1037                 float velX = velXA[i];
1038                 float velY = velYA[i];
1039                 float velZ = velZA[i];
1040 
1041                 float charge = chargeA[i];
1042                 __m256 chargeV = _mm256_set1_ps(charge);
1043 
1044                 __m256 accelerationXV = _mm256_set1_ps(0);
1045                 __m256 accelerationYV = _mm256_set1_ps(0);
1046                 __m256 accelerationZV = _mm256_set1_ps(0);
1047 
1048                 for (j = 0; j < (numParticles - 7); j += 8) {
1049                     __m256 deltaX = _mm256_sub_ps(posXV, _mm256_loadu_ps(&posXA[j]));
1050                     __m256 deltaY = _mm256_sub_ps(posYV, _mm256_loadu_ps(&posYA[j]));
1051                     __m256 deltaZ = _mm256_sub_ps(posZV, _mm256_loadu_ps(&posZA[j]));
1052                     __m256 distance2 =
1053                         _mm256_add_ps(_mm256_mul_ps(deltaX, deltaX),
1054                                       _mm256_mul_ps(deltaY, deltaY));
1055                     distance2 =
1056                         _mm256_add_ps(_mm256_mul_ps(deltaZ, deltaZ),
1057                                       distance2);
1058                     distance2 =
1059                         _mm256_add_ps(_mm256_set1_ps(SOFTENING),
1060                                       distance2);
1061 
1062                     __m256 factor = _mm256_mul_ps(chargeV, _mm256_loadu_ps(&chargeA[j]));
1063                     factor = _mm256_mul_ps(factor, _mm256_set1_ps(DELTA_T));
1064                     factor = _mm256_mul_ps(factor, _mm256_rcp_ps(distance2));
1065                     factor = _mm256_mul_ps(factor, _mm256_rsqrt_ps(distance2));
1066 
1067                     __m256 forceX = _mm256_mul_ps(deltaX, factor);
1068                     __m256 forceY = _mm256_mul_ps(deltaY, factor);
1069                     __m256 forceZ = _mm256_mul_ps(deltaZ, factor);
1070 
1071                     accelerationXV = _mm256_add_ps(accelerationXV, forceX);
1072                     accelerationYV = _mm256_add_ps(accelerationYV, forceY);
1073                     accelerationZV = _mm256_add_ps(accelerationZV, forceZ);
1074                 }
1075 
1076                 float buf[8];
1077                 _mm256_store_ps(buf, accelerationXV);
1078                 float accelerationX = buf[0] + buf[1] + buf[2] + buf[3] + buf[4] + buf[5] + buf[6] + buf[7];
1079                 _mm256_store_ps(buf, accelerationYV);
1080                 float accelerationY = buf[0] + buf[1] + buf[2] + buf[3] + buf[4] + buf[5] + buf[6] + buf[7];
1081                 _mm256_store_ps(buf, accelerationZV);
1082                 float accelerationZ = buf[0] + buf[1] + buf[2] + buf[3] + buf[4] + buf[5] + buf[6] + buf[7];
1083 
1084                 for (; j < numParticles; ++j) {
1085                     float deltaX = posX - posXA[j];
1086                     float deltaY = posY - posYA[j];
1087                     float deltaZ = posZ - posZA[j];
1088                     float distance2 = deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ + SOFTENING;
1089                     float factor = charge * chargeA[j] * DELTA_T / distance2 / sqrt(distance2);
1090                     float forceX = deltaX * factor;
1091                     float forceY = deltaY * factor;
1092                     float forceZ = deltaZ * factor;
1093 
1094                     accelerationX += forceX;
1095                     accelerationY += forceY;
1096                     accelerationZ += forceZ;
1097                 }
1098 
1099                 posXB[i] = posX + velX * DELTA_T;
1100                 posYB[i] = posY + velY * DELTA_T;
1101                 posZB[i] = posZ + velZ * DELTA_T;
1102 
1103                 velXB[i] = velX + accelerationX;
1104                 velYB[i] = velY + accelerationY;
1105                 velZB[i] = velZ + accelerationZ;
1106 
1107                 chargeB[i] = charge;
1108             }
1109 
1110             using std::swap;
1111             swap(posXA, posXB);
1112             swap(posYA, posYB);
1113             swap(posZA, posZB);
1114 
1115             swap(velXA, velXB);
1116             swap(velYA, velYB);
1117             swap(velZA, velZB);
1118 
1119             swap(chargeA, chargeB);
1120         }
1121 
1122         double tEnd = time();
1123 
1124         if (posXA[0] == 0.12345) {
1125             std::cout << "this is a debug statement to prevent the compiler from optimizing away the update routine\n";
1126         }
1127 
1128         return gflops(numParticles, repeats, tStart, tEnd);
1129     }
1130 };
1131 
1132 class NBodyCurry : public NBody
1133 {
1134 public:
species()1135     std::string species()
1136     {
1137         return "curry";
1138     }
1139 
performance(std::vector<int> dim)1140     double performance(std::vector<int> dim)
1141     {
1142         int numParticles = dim[0];
1143         int repeats = dim[1];
1144 
1145         std::vector<float> posXA;
1146         std::vector<float> posYA;
1147         std::vector<float> posZA;
1148         std::vector<float> posXB;
1149         std::vector<float> posYB;
1150         std::vector<float> posZB;
1151 
1152         std::vector<float> velXA;
1153         std::vector<float> velYA;
1154         std::vector<float> velZA;
1155         std::vector<float> velXB;
1156         std::vector<float> velYB;
1157         std::vector<float> velZB;
1158 
1159         std::vector<float> chargeA;
1160         std::vector<float> chargeB;
1161 
1162         for (int i = 0; i < numParticles; ++i) {
1163             Particle p(
1164                 i, i * i, sin(i),
1165                 i % 11, i % 13, i % 19,
1166                 10 + cos(2 * i));
1167 
1168             posXA.push_back(p.posX);
1169             posXB.push_back(p.posX);
1170             posYA.push_back(p.posY);
1171             posYB.push_back(p.posY);
1172             posZA.push_back(p.posZ);
1173             posZB.push_back(p.posZ);
1174 
1175             velXA.push_back(p.velX);
1176             velXB.push_back(p.velX);
1177             velYA.push_back(p.velY);
1178             velYB.push_back(p.velY);
1179             velZA.push_back(p.velZ);
1180             velZB.push_back(p.velZ);
1181 
1182             chargeA.push_back(p.charge);
1183             chargeB.push_back(p.charge);
1184         }
1185 
1186         double tStart = time();
1187 
1188         for (int t = 0; t < repeats; ++t) {
1189             for (int i = 0; i < (numParticles - 7); i += 8) {
1190                 int j;
1191 
1192                 __m256 posXV = _mm256_loadu_ps(&posXA[i]);
1193                 __m256 posYV = _mm256_loadu_ps(&posYA[i]);
1194                 __m256 posZV = _mm256_loadu_ps(&posZA[i]);
1195 
1196                 __m256 velXV = _mm256_loadu_ps(&velXA[i]);
1197                 __m256 velYV = _mm256_loadu_ps(&velYA[i]);
1198                 __m256 velZV = _mm256_loadu_ps(&velZA[i]);
1199 
1200                 __m256 chargeV = _mm256_loadu_ps(&chargeA[i]);
1201 
1202                 __m256 accelerationXV = _mm256_set1_ps(0);
1203                 __m256 accelerationYV = _mm256_set1_ps(0);
1204                 __m256 accelerationZV = _mm256_set1_ps(0);
1205 
1206                 __m256 deltaT = _mm256_set1_ps(DELTA_T);
1207 
1208                 for (j = 0; j < numParticles; ++j) {
1209                     __m256 deltaX = _mm256_sub_ps(posXV, _mm256_broadcast_ss(&posXA[j]));
1210                     __m256 deltaY = _mm256_sub_ps(posYV, _mm256_broadcast_ss(&posYA[j]));
1211                     __m256 deltaZ = _mm256_sub_ps(posZV, _mm256_broadcast_ss(&posZA[j]));
1212                     __m256 distance2 =
1213                         _mm256_add_ps(_mm256_mul_ps(deltaX, deltaX),
1214                                       _mm256_mul_ps(deltaY, deltaY));
1215                     distance2 =
1216                         _mm256_add_ps(_mm256_mul_ps(deltaZ, deltaZ),
1217                                       distance2);
1218                     distance2 =
1219                         _mm256_add_ps(_mm256_set1_ps(SOFTENING),
1220                                       distance2);
1221 
1222                     __m256 factor = _mm256_mul_ps(chargeV, _mm256_broadcast_ss(&chargeA[j]));
1223                     factor = _mm256_mul_ps(factor, deltaT);
1224                     factor = _mm256_mul_ps(factor, _mm256_rcp_ps(distance2));
1225                     factor = _mm256_mul_ps(factor, _mm256_rsqrt_ps(distance2));
1226 
1227                     __m256 forceX = _mm256_mul_ps(deltaX, factor);
1228                     __m256 forceY = _mm256_mul_ps(deltaY, factor);
1229                     __m256 forceZ = _mm256_mul_ps(deltaZ, factor);
1230 
1231                     accelerationXV = _mm256_add_ps(accelerationXV, forceX);
1232                     accelerationYV = _mm256_add_ps(accelerationYV, forceY);
1233                     accelerationZV = _mm256_add_ps(accelerationZV, forceZ);
1234                 }
1235 
1236                 posXV = _mm256_add_ps(posXV, _mm256_mul_ps(velXV, deltaT));
1237                 posYV = _mm256_add_ps(posYV, _mm256_mul_ps(velYV, deltaT));
1238                 posZV = _mm256_add_ps(posZV, _mm256_mul_ps(velZV, deltaT));
1239 
1240                 _mm256_storeu_ps(&posXB[i], posXV);
1241                 _mm256_storeu_ps(&posYB[i], posYV);
1242                 _mm256_storeu_ps(&posZB[i], posZV);
1243 
1244                 velXV = _mm256_add_ps(velXV, accelerationXV);
1245                 velYV = _mm256_add_ps(velYV, accelerationYV);
1246                 velZV = _mm256_add_ps(velZV, accelerationZV);
1247 
1248                 _mm256_storeu_ps(&velXB[i], velXV);
1249                 _mm256_storeu_ps(&velYB[i], velYV);
1250                 _mm256_storeu_ps(&velZB[i], velZV);
1251 
1252                 _mm256_storeu_ps(&chargeB[i], chargeV);
1253             }
1254 
1255             using std::swap;
1256             swap(posXA, posXB);
1257             swap(posYA, posYB);
1258             swap(posZA, posZB);
1259 
1260             swap(velXA, velXB);
1261             swap(velYA, velYB);
1262             swap(velZA, velZB);
1263 
1264             swap(chargeA, chargeB);
1265         }
1266 
1267         double tEnd = time();
1268 
1269         if (posXA[0] == 0.12345) {
1270             std::cout << "this is a debug statement to prevent the compiler from optimizing away the update routine\n";
1271         }
1272 
1273         return gflops(numParticles, repeats, tStart, tEnd);
1274     }
1275 };
1276 
1277 #endif
1278 
1279 class NBodyIron : public NBody
1280 {
1281 public:
species()1282     std::string species()
1283     {
1284         return "iron";
1285     }
1286 
performance(std::vector<int> dim)1287     double performance(std::vector<int> dim)
1288     {
1289         using namespace LibFlatArray;
1290 
1291         int numParticles = dim[0];
1292         int repeats = dim[1];
1293 
1294         soa_array<Particle, 8192> particlesA;
1295         soa_array<Particle, 8192> particlesB;
1296 
1297         for (int i = 0; i < numParticles; ++i) {
1298             Particle p(
1299                 i, i * i, sin(i),
1300                 i % 11, i % 13, i % 19,
1301                 10 + cos(2 * i));
1302 
1303             particlesA.push_back(p);
1304             particlesB.push_back(p);
1305         }
1306 
1307         double tStart = time();
1308 
1309         for (int t = 0; t < repeats; ++t) {
1310             soa_accessor<Particle, 8192, 1, 1, 0> accessorA = particlesA[0];
1311             soa_accessor<Particle, 8192, 1, 1, 0> accessorB = particlesB[0];
1312 
1313             for (; accessorA.index() < numParticles; ++accessorA, ++accessorB ) {
1314                 float posX = accessorA.posX();
1315                 float posY = accessorA.posY();
1316                 float posZ = accessorA.posZ();
1317 
1318                 float velX = accessorA.velX();
1319                 float velY = accessorA.velY();
1320                 float velZ = accessorA.velZ();
1321 
1322                 float charge = accessorA.charge();
1323 
1324                 float accelerationX = 0;
1325                 float accelerationY = 0;
1326                 float accelerationZ = 0;
1327 
1328                 soa_accessor<Particle, 8192, 1, 1, 0> accessorA2 = particlesA[0];
1329 
1330                 for (accessorA2.index() = 0; accessorA2.index() < numParticles; ++accessorA2) {
1331                     float deltaX = posX - accessorA2.posX();
1332                     float deltaY = posY - accessorA2.posY();
1333                     float deltaZ = posZ - accessorA2.posZ();
1334                     float distance2 = deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ + SOFTENING;
1335                     float factor = charge * accessorA2.charge() * DELTA_T / distance2 / sqrt(distance2);
1336                     float forceX = deltaX * factor;
1337                     float forceY = deltaY * factor;
1338                     float forceZ = deltaZ * factor;
1339 
1340                     accelerationX += forceX;
1341                     accelerationY += forceY;
1342                     accelerationZ += forceZ;
1343                 }
1344 
1345                 accessorB.posX() = posX + velX * DELTA_T;
1346                 accessorB.posY() = posY + velY * DELTA_T;
1347                 accessorB.posZ() = posZ + velZ * DELTA_T;
1348 
1349                 accessorB.velX() = velX + accelerationX;
1350                 accessorB.velY() = velY + accelerationY;
1351                 accessorB.velZ() = velZ + accelerationZ;
1352 
1353                 accessorB.charge() = charge;
1354             }
1355 
1356             using std::swap;
1357             swap(particlesA, particlesB);
1358         }
1359 
1360         double tEnd = time();
1361 
1362         if (particlesA[0].posX() == 0.12345) {
1363             std::cout << "this is a debug statement to prevent the compiler from optimizing away the update routine\n";
1364         }
1365 
1366         return gflops(numParticles, repeats, tStart, tEnd);
1367     }
1368 };
1369 
1370 #ifdef __AVX__
1371 
1372 class NBodyBronze : public NBody
1373 {
1374 public:
species()1375     std::string species()
1376     {
1377         return "bronze";
1378     }
1379 
performance(std::vector<int> dim)1380     double performance(std::vector<int> dim)
1381     {
1382         if (dim[0] <= 128) {
1383             return performance<128>(dim);
1384         }
1385         if (dim[0] <= 256) {
1386             return performance<256>(dim);
1387         }
1388         if (dim[0] <= 512) {
1389             return performance<512>(dim);
1390         }
1391         if (dim[0] <= 1024) {
1392             return performance<1024>(dim);
1393         }
1394         if (dim[0] <= 2048) {
1395             return performance<2048>(dim);
1396         }
1397         if (dim[0] <= 4096) {
1398             return performance<4096>(dim);
1399         }
1400         if (dim[0] <= 8192) {
1401             return performance<8192>(dim);
1402         }
1403 
1404         throw std::out_of_range("could not run test NBodyBronze as grid dimension X was too large");
1405     }
1406 
1407     template<int DIM>
performance(std::vector<int> dim)1408     double performance(std::vector<int> dim)
1409     {
1410         using namespace LibFlatArray;
1411 
1412         int numParticles = dim[0];
1413         int repeats = dim[1];
1414 
1415         soa_array<Particle, DIM> particlesA;
1416         soa_array<Particle, DIM> particlesB;
1417 
1418         for (int i = 0; i < numParticles; ++i) {
1419             Particle p(
1420                 i, i * i, sin(i),
1421                 i % 11, i % 13, i % 19,
1422                 10 + cos(2 * i));
1423 
1424             particlesA.push_back(p);
1425             particlesB.push_back(p);
1426         }
1427 
1428         double tStart = time();
1429 
1430         for (int t = 0; t < repeats; ++t) {
1431             soa_accessor<Particle, DIM, 1, 1, 0> accessorA = particlesA[0];
1432             soa_accessor<Particle, DIM, 1, 1, 0> accessorB = particlesB[0];
1433             soa_accessor<Particle, DIM, 1, 1, 0> accessorA2 = particlesA[0];
1434 
1435             for (; accessorA.index() < (numParticles - 7); accessorA += 8, accessorB += 8) {
1436                 __m256 posX = _mm256_loadu_ps(&accessorA.posX());
1437                 __m256 posY = _mm256_loadu_ps(&accessorA.posY());
1438                 __m256 posZ = _mm256_loadu_ps(&accessorA.posZ());
1439 
1440                 __m256 velX = _mm256_loadu_ps(&accessorA.velX());
1441                 __m256 velY = _mm256_loadu_ps(&accessorA.velY());
1442                 __m256 velZ = _mm256_loadu_ps(&accessorA.velZ());
1443 
1444                 __m256 charge = _mm256_loadu_ps(&accessorA.charge());
1445 
1446                 __m256 accelerationX = _mm256_set1_ps(0.0);
1447                 __m256 accelerationY = _mm256_set1_ps(0.0);
1448                 __m256 accelerationZ = _mm256_set1_ps(0.0);
1449 
1450                 __m256 deltaT = _mm256_set1_ps(DELTA_T);
1451 
1452                 for (accessorA2.index() = 0; accessorA2.index() < numParticles; ++accessorA2) {
1453                     __m256 deltaX = _mm256_sub_ps(posX, _mm256_broadcast_ss(&accessorA2.posX()));
1454                     __m256 deltaY = _mm256_sub_ps(posY, _mm256_broadcast_ss(&accessorA2.posY()));
1455                     __m256 deltaZ = _mm256_sub_ps(posZ, _mm256_broadcast_ss(&accessorA2.posZ()));
1456                     __m256 distance2 =
1457                         _mm256_add_ps(_mm256_mul_ps(deltaX, deltaX),
1458                                       _mm256_mul_ps(deltaY, deltaY));
1459                     distance2 =
1460                         _mm256_add_ps(_mm256_mul_ps(deltaZ, deltaZ),
1461                                       distance2);
1462                     distance2 =
1463                         _mm256_add_ps(_mm256_set1_ps(SOFTENING),
1464                                       distance2);
1465 
1466                     __m256 factor = _mm256_mul_ps(charge, _mm256_broadcast_ss(&accessorA2.charge()));
1467                     factor = _mm256_mul_ps(factor, _mm256_set1_ps(DELTA_T));
1468                     factor = _mm256_mul_ps(factor, _mm256_rcp_ps(distance2));
1469                     factor = _mm256_mul_ps(factor, _mm256_rsqrt_ps(distance2));
1470 
1471                     __m256 forceX = _mm256_mul_ps(deltaX, factor);
1472                     __m256 forceY = _mm256_mul_ps(deltaY, factor);
1473                     __m256 forceZ = _mm256_mul_ps(deltaZ, factor);
1474 
1475                     accelerationX = _mm256_add_ps(accelerationX, forceX);
1476                     accelerationY = _mm256_add_ps(accelerationY, forceY);
1477                     accelerationZ = _mm256_add_ps(accelerationZ, forceZ);
1478                 }
1479 
1480                 posX = _mm256_add_ps(posX, _mm256_mul_ps(velX, deltaT));
1481                 posY = _mm256_add_ps(posY, _mm256_mul_ps(velY, deltaT));
1482                 posZ = _mm256_add_ps(posZ, _mm256_mul_ps(velZ, deltaT));
1483 
1484                 _mm256_storeu_ps(&accessorB.posX(), posX);
1485                 _mm256_storeu_ps(&accessorB.posY(), posY);
1486                 _mm256_storeu_ps(&accessorB.posZ(), posZ);
1487 
1488                 velX = _mm256_add_ps(velX, accelerationX);
1489                 velY = _mm256_add_ps(velY, accelerationY);
1490                 velZ = _mm256_add_ps(velZ, accelerationZ);
1491 
1492                 _mm256_storeu_ps(&accessorB.velX(), velX);
1493                 _mm256_storeu_ps(&accessorB.velY(), velY);
1494                 _mm256_storeu_ps(&accessorB.velZ(), velZ);
1495 
1496                 _mm256_storeu_ps(&accessorB.charge(), charge);
1497             }
1498 
1499             using std::swap;
1500             swap(particlesA, particlesB);
1501         }
1502 
1503         double tEnd = time();
1504 
1505         if (particlesA[0].posX() == 0.12345) {
1506             std::cout << "this is a debug statement to prevent the compiler from optimizing away the update routine\n";
1507         }
1508 
1509         return gflops(numParticles, repeats, tStart, tEnd);
1510     }
1511 };
1512 
1513 #endif
1514 
1515 class NBodySilver : public NBody
1516 {
1517 public:
species()1518     std::string species()
1519     {
1520         return "silver";
1521     }
1522 
performance(std::vector<int> dim)1523     double performance(std::vector<int> dim)
1524     {
1525         if (dim[0] <= 128) {
1526             return performance<128,  short_vec<float, 8> >(dim);
1527         }
1528         if (dim[0] <= 256) {
1529             return performance<256,  short_vec<float, 8> >(dim);
1530         }
1531         if (dim[0] <= 512) {
1532             return performance<512,  short_vec<float, 8> >(dim);
1533         }
1534         if (dim[0] <= 1024) {
1535             return performance<1024, short_vec<float, 8> >(dim);
1536         }
1537         if (dim[0] <= 2048) {
1538             return performance<2048, short_vec<float, 8> >(dim);
1539         }
1540         if (dim[0] <= 4096) {
1541             return performance<4096, short_vec<float, 8> >(dim);
1542         }
1543         if (dim[0] <= 8192) {
1544             return performance<8192, short_vec<float, 8> >(dim);
1545         }
1546 
1547         throw std::out_of_range("could not run test NBodySilver as grid dimension X was too large");
1548     }
1549 
1550     template<int DIM, typename REAL>
performance(std::vector<int> dim)1551     double performance(std::vector<int> dim)
1552     {
1553         using namespace LibFlatArray;
1554 
1555         int numParticles = dim[0];
1556         int repeats = dim[1];
1557 
1558         soa_array<Particle, DIM> particlesA;
1559         soa_array<Particle, DIM> particlesB;
1560 
1561         for (int i = 0; i < numParticles; ++i) {
1562             Particle p(
1563                 i, i * i, sin(i),
1564                 i % 11, i % 13, i % 19,
1565                 10 + cos(2 * i));
1566 
1567             particlesA.push_back(p);
1568             particlesB.push_back(p);
1569         }
1570 
1571         double tStart = time();
1572 
1573         for (int t = 0; t < repeats; ++t) {
1574             soa_accessor<Particle, DIM, 1, 1, 0> accessorA = particlesA[0];
1575             soa_accessor<Particle, DIM, 1, 1, 0> accessorB = particlesB[0];
1576             soa_accessor<Particle, DIM, 1, 1, 0> accessorA2 = particlesA[0];
1577 
1578             for (; accessorA.index() < numParticles; accessorA += REAL::ARITY, accessorB += REAL::ARITY) {
1579                 REAL posX = &accessorA.posX();
1580                 REAL posY = &accessorA.posY();
1581                 REAL posZ = &accessorA.posZ();
1582 
1583                 REAL velX = &accessorA.velX();
1584                 REAL velY = &accessorA.velY();
1585                 REAL velZ = &accessorA.velZ();
1586 
1587                 REAL charge = &accessorA.charge();
1588 
1589                 REAL accelerationX = 0.0;
1590                 REAL accelerationY = 0.0;
1591                 REAL accelerationZ = 0.0;
1592 
1593                 for (accessorA2.index() = 0; accessorA2.index() < numParticles; ++accessorA2) {
1594                     REAL deltaX = posX - REAL(accessorA2.posX());
1595                     REAL deltaY = posY - REAL(accessorA2.posY());
1596                     REAL deltaZ = posZ - REAL(accessorA2.posZ());
1597                     REAL distance2 = deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ + SOFTENING;
1598 
1599                     REAL factor = charge * accessorA2.charge() * DELTA_T / distance2 / sqrt(distance2);
1600                     REAL forceX = deltaX * factor;
1601                     REAL forceY = deltaY * factor;
1602                     REAL forceZ = deltaZ * factor;
1603 
1604                     accelerationX += forceX;
1605                     accelerationY += forceY;
1606                     accelerationZ += forceZ;
1607                 }
1608 
1609                 &accessorB.posX() << (posX + velX * DELTA_T);
1610                 &accessorB.posY() << (posY + velY * DELTA_T);
1611                 &accessorB.posZ() << (posZ + velZ * DELTA_T);
1612 
1613                 &accessorB.velX() << (velX + accelerationX);
1614                 &accessorB.velY() << (velY + accelerationY);
1615                 &accessorB.velZ() << (velZ + accelerationZ);
1616 
1617                 &accessorB.charge() << charge;
1618             }
1619 
1620             using std::swap;
1621             swap(particlesA, particlesB);
1622         }
1623 
1624         double tEnd = time();
1625 
1626         if (particlesA[0].posX() == 0.12345) {
1627             std::cout << "this is a debug statement to prevent the compiler from optimizing away the update routine\n";
1628         }
1629 
1630         return gflops(numParticles, repeats, tStart, tEnd);
1631     }
1632 };
1633 
1634 class NBodyGold : public NBody
1635 {
1636 public:
species()1637     std::string species()
1638     {
1639         return "gold";
1640     }
1641 
performance(std::vector<int> dim)1642     double performance(std::vector<int> dim)
1643     {
1644         if (dim[0] <= 128) {
1645             return performance<128,  short_vec<float, 8> >(dim);
1646         }
1647         if (dim[0] <= 256) {
1648             return performance<256,  short_vec<float, 8> >(dim);
1649         }
1650         if (dim[0] <= 512) {
1651             return performance<512,  short_vec<float, 8> >(dim);
1652         }
1653         if (dim[0] <= 1024) {
1654             return performance<1024, short_vec<float, 8> >(dim);
1655         }
1656         if (dim[0] <= 2048) {
1657             return performance<2048, short_vec<float, 8> >(dim);
1658         }
1659         if (dim[0] <= 4096) {
1660             return performance<4096, short_vec<float, 8> >(dim);
1661         }
1662         if (dim[0] <= 8192) {
1663             return performance<8192, short_vec<float, 8> >(dim);
1664         }
1665 
1666         throw std::out_of_range("could not run test NBodySilver as grid dimension X was too large");
1667     }
1668 
1669     template<int DIM, typename REAL>
performance(std::vector<int> dim)1670     double performance(std::vector<int> dim)
1671     {
1672         using namespace LibFlatArray;
1673 
1674         int numParticles = dim[0];
1675         int repeats = dim[1];
1676 
1677         soa_array<ArrayParticle, DIM> particlesA;
1678         soa_array<ArrayParticle, DIM> particlesB;
1679 
1680         for (int i = 0; i < numParticles; ++i) {
1681             ArrayParticle p(
1682                 i, i * i, sin(i),
1683                 i % 11, i % 13, i % 19,
1684                 10 + cos(2 * i));
1685 
1686             particlesA.push_back(p);
1687             particlesB.push_back(p);
1688         }
1689 
1690         double tStart = time();
1691 
1692         for (int t = 0; t < repeats; ++t) {
1693             soa_accessor<ArrayParticle, DIM, 1, 1, 0> accessorA = particlesA[0];
1694             soa_accessor<ArrayParticle, DIM, 1, 1, 0> accessorB = particlesB[0];
1695             soa_accessor<ArrayParticle, DIM, 1, 1, 0> accessorA2 = particlesA[0];
1696 
1697             for (; accessorA.index() < numParticles; accessorA += REAL::ARITY, accessorB += REAL::ARITY) {
1698                 REAL posX = &accessorA.pos()[0];
1699                 REAL posY = &accessorA.pos()[1];
1700                 REAL posZ = &accessorA.pos()[2];
1701 
1702                 REAL velX = &accessorA.vel()[0];
1703                 REAL velY = &accessorA.vel()[1];
1704                 REAL velZ = &accessorA.vel()[2];
1705 
1706                 REAL charge = &accessorA.charge();
1707 
1708                 REAL accelerationX = 0.0;
1709                 REAL accelerationY = 0.0;
1710                 REAL accelerationZ = 0.0;
1711 
1712                 for (accessorA2.index() = 0; accessorA2.index() < numParticles; ++accessorA2) {
1713                     REAL deltaX = posX - REAL(accessorA2.pos()[0]);
1714                     REAL deltaY = posY - REAL(accessorA2.pos()[1]);
1715                     REAL deltaZ = posZ - REAL(accessorA2.pos()[2]);
1716                     REAL distance2 = deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ + SOFTENING;
1717 
1718                     REAL factor = charge * accessorA2.charge() * DELTA_T / distance2 / sqrt(distance2);
1719                     REAL forceX = deltaX * factor;
1720                     REAL forceY = deltaY * factor;
1721                     REAL forceZ = deltaZ * factor;
1722 
1723                     accelerationX += forceX;
1724                     accelerationY += forceY;
1725                     accelerationZ += forceZ;
1726                 }
1727 
1728                 &accessorB.pos()[0] << (posX + velX * DELTA_T);
1729                 &accessorB.pos()[1] << (posY + velY * DELTA_T);
1730                 &accessorB.pos()[2] << (posZ + velZ * DELTA_T);
1731 
1732                 &accessorB.vel()[0] << (velX + accelerationX);
1733                 &accessorB.vel()[1] << (velY + accelerationY);
1734                 &accessorB.vel()[2] << (velZ + accelerationZ);
1735 
1736                 &accessorB.charge() << charge;
1737             }
1738 
1739             using std::swap;
1740             swap(particlesA, particlesB);
1741         }
1742 
1743         double tEnd = time();
1744 
1745         if (particlesA[0].pos()[0] == 0.12345) {
1746             std::cout << "this is a debug statement to prevent the compiler from optimizing away the update routine\n";
1747         }
1748 
1749         return gflops(numParticles, repeats, tStart, tEnd);
1750     }
1751 };
1752 
main(int argc,char ** argv)1753 int main(int argc, char **argv)
1754 {
1755     if ((argc < 3) || (argc == 4) || (argc > 5)) {
1756         std::cerr << "usage: " << argv[0] << " [-n,--name SUBSTRING] REVISION CUDA_DEVICE \n"
1757                   << "  - optional: only run tests whose name contains a SUBSTRING,\n"
1758                   << "  - REVISION is purely for output reasons,\n"
1759                   << "  - CUDA_DEVICE causes CUDA tests to run on the device with the given ID.\n";
1760         return 1;
1761     }
1762     std::string name = "";
1763     int argumentIndex = 1;
1764     if (argc == 5) {
1765         if ((std::string(argv[1]) == "-n") ||
1766             (std::string(argv[1]) == "--name")) {
1767             name = std::string(argv[2]);
1768         }
1769         argumentIndex = 3;
1770     }
1771     std::string revision = argv[argumentIndex + 0];
1772 
1773     std::stringstream s;
1774     s << argv[argumentIndex + 1];
1775     int cudaDevice;
1776     s >> cudaDevice;
1777 
1778     evaluate eval(name, revision);
1779     eval.print_header();
1780 
1781     std::vector<std::vector<int> > sizes;
1782     for (int d = 32; d <= 544; d += 4) {
1783         std::vector<int> dim(3, d);
1784 
1785         sizes.push_back(dim);
1786     }
1787 
1788     for (std::vector<std::vector<int> >::iterator i = sizes.begin(); i != sizes.end(); ++i) {
1789         eval(JacobiD3Q7Vanilla(), *i);
1790     }
1791 
1792 #ifdef __SSE__
1793     for (std::vector<std::vector<int> >::iterator i = sizes.begin(); i != sizes.end(); ++i) {
1794         eval(JacobiD3Q7Pepper(), *i);
1795     }
1796 #endif
1797 
1798     for (std::vector<std::vector<int> >::iterator i = sizes.begin(); i != sizes.end(); ++i) {
1799         eval(JacobiD3Q7Bronze(), *i);
1800     }
1801 
1802 #ifdef __SSE__
1803     for (std::vector<std::vector<int> >::iterator i = sizes.begin(); i != sizes.end(); ++i) {
1804         eval(JacobiD3Q7Silver(), *i);
1805     }
1806 #endif
1807 
1808     for (std::vector<std::vector<int> >::iterator i = sizes.begin(); i != sizes.end(); ++i) {
1809         eval(JacobiD3Q7Gold(), *i);
1810     }
1811 
1812     sizes.clear();
1813 
1814     for (int n = 128; n <= 8192; n *= 2) {
1815         std::vector<int> dim(3);
1816         dim[0] = n;
1817         dim[1] = std::size_t(4) * 512 * 1024 * 1024 / n / n;
1818         dim[2] = 0;
1819 
1820         sizes.push_back(dim);
1821     }
1822 
1823     for (std::vector<std::vector<int> >::iterator i = sizes.begin(); i != sizes.end(); ++i) {
1824         eval(NBodyVanilla(), *i);
1825     }
1826 
1827 #ifdef __AVX__
1828     for (std::vector<std::vector<int> >::iterator i = sizes.begin(); i != sizes.end(); ++i) {
1829         eval(NBodyPepper(),  *i);
1830     }
1831 
1832     for (std::vector<std::vector<int> >::iterator i = sizes.begin(); i != sizes.end(); ++i) {
1833         eval(NBodyCurry(),  *i);
1834     }
1835 #endif
1836 
1837     for (std::vector<std::vector<int> >::iterator i = sizes.begin(); i != sizes.end(); ++i) {
1838         eval(NBodyIron(),  *i);
1839     }
1840 
1841 #ifdef __AVX__
1842     for (std::vector<std::vector<int> >::iterator i = sizes.begin(); i != sizes.end(); ++i) {
1843         eval(NBodyBronze(),  *i);
1844     }
1845 #endif
1846 
1847     for (std::vector<std::vector<int> >::iterator i = sizes.begin(); i != sizes.end(); ++i) {
1848         eval(NBodySilver(),  *i);
1849     }
1850 
1851     for (std::vector<std::vector<int> >::iterator i = sizes.begin(); i != sizes.end(); ++i) {
1852         eval(NBodyGold(),  *i);
1853     }
1854 
1855     return 0;
1856 }
1857