1 /**
2 * Copyright 2014 Andreas Schäfer
3 *
4 * Distributed under the Boost Software License, Version 1.0. (See accompanying
5 * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
6 */
7
8 #include <iostream>
9 #include <libflatarray/flat_array.hpp>
10 #include <libflatarray/short_vec.hpp>
11 #include <libflatarray/testbed/cpu_benchmark.hpp>
12 #include <libflatarray/testbed/evaluate.hpp>
13
14 #ifdef __SSE__
15 #include <xmmintrin.h>
16 #endif
17
18 #ifdef __AVX__
19 #include <immintrin.h>
20 #endif
21
22 #define WEIGHT_S 0.11
23 #define WEIGHT_T 0.12
24 #define WEIGHT_W 0.13
25 #define WEIGHT_C 0.20
26 #define WEIGHT_E 0.15
27 #define WEIGHT_B 0.16
28 #define WEIGHT_N 0.17
29 #define DELTA_T 0.001
30 #define SOFTENING 0.1
31
32 using namespace LibFlatArray;
33
34 class JacobiD3Q7 : public cpu_benchmark
35 {
36 public:
family()37 std::string family()
38 {
39 return "JacobiD3Q7";
40 }
41
unit()42 std::string unit()
43 {
44 return "GLUPS";
45 }
46
glups(std::vector<int> dim,int steps,double time) const47 double glups(std::vector<int> dim, int steps, double time) const
48 {
49 double updates = steps;
50 for (std::size_t i = 0; i < dim.size(); ++i) {
51 updates *= dim[i] - 2;
52 }
53
54 double gigaLatticeUpdatesPerSecond = updates / time * 1e-9;
55 return gigaLatticeUpdatesPerSecond;
56 }
57 };
58
59 class JacobiD3Q7Vanilla : public JacobiD3Q7
60 {
61 public:
species()62 std::string species()
63 {
64 return "vanilla";
65 }
66
performance(std::vector<int> dim)67 double performance(std::vector<int> dim)
68 {
69 int dim_x = dim[0];
70 int dim_y = dim[1];
71 int dim_z = dim[2];
72 int maxT = 200000000 / dim_x / dim_y / dim_z;
73 maxT = std::max(16, maxT);
74
75 int offsetZ = dim_x * dim_y;
76 int gridVolume = dim_x * dim_y * dim_z;
77 std::vector<double> compressedGrid(2 * gridVolume);
78 double *gridOld = &compressedGrid[0];
79 double *gridNew = &compressedGrid[gridVolume];
80
81 for (int z = 0; z < dim_z; ++z) {
82 for (int y = 0; y < dim_y; ++y) {
83 for (int x = 0; x < dim_x; ++x) {
84 gridOld[z * offsetZ + y * dim_y + x] = x + y + z;
85 gridNew[z * offsetZ + y * dim_y + x] = x + y + z;
86 }
87 }
88 }
89
90 double tStart = time();
91
92 for (int t = 0; t < maxT; ++t) {
93 for (int z = 1; z < (dim_z - 1); ++z) {
94 for (int y = 1; y < (dim_y - 1); ++y) {
95 updateLine(gridOld, gridNew, 1, y, z, dim_x - 1, dim_x, offsetZ);
96 }
97 }
98 }
99
100 double tEnd = time();
101
102 if (gridOld[1 * offsetZ + 1 * dim_y + 1] ==
103 gridNew[1 * offsetZ + 1 * dim_y + 1]) {
104 std::cout << "this is a debug statement to prevent the compiler from optimizing away the update routine\n";
105 }
106
107 return glups(dim, maxT, tEnd - tStart);
108 }
109
110 private:
updateLine(double * gridOld,double * gridNew,const int xStart,const int y,const int z,const int xEnd,const int offsetY,const int offsetZ) const111 void updateLine(double *gridOld, double *gridNew,
112 const int xStart, const int y, const int z,
113 const int xEnd, const int offsetY, const int offsetZ) const
114 {
115 for (int x = xStart; x < xEnd; ++x) {
116 gridNew[x + y * offsetY + z * offsetZ] =
117 gridOld[x + y * offsetY + z * offsetZ - 1 * offsetZ] * WEIGHT_S +
118 gridOld[x + y * offsetY + z * offsetZ - 1 * offsetY] * WEIGHT_T +
119 gridOld[x + y * offsetY + z * offsetZ - 1 ] * WEIGHT_W +
120 gridOld[x + y * offsetY + z * offsetZ + 0 ] * WEIGHT_C +
121 gridOld[x + y * offsetY + z * offsetZ + 1 ] * WEIGHT_E +
122 gridOld[x + y * offsetY + z * offsetZ + 1 * offsetY] * WEIGHT_B +
123 gridOld[x + y * offsetY + z * offsetZ + 1 * offsetZ] * WEIGHT_N;
124 }
125 }
126 };
127
128 #ifdef __SSE__
129
130 class JacobiD3Q7Pepper : public JacobiD3Q7
131 {
132 public:
species()133 std::string species()
134 {
135 return "pepper";
136 }
137
performance(std::vector<int> dim)138 double performance(std::vector<int> dim)
139 {
140 int dim_x = dim[0];
141 int dim_y = dim[1];
142 int dim_z = dim[2];
143 int maxT = 200000000 / dim_x / dim_y / dim_z;
144 maxT = std::max(16, maxT);
145
146 int offsetZ = dim_x * dim_y;
147 int gridVolume = dim_x * dim_y * dim_z;
148 std::vector<double> compressedGrid(2 * gridVolume);
149 double *gridOld = &compressedGrid[0];
150 double *gridNew = &compressedGrid[gridVolume];
151
152 for (int z = 0; z < dim_z; ++z) {
153 for (int y = 0; y < dim_y; ++y) {
154 for (int x = 0; x < dim_x; ++x) {
155 gridOld[z * offsetZ + y * dim_y + x] = x + y + z;
156 gridNew[z * offsetZ + y * dim_y + x] = x + y + z;
157 }
158 }
159 }
160
161 double tStart = time();
162
163 for (int t = 0; t < maxT; ++t) {
164 for (int z = 1; z < (dim_z - 1); ++z) {
165 for (int y = 1; y < (dim_y - 1); ++y) {
166 updateLine(gridOld, gridNew, 1, y, z, dim_x - 1, dim_x, offsetZ);
167 }
168 }
169 }
170
171 double tEnd = time();
172
173 if (gridOld[1 * offsetZ + 1 * dim_y + 1] ==
174 gridNew[1 * offsetZ + 1 * dim_y + 1]) {
175 std::cout << "this is a debug statement to prevent the compiler from optimizing away the update routine\n";
176 }
177
178 return glups(dim, maxT, tEnd - tStart);
179 }
180
181 private:
updateLine(double * gridOld,double * gridNew,const int xStart,const int y,const int z,const int xEnd,const int offsetY,const int offsetZ) const182 void updateLine(double *gridOld, double *gridNew,
183 const int xStart, const int y, const int z,
184 const int xEnd, const int offsetY, const int offsetZ) const
185 {
186 __m128d factorS = _mm_set1_pd(WEIGHT_S);
187 __m128d factorT = _mm_set1_pd(WEIGHT_T);
188 __m128d factorW = _mm_set1_pd(WEIGHT_W);
189 __m128d factorE = _mm_set1_pd(WEIGHT_E);
190 __m128d factorB = _mm_set1_pd(WEIGHT_B);
191 __m128d factorN = _mm_set1_pd(WEIGHT_N);
192
193 int x = xStart;
194
195 if (x % 2) {
196 gridNew[x + y * offsetY + z * offsetZ] =
197 gridOld[x + y * offsetY + z * offsetZ - 1 * offsetZ] * WEIGHT_S +
198 gridOld[x + y * offsetY + z * offsetZ - 1 * offsetY] * WEIGHT_N +
199 gridOld[x + y * offsetY + z * offsetZ - 1 ] * WEIGHT_W +
200 gridOld[x + y * offsetY + z * offsetZ + 0 ] * WEIGHT_C +
201 gridOld[x + y * offsetY + z * offsetZ + 1 ] * WEIGHT_E +
202 gridOld[x + y * offsetY + z * offsetZ + 1 * offsetY] * WEIGHT_B +
203 gridOld[x + y * offsetY + z * offsetZ + 1 * offsetZ] * WEIGHT_N;
204 ++x;
205 }
206
207 for (; x < (xEnd - 7); x += 8) {
208 // load south row:
209 __m128d bufA = _mm_load_pd(gridOld + x + y * offsetY + z * offsetZ - 1 * offsetZ + 0);
210 __m128d bufB = _mm_load_pd(gridOld + x + y * offsetY + z * offsetZ - 1 * offsetZ + 2);
211 __m128d bufC = _mm_load_pd(gridOld + x + y * offsetY + z * offsetZ - 1 * offsetZ + 4);
212 __m128d bufD = _mm_load_pd(gridOld + x + y * offsetY + z * offsetZ - 1 * offsetZ + 6);
213 __m128d bufE;
214
215 bufA = _mm_mul_pd(bufA, factorS);
216 bufB = _mm_mul_pd(bufB, factorS);
217 bufC = _mm_mul_pd(bufC, factorS);
218 bufD = _mm_mul_pd(bufD, factorS);
219
220 // load top row:
221 __m128d sumA = _mm_load_pd(gridOld + x + y * offsetY + z * offsetZ - 1 * offsetY + 0);
222 __m128d sumB = _mm_load_pd(gridOld + x + y * offsetY + z * offsetZ - 1 * offsetY + 2);
223 __m128d sumC = _mm_load_pd(gridOld + x + y * offsetY + z * offsetZ - 1 * offsetY + 4);
224 __m128d sumD = _mm_load_pd(gridOld + x + y * offsetY + z * offsetZ - 1 * offsetY + 6);
225
226 sumA = _mm_mul_pd(sumA, factorT);
227 sumB = _mm_mul_pd(sumB, factorT);
228 sumC = _mm_mul_pd(sumC, factorT);
229 sumD = _mm_mul_pd(sumD, factorT);
230
231 sumA = _mm_add_pd(sumA, bufA);
232 sumB = _mm_add_pd(sumB, bufB);
233 sumC = _mm_add_pd(sumC, bufC);
234 sumD = _mm_add_pd(sumD, bufD);
235
236 // load left/right row:
237 bufA = _mm_loadu_pd(gridOld + x + y * offsetY + z * offsetZ + 0 * offsetZ - 1);
238 bufB = _mm_loadu_pd(gridOld + x + y * offsetY + z * offsetZ + 0 * offsetZ + 1);
239 bufC = _mm_loadu_pd(gridOld + x + y * offsetY + z * offsetZ + 0 * offsetZ + 3);
240 bufD = _mm_loadu_pd(gridOld + x + y * offsetY + z * offsetZ + 0 * offsetZ + 5);
241 bufE = _mm_loadu_pd(gridOld + x + y * offsetY + z * offsetZ + 0 * offsetZ + 7);
242
243 sumA = _mm_add_pd(sumA, _mm_mul_pd(bufA, factorW));
244 sumB = _mm_add_pd(sumB, _mm_mul_pd(bufB, factorW));
245 sumC = _mm_add_pd(sumC, _mm_mul_pd(bufC, factorW));
246 sumD = _mm_add_pd(sumD, _mm_mul_pd(bufD, factorW));
247
248 sumA = _mm_add_pd(sumA, _mm_mul_pd(bufB, factorE));
249 sumB = _mm_add_pd(sumB, _mm_mul_pd(bufC, factorE));
250 sumC = _mm_add_pd(sumC, _mm_mul_pd(bufD, factorE));
251 sumD = _mm_add_pd(sumD, _mm_mul_pd(bufE, factorE));
252
253 // load bottom row:
254 bufA = _mm_load_pd(gridOld + x + y * offsetY + z * offsetZ + 1 * offsetY + 0);
255 bufB = _mm_load_pd(gridOld + x + y * offsetY + z * offsetZ + 1 * offsetY + 2);
256 bufC = _mm_load_pd(gridOld + x + y * offsetY + z * offsetZ + 1 * offsetY + 4);
257 bufD = _mm_load_pd(gridOld + x + y * offsetY + z * offsetZ + 1 * offsetY + 6);
258
259 bufA = _mm_mul_pd(bufA, factorB);
260 bufB = _mm_mul_pd(bufB, factorB);
261 bufC = _mm_mul_pd(bufC, factorB);
262 bufD = _mm_mul_pd(bufD, factorB);
263
264 sumA = _mm_add_pd(sumA, bufA);
265 sumB = _mm_add_pd(sumB, bufB);
266 sumC = _mm_add_pd(sumC, bufC);
267 sumD = _mm_add_pd(sumD, bufD);
268
269 // load north row:
270 bufA = _mm_load_pd(gridOld + x + y * offsetY + z * offsetZ + 1 * offsetZ + 0);
271 bufB = _mm_load_pd(gridOld + x + y * offsetY + z * offsetZ + 1 * offsetZ + 2);
272 bufC = _mm_load_pd(gridOld + x + y * offsetY + z * offsetZ + 1 * offsetZ + 4);
273 bufD = _mm_load_pd(gridOld + x + y * offsetY + z * offsetZ + 1 * offsetZ + 6);
274
275 bufA = _mm_mul_pd(bufA, factorN);
276 bufB = _mm_mul_pd(bufB, factorN);
277 bufC = _mm_mul_pd(bufC, factorN);
278 bufD = _mm_mul_pd(bufD, factorN);
279
280 sumA = _mm_add_pd(sumA, bufA);
281 sumB = _mm_add_pd(sumB, bufB);
282 sumC = _mm_add_pd(sumC, bufC);
283 sumD = _mm_add_pd(sumD, bufD);
284
285 _mm_stream_pd(gridNew + x + y * offsetY + z * offsetZ + 0, sumA);
286 _mm_stream_pd(gridNew + x + y * offsetY + z * offsetZ + 2, sumB);
287 _mm_stream_pd(gridNew + x + y * offsetY + z * offsetZ + 4, sumC);
288 _mm_stream_pd(gridNew + x + y * offsetY + z * offsetZ + 6, sumD);
289 }
290
291 for (; x < xEnd; ++x) {
292 gridNew[x + y * offsetY + z * offsetZ] =
293 gridOld[x + y * offsetY + z * offsetZ - 1 * offsetZ] * WEIGHT_S +
294 gridOld[x + y * offsetY + z * offsetZ - 1 * offsetY] * WEIGHT_N +
295 gridOld[x + y * offsetY + z * offsetZ - 1 ] * WEIGHT_W +
296 gridOld[x + y * offsetY + z * offsetZ + 0 ] * WEIGHT_C +
297 gridOld[x + y * offsetY + z * offsetZ + 1 ] * WEIGHT_E +
298 gridOld[x + y * offsetY + z * offsetZ + 1 * offsetY] * WEIGHT_B +
299 gridOld[x + y * offsetY + z * offsetZ + 1 * offsetZ] * WEIGHT_N;
300 }
301 }
302 };
303
304 #endif
305
306 class JacobiCell
307 {
308 public:
JacobiCell(double temp=0)309 explicit JacobiCell(double temp = 0) :
310 temp(temp)
311 {}
312
313 double temp;
314 };
315
316 LIBFLATARRAY_REGISTER_SOA(JacobiCell,
317 ((double)(temp)))
318
319 class JacobiD3Q7Bronze : public JacobiD3Q7
320 {
321 public:
322 class UpdateFunctor
323 {
324 public:
UpdateFunctor(long dim_x,long dim_y,long dim_z)325 UpdateFunctor(long dim_x, long dim_y, long dim_z) :
326 dim_x(dim_x),
327 dim_y(dim_y),
328 dim_z(dim_z)
329 {}
330
331 template<typename accessor_type1, typename accessor_type2>
operator ()(accessor_type1 & accessor1,accessor_type2 & accessor2) const332 void operator()(accessor_type1& accessor1, accessor_type2& accessor2) const
333 {
334 for (long z = 1; z < (dim_z - 1); ++z) {
335 for (long y = 1; y < (dim_y - 1); ++y) {
336 long indexStart = accessor1.gen_index(1, y, z);
337 long indexEnd = accessor1.gen_index(dim_x - 1, y, z);
338
339 for (accessor1.index() = indexStart, accessor2.index() = indexStart;
340 accessor1.index() < indexEnd;
341 accessor1 += 1, accessor2 += 1) {
342
343 accessor2.temp() =
344 accessor1[coord< 0, 0, -1>()].temp() * WEIGHT_S +
345 accessor1[coord< 0, -1, 0>()].temp() * WEIGHT_T +
346 accessor1[coord<-1, 0, 0>()].temp() * WEIGHT_W +
347 accessor1[coord< 0, 0, 0>()].temp() * WEIGHT_C +
348 accessor1[coord< 1, 0, 0>()].temp() * WEIGHT_E +
349 accessor1[coord< 0, 1, 0>()].temp() * WEIGHT_B +
350 accessor1[coord< 0, 0, 1>()].temp() * WEIGHT_N;
351 }
352 }
353 }
354 }
355
356 private:
357 long dim_x;
358 long dim_y;
359 long dim_z;
360 };
361
species()362 std::string species()
363 {
364 return "bronze";
365 }
366
performance(std::vector<int> dim)367 double performance(std::vector<int> dim)
368 {
369 long dim_x = dim[0];
370 long dim_y = dim[1];
371 long dim_z = dim[2];
372 int maxT = 200000000 / dim_x / dim_y / dim_z;
373 maxT = std::max(16, maxT);
374
375 soa_grid<JacobiCell> gridOld(dim_x, dim_y, dim_z);
376 soa_grid<JacobiCell> gridNew(dim_x, dim_y, dim_z);
377
378 for (long z = 0; z < dim_z; ++z) {
379 for (long y = 0; y < dim_y; ++y) {
380 for (long x = 0; x < dim_x; ++x) {
381 gridOld.set(x, y, z, JacobiCell(x + y + z));
382 gridNew.set(x, y, z, JacobiCell(x + y + z));
383 }
384 }
385 }
386
387 double tStart = time();
388
389 UpdateFunctor functor(dim_x, dim_y, dim_z);
390 for (int t = 0; t < maxT; ++t) {
391 gridOld.callback(&gridNew, functor);
392 using std::swap;
393 swap(gridOld, gridNew);
394 }
395
396 double tEnd = time();
397
398 if (gridOld.get(1, 1, 1).temp ==
399 gridNew.get(1, 1, 1).temp) {
400 std::cout << gridOld.get(1, 1, 1).temp << "\n";
401 std::cout << gridNew.get(1, 1, 1).temp << "\n";
402 std::cout << "this is a debug statement to prevent the compiler from optimizing away the update routine\n";
403 }
404
405 return glups(dim, maxT, tEnd - tStart);
406 }
407
408 private:
updateLine(double * gridOld,double * gridNew,const long xStart,const long y,const long z,const long xEnd,const long offsetY,const long offsetZ) const409 void updateLine(double *gridOld, double *gridNew,
410 const long xStart, const long y, const long z,
411 const long xEnd, const long offsetY, const long offsetZ) const
412 {
413 for (long x = xStart; x < xEnd; ++x) {
414 gridNew[x + y * offsetY + z * offsetZ] =
415 gridOld[x + y * offsetY + z * offsetZ - 1 * offsetZ] * WEIGHT_S +
416 gridOld[x + y * offsetY + z * offsetZ - 1 * offsetY] * WEIGHT_T +
417 gridOld[x + y * offsetY + z * offsetZ - 1 ] * WEIGHT_W +
418 gridOld[x + y * offsetY + z * offsetZ + 0 ] * WEIGHT_C +
419 gridOld[x + y * offsetY + z * offsetZ + 1 ] * WEIGHT_E +
420 gridOld[x + y * offsetY + z * offsetZ + 1 * offsetY] * WEIGHT_B +
421 gridOld[x + y * offsetY + z * offsetZ + 1 * offsetZ] * WEIGHT_N;
422 }
423 }
424 };
425
426 #ifdef __SSE__
427
428 class JacobiD3Q7Silver : public JacobiD3Q7
429 {
430 public:
431 class UpdateFunctor
432 {
433 public:
UpdateFunctor(long dim_x,long dim_y,long dim_z)434 UpdateFunctor(long dim_x, long dim_y, long dim_z) :
435 dim_x(dim_x),
436 dim_y(dim_y),
437 dim_z(dim_z)
438 {}
439
440 template<typename accessor_type1, typename accessor_type2>
operator ()(accessor_type1 & accessor1,accessor_type2 & accessor2) const441 void operator()(accessor_type1& accessor1,
442 accessor_type2& accessor2) const
443 {
444 __m128d factorS = _mm_set1_pd(WEIGHT_S);
445 __m128d factorT = _mm_set1_pd(WEIGHT_T);
446 __m128d factorW = _mm_set1_pd(WEIGHT_W);
447 __m128d factorE = _mm_set1_pd(WEIGHT_E);
448 __m128d factorB = _mm_set1_pd(WEIGHT_B);
449 __m128d factorN = _mm_set1_pd(WEIGHT_N);
450
451 for (long z = 1; z < (dim_z - 1); ++z) {
452 for (long y = 1; y < (dim_y - 1); ++y) {
453 long indexStart = accessor1.gen_index(1, y, z);
454 long indexEnd = accessor1.gen_index(dim_x - 1, y, z);
455
456 accessor1.index() = indexStart;
457 accessor2.index() = indexStart;
458
459 accessor2.temp() =
460 accessor1[coord< 0, 0, -1>()].temp() * WEIGHT_S +
461 accessor1[coord< 0, -1, 0>()].temp() * WEIGHT_T +
462 accessor1[coord<-1, 0, 0>()].temp() * WEIGHT_W +
463 accessor1[coord< 0, 0, 0>()].temp() * WEIGHT_C +
464 accessor1[coord< 1, 0, 0>()].temp() * WEIGHT_E +
465 accessor1[coord< 0, 1, 0>()].temp() * WEIGHT_B +
466 accessor1[coord< 0, 0, 1>()].temp() * WEIGHT_N;
467
468 accessor1.index() += 1;
469 accessor2.index() += 1;
470
471 for (;
472 accessor1.index() < (indexEnd - 7);
473 accessor1.index() += 8, accessor2.index() += 8) {
474
475 // load south row:
476 __m128d bufA = _mm_load_pd(&accessor1[coord<0, 0, -1>()].temp() + 0);
477 __m128d bufB = _mm_load_pd(&accessor1[coord<0, 0, -1>()].temp() + 2);
478 __m128d bufC = _mm_load_pd(&accessor1[coord<0, 0, -1>()].temp() + 4);
479 __m128d bufD = _mm_load_pd(&accessor1[coord<0, 0, -1>()].temp() + 6);
480 __m128d bufE;
481
482 bufA = _mm_mul_pd(bufA, factorS);
483 bufB = _mm_mul_pd(bufB, factorS);
484 bufC = _mm_mul_pd(bufC, factorS);
485 bufD = _mm_mul_pd(bufD, factorS);
486
487 // load top row:
488 __m128d sumA = _mm_load_pd(&accessor1[coord<0, -1, 0>()].temp() + 0);
489 __m128d sumB = _mm_load_pd(&accessor1[coord<0, -1, 0>()].temp() + 2);
490 __m128d sumC = _mm_load_pd(&accessor1[coord<0, -1, 0>()].temp() + 4);
491 __m128d sumD = _mm_load_pd(&accessor1[coord<0, -1, 0>()].temp() + 6);
492
493 sumA = _mm_mul_pd(sumA, factorT);
494 sumB = _mm_mul_pd(sumB, factorT);
495 sumC = _mm_mul_pd(sumC, factorT);
496 sumD = _mm_mul_pd(sumD, factorT);
497
498 sumA = _mm_add_pd(sumA, bufA);
499 sumB = _mm_add_pd(sumB, bufB);
500 sumC = _mm_add_pd(sumC, bufC);
501 sumD = _mm_add_pd(sumD, bufD);
502
503 // load left/right row:
504 bufA = _mm_loadu_pd(&accessor1[coord<0, 0, 0>()].temp() - 1);
505 bufB = _mm_loadu_pd(&accessor1[coord<0, 0, 0>()].temp() + 1);
506 bufC = _mm_loadu_pd(&accessor1[coord<0, 0, 0>()].temp() + 3);
507 bufD = _mm_loadu_pd(&accessor1[coord<0, 0, 0>()].temp() + 5);
508 bufE = _mm_loadu_pd(&accessor1[coord<0, 0, 0>()].temp() + 7);
509
510 sumA = _mm_add_pd(sumA, _mm_mul_pd(bufA, factorW));
511 sumB = _mm_add_pd(sumB, _mm_mul_pd(bufB, factorW));
512 sumC = _mm_add_pd(sumC, _mm_mul_pd(bufC, factorW));
513 sumD = _mm_add_pd(sumD, _mm_mul_pd(bufD, factorW));
514
515 sumA = _mm_add_pd(sumA, _mm_mul_pd(bufB, factorE));
516 sumB = _mm_add_pd(sumB, _mm_mul_pd(bufC, factorE));
517 sumC = _mm_add_pd(sumC, _mm_mul_pd(bufD, factorE));
518 sumD = _mm_add_pd(sumD, _mm_mul_pd(bufE, factorE));
519
520 // load bottom row:
521 bufA = _mm_load_pd(&accessor1[coord<0, 1, 0>()].temp() + 0);
522 bufB = _mm_load_pd(&accessor1[coord<0, 1, 0>()].temp() + 2);
523 bufC = _mm_load_pd(&accessor1[coord<0, 1, 0>()].temp() + 4);
524 bufD = _mm_load_pd(&accessor1[coord<0, 1, 0>()].temp() + 6);
525
526 bufA = _mm_mul_pd(bufA, factorB);
527 bufB = _mm_mul_pd(bufB, factorB);
528 bufC = _mm_mul_pd(bufC, factorB);
529 bufD = _mm_mul_pd(bufD, factorB);
530
531 sumA = _mm_add_pd(sumA, bufA);
532 sumB = _mm_add_pd(sumB, bufB);
533 sumC = _mm_add_pd(sumC, bufC);
534 sumD = _mm_add_pd(sumD, bufD);
535
536 // load north row:
537 bufA = _mm_load_pd(&accessor1[coord<0, 0, 1>()].temp() + 0);
538 bufB = _mm_load_pd(&accessor1[coord<0, 0, 1>()].temp() + 2);
539 bufC = _mm_load_pd(&accessor1[coord<0, 0, 1>()].temp() + 4);
540 bufD = _mm_load_pd(&accessor1[coord<0, 0, 1>()].temp() + 6);
541
542 bufA = _mm_mul_pd(bufA, factorN);
543 bufB = _mm_mul_pd(bufB, factorN);
544 bufC = _mm_mul_pd(bufC, factorN);
545 bufD = _mm_mul_pd(bufD, factorN);
546
547 sumA = _mm_add_pd(sumA, bufA);
548 sumB = _mm_add_pd(sumB, bufB);
549 sumC = _mm_add_pd(sumC, bufC);
550 sumD = _mm_add_pd(sumD, bufD);
551
552 _mm_stream_pd(&accessor2[coord<0, 0, 0>()].temp() + 0, sumA);
553 _mm_stream_pd(&accessor2[coord<0, 0, 0>()].temp() + 2, sumB);
554 _mm_stream_pd(&accessor2[coord<0, 0, 0>()].temp() + 4, sumC);
555 _mm_stream_pd(&accessor2[coord<0, 0, 0>()].temp() + 6, sumD);
556 }
557
558
559 for (;
560 accessor1.index() < (indexEnd - 1);
561 accessor1.index() += 1, accessor2.index() += 1) {
562 accessor2.temp() =
563 accessor1[coord< 0, 0, -1>()].temp() * WEIGHT_S +
564 accessor1[coord< 0, -1, 0>()].temp() * WEIGHT_T +
565 accessor1[coord<-1, 0, 0>()].temp() * WEIGHT_W +
566 accessor1[coord< 0, 0, 0>()].temp() * WEIGHT_C +
567 accessor1[coord< 1, 0, 0>()].temp() * WEIGHT_E +
568 accessor1[coord< 0, 1, 0>()].temp() * WEIGHT_B +
569 accessor1[coord< 0, 0, 1>()].temp() * WEIGHT_N;
570
571 }
572 }
573 }
574 }
575
576 private:
577 long dim_x;
578 long dim_y;
579 long dim_z;
580 };
581
species()582 std::string species()
583 {
584 return "silver";
585 }
586
performance(std::vector<int> dim)587 double performance(std::vector<int> dim)
588 {
589 long dim_x = dim[0];
590 long dim_y = dim[1];
591 long dim_z = dim[2];
592 int maxT = 200000000 / dim_x / dim_y / dim_z;
593 maxT = std::max(16, maxT);
594
595 soa_grid<JacobiCell> gridOld(dim_x, dim_y, dim_z);
596 soa_grid<JacobiCell> gridNew(dim_x, dim_y, dim_z);
597
598 for (long z = 0; z < dim_z; ++z) {
599 for (long y = 0; y < dim_y; ++y) {
600 for (long x = 0; x < dim_x; ++x) {
601 gridOld.set(x, y, z, JacobiCell(x + y + z));
602 gridNew.set(x, y, z, JacobiCell(x + y + z));
603 }
604 }
605 }
606
607 double tStart = time();
608
609 UpdateFunctor functor(dim_x, dim_y, dim_z);
610 for (int t = 0; t < maxT; ++t) {
611 gridOld.callback(&gridNew, functor);
612 using std::swap;
613 swap(gridOld, gridNew);
614 }
615
616 double tEnd = time();
617
618 if (gridOld.get(20, 20, 20).temp ==
619 gridNew.get(10, 10, 10).temp) {
620 std::cout << "this is a debug statement to prevent the compiler from optimizing away the update routine\n";
621 }
622
623 return glups(dim, maxT, tEnd - tStart);
624 }
625
626 private:
updateLine(double * gridOld,double * gridNew,const long xStart,const long y,const long z,const long xEnd,const long offsetY,const long offsetZ) const627 void updateLine(double *gridOld, double *gridNew,
628 const long xStart, const long y, const long z,
629 const long xEnd, const long offsetY, const long offsetZ) const
630 {
631 for (long x = xStart; x < xEnd; ++x) {
632 gridNew[x + y * offsetY + z * offsetZ] =
633 gridOld[x + y * offsetY + z * offsetZ - 1 * offsetZ] * WEIGHT_S +
634 gridOld[x + y * offsetY + z * offsetZ - 1 * offsetY] * WEIGHT_T +
635 gridOld[x + y * offsetY + z * offsetZ - 1 ] * WEIGHT_W +
636 gridOld[x + y * offsetY + z * offsetZ + 0 ] * WEIGHT_C +
637 gridOld[x + y * offsetY + z * offsetZ + 1 ] * WEIGHT_E +
638 gridOld[x + y * offsetY + z * offsetZ + 1 * offsetY] * WEIGHT_B +
639 gridOld[x + y * offsetY + z * offsetZ + 1 * offsetZ] * WEIGHT_N;
640 }
641 }
642 };
643
644 #endif
645
646 class JacobiD3Q7Gold : public JacobiD3Q7
647 {
648 public:
649 class UpdateFunctor
650 {
651 public:
UpdateFunctor(long dim_x,long dim_y,long dim_z)652 UpdateFunctor(long dim_x, long dim_y, long dim_z) :
653 dim_x(dim_x),
654 dim_y(dim_y),
655 dim_z(dim_z)
656 {}
657
658 template<typename accessor_type1, typename accessor_type2>
operator ()(accessor_type1 & accessor_old,accessor_type2 & accessor_new) const659 void operator()(accessor_type1& accessor_old,
660 accessor_type2& accessor_new) const
661 {
662 typedef typename LibFlatArray::estimate_optimum_short_vec_type<double, accessor_type1>::VALUE my_short_vec;
663
664 #ifdef _OPENMP
665 #pragma omp parallel for schedule(static) firstprivate(accessorOld, accessorNew)
666 #endif
667 for (std::size_t z = 1; z < (dim_z - 1); ++z) {
668 for (std::size_t y = 1; y < (dim_y - 1); ++y) {
669 std::size_t x = 1;
670 std::size_t end_x = dim_x - 1;
671
672 LIBFLATARRAY_LOOP_PEELER_TEMPLATE(
673 my_short_vec,
674 std::size_t,
675 x,
676 end_x,
677 update_line,
678 y,
679 z,
680 accessor_old,
681 accessor_new);
682 }
683 }
684 }
685
686 template<typename SHORT_VEC, typename SOA_ACCESSOR_1, typename SOA_ACCESSOR_2>
update_line(std::size_t & x,std::size_t end_x,std::size_t y,std::size_t z,SOA_ACCESSOR_1 & accessor_old,SOA_ACCESSOR_2 & accessor_new) const687 void update_line(
688 std::size_t& x,
689 std::size_t end_x,
690 std::size_t y,
691 std::size_t z,
692 SOA_ACCESSOR_1& accessor_old,
693 SOA_ACCESSOR_2& accessor_new) const
694 {
695 accessor_old.index() = SOA_ACCESSOR_1::gen_index(x, y, z);
696 accessor_new.index() = SOA_ACCESSOR_2::gen_index(x, y, z);
697
698 SHORT_VEC buf;
699 SHORT_VEC factorS = WEIGHT_S;
700 SHORT_VEC factorT = WEIGHT_T;
701 SHORT_VEC factorW = WEIGHT_W;
702 SHORT_VEC factorE = WEIGHT_E;
703 SHORT_VEC factorB = WEIGHT_B;
704 SHORT_VEC factorN = WEIGHT_N;
705
706 for (; x < end_x; x += SHORT_VEC::ARITY) {
707 using LibFlatArray::coord;
708 buf = SHORT_VEC(&accessor_old[coord< 0, 0, -1>()].temp()) * factorS;
709 buf += SHORT_VEC(&accessor_old[coord< 0, -1, 0>()].temp()) * factorT;
710 buf += SHORT_VEC(&accessor_old[coord<-1, 0, 0>()].temp()) * factorW;
711 buf += SHORT_VEC(&accessor_old[coord< 1, 0, 0>()].temp()) * factorE;
712 buf += SHORT_VEC(&accessor_old[coord< 0, 1, 0>()].temp()) * factorB;
713 buf += SHORT_VEC(&accessor_old[coord< 0, 0, 1>()].temp()) * factorN;
714
715 &accessor_new.temp() << buf;
716
717 accessor_new += SHORT_VEC::ARITY;
718 accessor_old += SHORT_VEC::ARITY;
719 }
720
721 }
722
723 private:
724 std::size_t dim_x;
725 std::size_t dim_y;
726 std::size_t dim_z;
727 };
728
species()729 std::string species()
730 {
731 return "gold";
732 }
733
performance(std::vector<int> dim)734 double performance(std::vector<int> dim)
735 {
736 long dim_x = dim[0];
737 long dim_y = dim[1];
738 long dim_z = dim[2];
739 int maxT = 200000000 / dim_x / dim_y / dim_z;
740 maxT = std::max(16, maxT);
741
742 soa_grid<JacobiCell> gridOld(dim_x, dim_y, dim_z);
743 soa_grid<JacobiCell> gridNew(dim_x, dim_y, dim_z);
744
745 for (long z = 0; z < dim_z; ++z) {
746 for (long y = 0; y < dim_y; ++y) {
747 for (long x = 0; x < dim_x; ++x) {
748 gridOld.set(x, y, z, JacobiCell(x + y + z));
749 gridNew.set(x, y, z, JacobiCell(x + y + z));
750 }
751 }
752 }
753
754 double tStart = time();
755
756 UpdateFunctor functor(dim_x, dim_y, dim_z);
757 for (int t = 0; t < maxT; ++t) {
758 gridOld.callback(&gridNew, functor);
759 using std::swap;
760 swap(gridOld, gridNew);
761 }
762
763 double tEnd = time();
764
765 if (gridOld.get(20, 20, 20).temp ==
766 gridNew.get(10, 10, 10).temp) {
767 std::cout << "this is a debug statement to prevent the compiler from optimizing away the update routine\n";
768 }
769
770 return glups(dim, maxT, tEnd - tStart);
771 }
772
773 private:
updateLine(double * gridOld,double * gridNew,const long xStart,const long y,const long z,const long xEnd,const long offsetY,const long offsetZ) const774 void updateLine(double *gridOld, double *gridNew,
775 const long xStart, const long y, const long z,
776 const long xEnd, const long offsetY, const long offsetZ) const
777 {
778 for (long x = xStart; x < xEnd; ++x) {
779 gridNew[x + y * offsetY + z * offsetZ] =
780 gridOld[x + y * offsetY + z * offsetZ - 1 * offsetZ] * WEIGHT_S +
781 gridOld[x + y * offsetY + z * offsetZ - 1 * offsetY] * WEIGHT_T +
782 gridOld[x + y * offsetY + z * offsetZ - 1 ] * WEIGHT_W +
783 gridOld[x + y * offsetY + z * offsetZ + 0 ] * WEIGHT_C +
784 gridOld[x + y * offsetY + z * offsetZ + 1 ] * WEIGHT_E +
785 gridOld[x + y * offsetY + z * offsetZ + 1 * offsetY] * WEIGHT_B +
786 gridOld[x + y * offsetY + z * offsetZ + 1 * offsetZ] * WEIGHT_N;
787 }
788 }
789 };
790
791 class Particle
792 {
793 public:
Particle(const float posX=0,const float posY=0,const float posZ=0,const float velX=0,const float velY=0,const float velZ=0,const float charge=0)794 explicit inline Particle(
795 const float posX = 0,
796 const float posY = 0,
797 const float posZ = 0,
798 const float velX = 0,
799 const float velY = 0,
800 const float velZ = 0,
801 const float charge = 0) :
802 posX(posX),
803 posY(posY),
804 posZ(posZ),
805 velX(velX),
806 velY(velY),
807 velZ(velZ),
808 charge(charge)
809 {}
810
811 float posX;
812 float posY;
813 float posZ;
814 float velX;
815 float velY;
816 float velZ;
817 float charge;
818 };
819
820 LIBFLATARRAY_REGISTER_SOA(Particle,
821 ((float)(posX))
822 ((float)(posY))
823 ((float)(posZ))
824 ((float)(velX))
825 ((float)(velY))
826 ((float)(velZ))
827 ((float)(charge)))
828
829 class ArrayParticle
830 {
831 public:
832 inline
ArrayParticle(const float posX=0,const float posY=0,const float posZ=0,const float velX=0,const float velY=0,const float velZ=0,const float charge=0)833 explicit ArrayParticle(
834 const float posX = 0,
835 const float posY = 0,
836 const float posZ = 0,
837 const float velX = 0,
838 const float velY = 0,
839 const float velZ = 0,
840 const float charge = 0) :
841 charge(charge)
842 {
843 pos[0] = posX;
844 pos[1] = posY;
845 pos[2] = posZ;
846 vel[0] = velX;
847 vel[1] = velY;
848 vel[2] = velZ;
849 }
850
851 float pos[3];
852 float vel[3];
853 float charge;
854 };
855
856 LIBFLATARRAY_REGISTER_SOA(ArrayParticle,
857 ((float)(pos)(3))
858 ((float)(vel)(3))
859 ((float)(charge)))
860
861 class NBody : public cpu_benchmark
862 {
863 public:
family()864 std::string family()
865 {
866 return "NBody";
867 }
868
unit()869 std::string unit()
870 {
871 return "GFLOPS";
872 }
873
gflops(double numParticles,double repeats,double tStart,double tEnd)874 double gflops(double numParticles, double repeats, double tStart, double tEnd)
875 {
876 double flops = repeats * numParticles * (9 + numParticles * (3 + 6 + 5 + 3 + 3));
877 double gflops = flops / (tEnd - tStart) * 1e-9;
878 return gflops;
879 }
880 };
881
882 class NBodyVanilla : public NBody
883 {
884 public:
species()885 std::string species()
886 {
887 return "vanilla";
888 }
889
performance(std::vector<int> dim)890 double performance(std::vector<int> dim)
891 {
892 int numParticles = dim[0];
893 int repeats = dim[1];
894
895 std::vector<Particle> particlesA;
896 std::vector<Particle> particlesB;
897 particlesA.reserve(numParticles);
898 particlesB.reserve(numParticles);
899
900 for (int i = 0; i < numParticles; ++i) {
901 Particle p(
902 i, i * i, sin(i),
903 i % 11, i % 13, i % 19,
904 10 + cos(2 * i));
905
906 particlesA.push_back(p);
907 particlesB.push_back(p);
908 }
909
910 double tStart = time();
911
912 for (int t = 0; t < repeats; ++t) {
913 for (int i = 0; i < numParticles; ++i) {
914 float posX = particlesA[i].posX;
915 float posY = particlesA[i].posY;
916 float posZ = particlesA[i].posZ;
917
918 float velX = particlesA[i].velX;
919 float velY = particlesA[i].velY;
920 float velZ = particlesA[i].velZ;
921
922 float charge = particlesA[i].charge;
923
924 float accelerationX = 0;
925 float accelerationY = 0;
926 float accelerationZ = 0;
927
928 for (int j = 0; j < numParticles; ++j) {
929 float deltaX = posX - particlesA[j].posX;
930 float deltaY = posY - particlesA[j].posY;
931 float deltaZ = posZ - particlesA[j].posZ;
932 float distance2 = deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ + SOFTENING;
933 float factor = charge * particlesA[j].charge * DELTA_T / distance2 / sqrt(distance2);
934 float forceX = deltaX * factor;
935 float forceY = deltaY * factor;
936 float forceZ = deltaZ * factor;
937
938 accelerationX += forceX;
939 accelerationY += forceY;
940 accelerationZ += forceZ;
941 }
942
943 particlesB[i].posX = posX + velX * DELTA_T;
944 particlesB[i].posY = posY + velY * DELTA_T;
945 particlesB[i].posZ = posZ + velZ * DELTA_T;
946
947 particlesB[i].velX = velX + accelerationX;
948 particlesB[i].velY = velY + accelerationY;
949 particlesB[i].velZ = velZ + accelerationZ;
950
951 particlesB[i].charge = charge;
952 }
953
954 using std::swap;
955 swap(particlesA, particlesB);
956 }
957
958 double tEnd = time();
959
960 if (particlesA[0].posX == 0.12345) {
961 std::cout << "this is a debug statement to prevent the compiler from optimizing away the update routine\n";
962 }
963
964 return gflops(numParticles, repeats, tStart, tEnd);
965 }
966 };
967
968 #ifdef __AVX__
969
970 class NBodyPepper : public NBody
971 {
972 public:
species()973 std::string species()
974 {
975 return "pepper";
976 }
977
performance(std::vector<int> dim)978 double performance(std::vector<int> dim)
979 {
980 int numParticles = dim[0];
981 int repeats = dim[1];
982
983 std::vector<float> posXA;
984 std::vector<float> posYA;
985 std::vector<float> posZA;
986 std::vector<float> posXB;
987 std::vector<float> posYB;
988 std::vector<float> posZB;
989
990 std::vector<float> velXA;
991 std::vector<float> velYA;
992 std::vector<float> velZA;
993 std::vector<float> velXB;
994 std::vector<float> velYB;
995 std::vector<float> velZB;
996
997 std::vector<float> chargeA;
998 std::vector<float> chargeB;
999
1000 for (int i = 0; i < numParticles; ++i) {
1001 Particle p(
1002 i, i * i, sin(i),
1003 i % 11, i % 13, i % 19,
1004 10 + cos(2 * i));
1005
1006 posXA.push_back(p.posX);
1007 posXB.push_back(p.posX);
1008 posYA.push_back(p.posY);
1009 posYB.push_back(p.posY);
1010 posZA.push_back(p.posZ);
1011 posZB.push_back(p.posZ);
1012
1013 velXA.push_back(p.velX);
1014 velXB.push_back(p.velX);
1015 velYA.push_back(p.velY);
1016 velYB.push_back(p.velY);
1017 velZA.push_back(p.velZ);
1018 velZB.push_back(p.velZ);
1019
1020 chargeA.push_back(p.charge);
1021 chargeB.push_back(p.charge);
1022 }
1023
1024 double tStart = time();
1025
1026 for (int t = 0; t < repeats; ++t) {
1027 for (int i = 0; i < numParticles; ++i) {
1028 int j;
1029
1030 float posX = posXA[i];
1031 float posY = posYA[i];
1032 float posZ = posZA[i];
1033 __m256 posXV = _mm256_set1_ps(posX);
1034 __m256 posYV = _mm256_set1_ps(posY);
1035 __m256 posZV = _mm256_set1_ps(posZ);
1036
1037 float velX = velXA[i];
1038 float velY = velYA[i];
1039 float velZ = velZA[i];
1040
1041 float charge = chargeA[i];
1042 __m256 chargeV = _mm256_set1_ps(charge);
1043
1044 __m256 accelerationXV = _mm256_set1_ps(0);
1045 __m256 accelerationYV = _mm256_set1_ps(0);
1046 __m256 accelerationZV = _mm256_set1_ps(0);
1047
1048 for (j = 0; j < (numParticles - 7); j += 8) {
1049 __m256 deltaX = _mm256_sub_ps(posXV, _mm256_loadu_ps(&posXA[j]));
1050 __m256 deltaY = _mm256_sub_ps(posYV, _mm256_loadu_ps(&posYA[j]));
1051 __m256 deltaZ = _mm256_sub_ps(posZV, _mm256_loadu_ps(&posZA[j]));
1052 __m256 distance2 =
1053 _mm256_add_ps(_mm256_mul_ps(deltaX, deltaX),
1054 _mm256_mul_ps(deltaY, deltaY));
1055 distance2 =
1056 _mm256_add_ps(_mm256_mul_ps(deltaZ, deltaZ),
1057 distance2);
1058 distance2 =
1059 _mm256_add_ps(_mm256_set1_ps(SOFTENING),
1060 distance2);
1061
1062 __m256 factor = _mm256_mul_ps(chargeV, _mm256_loadu_ps(&chargeA[j]));
1063 factor = _mm256_mul_ps(factor, _mm256_set1_ps(DELTA_T));
1064 factor = _mm256_mul_ps(factor, _mm256_rcp_ps(distance2));
1065 factor = _mm256_mul_ps(factor, _mm256_rsqrt_ps(distance2));
1066
1067 __m256 forceX = _mm256_mul_ps(deltaX, factor);
1068 __m256 forceY = _mm256_mul_ps(deltaY, factor);
1069 __m256 forceZ = _mm256_mul_ps(deltaZ, factor);
1070
1071 accelerationXV = _mm256_add_ps(accelerationXV, forceX);
1072 accelerationYV = _mm256_add_ps(accelerationYV, forceY);
1073 accelerationZV = _mm256_add_ps(accelerationZV, forceZ);
1074 }
1075
1076 float buf[8];
1077 _mm256_store_ps(buf, accelerationXV);
1078 float accelerationX = buf[0] + buf[1] + buf[2] + buf[3] + buf[4] + buf[5] + buf[6] + buf[7];
1079 _mm256_store_ps(buf, accelerationYV);
1080 float accelerationY = buf[0] + buf[1] + buf[2] + buf[3] + buf[4] + buf[5] + buf[6] + buf[7];
1081 _mm256_store_ps(buf, accelerationZV);
1082 float accelerationZ = buf[0] + buf[1] + buf[2] + buf[3] + buf[4] + buf[5] + buf[6] + buf[7];
1083
1084 for (; j < numParticles; ++j) {
1085 float deltaX = posX - posXA[j];
1086 float deltaY = posY - posYA[j];
1087 float deltaZ = posZ - posZA[j];
1088 float distance2 = deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ + SOFTENING;
1089 float factor = charge * chargeA[j] * DELTA_T / distance2 / sqrt(distance2);
1090 float forceX = deltaX * factor;
1091 float forceY = deltaY * factor;
1092 float forceZ = deltaZ * factor;
1093
1094 accelerationX += forceX;
1095 accelerationY += forceY;
1096 accelerationZ += forceZ;
1097 }
1098
1099 posXB[i] = posX + velX * DELTA_T;
1100 posYB[i] = posY + velY * DELTA_T;
1101 posZB[i] = posZ + velZ * DELTA_T;
1102
1103 velXB[i] = velX + accelerationX;
1104 velYB[i] = velY + accelerationY;
1105 velZB[i] = velZ + accelerationZ;
1106
1107 chargeB[i] = charge;
1108 }
1109
1110 using std::swap;
1111 swap(posXA, posXB);
1112 swap(posYA, posYB);
1113 swap(posZA, posZB);
1114
1115 swap(velXA, velXB);
1116 swap(velYA, velYB);
1117 swap(velZA, velZB);
1118
1119 swap(chargeA, chargeB);
1120 }
1121
1122 double tEnd = time();
1123
1124 if (posXA[0] == 0.12345) {
1125 std::cout << "this is a debug statement to prevent the compiler from optimizing away the update routine\n";
1126 }
1127
1128 return gflops(numParticles, repeats, tStart, tEnd);
1129 }
1130 };
1131
1132 class NBodyCurry : public NBody
1133 {
1134 public:
species()1135 std::string species()
1136 {
1137 return "curry";
1138 }
1139
performance(std::vector<int> dim)1140 double performance(std::vector<int> dim)
1141 {
1142 int numParticles = dim[0];
1143 int repeats = dim[1];
1144
1145 std::vector<float> posXA;
1146 std::vector<float> posYA;
1147 std::vector<float> posZA;
1148 std::vector<float> posXB;
1149 std::vector<float> posYB;
1150 std::vector<float> posZB;
1151
1152 std::vector<float> velXA;
1153 std::vector<float> velYA;
1154 std::vector<float> velZA;
1155 std::vector<float> velXB;
1156 std::vector<float> velYB;
1157 std::vector<float> velZB;
1158
1159 std::vector<float> chargeA;
1160 std::vector<float> chargeB;
1161
1162 for (int i = 0; i < numParticles; ++i) {
1163 Particle p(
1164 i, i * i, sin(i),
1165 i % 11, i % 13, i % 19,
1166 10 + cos(2 * i));
1167
1168 posXA.push_back(p.posX);
1169 posXB.push_back(p.posX);
1170 posYA.push_back(p.posY);
1171 posYB.push_back(p.posY);
1172 posZA.push_back(p.posZ);
1173 posZB.push_back(p.posZ);
1174
1175 velXA.push_back(p.velX);
1176 velXB.push_back(p.velX);
1177 velYA.push_back(p.velY);
1178 velYB.push_back(p.velY);
1179 velZA.push_back(p.velZ);
1180 velZB.push_back(p.velZ);
1181
1182 chargeA.push_back(p.charge);
1183 chargeB.push_back(p.charge);
1184 }
1185
1186 double tStart = time();
1187
1188 for (int t = 0; t < repeats; ++t) {
1189 for (int i = 0; i < (numParticles - 7); i += 8) {
1190 int j;
1191
1192 __m256 posXV = _mm256_loadu_ps(&posXA[i]);
1193 __m256 posYV = _mm256_loadu_ps(&posYA[i]);
1194 __m256 posZV = _mm256_loadu_ps(&posZA[i]);
1195
1196 __m256 velXV = _mm256_loadu_ps(&velXA[i]);
1197 __m256 velYV = _mm256_loadu_ps(&velYA[i]);
1198 __m256 velZV = _mm256_loadu_ps(&velZA[i]);
1199
1200 __m256 chargeV = _mm256_loadu_ps(&chargeA[i]);
1201
1202 __m256 accelerationXV = _mm256_set1_ps(0);
1203 __m256 accelerationYV = _mm256_set1_ps(0);
1204 __m256 accelerationZV = _mm256_set1_ps(0);
1205
1206 __m256 deltaT = _mm256_set1_ps(DELTA_T);
1207
1208 for (j = 0; j < numParticles; ++j) {
1209 __m256 deltaX = _mm256_sub_ps(posXV, _mm256_broadcast_ss(&posXA[j]));
1210 __m256 deltaY = _mm256_sub_ps(posYV, _mm256_broadcast_ss(&posYA[j]));
1211 __m256 deltaZ = _mm256_sub_ps(posZV, _mm256_broadcast_ss(&posZA[j]));
1212 __m256 distance2 =
1213 _mm256_add_ps(_mm256_mul_ps(deltaX, deltaX),
1214 _mm256_mul_ps(deltaY, deltaY));
1215 distance2 =
1216 _mm256_add_ps(_mm256_mul_ps(deltaZ, deltaZ),
1217 distance2);
1218 distance2 =
1219 _mm256_add_ps(_mm256_set1_ps(SOFTENING),
1220 distance2);
1221
1222 __m256 factor = _mm256_mul_ps(chargeV, _mm256_broadcast_ss(&chargeA[j]));
1223 factor = _mm256_mul_ps(factor, deltaT);
1224 factor = _mm256_mul_ps(factor, _mm256_rcp_ps(distance2));
1225 factor = _mm256_mul_ps(factor, _mm256_rsqrt_ps(distance2));
1226
1227 __m256 forceX = _mm256_mul_ps(deltaX, factor);
1228 __m256 forceY = _mm256_mul_ps(deltaY, factor);
1229 __m256 forceZ = _mm256_mul_ps(deltaZ, factor);
1230
1231 accelerationXV = _mm256_add_ps(accelerationXV, forceX);
1232 accelerationYV = _mm256_add_ps(accelerationYV, forceY);
1233 accelerationZV = _mm256_add_ps(accelerationZV, forceZ);
1234 }
1235
1236 posXV = _mm256_add_ps(posXV, _mm256_mul_ps(velXV, deltaT));
1237 posYV = _mm256_add_ps(posYV, _mm256_mul_ps(velYV, deltaT));
1238 posZV = _mm256_add_ps(posZV, _mm256_mul_ps(velZV, deltaT));
1239
1240 _mm256_storeu_ps(&posXB[i], posXV);
1241 _mm256_storeu_ps(&posYB[i], posYV);
1242 _mm256_storeu_ps(&posZB[i], posZV);
1243
1244 velXV = _mm256_add_ps(velXV, accelerationXV);
1245 velYV = _mm256_add_ps(velYV, accelerationYV);
1246 velZV = _mm256_add_ps(velZV, accelerationZV);
1247
1248 _mm256_storeu_ps(&velXB[i], velXV);
1249 _mm256_storeu_ps(&velYB[i], velYV);
1250 _mm256_storeu_ps(&velZB[i], velZV);
1251
1252 _mm256_storeu_ps(&chargeB[i], chargeV);
1253 }
1254
1255 using std::swap;
1256 swap(posXA, posXB);
1257 swap(posYA, posYB);
1258 swap(posZA, posZB);
1259
1260 swap(velXA, velXB);
1261 swap(velYA, velYB);
1262 swap(velZA, velZB);
1263
1264 swap(chargeA, chargeB);
1265 }
1266
1267 double tEnd = time();
1268
1269 if (posXA[0] == 0.12345) {
1270 std::cout << "this is a debug statement to prevent the compiler from optimizing away the update routine\n";
1271 }
1272
1273 return gflops(numParticles, repeats, tStart, tEnd);
1274 }
1275 };
1276
1277 #endif
1278
1279 class NBodyIron : public NBody
1280 {
1281 public:
species()1282 std::string species()
1283 {
1284 return "iron";
1285 }
1286
performance(std::vector<int> dim)1287 double performance(std::vector<int> dim)
1288 {
1289 using namespace LibFlatArray;
1290
1291 int numParticles = dim[0];
1292 int repeats = dim[1];
1293
1294 soa_array<Particle, 8192> particlesA;
1295 soa_array<Particle, 8192> particlesB;
1296
1297 for (int i = 0; i < numParticles; ++i) {
1298 Particle p(
1299 i, i * i, sin(i),
1300 i % 11, i % 13, i % 19,
1301 10 + cos(2 * i));
1302
1303 particlesA.push_back(p);
1304 particlesB.push_back(p);
1305 }
1306
1307 double tStart = time();
1308
1309 for (int t = 0; t < repeats; ++t) {
1310 soa_accessor<Particle, 8192, 1, 1, 0> accessorA = particlesA[0];
1311 soa_accessor<Particle, 8192, 1, 1, 0> accessorB = particlesB[0];
1312
1313 for (; accessorA.index() < numParticles; ++accessorA, ++accessorB ) {
1314 float posX = accessorA.posX();
1315 float posY = accessorA.posY();
1316 float posZ = accessorA.posZ();
1317
1318 float velX = accessorA.velX();
1319 float velY = accessorA.velY();
1320 float velZ = accessorA.velZ();
1321
1322 float charge = accessorA.charge();
1323
1324 float accelerationX = 0;
1325 float accelerationY = 0;
1326 float accelerationZ = 0;
1327
1328 soa_accessor<Particle, 8192, 1, 1, 0> accessorA2 = particlesA[0];
1329
1330 for (accessorA2.index() = 0; accessorA2.index() < numParticles; ++accessorA2) {
1331 float deltaX = posX - accessorA2.posX();
1332 float deltaY = posY - accessorA2.posY();
1333 float deltaZ = posZ - accessorA2.posZ();
1334 float distance2 = deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ + SOFTENING;
1335 float factor = charge * accessorA2.charge() * DELTA_T / distance2 / sqrt(distance2);
1336 float forceX = deltaX * factor;
1337 float forceY = deltaY * factor;
1338 float forceZ = deltaZ * factor;
1339
1340 accelerationX += forceX;
1341 accelerationY += forceY;
1342 accelerationZ += forceZ;
1343 }
1344
1345 accessorB.posX() = posX + velX * DELTA_T;
1346 accessorB.posY() = posY + velY * DELTA_T;
1347 accessorB.posZ() = posZ + velZ * DELTA_T;
1348
1349 accessorB.velX() = velX + accelerationX;
1350 accessorB.velY() = velY + accelerationY;
1351 accessorB.velZ() = velZ + accelerationZ;
1352
1353 accessorB.charge() = charge;
1354 }
1355
1356 using std::swap;
1357 swap(particlesA, particlesB);
1358 }
1359
1360 double tEnd = time();
1361
1362 if (particlesA[0].posX() == 0.12345) {
1363 std::cout << "this is a debug statement to prevent the compiler from optimizing away the update routine\n";
1364 }
1365
1366 return gflops(numParticles, repeats, tStart, tEnd);
1367 }
1368 };
1369
1370 #ifdef __AVX__
1371
1372 class NBodyBronze : public NBody
1373 {
1374 public:
species()1375 std::string species()
1376 {
1377 return "bronze";
1378 }
1379
performance(std::vector<int> dim)1380 double performance(std::vector<int> dim)
1381 {
1382 if (dim[0] <= 128) {
1383 return performance<128>(dim);
1384 }
1385 if (dim[0] <= 256) {
1386 return performance<256>(dim);
1387 }
1388 if (dim[0] <= 512) {
1389 return performance<512>(dim);
1390 }
1391 if (dim[0] <= 1024) {
1392 return performance<1024>(dim);
1393 }
1394 if (dim[0] <= 2048) {
1395 return performance<2048>(dim);
1396 }
1397 if (dim[0] <= 4096) {
1398 return performance<4096>(dim);
1399 }
1400 if (dim[0] <= 8192) {
1401 return performance<8192>(dim);
1402 }
1403
1404 throw std::out_of_range("could not run test NBodyBronze as grid dimension X was too large");
1405 }
1406
1407 template<int DIM>
performance(std::vector<int> dim)1408 double performance(std::vector<int> dim)
1409 {
1410 using namespace LibFlatArray;
1411
1412 int numParticles = dim[0];
1413 int repeats = dim[1];
1414
1415 soa_array<Particle, DIM> particlesA;
1416 soa_array<Particle, DIM> particlesB;
1417
1418 for (int i = 0; i < numParticles; ++i) {
1419 Particle p(
1420 i, i * i, sin(i),
1421 i % 11, i % 13, i % 19,
1422 10 + cos(2 * i));
1423
1424 particlesA.push_back(p);
1425 particlesB.push_back(p);
1426 }
1427
1428 double tStart = time();
1429
1430 for (int t = 0; t < repeats; ++t) {
1431 soa_accessor<Particle, DIM, 1, 1, 0> accessorA = particlesA[0];
1432 soa_accessor<Particle, DIM, 1, 1, 0> accessorB = particlesB[0];
1433 soa_accessor<Particle, DIM, 1, 1, 0> accessorA2 = particlesA[0];
1434
1435 for (; accessorA.index() < (numParticles - 7); accessorA += 8, accessorB += 8) {
1436 __m256 posX = _mm256_loadu_ps(&accessorA.posX());
1437 __m256 posY = _mm256_loadu_ps(&accessorA.posY());
1438 __m256 posZ = _mm256_loadu_ps(&accessorA.posZ());
1439
1440 __m256 velX = _mm256_loadu_ps(&accessorA.velX());
1441 __m256 velY = _mm256_loadu_ps(&accessorA.velY());
1442 __m256 velZ = _mm256_loadu_ps(&accessorA.velZ());
1443
1444 __m256 charge = _mm256_loadu_ps(&accessorA.charge());
1445
1446 __m256 accelerationX = _mm256_set1_ps(0.0);
1447 __m256 accelerationY = _mm256_set1_ps(0.0);
1448 __m256 accelerationZ = _mm256_set1_ps(0.0);
1449
1450 __m256 deltaT = _mm256_set1_ps(DELTA_T);
1451
1452 for (accessorA2.index() = 0; accessorA2.index() < numParticles; ++accessorA2) {
1453 __m256 deltaX = _mm256_sub_ps(posX, _mm256_broadcast_ss(&accessorA2.posX()));
1454 __m256 deltaY = _mm256_sub_ps(posY, _mm256_broadcast_ss(&accessorA2.posY()));
1455 __m256 deltaZ = _mm256_sub_ps(posZ, _mm256_broadcast_ss(&accessorA2.posZ()));
1456 __m256 distance2 =
1457 _mm256_add_ps(_mm256_mul_ps(deltaX, deltaX),
1458 _mm256_mul_ps(deltaY, deltaY));
1459 distance2 =
1460 _mm256_add_ps(_mm256_mul_ps(deltaZ, deltaZ),
1461 distance2);
1462 distance2 =
1463 _mm256_add_ps(_mm256_set1_ps(SOFTENING),
1464 distance2);
1465
1466 __m256 factor = _mm256_mul_ps(charge, _mm256_broadcast_ss(&accessorA2.charge()));
1467 factor = _mm256_mul_ps(factor, _mm256_set1_ps(DELTA_T));
1468 factor = _mm256_mul_ps(factor, _mm256_rcp_ps(distance2));
1469 factor = _mm256_mul_ps(factor, _mm256_rsqrt_ps(distance2));
1470
1471 __m256 forceX = _mm256_mul_ps(deltaX, factor);
1472 __m256 forceY = _mm256_mul_ps(deltaY, factor);
1473 __m256 forceZ = _mm256_mul_ps(deltaZ, factor);
1474
1475 accelerationX = _mm256_add_ps(accelerationX, forceX);
1476 accelerationY = _mm256_add_ps(accelerationY, forceY);
1477 accelerationZ = _mm256_add_ps(accelerationZ, forceZ);
1478 }
1479
1480 posX = _mm256_add_ps(posX, _mm256_mul_ps(velX, deltaT));
1481 posY = _mm256_add_ps(posY, _mm256_mul_ps(velY, deltaT));
1482 posZ = _mm256_add_ps(posZ, _mm256_mul_ps(velZ, deltaT));
1483
1484 _mm256_storeu_ps(&accessorB.posX(), posX);
1485 _mm256_storeu_ps(&accessorB.posY(), posY);
1486 _mm256_storeu_ps(&accessorB.posZ(), posZ);
1487
1488 velX = _mm256_add_ps(velX, accelerationX);
1489 velY = _mm256_add_ps(velY, accelerationY);
1490 velZ = _mm256_add_ps(velZ, accelerationZ);
1491
1492 _mm256_storeu_ps(&accessorB.velX(), velX);
1493 _mm256_storeu_ps(&accessorB.velY(), velY);
1494 _mm256_storeu_ps(&accessorB.velZ(), velZ);
1495
1496 _mm256_storeu_ps(&accessorB.charge(), charge);
1497 }
1498
1499 using std::swap;
1500 swap(particlesA, particlesB);
1501 }
1502
1503 double tEnd = time();
1504
1505 if (particlesA[0].posX() == 0.12345) {
1506 std::cout << "this is a debug statement to prevent the compiler from optimizing away the update routine\n";
1507 }
1508
1509 return gflops(numParticles, repeats, tStart, tEnd);
1510 }
1511 };
1512
1513 #endif
1514
1515 class NBodySilver : public NBody
1516 {
1517 public:
species()1518 std::string species()
1519 {
1520 return "silver";
1521 }
1522
performance(std::vector<int> dim)1523 double performance(std::vector<int> dim)
1524 {
1525 if (dim[0] <= 128) {
1526 return performance<128, short_vec<float, 8> >(dim);
1527 }
1528 if (dim[0] <= 256) {
1529 return performance<256, short_vec<float, 8> >(dim);
1530 }
1531 if (dim[0] <= 512) {
1532 return performance<512, short_vec<float, 8> >(dim);
1533 }
1534 if (dim[0] <= 1024) {
1535 return performance<1024, short_vec<float, 8> >(dim);
1536 }
1537 if (dim[0] <= 2048) {
1538 return performance<2048, short_vec<float, 8> >(dim);
1539 }
1540 if (dim[0] <= 4096) {
1541 return performance<4096, short_vec<float, 8> >(dim);
1542 }
1543 if (dim[0] <= 8192) {
1544 return performance<8192, short_vec<float, 8> >(dim);
1545 }
1546
1547 throw std::out_of_range("could not run test NBodySilver as grid dimension X was too large");
1548 }
1549
1550 template<int DIM, typename REAL>
performance(std::vector<int> dim)1551 double performance(std::vector<int> dim)
1552 {
1553 using namespace LibFlatArray;
1554
1555 int numParticles = dim[0];
1556 int repeats = dim[1];
1557
1558 soa_array<Particle, DIM> particlesA;
1559 soa_array<Particle, DIM> particlesB;
1560
1561 for (int i = 0; i < numParticles; ++i) {
1562 Particle p(
1563 i, i * i, sin(i),
1564 i % 11, i % 13, i % 19,
1565 10 + cos(2 * i));
1566
1567 particlesA.push_back(p);
1568 particlesB.push_back(p);
1569 }
1570
1571 double tStart = time();
1572
1573 for (int t = 0; t < repeats; ++t) {
1574 soa_accessor<Particle, DIM, 1, 1, 0> accessorA = particlesA[0];
1575 soa_accessor<Particle, DIM, 1, 1, 0> accessorB = particlesB[0];
1576 soa_accessor<Particle, DIM, 1, 1, 0> accessorA2 = particlesA[0];
1577
1578 for (; accessorA.index() < numParticles; accessorA += REAL::ARITY, accessorB += REAL::ARITY) {
1579 REAL posX = &accessorA.posX();
1580 REAL posY = &accessorA.posY();
1581 REAL posZ = &accessorA.posZ();
1582
1583 REAL velX = &accessorA.velX();
1584 REAL velY = &accessorA.velY();
1585 REAL velZ = &accessorA.velZ();
1586
1587 REAL charge = &accessorA.charge();
1588
1589 REAL accelerationX = 0.0;
1590 REAL accelerationY = 0.0;
1591 REAL accelerationZ = 0.0;
1592
1593 for (accessorA2.index() = 0; accessorA2.index() < numParticles; ++accessorA2) {
1594 REAL deltaX = posX - REAL(accessorA2.posX());
1595 REAL deltaY = posY - REAL(accessorA2.posY());
1596 REAL deltaZ = posZ - REAL(accessorA2.posZ());
1597 REAL distance2 = deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ + SOFTENING;
1598
1599 REAL factor = charge * accessorA2.charge() * DELTA_T / distance2 / sqrt(distance2);
1600 REAL forceX = deltaX * factor;
1601 REAL forceY = deltaY * factor;
1602 REAL forceZ = deltaZ * factor;
1603
1604 accelerationX += forceX;
1605 accelerationY += forceY;
1606 accelerationZ += forceZ;
1607 }
1608
1609 &accessorB.posX() << (posX + velX * DELTA_T);
1610 &accessorB.posY() << (posY + velY * DELTA_T);
1611 &accessorB.posZ() << (posZ + velZ * DELTA_T);
1612
1613 &accessorB.velX() << (velX + accelerationX);
1614 &accessorB.velY() << (velY + accelerationY);
1615 &accessorB.velZ() << (velZ + accelerationZ);
1616
1617 &accessorB.charge() << charge;
1618 }
1619
1620 using std::swap;
1621 swap(particlesA, particlesB);
1622 }
1623
1624 double tEnd = time();
1625
1626 if (particlesA[0].posX() == 0.12345) {
1627 std::cout << "this is a debug statement to prevent the compiler from optimizing away the update routine\n";
1628 }
1629
1630 return gflops(numParticles, repeats, tStart, tEnd);
1631 }
1632 };
1633
1634 class NBodyGold : public NBody
1635 {
1636 public:
species()1637 std::string species()
1638 {
1639 return "gold";
1640 }
1641
performance(std::vector<int> dim)1642 double performance(std::vector<int> dim)
1643 {
1644 if (dim[0] <= 128) {
1645 return performance<128, short_vec<float, 8> >(dim);
1646 }
1647 if (dim[0] <= 256) {
1648 return performance<256, short_vec<float, 8> >(dim);
1649 }
1650 if (dim[0] <= 512) {
1651 return performance<512, short_vec<float, 8> >(dim);
1652 }
1653 if (dim[0] <= 1024) {
1654 return performance<1024, short_vec<float, 8> >(dim);
1655 }
1656 if (dim[0] <= 2048) {
1657 return performance<2048, short_vec<float, 8> >(dim);
1658 }
1659 if (dim[0] <= 4096) {
1660 return performance<4096, short_vec<float, 8> >(dim);
1661 }
1662 if (dim[0] <= 8192) {
1663 return performance<8192, short_vec<float, 8> >(dim);
1664 }
1665
1666 throw std::out_of_range("could not run test NBodySilver as grid dimension X was too large");
1667 }
1668
1669 template<int DIM, typename REAL>
performance(std::vector<int> dim)1670 double performance(std::vector<int> dim)
1671 {
1672 using namespace LibFlatArray;
1673
1674 int numParticles = dim[0];
1675 int repeats = dim[1];
1676
1677 soa_array<ArrayParticle, DIM> particlesA;
1678 soa_array<ArrayParticle, DIM> particlesB;
1679
1680 for (int i = 0; i < numParticles; ++i) {
1681 ArrayParticle p(
1682 i, i * i, sin(i),
1683 i % 11, i % 13, i % 19,
1684 10 + cos(2 * i));
1685
1686 particlesA.push_back(p);
1687 particlesB.push_back(p);
1688 }
1689
1690 double tStart = time();
1691
1692 for (int t = 0; t < repeats; ++t) {
1693 soa_accessor<ArrayParticle, DIM, 1, 1, 0> accessorA = particlesA[0];
1694 soa_accessor<ArrayParticle, DIM, 1, 1, 0> accessorB = particlesB[0];
1695 soa_accessor<ArrayParticle, DIM, 1, 1, 0> accessorA2 = particlesA[0];
1696
1697 for (; accessorA.index() < numParticles; accessorA += REAL::ARITY, accessorB += REAL::ARITY) {
1698 REAL posX = &accessorA.pos()[0];
1699 REAL posY = &accessorA.pos()[1];
1700 REAL posZ = &accessorA.pos()[2];
1701
1702 REAL velX = &accessorA.vel()[0];
1703 REAL velY = &accessorA.vel()[1];
1704 REAL velZ = &accessorA.vel()[2];
1705
1706 REAL charge = &accessorA.charge();
1707
1708 REAL accelerationX = 0.0;
1709 REAL accelerationY = 0.0;
1710 REAL accelerationZ = 0.0;
1711
1712 for (accessorA2.index() = 0; accessorA2.index() < numParticles; ++accessorA2) {
1713 REAL deltaX = posX - REAL(accessorA2.pos()[0]);
1714 REAL deltaY = posY - REAL(accessorA2.pos()[1]);
1715 REAL deltaZ = posZ - REAL(accessorA2.pos()[2]);
1716 REAL distance2 = deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ + SOFTENING;
1717
1718 REAL factor = charge * accessorA2.charge() * DELTA_T / distance2 / sqrt(distance2);
1719 REAL forceX = deltaX * factor;
1720 REAL forceY = deltaY * factor;
1721 REAL forceZ = deltaZ * factor;
1722
1723 accelerationX += forceX;
1724 accelerationY += forceY;
1725 accelerationZ += forceZ;
1726 }
1727
1728 &accessorB.pos()[0] << (posX + velX * DELTA_T);
1729 &accessorB.pos()[1] << (posY + velY * DELTA_T);
1730 &accessorB.pos()[2] << (posZ + velZ * DELTA_T);
1731
1732 &accessorB.vel()[0] << (velX + accelerationX);
1733 &accessorB.vel()[1] << (velY + accelerationY);
1734 &accessorB.vel()[2] << (velZ + accelerationZ);
1735
1736 &accessorB.charge() << charge;
1737 }
1738
1739 using std::swap;
1740 swap(particlesA, particlesB);
1741 }
1742
1743 double tEnd = time();
1744
1745 if (particlesA[0].pos()[0] == 0.12345) {
1746 std::cout << "this is a debug statement to prevent the compiler from optimizing away the update routine\n";
1747 }
1748
1749 return gflops(numParticles, repeats, tStart, tEnd);
1750 }
1751 };
1752
main(int argc,char ** argv)1753 int main(int argc, char **argv)
1754 {
1755 if ((argc < 3) || (argc == 4) || (argc > 5)) {
1756 std::cerr << "usage: " << argv[0] << " [-n,--name SUBSTRING] REVISION CUDA_DEVICE \n"
1757 << " - optional: only run tests whose name contains a SUBSTRING,\n"
1758 << " - REVISION is purely for output reasons,\n"
1759 << " - CUDA_DEVICE causes CUDA tests to run on the device with the given ID.\n";
1760 return 1;
1761 }
1762 std::string name = "";
1763 int argumentIndex = 1;
1764 if (argc == 5) {
1765 if ((std::string(argv[1]) == "-n") ||
1766 (std::string(argv[1]) == "--name")) {
1767 name = std::string(argv[2]);
1768 }
1769 argumentIndex = 3;
1770 }
1771 std::string revision = argv[argumentIndex + 0];
1772
1773 std::stringstream s;
1774 s << argv[argumentIndex + 1];
1775 int cudaDevice;
1776 s >> cudaDevice;
1777
1778 evaluate eval(name, revision);
1779 eval.print_header();
1780
1781 std::vector<std::vector<int> > sizes;
1782 for (int d = 32; d <= 544; d += 4) {
1783 std::vector<int> dim(3, d);
1784
1785 sizes.push_back(dim);
1786 }
1787
1788 for (std::vector<std::vector<int> >::iterator i = sizes.begin(); i != sizes.end(); ++i) {
1789 eval(JacobiD3Q7Vanilla(), *i);
1790 }
1791
1792 #ifdef __SSE__
1793 for (std::vector<std::vector<int> >::iterator i = sizes.begin(); i != sizes.end(); ++i) {
1794 eval(JacobiD3Q7Pepper(), *i);
1795 }
1796 #endif
1797
1798 for (std::vector<std::vector<int> >::iterator i = sizes.begin(); i != sizes.end(); ++i) {
1799 eval(JacobiD3Q7Bronze(), *i);
1800 }
1801
1802 #ifdef __SSE__
1803 for (std::vector<std::vector<int> >::iterator i = sizes.begin(); i != sizes.end(); ++i) {
1804 eval(JacobiD3Q7Silver(), *i);
1805 }
1806 #endif
1807
1808 for (std::vector<std::vector<int> >::iterator i = sizes.begin(); i != sizes.end(); ++i) {
1809 eval(JacobiD3Q7Gold(), *i);
1810 }
1811
1812 sizes.clear();
1813
1814 for (int n = 128; n <= 8192; n *= 2) {
1815 std::vector<int> dim(3);
1816 dim[0] = n;
1817 dim[1] = std::size_t(4) * 512 * 1024 * 1024 / n / n;
1818 dim[2] = 0;
1819
1820 sizes.push_back(dim);
1821 }
1822
1823 for (std::vector<std::vector<int> >::iterator i = sizes.begin(); i != sizes.end(); ++i) {
1824 eval(NBodyVanilla(), *i);
1825 }
1826
1827 #ifdef __AVX__
1828 for (std::vector<std::vector<int> >::iterator i = sizes.begin(); i != sizes.end(); ++i) {
1829 eval(NBodyPepper(), *i);
1830 }
1831
1832 for (std::vector<std::vector<int> >::iterator i = sizes.begin(); i != sizes.end(); ++i) {
1833 eval(NBodyCurry(), *i);
1834 }
1835 #endif
1836
1837 for (std::vector<std::vector<int> >::iterator i = sizes.begin(); i != sizes.end(); ++i) {
1838 eval(NBodyIron(), *i);
1839 }
1840
1841 #ifdef __AVX__
1842 for (std::vector<std::vector<int> >::iterator i = sizes.begin(); i != sizes.end(); ++i) {
1843 eval(NBodyBronze(), *i);
1844 }
1845 #endif
1846
1847 for (std::vector<std::vector<int> >::iterator i = sizes.begin(); i != sizes.end(); ++i) {
1848 eval(NBodySilver(), *i);
1849 }
1850
1851 for (std::vector<std::vector<int> >::iterator i = sizes.begin(); i != sizes.end(); ++i) {
1852 eval(NBodyGold(), *i);
1853 }
1854
1855 return 0;
1856 }
1857