1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 //                        Kokkos v. 3.0
6 //       Copyright (2020) National Technology & Engineering
7 //               Solutions of Sandia, LLC (NTESS).
8 //
9 // Under the terms of Contract DE-NA0003525 with NTESS,
10 // the U.S. Government retains certain rights in this software.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions are
14 // met:
15 //
16 // 1. Redistributions of source code must retain the above copyright
17 // notice, this list of conditions and the following disclaimer.
18 //
19 // 2. Redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution.
22 //
23 // 3. Neither the name of the Corporation nor the names of the
24 // contributors may be used to endorse or promote products derived from
25 // this software without specific prior written permission.
26 //
27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 //
39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
40 //
41 // ************************************************************************
42 //@HEADER
43 */
44 
45 #include <cstdio>
46 #include <cstring>
47 #include <cstdlib>
48 
49 #include <Kokkos_Core.hpp>
50 #include <impl/Kokkos_Timer.hpp>
51 
52 using exec_space = Kokkos::DefaultExecutionSpace;
53 
54 #define RESET 0
55 #define BRIGHT 1
56 #define DIM 2
57 #define UNDERLINE 3
58 #define BLINK 4
59 #define REVERSE 7
60 #define HIDDEN 8
61 
62 #define BLACK 0
63 #define RED 1
64 #define GREEN 2
65 #define YELLOW 3
66 #define BLUE 4
67 #define MAGENTA 5
68 #define CYAN 6
69 #define GREY 7
70 #define WHITE 8
71 
textcolor(int attr,int fg,int bg)72 void textcolor(int attr, int fg, int bg) {
73   char command[40];
74 
75   /* Command is the control command to the terminal */
76   sprintf(command, "%c[%d;%d;%dm", 0x1B, attr, fg + 30, bg + 40);
77   printf("%s", command);
78 }
textcolor_standard()79 void textcolor_standard() { textcolor(RESET, BLACK, WHITE); }
80 
81 template <class T, class DEVICE_TYPE>
82 struct ZeroFunctor {
83   using execution_space = DEVICE_TYPE;
84   using type            = typename Kokkos::View<T, execution_space>;
85   using h_type          = typename Kokkos::View<T, execution_space>::HostMirror;
86   type data;
87   KOKKOS_INLINE_FUNCTION
operator ()ZeroFunctor88   void operator()(int) const { data() = 0; }
89 };
90 
91 //---------------------------------------------------
92 //--------------atomic_fetch_add---------------------
93 //---------------------------------------------------
94 
95 template <class T, class DEVICE_TYPE>
96 struct AddFunctor {
97   using execution_space = DEVICE_TYPE;
98   using type            = Kokkos::View<T, execution_space>;
99   type data;
100 
101   KOKKOS_INLINE_FUNCTION
operator ()AddFunctor102   void operator()(int) const { Kokkos::atomic_fetch_add(&data(), (T)1); }
103 };
104 
105 template <class T>
AddLoop(int loop)106 T AddLoop(int loop) {
107   struct ZeroFunctor<T, exec_space> f_zero;
108   typename ZeroFunctor<T, exec_space>::type data("Data");
109   typename ZeroFunctor<T, exec_space>::h_type h_data("HData");
110   f_zero.data = data;
111   Kokkos::parallel_for(1, f_zero);
112   exec_space().fence();
113 
114   struct AddFunctor<T, exec_space> f_add;
115   f_add.data = data;
116   Kokkos::parallel_for(loop, f_add);
117   exec_space().fence();
118 
119   Kokkos::deep_copy(h_data, data);
120   T val = h_data();
121   return val;
122 }
123 
124 template <class T, class DEVICE_TYPE>
125 struct AddNonAtomicFunctor {
126   using execution_space = DEVICE_TYPE;
127   using type            = Kokkos::View<T, execution_space>;
128   type data;
129 
130   KOKKOS_INLINE_FUNCTION
operator ()AddNonAtomicFunctor131   void operator()(int) const { data() += (T)1; }
132 };
133 
134 template <class T>
AddLoopNonAtomic(int loop)135 T AddLoopNonAtomic(int loop) {
136   struct ZeroFunctor<T, exec_space> f_zero;
137   typename ZeroFunctor<T, exec_space>::type data("Data");
138   typename ZeroFunctor<T, exec_space>::h_type h_data("HData");
139 
140   f_zero.data = data;
141   Kokkos::parallel_for(1, f_zero);
142   exec_space().fence();
143 
144   struct AddNonAtomicFunctor<T, exec_space> f_add;
145   f_add.data = data;
146   Kokkos::parallel_for(loop, f_add);
147   exec_space().fence();
148 
149   Kokkos::deep_copy(h_data, data);
150   T val = h_data();
151 
152   return val;
153 }
154 
155 template <class T>
AddLoopSerial(int loop)156 T AddLoopSerial(int loop) {
157   T* data = new T[1];
158   data[0] = 0;
159 
160   for (int i = 0; i < loop; i++) *data += (T)1;
161 
162   T val = *data;
163   delete[] data;
164   return val;
165 }
166 
167 template <class T, class DEVICE_TYPE>
168 struct CASFunctor {
169   using execution_space = DEVICE_TYPE;
170   using type            = Kokkos::View<T, execution_space>;
171   type data;
172 
173   KOKKOS_INLINE_FUNCTION
operator ()CASFunctor174   void operator()(int) const {
175     T old = data();
176     T newval, assumed;
177     do {
178       assumed = old;
179       newval  = assumed + (T)1;
180       old     = Kokkos::atomic_compare_exchange(&data(), assumed, newval);
181     } while (old != assumed);
182   }
183 };
184 
185 template <class T>
CASLoop(int loop)186 T CASLoop(int loop) {
187   struct ZeroFunctor<T, exec_space> f_zero;
188   typename ZeroFunctor<T, exec_space>::type data("Data");
189   typename ZeroFunctor<T, exec_space>::h_type h_data("HData");
190   f_zero.data = data;
191   Kokkos::parallel_for(1, f_zero);
192   exec_space().fence();
193 
194   struct CASFunctor<T, exec_space> f_cas;
195   f_cas.data = data;
196   Kokkos::parallel_for(loop, f_cas);
197   exec_space().fence();
198 
199   Kokkos::deep_copy(h_data, data);
200   T val = h_data();
201 
202   return val;
203 }
204 
205 template <class T, class DEVICE_TYPE>
206 struct CASNonAtomicFunctor {
207   using execution_space = DEVICE_TYPE;
208   using type            = Kokkos::View<T, execution_space>;
209   type data;
210 
211   KOKKOS_INLINE_FUNCTION
operator ()CASNonAtomicFunctor212   void operator()(int) const {
213     volatile T assumed;
214     volatile T newval;
215     bool fail = 1;
216     do {
217       assumed = data();
218       newval  = assumed + (T)1;
219       if (data() == assumed) {
220         data() = newval;
221         fail   = 0;
222       }
223     } while (fail);
224   }
225 };
226 
227 template <class T>
CASLoopNonAtomic(int loop)228 T CASLoopNonAtomic(int loop) {
229   struct ZeroFunctor<T, exec_space> f_zero;
230   typename ZeroFunctor<T, exec_space>::type data("Data");
231   typename ZeroFunctor<T, exec_space>::h_type h_data("HData");
232   f_zero.data = data;
233   Kokkos::parallel_for(1, f_zero);
234   exec_space().fence();
235 
236   struct CASNonAtomicFunctor<T, exec_space> f_cas;
237   f_cas.data = data;
238   Kokkos::parallel_for(loop, f_cas);
239   exec_space().fence();
240 
241   Kokkos::deep_copy(h_data, data);
242   T val = h_data();
243 
244   return val;
245 }
246 
247 template <class T>
CASLoopSerial(int loop)248 T CASLoopSerial(int loop) {
249   T* data = new T[1];
250   data[0] = 0;
251 
252   for (int i = 0; i < loop; i++) {
253     T assumed;
254     T newval;
255     T old;
256     do {
257       assumed = *data;
258       newval  = assumed + (T)1;
259       old     = *data;
260       *data   = newval;
261     } while (!(assumed == old));
262   }
263 
264   T val = *data;
265   delete[] data;
266   return val;
267 }
268 
269 template <class T, class DEVICE_TYPE>
270 struct ExchFunctor {
271   using execution_space = DEVICE_TYPE;
272   using type            = Kokkos::View<T, execution_space>;
273   type data, data2;
274 
275   KOKKOS_INLINE_FUNCTION
operator ()ExchFunctor276   void operator()(int i) const {
277     T old = Kokkos::atomic_exchange(&data(), (T)i);
278     Kokkos::atomic_fetch_add(&data2(), old);
279   }
280 };
281 
282 template <class T>
ExchLoop(int loop)283 T ExchLoop(int loop) {
284   struct ZeroFunctor<T, exec_space> f_zero;
285   typename ZeroFunctor<T, exec_space>::type data("Data");
286   typename ZeroFunctor<T, exec_space>::h_type h_data("HData");
287   f_zero.data = data;
288   Kokkos::parallel_for(1, f_zero);
289   exec_space().fence();
290 
291   typename ZeroFunctor<T, exec_space>::type data2("Data");
292   typename ZeroFunctor<T, exec_space>::h_type h_data2("HData");
293   f_zero.data = data2;
294   Kokkos::parallel_for(1, f_zero);
295   exec_space().fence();
296 
297   struct ExchFunctor<T, exec_space> f_exch;
298   f_exch.data  = data;
299   f_exch.data2 = data2;
300   Kokkos::parallel_for(loop, f_exch);
301   exec_space().fence();
302 
303   Kokkos::deep_copy(h_data, data);
304   Kokkos::deep_copy(h_data2, data2);
305   T val = h_data() + h_data2();
306 
307   return val;
308 }
309 
310 template <class T, class DEVICE_TYPE>
311 struct ExchNonAtomicFunctor {
312   using execution_space = DEVICE_TYPE;
313   using type            = Kokkos::View<T, execution_space>;
314   type data, data2;
315 
316   KOKKOS_INLINE_FUNCTION
operator ()ExchNonAtomicFunctor317   void operator()(int i) const {
318     T old  = data();
319     data() = (T)i;
320     data2() += old;
321   }
322 };
323 
324 template <class T>
ExchLoopNonAtomic(int loop)325 T ExchLoopNonAtomic(int loop) {
326   struct ZeroFunctor<T, exec_space> f_zero;
327   typename ZeroFunctor<T, exec_space>::type data("Data");
328   typename ZeroFunctor<T, exec_space>::h_type h_data("HData");
329   f_zero.data = data;
330   Kokkos::parallel_for(1, f_zero);
331   exec_space().fence();
332 
333   typename ZeroFunctor<T, exec_space>::type data2("Data");
334   typename ZeroFunctor<T, exec_space>::h_type h_data2("HData");
335   f_zero.data = data2;
336   Kokkos::parallel_for(1, f_zero);
337   exec_space().fence();
338 
339   struct ExchNonAtomicFunctor<T, exec_space> f_exch;
340   f_exch.data  = data;
341   f_exch.data2 = data2;
342   Kokkos::parallel_for(loop, f_exch);
343   exec_space().fence();
344 
345   Kokkos::deep_copy(h_data, data);
346   Kokkos::deep_copy(h_data2, data2);
347   T val = h_data() + h_data2();
348 
349   return val;
350 }
351 
352 template <class T>
ExchLoopSerial(int loop)353 T ExchLoopSerial(int loop) {
354   T* data  = new T[1];
355   T* data2 = new T[1];
356   data[0]  = 0;
357   data2[0] = 0;
358   for (int i = 0; i < loop; i++) {
359     T old = *data;
360     *data = (T)i;
361     *data2 += old;
362   }
363 
364   T val = *data2 + *data;
365   delete[] data;
366   delete[] data2;
367   return val;
368 }
369 
370 template <class T>
LoopVariant(int loop,int test)371 T LoopVariant(int loop, int test) {
372   switch (test) {
373     case 1: return AddLoop<T>(loop);
374     case 2: return CASLoop<T>(loop);
375     case 3: return ExchLoop<T>(loop);
376   }
377   return 0;
378 }
379 
380 template <class T>
LoopVariantSerial(int loop,int test)381 T LoopVariantSerial(int loop, int test) {
382   switch (test) {
383     case 1: return AddLoopSerial<T>(loop);
384     case 2: return CASLoopSerial<T>(loop);
385     case 3: return ExchLoopSerial<T>(loop);
386   }
387   return 0;
388 }
389 
390 template <class T>
LoopVariantNonAtomic(int loop,int test)391 T LoopVariantNonAtomic(int loop, int test) {
392   switch (test) {
393     case 1: return AddLoopNonAtomic<T>(loop);
394     case 2: return CASLoopNonAtomic<T>(loop);
395     case 3: return ExchLoopNonAtomic<T>(loop);
396   }
397   return 0;
398 }
399 
400 template <class T>
Loop(int loop,int test,const char * type_name)401 void Loop(int loop, int test, const char* type_name) {
402   LoopVariant<T>(loop, test);
403 
404   Kokkos::Impl::Timer timer;
405   T res       = LoopVariant<T>(loop, test);
406   double time = timer.seconds();
407 
408   timer.reset();
409   T resNonAtomic       = LoopVariantNonAtomic<T>(loop, test);
410   double timeNonAtomic = timer.seconds();
411 
412   timer.reset();
413   T resSerial       = LoopVariantSerial<T>(loop, test);
414   double timeSerial = timer.seconds();
415 
416   time *= 1e6 / loop;
417   timeNonAtomic *= 1e6 / loop;
418   timeSerial *= 1e6 / loop;
419   // textcolor_standard();
420   bool passed = true;
421   if (resSerial != res) passed = false;
422   // if(!passed) textcolor(RESET,BLACK,YELLOW);
423   printf(
424       "%s Test %i %s  --- Loop: %i Value (S,A,NA): %e %e %e Time: %7.4e %7.4e "
425       "%7.4e Size of Type %i)",
426       type_name, test, passed ? "PASSED" : "FAILED", loop, 1.0 * resSerial,
427       1.0 * res, 1.0 * resNonAtomic, timeSerial, time, timeNonAtomic,
428       (int)sizeof(T));
429   // if(!passed) textcolor_standard();
430   printf("\n");
431 }
432 
433 template <class T>
Test(int loop,int test,const char * type_name)434 void Test(int loop, int test, const char* type_name) {
435   if (test == -1) {
436     Loop<T>(loop, 1, type_name);
437     Loop<T>(loop, 2, type_name);
438     Loop<T>(loop, 3, type_name);
439 
440   } else
441     Loop<T>(loop, test, type_name);
442 }
443 
main(int argc,char * argv[])444 int main(int argc, char* argv[]) {
445   int type = -1;
446   int loop = 100000;
447   int test = -1;
448 
449   for (int i = 0; i < argc; i++) {
450     if ((strcmp(argv[i], "--test") == 0)) {
451       test = std::stoi(argv[++i]);
452       continue;
453     }
454     if ((strcmp(argv[i], "--type") == 0)) {
455       type = std::stoi(argv[++i]);
456       continue;
457     }
458     if ((strcmp(argv[i], "-l") == 0) || (strcmp(argv[i], "--loop") == 0)) {
459       loop = std::stoi(argv[++i]);
460       continue;
461     }
462   }
463 
464   Kokkos::initialize(argc, argv);
465 
466   printf("Using %s\n", Kokkos::atomic_query_version());
467   bool all_tests = false;
468   if (type == -1) all_tests = true;
469   while (type < 100) {
470     if (type == 1) {
471       Test<int>(loop, test, "int                    ");
472     }
473     if (type == 2) {
474       Test<long int>(loop, test, "long int               ");
475     }
476     if (type == 3) {
477       Test<long long int>(loop, test, "long long int          ");
478     }
479     if (type == 4) {
480       Test<unsigned int>(loop, test, "unsigned int           ");
481     }
482     if (type == 5) {
483       Test<unsigned long int>(loop, test, "unsigned long int      ");
484     }
485     if (type == 6) {
486       Test<unsigned long long int>(loop, test, "unsigned long long int ");
487     }
488     if (type == 10) {
489       // Test<float>(loop,test,"float                  ");
490     }
491     if (type == 11) {
492       Test<double>(loop, test, "double                 ");
493     }
494     if (!all_tests)
495       type = 100;
496     else
497       type++;
498   }
499 
500   Kokkos::finalize();
501 }
502