1 //============================================================================
2 //  Copyright (c) Kitware, Inc.
3 //  All rights reserved.
4 //  See LICENSE.txt for details.
5 //  This software is distributed WITHOUT ANY WARRANTY; without even
6 //  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
7 //  PURPOSE.  See the above copyright notice for more information.
8 //
9 //  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
10 //  Copyright 2018 UT-Battelle, LLC.
11 //  Copyright 2018 Los Alamos National Security.
12 //
13 //  Under the terms of Contract DE-NA0003525 with NTESS,
14 //  the U.S. Government retains certain rights in this software.
15 //
16 //  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
17 //  Laboratory (LANL), the U.S. Government retains certain rights in
18 //  this software.
19 //============================================================================
20 
21 #include "Benchmarker.h"
22 
23 #include <vtkm/cont/ArrayHandle.h>
24 #include <vtkm/cont/AtomicArray.h>
25 #include <vtkm/cont/RuntimeDeviceTracker.h>
26 #include <vtkm/cont/Timer.h>
27 
28 #include <vtkm/exec/FunctorBase.h>
29 
30 #include <iomanip>
31 #include <sstream>
32 #include <string>
33 
34 namespace vtkm
35 {
36 namespace benchmarking
37 {
38 
39 // This is 32x larger than the largest array size.
40 static constexpr vtkm::Id NumWrites = 33554432; // 2^25
41 
42 #define MAKE_ATOMIC_BENCHMARKS(Name, Class)                                                        \
43   VTKM_MAKE_BENCHMARK(Name##1, Class, 1);                                                          \
44   VTKM_MAKE_BENCHMARK(Name##8, Class, 8);                                                          \
45   VTKM_MAKE_BENCHMARK(Name##32, Class, 32);                                                        \
46   VTKM_MAKE_BENCHMARK(Name##512, Class, 512);                                                      \
47   VTKM_MAKE_BENCHMARK(Name##2048, Class, 2048);                                                    \
48   VTKM_MAKE_BENCHMARK(Name##32768, Class, 32768);                                                  \
49   VTKM_MAKE_BENCHMARK(Name##1048576, Class, 1048576)
50 
51 #define RUN_ATOMIC_BENCHMARKS(Name)                                                                \
52   VTKM_RUN_BENCHMARK(Name##1, vtkm::cont::AtomicArrayTypeListTag{});                               \
53   VTKM_RUN_BENCHMARK(Name##8, vtkm::cont::AtomicArrayTypeListTag{});                               \
54   VTKM_RUN_BENCHMARK(Name##32, vtkm::cont::AtomicArrayTypeListTag{});                              \
55   VTKM_RUN_BENCHMARK(Name##512, vtkm::cont::AtomicArrayTypeListTag{});                             \
56   VTKM_RUN_BENCHMARK(Name##2048, vtkm::cont::AtomicArrayTypeListTag{});                            \
57   VTKM_RUN_BENCHMARK(Name##32768, vtkm::cont::AtomicArrayTypeListTag{});                           \
58   VTKM_RUN_BENCHMARK(Name##1048576, vtkm::cont::AtomicArrayTypeListTag{})
59 
60 template <class Device>
61 class BenchmarkAtomicArray
62 {
63 public:
64   using Algo = vtkm::cont::DeviceAdapterAlgorithm<Device>;
65   using Timer = vtkm::cont::Timer<Device>;
66 
67   // Benchmarks AtomicArray::Add such that each work index writes to adjacent
68   // indices.
69   template <typename ValueType>
70   struct BenchAddSeq
71   {
72     vtkm::Id ArraySize;
73     vtkm::cont::ArrayHandle<ValueType> Data;
74 
75     template <typename PortalType>
76     struct Worker : public vtkm::exec::FunctorBase
77     {
78       vtkm::Id ArraySize;
79       PortalType Portal;
80 
81       VTKM_CONT
Workervtkm::benchmarking::BenchmarkAtomicArray::BenchAddSeq::Worker82       Worker(vtkm::Id arraySize, PortalType portal)
83         : ArraySize(arraySize)
84         , Portal(portal)
85       {
86       }
87 
88       VTKM_EXEC
operator ()vtkm::benchmarking::BenchmarkAtomicArray::BenchAddSeq::Worker89       void operator()(vtkm::Id i) const { this->Portal.Add(i % this->ArraySize, 1); }
90     };
91 
BenchAddSeqvtkm::benchmarking::BenchmarkAtomicArray::BenchAddSeq92     BenchAddSeq(vtkm::Id arraySize)
93       : ArraySize(arraySize)
94     {
95       this->Data.PrepareForOutput(this->ArraySize, Device{});
96     }
97 
98     VTKM_CONT
operator ()vtkm::benchmarking::BenchmarkAtomicArray::BenchAddSeq99     vtkm::Float64 operator()()
100     {
101       vtkm::cont::AtomicArray<ValueType> array(this->Data);
102       auto portal = array.PrepareForExecution(Device{});
103       Worker<decltype(portal)> worker{ this->ArraySize, portal };
104 
105       Timer timer;
106       Algo::Schedule(worker, NumWrites);
107       return timer.GetElapsedTime();
108     }
109 
110     VTKM_CONT
Descriptionvtkm::benchmarking::BenchmarkAtomicArray::BenchAddSeq111     std::string Description() const
112     {
113       std::ostringstream desc;
114       desc << "Add (Seq, Atomic, " << std::setw(7) << std::setfill('0') << this->ArraySize << ")";
115       return desc.str();
116     }
117   };
118   MAKE_ATOMIC_BENCHMARKS(AddSeq, BenchAddSeq);
119 
120   // Provides a non-atomic baseline for BenchAddSeq
121   template <typename ValueType>
122   struct BenchAddSeqBaseline
123   {
124     vtkm::Id ArraySize;
125     vtkm::cont::ArrayHandle<ValueType> Data;
126 
127     template <typename PortalType>
128     struct Worker : public vtkm::exec::FunctorBase
129     {
130       vtkm::Id ArraySize;
131       PortalType Portal;
132 
133       VTKM_CONT
Workervtkm::benchmarking::BenchmarkAtomicArray::BenchAddSeqBaseline::Worker134       Worker(vtkm::Id arraySize, PortalType portal)
135         : ArraySize(arraySize)
136         , Portal(portal)
137       {
138       }
139 
140       VTKM_EXEC
operator ()vtkm::benchmarking::BenchmarkAtomicArray::BenchAddSeqBaseline::Worker141       void operator()(vtkm::Id i) const
142       {
143         vtkm::Id idx = i % this->ArraySize;
144         this->Portal.Set(idx, this->Portal.Get(idx) + 1);
145       }
146     };
147 
BenchAddSeqBaselinevtkm::benchmarking::BenchmarkAtomicArray::BenchAddSeqBaseline148     BenchAddSeqBaseline(vtkm::Id arraySize)
149       : ArraySize(arraySize)
150     {
151     }
152 
153     VTKM_CONT
operator ()vtkm::benchmarking::BenchmarkAtomicArray::BenchAddSeqBaseline154     vtkm::Float64 operator()()
155     {
156       auto portal = this->Data.PrepareForOutput(this->ArraySize, Device{});
157       Worker<decltype(portal)> worker{ this->ArraySize, portal };
158 
159       Timer timer;
160       Algo::Schedule(worker, NumWrites);
161       return timer.GetElapsedTime();
162     }
163 
164     VTKM_CONT
Descriptionvtkm::benchmarking::BenchmarkAtomicArray::BenchAddSeqBaseline165     std::string Description() const
166     {
167       std::ostringstream desc;
168       desc << "Add (Seq, Baseline, " << std::setw(7) << std::setfill('0') << this->ArraySize << ")";
169       return desc.str();
170     }
171   };
172   MAKE_ATOMIC_BENCHMARKS(AddSeqBase, BenchAddSeqBaseline);
173 
174   // Benchmarks AtomicArray::Add such that each work index writes to a strided
175   // index ( floor(i / stride) + stride * (i % stride)
176   template <typename ValueType>
177   struct BenchAddStride
178   {
179     vtkm::Id ArraySize;
180     vtkm::Id Stride;
181     vtkm::cont::ArrayHandle<ValueType> Data;
182 
183     template <typename PortalType>
184     struct Worker : public vtkm::exec::FunctorBase
185     {
186       vtkm::Id ArraySize;
187       vtkm::Id Stride;
188       PortalType Portal;
189 
190       VTKM_CONT
Workervtkm::benchmarking::BenchmarkAtomicArray::BenchAddStride::Worker191       Worker(vtkm::Id arraySize, vtkm::Id stride, PortalType portal)
192         : ArraySize(arraySize)
193         , Stride(stride)
194         , Portal(portal)
195       {
196       }
197 
198       VTKM_EXEC
operator ()vtkm::benchmarking::BenchmarkAtomicArray::BenchAddStride::Worker199       void operator()(vtkm::Id i) const
200       {
201         vtkm::Id idx = (i / this->Stride + this->Stride * (i % this->Stride)) % this->ArraySize;
202         this->Portal.Add(idx % this->ArraySize, 1);
203       }
204     };
205 
BenchAddStridevtkm::benchmarking::BenchmarkAtomicArray::BenchAddStride206     BenchAddStride(vtkm::Id arraySize, vtkm::Id stride = 32)
207       : ArraySize(arraySize)
208       , Stride(stride)
209     {
210       this->Data.PrepareForOutput(this->ArraySize, Device{});
211     }
212 
213     VTKM_CONT
operator ()vtkm::benchmarking::BenchmarkAtomicArray::BenchAddStride214     vtkm::Float64 operator()()
215     {
216       vtkm::cont::AtomicArray<ValueType> array(this->Data);
217       auto portal = array.PrepareForExecution(Device{});
218       Worker<decltype(portal)> worker{ this->ArraySize, this->Stride, portal };
219 
220       Timer timer;
221       Algo::Schedule(worker, NumWrites);
222       return timer.GetElapsedTime();
223     }
224 
225     VTKM_CONT
Descriptionvtkm::benchmarking::BenchmarkAtomicArray::BenchAddStride226     std::string Description() const
227     {
228       std::ostringstream desc;
229       desc << "Add (Stride=" << this->Stride << ", Atomic, " << std::setw(7) << std::setfill('0')
230            << this->ArraySize << ")";
231       return desc.str();
232     }
233   };
234   MAKE_ATOMIC_BENCHMARKS(AddStride, BenchAddStride);
235 
236   // Non-atomic baseline for AddStride
237   template <typename ValueType>
238   struct BenchAddStrideBaseline
239   {
240     vtkm::Id ArraySize;
241     vtkm::Id Stride;
242     vtkm::cont::ArrayHandle<ValueType> Data;
243 
244     template <typename PortalType>
245     struct Worker : public vtkm::exec::FunctorBase
246     {
247       vtkm::Id ArraySize;
248       vtkm::Id Stride;
249       PortalType Portal;
250 
251       VTKM_CONT
Workervtkm::benchmarking::BenchmarkAtomicArray::BenchAddStrideBaseline::Worker252       Worker(vtkm::Id arraySize, vtkm::Id stride, PortalType portal)
253         : ArraySize(arraySize)
254         , Stride(stride)
255         , Portal(portal)
256       {
257       }
258 
259       VTKM_EXEC
operator ()vtkm::benchmarking::BenchmarkAtomicArray::BenchAddStrideBaseline::Worker260       void operator()(vtkm::Id i) const
261       {
262         vtkm::Id idx = (i / this->Stride + this->Stride * (i % this->Stride)) % this->ArraySize;
263         this->Portal.Set(idx, this->Portal.Get(idx) + 1);
264       }
265     };
266 
BenchAddStrideBaselinevtkm::benchmarking::BenchmarkAtomicArray::BenchAddStrideBaseline267     BenchAddStrideBaseline(vtkm::Id arraySize, vtkm::Id stride = 32)
268       : ArraySize(arraySize)
269       , Stride(stride)
270     {
271     }
272 
273     VTKM_CONT
operator ()vtkm::benchmarking::BenchmarkAtomicArray::BenchAddStrideBaseline274     vtkm::Float64 operator()()
275     {
276       auto portal = this->Data.PrepareForOutput(this->ArraySize, Device{});
277       Worker<decltype(portal)> worker{ this->ArraySize, this->Stride, portal };
278 
279       Timer timer;
280       Algo::Schedule(worker, NumWrites);
281       return timer.GetElapsedTime();
282     }
283 
284     VTKM_CONT
Descriptionvtkm::benchmarking::BenchmarkAtomicArray::BenchAddStrideBaseline285     std::string Description() const
286     {
287       std::ostringstream desc;
288       desc << "Add (Stride=" << this->Stride << ", Baseline, " << std::setw(7) << std::setfill('0')
289            << this->ArraySize << ")";
290       return desc.str();
291     }
292   };
293   MAKE_ATOMIC_BENCHMARKS(AddStrideBase, BenchAddStrideBaseline);
294 
295   // Benchmarks AtomicArray::CompareAndSwap such that each work index writes to adjacent
296   // indices.
297   template <typename ValueType>
298   struct BenchCASSeq
299   {
300     vtkm::Id ArraySize;
301     vtkm::cont::ArrayHandle<ValueType> Data;
302 
303     template <typename PortalType>
304     struct Worker : public vtkm::exec::FunctorBase
305     {
306       vtkm::Id ArraySize;
307       PortalType Portal;
308 
309       VTKM_CONT
Workervtkm::benchmarking::BenchmarkAtomicArray::BenchCASSeq::Worker310       Worker(vtkm::Id arraySize, PortalType portal)
311         : ArraySize(arraySize)
312         , Portal(portal)
313       {
314       }
315 
316       VTKM_EXEC
operator ()vtkm::benchmarking::BenchmarkAtomicArray::BenchCASSeq::Worker317       void operator()(vtkm::Id i) const
318       {
319         vtkm::Id idx = i % this->ArraySize;
320         ValueType val = static_cast<ValueType>(i);
321         // Get the old val with a no-op
322         ValueType oldVal = this->Portal.Add(idx, static_cast<ValueType>(0));
323         ValueType assumed = static_cast<ValueType>(0);
324         do
325         {
326           assumed = oldVal;
327           oldVal = this->Portal.CompareAndSwap(idx, assumed + val, assumed);
328         } while (assumed != oldVal);
329       }
330     };
331 
BenchCASSeqvtkm::benchmarking::BenchmarkAtomicArray::BenchCASSeq332     BenchCASSeq(vtkm::Id arraySize)
333       : ArraySize(arraySize)
334     {
335       this->Data.PrepareForOutput(this->ArraySize, Device{});
336     }
337 
338     VTKM_CONT
operator ()vtkm::benchmarking::BenchmarkAtomicArray::BenchCASSeq339     vtkm::Float64 operator()()
340     {
341       vtkm::cont::AtomicArray<ValueType> array(this->Data);
342       auto portal = array.PrepareForExecution(Device{});
343       Worker<decltype(portal)> worker{ this->ArraySize, portal };
344 
345       Timer timer;
346       Algo::Schedule(worker, NumWrites);
347       return timer.GetElapsedTime();
348     }
349 
350     VTKM_CONT
Descriptionvtkm::benchmarking::BenchmarkAtomicArray::BenchCASSeq351     std::string Description() const
352     {
353       std::ostringstream desc;
354       desc << "CAS (Seq, Atomic, " << std::setw(7) << std::setfill('0') << this->ArraySize << ")";
355       return desc.str();
356     }
357   };
358   MAKE_ATOMIC_BENCHMARKS(CASSeq, BenchCASSeq);
359 
360   // Provides a non-atomic baseline for BenchCASSeq
361   template <typename ValueType>
362   struct BenchCASSeqBaseline
363   {
364     vtkm::Id ArraySize;
365     vtkm::cont::ArrayHandle<ValueType> Data;
366 
367     template <typename PortalType>
368     struct Worker : public vtkm::exec::FunctorBase
369     {
370       vtkm::Id ArraySize;
371       PortalType Portal;
372 
373       VTKM_CONT
Workervtkm::benchmarking::BenchmarkAtomicArray::BenchCASSeqBaseline::Worker374       Worker(vtkm::Id arraySize, PortalType portal)
375         : ArraySize(arraySize)
376         , Portal(portal)
377       {
378       }
379 
380       VTKM_EXEC
operator ()vtkm::benchmarking::BenchmarkAtomicArray::BenchCASSeqBaseline::Worker381       void operator()(vtkm::Id i) const
382       {
383         vtkm::Id idx = i % this->ArraySize;
384         ValueType val = static_cast<ValueType>(i);
385         ValueType oldVal = this->Portal.Get(idx);
386         this->Portal.Set(idx, oldVal + val);
387       }
388     };
389 
BenchCASSeqBaselinevtkm::benchmarking::BenchmarkAtomicArray::BenchCASSeqBaseline390     BenchCASSeqBaseline(vtkm::Id arraySize)
391       : ArraySize(arraySize)
392     {
393     }
394 
395     VTKM_CONT
operator ()vtkm::benchmarking::BenchmarkAtomicArray::BenchCASSeqBaseline396     vtkm::Float64 operator()()
397     {
398       auto portal = this->Data.PrepareForOutput(this->ArraySize, Device{});
399       Worker<decltype(portal)> worker{ this->ArraySize, portal };
400 
401       Timer timer;
402       Algo::Schedule(worker, NumWrites);
403       return timer.GetElapsedTime();
404     }
405 
406     VTKM_CONT
Descriptionvtkm::benchmarking::BenchmarkAtomicArray::BenchCASSeqBaseline407     std::string Description() const
408     {
409       std::ostringstream desc;
410       desc << "CAS (Seq, Baseline, " << std::setw(7) << std::setfill('0') << this->ArraySize << ")";
411       return desc.str();
412     }
413   };
414   MAKE_ATOMIC_BENCHMARKS(CASSeqBase, BenchCASSeqBaseline);
415 
416   // Benchmarks AtomicArray::CompareAndSwap such that each work index writes to
417   // a strided index:
418   // ( floor(i / stride) + stride * (i % stride)
419   template <typename ValueType>
420   struct BenchCASStride
421   {
422     vtkm::Id ArraySize;
423     vtkm::Id Stride;
424     vtkm::cont::ArrayHandle<ValueType> Data;
425 
426     template <typename PortalType>
427     struct Worker : public vtkm::exec::FunctorBase
428     {
429       vtkm::Id ArraySize;
430       vtkm::Id Stride;
431       PortalType Portal;
432 
433       VTKM_CONT
Workervtkm::benchmarking::BenchmarkAtomicArray::BenchCASStride::Worker434       Worker(vtkm::Id arraySize, vtkm::Id stride, PortalType portal)
435         : ArraySize(arraySize)
436         , Stride(stride)
437         , Portal(portal)
438       {
439       }
440 
441       VTKM_EXEC
operator ()vtkm::benchmarking::BenchmarkAtomicArray::BenchCASStride::Worker442       void operator()(vtkm::Id i) const
443       {
444         vtkm::Id idx = (i / this->Stride + this->Stride * (i % this->Stride)) % this->ArraySize;
445         ValueType val = static_cast<ValueType>(i);
446         // Get the old val with a no-op
447         ValueType oldVal = this->Portal.Add(idx, static_cast<ValueType>(0));
448         ValueType assumed = static_cast<ValueType>(0);
449         do
450         {
451           assumed = oldVal;
452           oldVal = this->Portal.CompareAndSwap(idx, assumed + val, assumed);
453         } while (assumed != oldVal);
454       }
455     };
456 
BenchCASStridevtkm::benchmarking::BenchmarkAtomicArray::BenchCASStride457     BenchCASStride(vtkm::Id arraySize, vtkm::Id stride = 32)
458       : ArraySize(arraySize)
459       , Stride(stride)
460     {
461       this->Data.PrepareForOutput(this->ArraySize, Device{});
462     }
463 
464     VTKM_CONT
operator ()vtkm::benchmarking::BenchmarkAtomicArray::BenchCASStride465     vtkm::Float64 operator()()
466     {
467       vtkm::cont::AtomicArray<ValueType> array(this->Data);
468       auto portal = array.PrepareForExecution(Device{});
469       Worker<decltype(portal)> worker{ this->ArraySize, this->Stride, portal };
470 
471       Timer timer;
472       Algo::Schedule(worker, NumWrites);
473       return timer.GetElapsedTime();
474     }
475 
476     VTKM_CONT
Descriptionvtkm::benchmarking::BenchmarkAtomicArray::BenchCASStride477     std::string Description() const
478     {
479       std::ostringstream desc;
480       desc << "CAS (Stride=" << this->Stride << ", Atomic, " << std::setw(7) << std::setfill('0')
481            << this->ArraySize << ")";
482       return desc.str();
483     }
484   };
485   MAKE_ATOMIC_BENCHMARKS(CASStride, BenchCASStride);
486 
487   // Non-atomic baseline for CASStride
488   template <typename ValueType>
489   struct BenchCASStrideBaseline
490   {
491     vtkm::Id ArraySize;
492     vtkm::Id Stride;
493     vtkm::cont::ArrayHandle<ValueType> Data;
494 
495     template <typename PortalType>
496     struct Worker : public vtkm::exec::FunctorBase
497     {
498       vtkm::Id ArraySize;
499       vtkm::Id Stride;
500       PortalType Portal;
501 
502       VTKM_CONT
Workervtkm::benchmarking::BenchmarkAtomicArray::BenchCASStrideBaseline::Worker503       Worker(vtkm::Id arraySize, vtkm::Id stride, PortalType portal)
504         : ArraySize(arraySize)
505         , Stride(stride)
506         , Portal(portal)
507       {
508       }
509 
510       VTKM_EXEC
operator ()vtkm::benchmarking::BenchmarkAtomicArray::BenchCASStrideBaseline::Worker511       void operator()(vtkm::Id i) const
512       {
513         vtkm::Id idx = (i / this->Stride + this->Stride * (i % this->Stride)) % this->ArraySize;
514         ValueType val = static_cast<ValueType>(i);
515         ValueType oldVal = this->Portal.Get(idx);
516         this->Portal.Set(idx, oldVal + val);
517       }
518     };
519 
BenchCASStrideBaselinevtkm::benchmarking::BenchmarkAtomicArray::BenchCASStrideBaseline520     BenchCASStrideBaseline(vtkm::Id arraySize, vtkm::Id stride = 32)
521       : ArraySize(arraySize)
522       , Stride(stride)
523     {
524     }
525 
526     VTKM_CONT
operator ()vtkm::benchmarking::BenchmarkAtomicArray::BenchCASStrideBaseline527     vtkm::Float64 operator()()
528     {
529       auto portal = this->Data.PrepareForOutput(this->ArraySize, Device{});
530       Worker<decltype(portal)> worker{ this->ArraySize, this->Stride, portal };
531 
532       Timer timer;
533       Algo::Schedule(worker, NumWrites);
534       return timer.GetElapsedTime();
535     }
536 
537     VTKM_CONT
Descriptionvtkm::benchmarking::BenchmarkAtomicArray::BenchCASStrideBaseline538     std::string Description() const
539     {
540       std::ostringstream desc;
541       desc << "CAS (Stride=" << this->Stride << ", Baseline, " << std::setw(7) << std::setfill('0')
542            << this->ArraySize << ")";
543       return desc.str();
544     }
545   };
546   MAKE_ATOMIC_BENCHMARKS(CASStrideBase, BenchCASStrideBaseline);
547 
Run()548   static void Run()
549   {
550     RUN_ATOMIC_BENCHMARKS(AddSeq);
551     RUN_ATOMIC_BENCHMARKS(AddSeqBase);
552     RUN_ATOMIC_BENCHMARKS(AddStride);
553     RUN_ATOMIC_BENCHMARKS(AddStrideBase);
554 
555     RUN_ATOMIC_BENCHMARKS(CASSeq);
556     RUN_ATOMIC_BENCHMARKS(CASSeqBase);
557     RUN_ATOMIC_BENCHMARKS(CASStride);
558     RUN_ATOMIC_BENCHMARKS(CASStrideBase);
559   }
560 };
561 }
562 } // end namespace vtkm::benchmarking
563 
main(int,char * [])564 int main(int, char* [])
565 {
566   using Device = VTKM_DEFAULT_DEVICE_ADAPTER_TAG;
567   auto tracker = vtkm::cont::GetGlobalRuntimeDeviceTracker();
568   tracker.ForceDevice(Device{});
569 
570   try
571   {
572     vtkm::benchmarking::BenchmarkAtomicArray<Device>::Run();
573   }
574   catch (std::exception& e)
575   {
576     std::cerr << "Benchmark encountered an exception: " << e.what() << "\n";
577     return 1;
578   }
579 
580   return 0;
581 }
582