1 //============================================================================
2 //  Copyright (c) Kitware, Inc.
3 //  All rights reserved.
4 //  See LICENSE.txt for details.
5 //  This software is distributed WITHOUT ANY WARRANTY; without even
6 //  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
7 //  PURPOSE.  See the above copyright notice for more information.
8 //
9 //  Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
10 //  Copyright 2018 UT-Battelle, LLC.
11 //  Copyright 2018 Los Alamos National Security.
12 //
13 //  Under the terms of Contract DE-NA0003525 with NTESS,
14 //  the U.S. Government retains certain rights in this software.
15 //
16 //  Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National
17 //  Laboratory (LANL), the U.S. Government retains certain rights in
18 //  this software.
19 //============================================================================
20 
21 #include <vtkm/cont/openmp/internal/DeviceAdapterTagOpenMP.h>
22 #include <vtkm/cont/openmp/internal/FunctorsOpenMP.h>
23 
24 #include <vtkm/cont/internal/FunctorsGeneral.h>
25 
26 #include <vtkm/Types.h>
27 #include <vtkm/cont/ArrayHandle.h>
28 
29 #include <omp.h>
30 
31 namespace vtkm
32 {
33 namespace cont
34 {
35 namespace openmp
36 {
37 namespace scan
38 {
39 
40 enum class ChildType
41 {
42   Left,
43   Right
44 };
45 
46 // Generic implementation of modified Ladner & Fischer 1977 "adder" algorithm
47 // used for backbone of exclusive/inclusive scans. Language in comments is
48 // specific to computing a sum, but the implementation should be generic enough
49 // for any scan operation.
50 //
51 // The basic idea is that a tree structure is used to partition the input into
52 // sets of LeafSize. Each leaf of the tree is processed in two stages: First,
53 // the sum of each leaf is computed, and this information is pushed up the tree
54 // to compute the sum of each node's child leaves. Then the partial sum at the
55 // start of each node is computed and pushed down the tree (the "carry"
56 // values). In the second pass through each leaf's data, these partial sums are
57 // used to compute the final output from the carry value and the input data.
58 //
59 // The passes will likely overlap due to the "leftEdge" optimizations, which
60 // allow each leaf to start the second pass as soon as the first pass of all
61 // previous leaves is completed. Additionally, the first leaf in the data will
62 // combine both passes into one, computing the final output data while
63 // generating its sum for the communication stage.
64 template <typename ScanBody>
65 struct Adder : public ScanBody
66 {
67   template <typename NodeImpl>
68   struct NodeWrapper : public NodeImpl
69   {
70     // Range of IDs this node represents
71     vtkm::Id2 Range{ -1, -1 };
72 
73     // Connections:
74     NodeWrapper* Parent{ nullptr };
75     NodeWrapper* Left{ nullptr };
76     NodeWrapper* Right{ nullptr };
77 
78     // Special flag to mark nodes on the far left edge of the tree. This allows
79     // various optimization that start the second pass sooner on some ranges.
80     bool LeftEdge{ false };
81 
82     // Pad the node out to the size of a cache line to prevent false sharing:
83     static constexpr size_t DataSize =
84       sizeof(NodeImpl) + sizeof(vtkm::Id2) + 3 * sizeof(NodeWrapper*) + sizeof(bool);
85     static constexpr size_t NumCacheLines = CeilDivide<size_t>(DataSize, CACHE_LINE_SIZE);
86     static constexpr size_t PaddingSize = NumCacheLines * CACHE_LINE_SIZE - DataSize;
87     unsigned char Padding[PaddingSize];
88   };
89 
90   using Node = NodeWrapper<typename ScanBody::Node>;
91   using ValueType = typename ScanBody::ValueType;
92 
93   vtkm::Id LeafSize;
94   std::vector<Node> Nodes;
95   size_t NextNode;
96 
97   // Use ScanBody's ctor:
98   using ScanBody::ScanBody;
99 
100   // Returns the total array sum:
ExecuteAdder101   ValueType Execute(const vtkm::Id2& range)
102   {
103     Node* rootNode = nullptr;
104 
105     VTKM_OPENMP_DIRECTIVE(parallel default(shared))
106     {
107       VTKM_OPENMP_DIRECTIVE(single)
108       {
109         // Allocate nodes, prep metadata:
110         this->Prepare(range);
111 
112         // Compute the partition and node sums:
113         rootNode = this->AllocNode();
114         rootNode->Range = range;
115         rootNode->LeftEdge = true;
116         ScanBody::InitializeRootNode(rootNode);
117 
118         this->Scan(rootNode);
119       } // end single
120     }   // end parallel
121 
122     return rootNode ? ScanBody::GetFinalResult(rootNode)
123                     : vtkm::TypeTraits<ValueType>::ZeroInitialization();
124   }
125 
126 private:
127   // Returns the next available node in a thread-safe manner.
AllocNodeAdder128   Node* AllocNode()
129   {
130     size_t nodeIdx;
131 
132 // GCC emits a false positive "value computed but not used" for this block:
133 #pragma GCC diagnostic push
134 #pragma GCC diagnostic ignored "-Wunused-value"
135 
136     VTKM_OPENMP_DIRECTIVE(atomic capture)
137     {
138       nodeIdx = this->NextNode;
139       ++this->NextNode;
140     }
141 
142 #pragma GCC diagnostic pop
143 
144     VTKM_ASSERT(nodeIdx < this->Nodes.size());
145 
146     return &this->Nodes[nodeIdx];
147   }
148 
149   // Does the range represent a leave node?
IsLeafAdder150   bool IsLeaf(const vtkm::Id2& range) const { return (range[1] - range[0]) <= this->LeafSize; }
151 
152   // Use to split ranges. Ensures that the first range is always a multiple of
153   // LeafSize, when possible.
ComputeMidpointAdder154   vtkm::Id ComputeMidpoint(const vtkm::Id2& range) const
155   {
156     const vtkm::Id n = range[1] - range[0];
157     const vtkm::Id np = this->LeafSize;
158 
159     return (((n / 2) + (np - 1)) / np) * np + range[0];
160   }
161 
PrepareAdder162   void Prepare(const vtkm::Id2& range)
163   {
164     // Figure out how many values each thread should handle:
165     vtkm::Id numVals = range[1] - range[0];
166     int numThreads = omp_get_num_threads();
167     vtkm::Id chunksPerThread = 8;
168     vtkm::Id numChunks;
169     ComputeChunkSize(
170       numVals, numThreads, chunksPerThread, sizeof(ValueType), numChunks, this->LeafSize);
171 
172     // Compute an upper-bound of the number of nodes in the tree:
173     size_t numNodes = numChunks;
174     while (numChunks > 1)
175     {
176       numChunks = (numChunks + 1) / 2;
177       numNodes += numChunks;
178     }
179     this->Nodes.resize(numNodes);
180     this->NextNode = 0;
181   }
182 
183   // Build the tree and compute the sums:
ScanAdder184   void Scan(Node* node)
185   {
186     if (!this->IsLeaf(node->Range))
187     { // split range:
188       vtkm::Id midpoint = this->ComputeMidpoint(node->Range);
189 
190       Node* right = this->AllocNode();
191       right->Parent = node;
192       node->Right = right;
193       right->Range = vtkm::Id2(midpoint, node->Range[1]);
194       ScanBody::InitializeChildNode(right, node, ChildType::Right, false);
195 
196       // Intel compilers seem to have trouble following the 'this' pointer
197       // when launching tasks, resulting in a corrupt task environment.
198       // Explicitly copying the pointer into a local variable seems to fix this.
199       auto explicitThis = this;
200 
201       VTKM_OPENMP_DIRECTIVE(taskgroup)
202       {
203         VTKM_OPENMP_DIRECTIVE(task) { explicitThis->Scan(right); } // end right task
204 
205         Node* left = this->AllocNode();
206         left->Parent = node;
207         node->Left = left;
208         left->Range = vtkm::Id2(node->Range[0], midpoint);
209         left->LeftEdge = node->LeftEdge;
210         ScanBody::InitializeChildNode(left, node, ChildType::Left, left->LeftEdge);
211         this->Scan(left);
212 
213       } // end task group. Both l/r sums will be finished here.
214 
215       ScanBody::CombineSummaries(node, node->Left, node->Right);
216       if (node->LeftEdge)
217       {
218         this->UpdateOutput(node);
219       }
220     }
221     else
222     { // Compute sums:
223       ScanBody::ComputeSummary(node, node->Range, node->LeftEdge);
224     }
225   }
226 
UpdateOutputAdder227   void UpdateOutput(Node* node)
228   {
229     if (node->Left != nullptr)
230     {
231       assert(node->Right != nullptr);
232       ScanBody::PropagateSummaries(node, node->Left, node->Right, node->LeftEdge);
233 
234       // if this node is on the left edge, we know that the left child's
235       // output is already updated, so only descend to the right:
236       if (node->LeftEdge)
237       {
238         this->UpdateOutput(node->Right);
239       }
240       else // Otherwise descent into both:
241       {
242         // Intel compilers seem to have trouble following the 'this' pointer
243         // when launching tasks, resulting in a corrupt task environment.
244         // Explicitly copying the pointer into a local variable seems to fix
245         // this.
246         auto explicitThis = this;
247 
248         // no taskgroup/sync needed other than the final barrier of the parallel
249         // section.
250         VTKM_OPENMP_DIRECTIVE(task) { explicitThis->UpdateOutput(node->Right); } // end task
251         this->UpdateOutput(node->Left);
252       }
253     }
254     else
255     {
256       ScanBody::UpdateOutput(node, node->Range, node->LeftEdge);
257     }
258   }
259 };
260 
261 template <typename InPortalT, typename OutPortalT, typename RawFunctorT>
262 struct ScanExclusiveBody
263 {
264   using ValueType = typename InPortalT::ValueType;
265   using FunctorType = internal::WrappedBinaryOperator<ValueType, RawFunctorT>;
266 
267   InPortalT InPortal;
268   OutPortalT OutPortal;
269   FunctorType Functor;
270   ValueType InitialValue;
271 
272   struct Node
273   {
274     // Sum of all values in range
275     ValueType Sum{ vtkm::TypeTraits<ValueType>::ZeroInitialization() };
276 
277     // The sum of all elements prior to this node's range
278     ValueType Carry{ vtkm::TypeTraits<ValueType>::ZeroInitialization() };
279   };
280 
ScanExclusiveBodyScanExclusiveBody281   ScanExclusiveBody(const InPortalT& inPortal,
282                     const OutPortalT& outPortal,
283                     const RawFunctorT& functor,
284                     const ValueType& init)
285     : InPortal(inPortal)
286     , OutPortal(outPortal)
287     , Functor(functor)
288     , InitialValue(init)
289   {
290   }
291 
292   // Initialize the root of the node tree
InitializeRootNodeScanExclusiveBody293   void InitializeRootNode(Node* /*root*/) {}
294 
InitializeChildNodeScanExclusiveBody295   void InitializeChildNode(Node* /*node*/,
296                            const Node* /*parent*/,
297                            ChildType /*type*/,
298                            bool /*leftEdge*/)
299   {
300   }
301 
ComputeSummaryScanExclusiveBody302   void ComputeSummary(Node* node, const vtkm::Id2& range, bool leftEdge)
303   {
304     auto input = vtkm::cont::ArrayPortalToIteratorBegin(this->InPortal);
305     node->Sum = input[range[0]];
306 
307     // If this block is on the left edge, we can update the output while we
308     // compute the sum:
309     if (leftEdge)
310     {
311       // Set leftEdge arg to false to force the update:
312       node->Sum = UpdateOutputImpl(node, range, false, true);
313     }
314     else // Otherwise, only compute the sum and update the output in pass 2.
315     {
316       for (vtkm::Id i = range[0] + 1; i < range[1]; ++i)
317       {
318         node->Sum = this->Functor(node->Sum, input[i]);
319       }
320     }
321   }
322 
CombineSummariesScanExclusiveBody323   void CombineSummaries(Node* parent, const Node* left, const Node* right)
324   {
325     parent->Sum = this->Functor(left->Sum, right->Sum);
326   }
327 
PropagateSummariesScanExclusiveBody328   void PropagateSummaries(const Node* parent, Node* left, Node* right, bool leftEdge)
329   {
330     left->Carry = parent->Carry;
331     right->Carry = leftEdge ? left->Sum : this->Functor(parent->Carry, left->Sum);
332   }
333 
UpdateOutputScanExclusiveBody334   void UpdateOutput(const Node* node, const vtkm::Id2& range, bool leftEdge)
335   {
336     this->UpdateOutputImpl(node, range, leftEdge, false);
337   }
338 
UpdateOutputImplScanExclusiveBody339   ValueType UpdateOutputImpl(const Node* node, const vtkm::Id2& range, bool skip, bool useInit)
340   {
341     if (skip)
342     {
343       // Do nothing; this was already done in ComputeSummary.
344       return vtkm::TypeTraits<ValueType>::ZeroInitialization();
345     }
346 
347     auto input = vtkm::cont::ArrayPortalToIteratorBegin(this->InPortal);
348     auto output = vtkm::cont::ArrayPortalToIteratorBegin(this->OutPortal);
349 
350     // Be careful with the order input/output are modified. They might be
351     // pointing at the same data:
352     ValueType carry = useInit ? this->InitialValue : node->Carry;
353     vtkm::Id end = range[1];
354 
355     for (vtkm::Id i = range[0]; i < end; ++i)
356     {
357       output[i] = this->Functor(carry, input[i]);
358 
359       using std::swap; // Enable ADL
360       swap(output[i], carry);
361     }
362 
363     return carry;
364   }
365 
366   // Compute the final sum from the node's metadata:
GetFinalResultScanExclusiveBody367   ValueType GetFinalResult(const Node* node) const { return this->Functor(node->Sum, node->Carry); }
368 };
369 
370 template <typename InPortalT, typename OutPortalT, typename RawFunctorT>
371 struct ScanInclusiveBody
372 {
373   using ValueType = typename InPortalT::ValueType;
374   using FunctorType = internal::WrappedBinaryOperator<ValueType, RawFunctorT>;
375 
376   InPortalT InPortal;
377   OutPortalT OutPortal;
378   FunctorType Functor;
379 
380   struct Node
381   {
382     // Sum of all values in range
383     ValueType Sum{ vtkm::TypeTraits<ValueType>::ZeroInitialization() };
384 
385     // The sum of all elements prior to this node's range
386     ValueType Carry{ vtkm::TypeTraits<ValueType>::ZeroInitialization() };
387   };
388 
ScanInclusiveBodyScanInclusiveBody389   ScanInclusiveBody(const InPortalT& inPortal,
390                     const OutPortalT& outPortal,
391                     const RawFunctorT& functor)
392     : InPortal(inPortal)
393     , OutPortal(outPortal)
394     , Functor(functor)
395   {
396   }
397 
398   // Initialize the root of the node tree
InitializeRootNodeScanInclusiveBody399   void InitializeRootNode(Node*)
400   {
401     // no-op
402   }
403 
InitializeChildNodeScanInclusiveBody404   void InitializeChildNode(Node*, const Node*, ChildType, bool)
405   {
406     // no-op
407   }
408 
ComputeSummaryScanInclusiveBody409   void ComputeSummary(Node* node, const vtkm::Id2& range, bool leftEdge)
410   {
411     // If this block is on the left edge, we can update the output while we
412     // compute the sum:
413     if (leftEdge)
414     {
415       node->Sum = UpdateOutputImpl(node, range, false, false);
416     }
417     else // Otherwise, only compute the sum and update the output in pass 2.
418     {
419       auto input = vtkm::cont::ArrayPortalToIteratorBegin(this->InPortal);
420       node->Sum = input[range[0]];
421       for (vtkm::Id i = range[0] + 1; i < range[1]; ++i)
422       {
423         node->Sum = this->Functor(node->Sum, input[i]);
424       }
425     }
426   }
427 
CombineSummariesScanInclusiveBody428   void CombineSummaries(Node* parent, const Node* left, const Node* right)
429   {
430     parent->Sum = this->Functor(left->Sum, right->Sum);
431   }
432 
PropagateSummariesScanInclusiveBody433   void PropagateSummaries(const Node* parent, Node* left, Node* right, bool leftEdge)
434   {
435     left->Carry = parent->Carry;
436     right->Carry = leftEdge ? left->Sum : this->Functor(parent->Carry, left->Sum);
437   }
438 
UpdateOutputScanInclusiveBody439   void UpdateOutput(const Node* node, const vtkm::Id2& range, bool leftEdge)
440   {
441     UpdateOutputImpl(node, range, leftEdge, true);
442   }
443 
UpdateOutputImplScanInclusiveBody444   ValueType UpdateOutputImpl(const Node* node, const vtkm::Id2& range, bool skip, bool useCarry)
445   {
446     if (skip)
447     {
448       // Do nothing; this was already done in ComputeSummary.
449       return vtkm::TypeTraits<ValueType>::ZeroInitialization();
450     }
451 
452     auto input = vtkm::cont::ArrayPortalToIteratorBegin(this->InPortal);
453     auto output = vtkm::cont::ArrayPortalToIteratorBegin(this->OutPortal);
454 
455     vtkm::Id start = range[0];
456     vtkm::Id end = range[1];
457     ValueType carry = node->Carry;
458 
459     // Initialize with the first value if this is the first range:
460     if (!useCarry && start < end)
461     {
462       carry = input[start];
463       output[start] = carry;
464       ++start;
465     }
466 
467     for (vtkm::Id i = start; i < end; ++i)
468     {
469       output[i] = this->Functor(carry, input[i]);
470       carry = output[i];
471     }
472 
473     return output[end - 1];
474   }
475 
476   // Compute the final sum from the node's metadata:
GetFinalResultScanInclusiveBody477   ValueType GetFinalResult(const Node* node) const { return node->Sum; }
478 };
479 
480 } // end namespace scan
481 
482 template <typename InPortalT, typename OutPortalT, typename FunctorT>
483 using ScanExclusiveHelper = scan::Adder<scan::ScanExclusiveBody<InPortalT, OutPortalT, FunctorT>>;
484 
485 template <typename InPortalT, typename OutPortalT, typename FunctorT>
486 using ScanInclusiveHelper = scan::Adder<scan::ScanInclusiveBody<InPortalT, OutPortalT, FunctorT>>;
487 }
488 }
489 } // end namespace vtkm::cont::openmp
490