1 //============================================================================
2 //  Copyright (c) Kitware, Inc.
3 //  All rights reserved.
4 //  See LICENSE.txt for details.
5 //
6 //  This software is distributed WITHOUT ANY WARRANTY; without even
7 //  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
8 //  PURPOSE.  See the above copyright notice for more information.
9 //============================================================================
10 
11 #include <vtkm/cont/openmp/internal/DeviceAdapterTagOpenMP.h>
12 #include <vtkm/cont/openmp/internal/FunctorsOpenMP.h>
13 
14 #include <vtkm/cont/internal/FunctorsGeneral.h>
15 
16 #include <vtkm/Types.h>
17 #include <vtkm/cont/ArrayHandle.h>
18 
19 #include <omp.h>
20 
21 namespace vtkm
22 {
23 namespace cont
24 {
25 namespace openmp
26 {
27 namespace scan
28 {
29 
30 enum class ChildType
31 {
32   Left,
33   Right
34 };
35 
36 // Generic implementation of modified Ladner & Fischer 1977 "adder" algorithm
37 // used for backbone of exclusive/inclusive scans. Language in comments is
38 // specific to computing a sum, but the implementation should be generic enough
39 // for any scan operation.
40 //
41 // The basic idea is that a tree structure is used to partition the input into
42 // sets of LeafSize. Each leaf of the tree is processed in two stages: First,
43 // the sum of each leaf is computed, and this information is pushed up the tree
44 // to compute the sum of each node's child leaves. Then the partial sum at the
45 // start of each node is computed and pushed down the tree (the "carry"
46 // values). In the second pass through each leaf's data, these partial sums are
47 // used to compute the final output from the carry value and the input data.
48 //
49 // The passes will likely overlap due to the "leftEdge" optimizations, which
50 // allow each leaf to start the second pass as soon as the first pass of all
51 // previous leaves is completed. Additionally, the first leaf in the data will
52 // combine both passes into one, computing the final output data while
53 // generating its sum for the communication stage.
54 template <typename ScanBody>
55 struct Adder : public ScanBody
56 {
57   template <typename NodeImpl>
58   struct NodeWrapper : public NodeImpl
59   {
60     // Range of IDs this node represents
61     vtkm::Id2 Range{ -1, -1 };
62 
63     // Connections:
64     NodeWrapper* Parent{ nullptr };
65     NodeWrapper* Left{ nullptr };
66     NodeWrapper* Right{ nullptr };
67 
68     // Special flag to mark nodes on the far left edge of the tree. This allows
69     // various optimization that start the second pass sooner on some ranges.
70     bool LeftEdge{ false };
71 
72     // Pad the node out to the size of a cache line to prevent false sharing:
73     static constexpr size_t DataSize =
74       sizeof(NodeImpl) + sizeof(vtkm::Id2) + 3 * sizeof(NodeWrapper*) + sizeof(bool);
75     static constexpr size_t NumCacheLines = CeilDivide<size_t>(DataSize, VTKM_CACHE_LINE_SIZE);
76     static constexpr size_t PaddingSize = NumCacheLines * VTKM_CACHE_LINE_SIZE - DataSize;
77     unsigned char Padding[PaddingSize];
78   };
79 
80   using Node = NodeWrapper<typename ScanBody::Node>;
81   using ValueType = typename ScanBody::ValueType;
82 
83   vtkm::Id LeafSize;
84   std::vector<Node> Nodes;
85   size_t NextNode;
86 
87   // Use ScanBody's ctor:
88   using ScanBody::ScanBody;
89 
90   // Returns the total array sum:
ExecuteAdder91   ValueType Execute(const vtkm::Id2& range)
92   {
93     Node* rootNode = nullptr;
94 
95     VTKM_OPENMP_DIRECTIVE(parallel default(shared))
96     {
97       VTKM_OPENMP_DIRECTIVE(single)
98       {
99         // Allocate nodes, prep metadata:
100         this->Prepare(range);
101 
102         // Compute the partition and node sums:
103         rootNode = this->AllocNode();
104         rootNode->Range = range;
105         rootNode->LeftEdge = true;
106         ScanBody::InitializeRootNode(rootNode);
107 
108         this->Scan(rootNode);
109       } // end single
110     }   // end parallel
111 
112     return rootNode ? ScanBody::GetFinalResult(rootNode)
113                     : vtkm::TypeTraits<ValueType>::ZeroInitialization();
114   }
115 
116 private:
117   // Returns the next available node in a thread-safe manner.
AllocNodeAdder118   Node* AllocNode()
119   {
120     size_t nodeIdx;
121 
122 // GCC emits a false positive "value computed but not used" for this block:
123 #pragma GCC diagnostic push
124 #pragma GCC diagnostic ignored "-Wunused-value"
125 
126     VTKM_OPENMP_DIRECTIVE(atomic capture)
127     {
128       nodeIdx = this->NextNode;
129       ++this->NextNode;
130     }
131 
132 #pragma GCC diagnostic pop
133 
134     VTKM_ASSERT(nodeIdx < this->Nodes.size());
135 
136     return &this->Nodes[nodeIdx];
137   }
138 
139   // Does the range represent a leave node?
IsLeafAdder140   bool IsLeaf(const vtkm::Id2& range) const { return (range[1] - range[0]) <= this->LeafSize; }
141 
142   // Use to split ranges. Ensures that the first range is always a multiple of
143   // LeafSize, when possible.
ComputeMidpointAdder144   vtkm::Id ComputeMidpoint(const vtkm::Id2& range) const
145   {
146     const vtkm::Id n = range[1] - range[0];
147     const vtkm::Id np = this->LeafSize;
148 
149     return (((n / 2) + (np - 1)) / np) * np + range[0];
150   }
151 
PrepareAdder152   void Prepare(const vtkm::Id2& range)
153   {
154     // Figure out how many values each thread should handle:
155     vtkm::Id numVals = range[1] - range[0];
156     int numThreads = omp_get_num_threads();
157     vtkm::Id chunksPerThread = 8;
158     vtkm::Id numChunks;
159     ComputeChunkSize(
160       numVals, numThreads, chunksPerThread, sizeof(ValueType), numChunks, this->LeafSize);
161 
162     // Compute an upper-bound of the number of nodes in the tree:
163     std::size_t numNodes = static_cast<std::size_t>(numChunks);
164     while (numChunks > 1)
165     {
166       numChunks = (numChunks + 1) / 2;
167       numNodes += static_cast<std::size_t>(numChunks);
168     }
169     this->Nodes.resize(numNodes);
170     this->NextNode = 0;
171   }
172 
173   // Build the tree and compute the sums:
ScanAdder174   void Scan(Node* node)
175   {
176     if (!this->IsLeaf(node->Range))
177     { // split range:
178       vtkm::Id midpoint = this->ComputeMidpoint(node->Range);
179 
180       Node* right = this->AllocNode();
181       right->Parent = node;
182       node->Right = right;
183       right->Range = vtkm::Id2(midpoint, node->Range[1]);
184       ScanBody::InitializeChildNode(right, node, ChildType::Right, false);
185 
186       // Intel compilers seem to have trouble following the 'this' pointer
187       // when launching tasks, resulting in a corrupt task environment.
188       // Explicitly copying the pointer into a local variable seems to fix this.
189       auto explicitThis = this;
190 
191       VTKM_OPENMP_DIRECTIVE(taskgroup)
192       {
193         VTKM_OPENMP_DIRECTIVE(task) { explicitThis->Scan(right); } // end right task
194 
195         Node* left = this->AllocNode();
196         left->Parent = node;
197         node->Left = left;
198         left->Range = vtkm::Id2(node->Range[0], midpoint);
199         left->LeftEdge = node->LeftEdge;
200         ScanBody::InitializeChildNode(left, node, ChildType::Left, left->LeftEdge);
201         this->Scan(left);
202 
203       } // end task group. Both l/r sums will be finished here.
204 
205       ScanBody::CombineSummaries(node, node->Left, node->Right);
206       if (node->LeftEdge)
207       {
208         this->UpdateOutput(node);
209       }
210     }
211     else
212     { // Compute sums:
213       ScanBody::ComputeSummary(node, node->Range, node->LeftEdge);
214     }
215   }
216 
UpdateOutputAdder217   void UpdateOutput(Node* node)
218   {
219     if (node->Left != nullptr)
220     {
221       assert(node->Right != nullptr);
222       ScanBody::PropagateSummaries(node, node->Left, node->Right, node->LeftEdge);
223 
224       // if this node is on the left edge, we know that the left child's
225       // output is already updated, so only descend to the right:
226       if (node->LeftEdge)
227       {
228         this->UpdateOutput(node->Right);
229       }
230       else // Otherwise descent into both:
231       {
232         // Intel compilers seem to have trouble following the 'this' pointer
233         // when launching tasks, resulting in a corrupt task environment.
234         // Explicitly copying the pointer into a local variable seems to fix
235         // this.
236         auto explicitThis = this;
237 
238         // no taskgroup/sync needed other than the final barrier of the parallel
239         // section.
240         VTKM_OPENMP_DIRECTIVE(task) { explicitThis->UpdateOutput(node->Right); } // end task
241         this->UpdateOutput(node->Left);
242       }
243     }
244     else
245     {
246       ScanBody::UpdateOutput(node, node->Range, node->LeftEdge);
247     }
248   }
249 };
250 
251 template <typename InPortalT, typename OutPortalT, typename RawFunctorT>
252 struct ScanExclusiveBody
253 {
254   using ValueType = typename InPortalT::ValueType;
255   using FunctorType = internal::WrappedBinaryOperator<ValueType, RawFunctorT>;
256 
257   InPortalT InPortal;
258   OutPortalT OutPortal;
259   FunctorType Functor;
260   ValueType InitialValue;
261 
262   struct Node
263   {
264     // Sum of all values in range
265     ValueType Sum{ vtkm::TypeTraits<ValueType>::ZeroInitialization() };
266 
267     // The sum of all elements prior to this node's range
268     ValueType Carry{ vtkm::TypeTraits<ValueType>::ZeroInitialization() };
269   };
270 
ScanExclusiveBodyScanExclusiveBody271   ScanExclusiveBody(const InPortalT& inPortal,
272                     const OutPortalT& outPortal,
273                     const RawFunctorT& functor,
274                     const ValueType& init)
275     : InPortal(inPortal)
276     , OutPortal(outPortal)
277     , Functor(functor)
278     , InitialValue(init)
279   {
280   }
281 
282   // Initialize the root of the node tree
InitializeRootNodeScanExclusiveBody283   void InitializeRootNode(Node* /*root*/) {}
284 
InitializeChildNodeScanExclusiveBody285   void InitializeChildNode(Node* /*node*/,
286                            const Node* /*parent*/,
287                            ChildType /*type*/,
288                            bool /*leftEdge*/)
289   {
290   }
291 
ComputeSummaryScanExclusiveBody292   void ComputeSummary(Node* node, const vtkm::Id2& range, bool leftEdge)
293   {
294     auto input = vtkm::cont::ArrayPortalToIteratorBegin(this->InPortal);
295     node->Sum = input[range[0]];
296 
297     // If this block is on the left edge, we can update the output while we
298     // compute the sum:
299     if (leftEdge)
300     {
301       // Set leftEdge arg to false to force the update:
302       node->Sum = UpdateOutputImpl(node, range, false, true);
303     }
304     else // Otherwise, only compute the sum and update the output in pass 2.
305     {
306       for (vtkm::Id i = range[0] + 1; i < range[1]; ++i)
307       {
308         node->Sum = this->Functor(node->Sum, input[i]);
309       }
310     }
311   }
312 
CombineSummariesScanExclusiveBody313   void CombineSummaries(Node* parent, const Node* left, const Node* right)
314   {
315     parent->Sum = this->Functor(left->Sum, right->Sum);
316   }
317 
PropagateSummariesScanExclusiveBody318   void PropagateSummaries(const Node* parent, Node* left, Node* right, bool leftEdge)
319   {
320     left->Carry = parent->Carry;
321     right->Carry = leftEdge ? left->Sum : this->Functor(parent->Carry, left->Sum);
322   }
323 
UpdateOutputScanExclusiveBody324   void UpdateOutput(const Node* node, const vtkm::Id2& range, bool leftEdge)
325   {
326     this->UpdateOutputImpl(node, range, leftEdge, false);
327   }
328 
UpdateOutputImplScanExclusiveBody329   ValueType UpdateOutputImpl(const Node* node, const vtkm::Id2& range, bool skip, bool useInit)
330   {
331     if (skip)
332     {
333       // Do nothing; this was already done in ComputeSummary.
334       return vtkm::TypeTraits<ValueType>::ZeroInitialization();
335     }
336 
337     auto input = vtkm::cont::ArrayPortalToIteratorBegin(this->InPortal);
338     auto output = vtkm::cont::ArrayPortalToIteratorBegin(this->OutPortal);
339 
340     // Be careful with the order input/output are modified. They might be
341     // pointing at the same data:
342     ValueType carry = useInit ? this->InitialValue : node->Carry;
343     vtkm::Id end = range[1];
344 
345     for (vtkm::Id i = range[0]; i < end; ++i)
346     {
347       output[i] = this->Functor(carry, input[i]);
348 
349       using std::swap; // Enable ADL
350       swap(output[i], carry);
351     }
352 
353     return carry;
354   }
355 
356   // Compute the final sum from the node's metadata:
GetFinalResultScanExclusiveBody357   ValueType GetFinalResult(const Node* node) const { return this->Functor(node->Sum, node->Carry); }
358 };
359 
360 template <typename InPortalT, typename OutPortalT, typename RawFunctorT>
361 struct ScanInclusiveBody
362 {
363   using ValueType = typename InPortalT::ValueType;
364   using FunctorType = internal::WrappedBinaryOperator<ValueType, RawFunctorT>;
365 
366   InPortalT InPortal;
367   OutPortalT OutPortal;
368   FunctorType Functor;
369 
370   struct Node
371   {
372     // Sum of all values in range
373     ValueType Sum{ vtkm::TypeTraits<ValueType>::ZeroInitialization() };
374 
375     // The sum of all elements prior to this node's range
376     ValueType Carry{ vtkm::TypeTraits<ValueType>::ZeroInitialization() };
377   };
378 
ScanInclusiveBodyScanInclusiveBody379   ScanInclusiveBody(const InPortalT& inPortal,
380                     const OutPortalT& outPortal,
381                     const RawFunctorT& functor)
382     : InPortal(inPortal)
383     , OutPortal(outPortal)
384     , Functor(functor)
385   {
386   }
387 
388   // Initialize the root of the node tree
InitializeRootNodeScanInclusiveBody389   void InitializeRootNode(Node*)
390   {
391     // no-op
392   }
393 
InitializeChildNodeScanInclusiveBody394   void InitializeChildNode(Node*, const Node*, ChildType, bool)
395   {
396     // no-op
397   }
398 
ComputeSummaryScanInclusiveBody399   void ComputeSummary(Node* node, const vtkm::Id2& range, bool leftEdge)
400   {
401     // If this block is on the left edge, we can update the output while we
402     // compute the sum:
403     if (leftEdge)
404     {
405       node->Sum = UpdateOutputImpl(node, range, false, false);
406     }
407     else // Otherwise, only compute the sum and update the output in pass 2.
408     {
409       auto input = vtkm::cont::ArrayPortalToIteratorBegin(this->InPortal);
410       node->Sum = input[range[0]];
411       for (vtkm::Id i = range[0] + 1; i < range[1]; ++i)
412       {
413         node->Sum = this->Functor(node->Sum, input[i]);
414       }
415     }
416   }
417 
CombineSummariesScanInclusiveBody418   void CombineSummaries(Node* parent, const Node* left, const Node* right)
419   {
420     parent->Sum = this->Functor(left->Sum, right->Sum);
421   }
422 
PropagateSummariesScanInclusiveBody423   void PropagateSummaries(const Node* parent, Node* left, Node* right, bool leftEdge)
424   {
425     left->Carry = parent->Carry;
426     right->Carry = leftEdge ? left->Sum : this->Functor(parent->Carry, left->Sum);
427   }
428 
UpdateOutputScanInclusiveBody429   void UpdateOutput(const Node* node, const vtkm::Id2& range, bool leftEdge)
430   {
431     UpdateOutputImpl(node, range, leftEdge, true);
432   }
433 
UpdateOutputImplScanInclusiveBody434   ValueType UpdateOutputImpl(const Node* node, const vtkm::Id2& range, bool skip, bool useCarry)
435   {
436     if (skip)
437     {
438       // Do nothing; this was already done in ComputeSummary.
439       return vtkm::TypeTraits<ValueType>::ZeroInitialization();
440     }
441 
442     auto input = vtkm::cont::ArrayPortalToIteratorBegin(this->InPortal);
443     auto output = vtkm::cont::ArrayPortalToIteratorBegin(this->OutPortal);
444 
445     vtkm::Id start = range[0];
446     vtkm::Id end = range[1];
447     ValueType carry = node->Carry;
448 
449     // Initialize with the first value if this is the first range:
450     if (!useCarry && start < end)
451     {
452       carry = input[start];
453       output[start] = carry;
454       ++start;
455     }
456 
457     for (vtkm::Id i = start; i < end; ++i)
458     {
459       output[i] = this->Functor(carry, input[i]);
460       carry = output[i];
461     }
462 
463     return output[end - 1];
464   }
465 
466   // Compute the final sum from the node's metadata:
GetFinalResultScanInclusiveBody467   ValueType GetFinalResult(const Node* node) const { return node->Sum; }
468 };
469 
470 } // end namespace scan
471 
472 template <typename InPortalT, typename OutPortalT, typename FunctorT>
473 using ScanExclusiveHelper = scan::Adder<scan::ScanExclusiveBody<InPortalT, OutPortalT, FunctorT>>;
474 
475 template <typename InPortalT, typename OutPortalT, typename FunctorT>
476 using ScanInclusiveHelper = scan::Adder<scan::ScanInclusiveBody<InPortalT, OutPortalT, FunctorT>>;
477 }
478 }
479 } // end namespace vtkm::cont::openmp
480