1 //============================================================================ 2 // Copyright (c) Kitware, Inc. 3 // All rights reserved. 4 // See LICENSE.txt for details. 5 // 6 // This software is distributed WITHOUT ANY WARRANTY; without even 7 // the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR 8 // PURPOSE. See the above copyright notice for more information. 9 //============================================================================ 10 11 #include <vtkm/cont/openmp/internal/DeviceAdapterTagOpenMP.h> 12 #include <vtkm/cont/openmp/internal/FunctorsOpenMP.h> 13 14 #include <vtkm/cont/internal/FunctorsGeneral.h> 15 16 #include <vtkm/Types.h> 17 #include <vtkm/cont/ArrayHandle.h> 18 19 #include <omp.h> 20 21 namespace vtkm 22 { 23 namespace cont 24 { 25 namespace openmp 26 { 27 namespace scan 28 { 29 30 enum class ChildType 31 { 32 Left, 33 Right 34 }; 35 36 // Generic implementation of modified Ladner & Fischer 1977 "adder" algorithm 37 // used for backbone of exclusive/inclusive scans. Language in comments is 38 // specific to computing a sum, but the implementation should be generic enough 39 // for any scan operation. 40 // 41 // The basic idea is that a tree structure is used to partition the input into 42 // sets of LeafSize. Each leaf of the tree is processed in two stages: First, 43 // the sum of each leaf is computed, and this information is pushed up the tree 44 // to compute the sum of each node's child leaves. Then the partial sum at the 45 // start of each node is computed and pushed down the tree (the "carry" 46 // values). In the second pass through each leaf's data, these partial sums are 47 // used to compute the final output from the carry value and the input data. 48 // 49 // The passes will likely overlap due to the "leftEdge" optimizations, which 50 // allow each leaf to start the second pass as soon as the first pass of all 51 // previous leaves is completed. Additionally, the first leaf in the data will 52 // combine both passes into one, computing the final output data while 53 // generating its sum for the communication stage. 54 template <typename ScanBody> 55 struct Adder : public ScanBody 56 { 57 template <typename NodeImpl> 58 struct NodeWrapper : public NodeImpl 59 { 60 // Range of IDs this node represents 61 vtkm::Id2 Range{ -1, -1 }; 62 63 // Connections: 64 NodeWrapper* Parent{ nullptr }; 65 NodeWrapper* Left{ nullptr }; 66 NodeWrapper* Right{ nullptr }; 67 68 // Special flag to mark nodes on the far left edge of the tree. This allows 69 // various optimization that start the second pass sooner on some ranges. 70 bool LeftEdge{ false }; 71 72 // Pad the node out to the size of a cache line to prevent false sharing: 73 static constexpr size_t DataSize = 74 sizeof(NodeImpl) + sizeof(vtkm::Id2) + 3 * sizeof(NodeWrapper*) + sizeof(bool); 75 static constexpr size_t NumCacheLines = CeilDivide<size_t>(DataSize, VTKM_CACHE_LINE_SIZE); 76 static constexpr size_t PaddingSize = NumCacheLines * VTKM_CACHE_LINE_SIZE - DataSize; 77 unsigned char Padding[PaddingSize]; 78 }; 79 80 using Node = NodeWrapper<typename ScanBody::Node>; 81 using ValueType = typename ScanBody::ValueType; 82 83 vtkm::Id LeafSize; 84 std::vector<Node> Nodes; 85 size_t NextNode; 86 87 // Use ScanBody's ctor: 88 using ScanBody::ScanBody; 89 90 // Returns the total array sum: ExecuteAdder91 ValueType Execute(const vtkm::Id2& range) 92 { 93 Node* rootNode = nullptr; 94 95 VTKM_OPENMP_DIRECTIVE(parallel default(shared)) 96 { 97 VTKM_OPENMP_DIRECTIVE(single) 98 { 99 // Allocate nodes, prep metadata: 100 this->Prepare(range); 101 102 // Compute the partition and node sums: 103 rootNode = this->AllocNode(); 104 rootNode->Range = range; 105 rootNode->LeftEdge = true; 106 ScanBody::InitializeRootNode(rootNode); 107 108 this->Scan(rootNode); 109 } // end single 110 } // end parallel 111 112 return rootNode ? ScanBody::GetFinalResult(rootNode) 113 : vtkm::TypeTraits<ValueType>::ZeroInitialization(); 114 } 115 116 private: 117 // Returns the next available node in a thread-safe manner. AllocNodeAdder118 Node* AllocNode() 119 { 120 size_t nodeIdx; 121 122 // GCC emits a false positive "value computed but not used" for this block: 123 #pragma GCC diagnostic push 124 #pragma GCC diagnostic ignored "-Wunused-value" 125 126 VTKM_OPENMP_DIRECTIVE(atomic capture) 127 { 128 nodeIdx = this->NextNode; 129 ++this->NextNode; 130 } 131 132 #pragma GCC diagnostic pop 133 134 VTKM_ASSERT(nodeIdx < this->Nodes.size()); 135 136 return &this->Nodes[nodeIdx]; 137 } 138 139 // Does the range represent a leave node? IsLeafAdder140 bool IsLeaf(const vtkm::Id2& range) const { return (range[1] - range[0]) <= this->LeafSize; } 141 142 // Use to split ranges. Ensures that the first range is always a multiple of 143 // LeafSize, when possible. ComputeMidpointAdder144 vtkm::Id ComputeMidpoint(const vtkm::Id2& range) const 145 { 146 const vtkm::Id n = range[1] - range[0]; 147 const vtkm::Id np = this->LeafSize; 148 149 return (((n / 2) + (np - 1)) / np) * np + range[0]; 150 } 151 PrepareAdder152 void Prepare(const vtkm::Id2& range) 153 { 154 // Figure out how many values each thread should handle: 155 vtkm::Id numVals = range[1] - range[0]; 156 int numThreads = omp_get_num_threads(); 157 vtkm::Id chunksPerThread = 8; 158 vtkm::Id numChunks; 159 ComputeChunkSize( 160 numVals, numThreads, chunksPerThread, sizeof(ValueType), numChunks, this->LeafSize); 161 162 // Compute an upper-bound of the number of nodes in the tree: 163 std::size_t numNodes = static_cast<std::size_t>(numChunks); 164 while (numChunks > 1) 165 { 166 numChunks = (numChunks + 1) / 2; 167 numNodes += static_cast<std::size_t>(numChunks); 168 } 169 this->Nodes.resize(numNodes); 170 this->NextNode = 0; 171 } 172 173 // Build the tree and compute the sums: ScanAdder174 void Scan(Node* node) 175 { 176 if (!this->IsLeaf(node->Range)) 177 { // split range: 178 vtkm::Id midpoint = this->ComputeMidpoint(node->Range); 179 180 Node* right = this->AllocNode(); 181 right->Parent = node; 182 node->Right = right; 183 right->Range = vtkm::Id2(midpoint, node->Range[1]); 184 ScanBody::InitializeChildNode(right, node, ChildType::Right, false); 185 186 // Intel compilers seem to have trouble following the 'this' pointer 187 // when launching tasks, resulting in a corrupt task environment. 188 // Explicitly copying the pointer into a local variable seems to fix this. 189 auto explicitThis = this; 190 191 VTKM_OPENMP_DIRECTIVE(taskgroup) 192 { 193 VTKM_OPENMP_DIRECTIVE(task) { explicitThis->Scan(right); } // end right task 194 195 Node* left = this->AllocNode(); 196 left->Parent = node; 197 node->Left = left; 198 left->Range = vtkm::Id2(node->Range[0], midpoint); 199 left->LeftEdge = node->LeftEdge; 200 ScanBody::InitializeChildNode(left, node, ChildType::Left, left->LeftEdge); 201 this->Scan(left); 202 203 } // end task group. Both l/r sums will be finished here. 204 205 ScanBody::CombineSummaries(node, node->Left, node->Right); 206 if (node->LeftEdge) 207 { 208 this->UpdateOutput(node); 209 } 210 } 211 else 212 { // Compute sums: 213 ScanBody::ComputeSummary(node, node->Range, node->LeftEdge); 214 } 215 } 216 UpdateOutputAdder217 void UpdateOutput(Node* node) 218 { 219 if (node->Left != nullptr) 220 { 221 assert(node->Right != nullptr); 222 ScanBody::PropagateSummaries(node, node->Left, node->Right, node->LeftEdge); 223 224 // if this node is on the left edge, we know that the left child's 225 // output is already updated, so only descend to the right: 226 if (node->LeftEdge) 227 { 228 this->UpdateOutput(node->Right); 229 } 230 else // Otherwise descent into both: 231 { 232 // Intel compilers seem to have trouble following the 'this' pointer 233 // when launching tasks, resulting in a corrupt task environment. 234 // Explicitly copying the pointer into a local variable seems to fix 235 // this. 236 auto explicitThis = this; 237 238 // no taskgroup/sync needed other than the final barrier of the parallel 239 // section. 240 VTKM_OPENMP_DIRECTIVE(task) { explicitThis->UpdateOutput(node->Right); } // end task 241 this->UpdateOutput(node->Left); 242 } 243 } 244 else 245 { 246 ScanBody::UpdateOutput(node, node->Range, node->LeftEdge); 247 } 248 } 249 }; 250 251 template <typename InPortalT, typename OutPortalT, typename RawFunctorT> 252 struct ScanExclusiveBody 253 { 254 using ValueType = typename InPortalT::ValueType; 255 using FunctorType = internal::WrappedBinaryOperator<ValueType, RawFunctorT>; 256 257 InPortalT InPortal; 258 OutPortalT OutPortal; 259 FunctorType Functor; 260 ValueType InitialValue; 261 262 struct Node 263 { 264 // Sum of all values in range 265 ValueType Sum{ vtkm::TypeTraits<ValueType>::ZeroInitialization() }; 266 267 // The sum of all elements prior to this node's range 268 ValueType Carry{ vtkm::TypeTraits<ValueType>::ZeroInitialization() }; 269 }; 270 ScanExclusiveBodyScanExclusiveBody271 ScanExclusiveBody(const InPortalT& inPortal, 272 const OutPortalT& outPortal, 273 const RawFunctorT& functor, 274 const ValueType& init) 275 : InPortal(inPortal) 276 , OutPortal(outPortal) 277 , Functor(functor) 278 , InitialValue(init) 279 { 280 } 281 282 // Initialize the root of the node tree InitializeRootNodeScanExclusiveBody283 void InitializeRootNode(Node* /*root*/) {} 284 InitializeChildNodeScanExclusiveBody285 void InitializeChildNode(Node* /*node*/, 286 const Node* /*parent*/, 287 ChildType /*type*/, 288 bool /*leftEdge*/) 289 { 290 } 291 ComputeSummaryScanExclusiveBody292 void ComputeSummary(Node* node, const vtkm::Id2& range, bool leftEdge) 293 { 294 auto input = vtkm::cont::ArrayPortalToIteratorBegin(this->InPortal); 295 node->Sum = input[range[0]]; 296 297 // If this block is on the left edge, we can update the output while we 298 // compute the sum: 299 if (leftEdge) 300 { 301 // Set leftEdge arg to false to force the update: 302 node->Sum = UpdateOutputImpl(node, range, false, true); 303 } 304 else // Otherwise, only compute the sum and update the output in pass 2. 305 { 306 for (vtkm::Id i = range[0] + 1; i < range[1]; ++i) 307 { 308 node->Sum = this->Functor(node->Sum, input[i]); 309 } 310 } 311 } 312 CombineSummariesScanExclusiveBody313 void CombineSummaries(Node* parent, const Node* left, const Node* right) 314 { 315 parent->Sum = this->Functor(left->Sum, right->Sum); 316 } 317 PropagateSummariesScanExclusiveBody318 void PropagateSummaries(const Node* parent, Node* left, Node* right, bool leftEdge) 319 { 320 left->Carry = parent->Carry; 321 right->Carry = leftEdge ? left->Sum : this->Functor(parent->Carry, left->Sum); 322 } 323 UpdateOutputScanExclusiveBody324 void UpdateOutput(const Node* node, const vtkm::Id2& range, bool leftEdge) 325 { 326 this->UpdateOutputImpl(node, range, leftEdge, false); 327 } 328 UpdateOutputImplScanExclusiveBody329 ValueType UpdateOutputImpl(const Node* node, const vtkm::Id2& range, bool skip, bool useInit) 330 { 331 if (skip) 332 { 333 // Do nothing; this was already done in ComputeSummary. 334 return vtkm::TypeTraits<ValueType>::ZeroInitialization(); 335 } 336 337 auto input = vtkm::cont::ArrayPortalToIteratorBegin(this->InPortal); 338 auto output = vtkm::cont::ArrayPortalToIteratorBegin(this->OutPortal); 339 340 // Be careful with the order input/output are modified. They might be 341 // pointing at the same data: 342 ValueType carry = useInit ? this->InitialValue : node->Carry; 343 vtkm::Id end = range[1]; 344 345 for (vtkm::Id i = range[0]; i < end; ++i) 346 { 347 output[i] = this->Functor(carry, input[i]); 348 349 using std::swap; // Enable ADL 350 swap(output[i], carry); 351 } 352 353 return carry; 354 } 355 356 // Compute the final sum from the node's metadata: GetFinalResultScanExclusiveBody357 ValueType GetFinalResult(const Node* node) const { return this->Functor(node->Sum, node->Carry); } 358 }; 359 360 template <typename InPortalT, typename OutPortalT, typename RawFunctorT> 361 struct ScanInclusiveBody 362 { 363 using ValueType = typename InPortalT::ValueType; 364 using FunctorType = internal::WrappedBinaryOperator<ValueType, RawFunctorT>; 365 366 InPortalT InPortal; 367 OutPortalT OutPortal; 368 FunctorType Functor; 369 370 struct Node 371 { 372 // Sum of all values in range 373 ValueType Sum{ vtkm::TypeTraits<ValueType>::ZeroInitialization() }; 374 375 // The sum of all elements prior to this node's range 376 ValueType Carry{ vtkm::TypeTraits<ValueType>::ZeroInitialization() }; 377 }; 378 ScanInclusiveBodyScanInclusiveBody379 ScanInclusiveBody(const InPortalT& inPortal, 380 const OutPortalT& outPortal, 381 const RawFunctorT& functor) 382 : InPortal(inPortal) 383 , OutPortal(outPortal) 384 , Functor(functor) 385 { 386 } 387 388 // Initialize the root of the node tree InitializeRootNodeScanInclusiveBody389 void InitializeRootNode(Node*) 390 { 391 // no-op 392 } 393 InitializeChildNodeScanInclusiveBody394 void InitializeChildNode(Node*, const Node*, ChildType, bool) 395 { 396 // no-op 397 } 398 ComputeSummaryScanInclusiveBody399 void ComputeSummary(Node* node, const vtkm::Id2& range, bool leftEdge) 400 { 401 // If this block is on the left edge, we can update the output while we 402 // compute the sum: 403 if (leftEdge) 404 { 405 node->Sum = UpdateOutputImpl(node, range, false, false); 406 } 407 else // Otherwise, only compute the sum and update the output in pass 2. 408 { 409 auto input = vtkm::cont::ArrayPortalToIteratorBegin(this->InPortal); 410 node->Sum = input[range[0]]; 411 for (vtkm::Id i = range[0] + 1; i < range[1]; ++i) 412 { 413 node->Sum = this->Functor(node->Sum, input[i]); 414 } 415 } 416 } 417 CombineSummariesScanInclusiveBody418 void CombineSummaries(Node* parent, const Node* left, const Node* right) 419 { 420 parent->Sum = this->Functor(left->Sum, right->Sum); 421 } 422 PropagateSummariesScanInclusiveBody423 void PropagateSummaries(const Node* parent, Node* left, Node* right, bool leftEdge) 424 { 425 left->Carry = parent->Carry; 426 right->Carry = leftEdge ? left->Sum : this->Functor(parent->Carry, left->Sum); 427 } 428 UpdateOutputScanInclusiveBody429 void UpdateOutput(const Node* node, const vtkm::Id2& range, bool leftEdge) 430 { 431 UpdateOutputImpl(node, range, leftEdge, true); 432 } 433 UpdateOutputImplScanInclusiveBody434 ValueType UpdateOutputImpl(const Node* node, const vtkm::Id2& range, bool skip, bool useCarry) 435 { 436 if (skip) 437 { 438 // Do nothing; this was already done in ComputeSummary. 439 return vtkm::TypeTraits<ValueType>::ZeroInitialization(); 440 } 441 442 auto input = vtkm::cont::ArrayPortalToIteratorBegin(this->InPortal); 443 auto output = vtkm::cont::ArrayPortalToIteratorBegin(this->OutPortal); 444 445 vtkm::Id start = range[0]; 446 vtkm::Id end = range[1]; 447 ValueType carry = node->Carry; 448 449 // Initialize with the first value if this is the first range: 450 if (!useCarry && start < end) 451 { 452 carry = input[start]; 453 output[start] = carry; 454 ++start; 455 } 456 457 for (vtkm::Id i = start; i < end; ++i) 458 { 459 output[i] = this->Functor(carry, input[i]); 460 carry = output[i]; 461 } 462 463 return output[end - 1]; 464 } 465 466 // Compute the final sum from the node's metadata: GetFinalResultScanInclusiveBody467 ValueType GetFinalResult(const Node* node) const { return node->Sum; } 468 }; 469 470 } // end namespace scan 471 472 template <typename InPortalT, typename OutPortalT, typename FunctorT> 473 using ScanExclusiveHelper = scan::Adder<scan::ScanExclusiveBody<InPortalT, OutPortalT, FunctorT>>; 474 475 template <typename InPortalT, typename OutPortalT, typename FunctorT> 476 using ScanInclusiveHelper = scan::Adder<scan::ScanInclusiveBody<InPortalT, OutPortalT, FunctorT>>; 477 } 478 } 479 } // end namespace vtkm::cont::openmp 480