1 //============================================================================ 2 // Copyright (c) Kitware, Inc. 3 // All rights reserved. 4 // See LICENSE.txt for details. 5 // This software is distributed WITHOUT ANY WARRANTY; without even 6 // the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR 7 // PURPOSE. See the above copyright notice for more information. 8 // 9 // Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS). 10 // Copyright 2018 UT-Battelle, LLC. 11 // Copyright 2018 Los Alamos National Security. 12 // 13 // Under the terms of Contract DE-NA0003525 with NTESS, 14 // the U.S. Government retains certain rights in this software. 15 // 16 // Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National 17 // Laboratory (LANL), the U.S. Government retains certain rights in 18 // this software. 19 //============================================================================ 20 21 #include <vtkm/cont/openmp/internal/DeviceAdapterTagOpenMP.h> 22 #include <vtkm/cont/openmp/internal/FunctorsOpenMP.h> 23 24 #include <vtkm/cont/internal/FunctorsGeneral.h> 25 26 #include <vtkm/Types.h> 27 #include <vtkm/cont/ArrayHandle.h> 28 29 #include <omp.h> 30 31 namespace vtkm 32 { 33 namespace cont 34 { 35 namespace openmp 36 { 37 namespace scan 38 { 39 40 enum class ChildType 41 { 42 Left, 43 Right 44 }; 45 46 // Generic implementation of modified Ladner & Fischer 1977 "adder" algorithm 47 // used for backbone of exclusive/inclusive scans. Language in comments is 48 // specific to computing a sum, but the implementation should be generic enough 49 // for any scan operation. 50 // 51 // The basic idea is that a tree structure is used to partition the input into 52 // sets of LeafSize. Each leaf of the tree is processed in two stages: First, 53 // the sum of each leaf is computed, and this information is pushed up the tree 54 // to compute the sum of each node's child leaves. Then the partial sum at the 55 // start of each node is computed and pushed down the tree (the "carry" 56 // values). In the second pass through each leaf's data, these partial sums are 57 // used to compute the final output from the carry value and the input data. 58 // 59 // The passes will likely overlap due to the "leftEdge" optimizations, which 60 // allow each leaf to start the second pass as soon as the first pass of all 61 // previous leaves is completed. Additionally, the first leaf in the data will 62 // combine both passes into one, computing the final output data while 63 // generating its sum for the communication stage. 64 template <typename ScanBody> 65 struct Adder : public ScanBody 66 { 67 template <typename NodeImpl> 68 struct NodeWrapper : public NodeImpl 69 { 70 // Range of IDs this node represents 71 vtkm::Id2 Range{ -1, -1 }; 72 73 // Connections: 74 NodeWrapper* Parent{ nullptr }; 75 NodeWrapper* Left{ nullptr }; 76 NodeWrapper* Right{ nullptr }; 77 78 // Special flag to mark nodes on the far left edge of the tree. This allows 79 // various optimization that start the second pass sooner on some ranges. 80 bool LeftEdge{ false }; 81 82 // Pad the node out to the size of a cache line to prevent false sharing: 83 static constexpr size_t DataSize = 84 sizeof(NodeImpl) + sizeof(vtkm::Id2) + 3 * sizeof(NodeWrapper*) + sizeof(bool); 85 static constexpr size_t NumCacheLines = CeilDivide<size_t>(DataSize, CACHE_LINE_SIZE); 86 static constexpr size_t PaddingSize = NumCacheLines * CACHE_LINE_SIZE - DataSize; 87 unsigned char Padding[PaddingSize]; 88 }; 89 90 using Node = NodeWrapper<typename ScanBody::Node>; 91 using ValueType = typename ScanBody::ValueType; 92 93 vtkm::Id LeafSize; 94 std::vector<Node> Nodes; 95 size_t NextNode; 96 97 // Use ScanBody's ctor: 98 using ScanBody::ScanBody; 99 100 // Returns the total array sum: ExecuteAdder101 ValueType Execute(const vtkm::Id2& range) 102 { 103 Node* rootNode = nullptr; 104 105 VTKM_OPENMP_DIRECTIVE(parallel default(shared)) 106 { 107 VTKM_OPENMP_DIRECTIVE(single) 108 { 109 // Allocate nodes, prep metadata: 110 this->Prepare(range); 111 112 // Compute the partition and node sums: 113 rootNode = this->AllocNode(); 114 rootNode->Range = range; 115 rootNode->LeftEdge = true; 116 ScanBody::InitializeRootNode(rootNode); 117 118 this->Scan(rootNode); 119 } // end single 120 } // end parallel 121 122 return rootNode ? ScanBody::GetFinalResult(rootNode) 123 : vtkm::TypeTraits<ValueType>::ZeroInitialization(); 124 } 125 126 private: 127 // Returns the next available node in a thread-safe manner. AllocNodeAdder128 Node* AllocNode() 129 { 130 size_t nodeIdx; 131 132 // GCC emits a false positive "value computed but not used" for this block: 133 #pragma GCC diagnostic push 134 #pragma GCC diagnostic ignored "-Wunused-value" 135 136 VTKM_OPENMP_DIRECTIVE(atomic capture) 137 { 138 nodeIdx = this->NextNode; 139 ++this->NextNode; 140 } 141 142 #pragma GCC diagnostic pop 143 144 VTKM_ASSERT(nodeIdx < this->Nodes.size()); 145 146 return &this->Nodes[nodeIdx]; 147 } 148 149 // Does the range represent a leave node? IsLeafAdder150 bool IsLeaf(const vtkm::Id2& range) const { return (range[1] - range[0]) <= this->LeafSize; } 151 152 // Use to split ranges. Ensures that the first range is always a multiple of 153 // LeafSize, when possible. ComputeMidpointAdder154 vtkm::Id ComputeMidpoint(const vtkm::Id2& range) const 155 { 156 const vtkm::Id n = range[1] - range[0]; 157 const vtkm::Id np = this->LeafSize; 158 159 return (((n / 2) + (np - 1)) / np) * np + range[0]; 160 } 161 PrepareAdder162 void Prepare(const vtkm::Id2& range) 163 { 164 // Figure out how many values each thread should handle: 165 vtkm::Id numVals = range[1] - range[0]; 166 int numThreads = omp_get_num_threads(); 167 vtkm::Id chunksPerThread = 8; 168 vtkm::Id numChunks; 169 ComputeChunkSize( 170 numVals, numThreads, chunksPerThread, sizeof(ValueType), numChunks, this->LeafSize); 171 172 // Compute an upper-bound of the number of nodes in the tree: 173 size_t numNodes = numChunks; 174 while (numChunks > 1) 175 { 176 numChunks = (numChunks + 1) / 2; 177 numNodes += numChunks; 178 } 179 this->Nodes.resize(numNodes); 180 this->NextNode = 0; 181 } 182 183 // Build the tree and compute the sums: ScanAdder184 void Scan(Node* node) 185 { 186 if (!this->IsLeaf(node->Range)) 187 { // split range: 188 vtkm::Id midpoint = this->ComputeMidpoint(node->Range); 189 190 Node* right = this->AllocNode(); 191 right->Parent = node; 192 node->Right = right; 193 right->Range = vtkm::Id2(midpoint, node->Range[1]); 194 ScanBody::InitializeChildNode(right, node, ChildType::Right, false); 195 196 // Intel compilers seem to have trouble following the 'this' pointer 197 // when launching tasks, resulting in a corrupt task environment. 198 // Explicitly copying the pointer into a local variable seems to fix this. 199 auto explicitThis = this; 200 201 VTKM_OPENMP_DIRECTIVE(taskgroup) 202 { 203 VTKM_OPENMP_DIRECTIVE(task) { explicitThis->Scan(right); } // end right task 204 205 Node* left = this->AllocNode(); 206 left->Parent = node; 207 node->Left = left; 208 left->Range = vtkm::Id2(node->Range[0], midpoint); 209 left->LeftEdge = node->LeftEdge; 210 ScanBody::InitializeChildNode(left, node, ChildType::Left, left->LeftEdge); 211 this->Scan(left); 212 213 } // end task group. Both l/r sums will be finished here. 214 215 ScanBody::CombineSummaries(node, node->Left, node->Right); 216 if (node->LeftEdge) 217 { 218 this->UpdateOutput(node); 219 } 220 } 221 else 222 { // Compute sums: 223 ScanBody::ComputeSummary(node, node->Range, node->LeftEdge); 224 } 225 } 226 UpdateOutputAdder227 void UpdateOutput(Node* node) 228 { 229 if (node->Left != nullptr) 230 { 231 assert(node->Right != nullptr); 232 ScanBody::PropagateSummaries(node, node->Left, node->Right, node->LeftEdge); 233 234 // if this node is on the left edge, we know that the left child's 235 // output is already updated, so only descend to the right: 236 if (node->LeftEdge) 237 { 238 this->UpdateOutput(node->Right); 239 } 240 else // Otherwise descent into both: 241 { 242 // Intel compilers seem to have trouble following the 'this' pointer 243 // when launching tasks, resulting in a corrupt task environment. 244 // Explicitly copying the pointer into a local variable seems to fix 245 // this. 246 auto explicitThis = this; 247 248 // no taskgroup/sync needed other than the final barrier of the parallel 249 // section. 250 VTKM_OPENMP_DIRECTIVE(task) { explicitThis->UpdateOutput(node->Right); } // end task 251 this->UpdateOutput(node->Left); 252 } 253 } 254 else 255 { 256 ScanBody::UpdateOutput(node, node->Range, node->LeftEdge); 257 } 258 } 259 }; 260 261 template <typename InPortalT, typename OutPortalT, typename RawFunctorT> 262 struct ScanExclusiveBody 263 { 264 using ValueType = typename InPortalT::ValueType; 265 using FunctorType = internal::WrappedBinaryOperator<ValueType, RawFunctorT>; 266 267 InPortalT InPortal; 268 OutPortalT OutPortal; 269 FunctorType Functor; 270 ValueType InitialValue; 271 272 struct Node 273 { 274 // Sum of all values in range 275 ValueType Sum{ vtkm::TypeTraits<ValueType>::ZeroInitialization() }; 276 277 // The sum of all elements prior to this node's range 278 ValueType Carry{ vtkm::TypeTraits<ValueType>::ZeroInitialization() }; 279 }; 280 ScanExclusiveBodyScanExclusiveBody281 ScanExclusiveBody(const InPortalT& inPortal, 282 const OutPortalT& outPortal, 283 const RawFunctorT& functor, 284 const ValueType& init) 285 : InPortal(inPortal) 286 , OutPortal(outPortal) 287 , Functor(functor) 288 , InitialValue(init) 289 { 290 } 291 292 // Initialize the root of the node tree InitializeRootNodeScanExclusiveBody293 void InitializeRootNode(Node* /*root*/) {} 294 InitializeChildNodeScanExclusiveBody295 void InitializeChildNode(Node* /*node*/, 296 const Node* /*parent*/, 297 ChildType /*type*/, 298 bool /*leftEdge*/) 299 { 300 } 301 ComputeSummaryScanExclusiveBody302 void ComputeSummary(Node* node, const vtkm::Id2& range, bool leftEdge) 303 { 304 auto input = vtkm::cont::ArrayPortalToIteratorBegin(this->InPortal); 305 node->Sum = input[range[0]]; 306 307 // If this block is on the left edge, we can update the output while we 308 // compute the sum: 309 if (leftEdge) 310 { 311 // Set leftEdge arg to false to force the update: 312 node->Sum = UpdateOutputImpl(node, range, false, true); 313 } 314 else // Otherwise, only compute the sum and update the output in pass 2. 315 { 316 for (vtkm::Id i = range[0] + 1; i < range[1]; ++i) 317 { 318 node->Sum = this->Functor(node->Sum, input[i]); 319 } 320 } 321 } 322 CombineSummariesScanExclusiveBody323 void CombineSummaries(Node* parent, const Node* left, const Node* right) 324 { 325 parent->Sum = this->Functor(left->Sum, right->Sum); 326 } 327 PropagateSummariesScanExclusiveBody328 void PropagateSummaries(const Node* parent, Node* left, Node* right, bool leftEdge) 329 { 330 left->Carry = parent->Carry; 331 right->Carry = leftEdge ? left->Sum : this->Functor(parent->Carry, left->Sum); 332 } 333 UpdateOutputScanExclusiveBody334 void UpdateOutput(const Node* node, const vtkm::Id2& range, bool leftEdge) 335 { 336 this->UpdateOutputImpl(node, range, leftEdge, false); 337 } 338 UpdateOutputImplScanExclusiveBody339 ValueType UpdateOutputImpl(const Node* node, const vtkm::Id2& range, bool skip, bool useInit) 340 { 341 if (skip) 342 { 343 // Do nothing; this was already done in ComputeSummary. 344 return vtkm::TypeTraits<ValueType>::ZeroInitialization(); 345 } 346 347 auto input = vtkm::cont::ArrayPortalToIteratorBegin(this->InPortal); 348 auto output = vtkm::cont::ArrayPortalToIteratorBegin(this->OutPortal); 349 350 // Be careful with the order input/output are modified. They might be 351 // pointing at the same data: 352 ValueType carry = useInit ? this->InitialValue : node->Carry; 353 vtkm::Id end = range[1]; 354 355 for (vtkm::Id i = range[0]; i < end; ++i) 356 { 357 output[i] = this->Functor(carry, input[i]); 358 359 using std::swap; // Enable ADL 360 swap(output[i], carry); 361 } 362 363 return carry; 364 } 365 366 // Compute the final sum from the node's metadata: GetFinalResultScanExclusiveBody367 ValueType GetFinalResult(const Node* node) const { return this->Functor(node->Sum, node->Carry); } 368 }; 369 370 template <typename InPortalT, typename OutPortalT, typename RawFunctorT> 371 struct ScanInclusiveBody 372 { 373 using ValueType = typename InPortalT::ValueType; 374 using FunctorType = internal::WrappedBinaryOperator<ValueType, RawFunctorT>; 375 376 InPortalT InPortal; 377 OutPortalT OutPortal; 378 FunctorType Functor; 379 380 struct Node 381 { 382 // Sum of all values in range 383 ValueType Sum{ vtkm::TypeTraits<ValueType>::ZeroInitialization() }; 384 385 // The sum of all elements prior to this node's range 386 ValueType Carry{ vtkm::TypeTraits<ValueType>::ZeroInitialization() }; 387 }; 388 ScanInclusiveBodyScanInclusiveBody389 ScanInclusiveBody(const InPortalT& inPortal, 390 const OutPortalT& outPortal, 391 const RawFunctorT& functor) 392 : InPortal(inPortal) 393 , OutPortal(outPortal) 394 , Functor(functor) 395 { 396 } 397 398 // Initialize the root of the node tree InitializeRootNodeScanInclusiveBody399 void InitializeRootNode(Node*) 400 { 401 // no-op 402 } 403 InitializeChildNodeScanInclusiveBody404 void InitializeChildNode(Node*, const Node*, ChildType, bool) 405 { 406 // no-op 407 } 408 ComputeSummaryScanInclusiveBody409 void ComputeSummary(Node* node, const vtkm::Id2& range, bool leftEdge) 410 { 411 // If this block is on the left edge, we can update the output while we 412 // compute the sum: 413 if (leftEdge) 414 { 415 node->Sum = UpdateOutputImpl(node, range, false, false); 416 } 417 else // Otherwise, only compute the sum and update the output in pass 2. 418 { 419 auto input = vtkm::cont::ArrayPortalToIteratorBegin(this->InPortal); 420 node->Sum = input[range[0]]; 421 for (vtkm::Id i = range[0] + 1; i < range[1]; ++i) 422 { 423 node->Sum = this->Functor(node->Sum, input[i]); 424 } 425 } 426 } 427 CombineSummariesScanInclusiveBody428 void CombineSummaries(Node* parent, const Node* left, const Node* right) 429 { 430 parent->Sum = this->Functor(left->Sum, right->Sum); 431 } 432 PropagateSummariesScanInclusiveBody433 void PropagateSummaries(const Node* parent, Node* left, Node* right, bool leftEdge) 434 { 435 left->Carry = parent->Carry; 436 right->Carry = leftEdge ? left->Sum : this->Functor(parent->Carry, left->Sum); 437 } 438 UpdateOutputScanInclusiveBody439 void UpdateOutput(const Node* node, const vtkm::Id2& range, bool leftEdge) 440 { 441 UpdateOutputImpl(node, range, leftEdge, true); 442 } 443 UpdateOutputImplScanInclusiveBody444 ValueType UpdateOutputImpl(const Node* node, const vtkm::Id2& range, bool skip, bool useCarry) 445 { 446 if (skip) 447 { 448 // Do nothing; this was already done in ComputeSummary. 449 return vtkm::TypeTraits<ValueType>::ZeroInitialization(); 450 } 451 452 auto input = vtkm::cont::ArrayPortalToIteratorBegin(this->InPortal); 453 auto output = vtkm::cont::ArrayPortalToIteratorBegin(this->OutPortal); 454 455 vtkm::Id start = range[0]; 456 vtkm::Id end = range[1]; 457 ValueType carry = node->Carry; 458 459 // Initialize with the first value if this is the first range: 460 if (!useCarry && start < end) 461 { 462 carry = input[start]; 463 output[start] = carry; 464 ++start; 465 } 466 467 for (vtkm::Id i = start; i < end; ++i) 468 { 469 output[i] = this->Functor(carry, input[i]); 470 carry = output[i]; 471 } 472 473 return output[end - 1]; 474 } 475 476 // Compute the final sum from the node's metadata: GetFinalResultScanInclusiveBody477 ValueType GetFinalResult(const Node* node) const { return node->Sum; } 478 }; 479 480 } // end namespace scan 481 482 template <typename InPortalT, typename OutPortalT, typename FunctorT> 483 using ScanExclusiveHelper = scan::Adder<scan::ScanExclusiveBody<InPortalT, OutPortalT, FunctorT>>; 484 485 template <typename InPortalT, typename OutPortalT, typename FunctorT> 486 using ScanInclusiveHelper = scan::Adder<scan::ScanInclusiveBody<InPortalT, OutPortalT, FunctorT>>; 487 } 488 } 489 } // end namespace vtkm::cont::openmp 490