1 /* 2 //@HEADER 3 // ************************************************************************ 4 // 5 // Kokkos v. 3.0 6 // Copyright (2020) National Technology & Engineering 7 // Solutions of Sandia, LLC (NTESS). 8 // 9 // Under the terms of Contract DE-NA0003525 with NTESS, 10 // the U.S. Government retains certain rights in this software. 11 // 12 // Redistribution and use in source and binary forms, with or without 13 // modification, are permitted provided that the following conditions are 14 // met: 15 // 16 // 1. Redistributions of source code must retain the above copyright 17 // notice, this list of conditions and the following disclaimer. 18 // 19 // 2. Redistributions in binary form must reproduce the above copyright 20 // notice, this list of conditions and the following disclaimer in the 21 // documentation and/or other materials provided with the distribution. 22 // 23 // 3. Neither the name of the Corporation nor the names of the 24 // contributors may be used to endorse or promote products derived from 25 // this software without specific prior written permission. 26 // 27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY 28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE 31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 // 39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov) 40 // 41 // ************************************************************************ 42 //@HEADER 43 */ 44 45 #ifndef KOKKOS_CUDA_PARALLEL_HPP 46 #define KOKKOS_CUDA_PARALLEL_HPP 47 48 #include <Kokkos_Macros.hpp> 49 #if defined(KOKKOS_ENABLE_CUDA) 50 51 #include <algorithm> 52 #include <string> 53 #include <cstdio> 54 #include <cstdint> 55 56 #include <utility> 57 #include <Kokkos_Parallel.hpp> 58 59 #include <Cuda/Kokkos_Cuda_KernelLaunch.hpp> 60 #include <Cuda/Kokkos_Cuda_ReduceScan.hpp> 61 #include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp> 62 #include <Cuda/Kokkos_Cuda_Locks.hpp> 63 #include <Cuda/Kokkos_Cuda_Team.hpp> 64 #include <Kokkos_Vectorization.hpp> 65 #include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp> 66 67 #include <impl/Kokkos_Tools.hpp> 68 #include <typeinfo> 69 70 #include <KokkosExp_MDRangePolicy.hpp> 71 #include <impl/KokkosExp_IterateTileGPU.hpp> 72 73 //---------------------------------------------------------------------------- 74 //---------------------------------------------------------------------------- 75 76 namespace Kokkos { 77 78 extern bool show_warnings() noexcept; 79 80 namespace Impl { 81 82 template <class... Properties> 83 class TeamPolicyInternal<Kokkos::Cuda, Properties...> 84 : public PolicyTraits<Properties...> { 85 public: 86 //! Tag this class as a kokkos execution policy 87 using execution_policy = TeamPolicyInternal; 88 89 using traits = PolicyTraits<Properties...>; 90 91 template <class ExecSpace, class... OtherProperties> 92 friend class TeamPolicyInternal; 93 94 private: 95 enum { MAX_WARP = 8 }; 96 97 typename traits::execution_space m_space; 98 int m_league_size; 99 int m_team_size; 100 int m_vector_length; 101 int m_team_scratch_size[2]; 102 int m_thread_scratch_size[2]; 103 int m_chunk_size; 104 bool m_tune_team; 105 bool m_tune_vector; 106 107 public: 108 //! Execution space of this execution policy 109 using execution_space = Kokkos::Cuda; 110 111 template <class... OtherProperties> TeamPolicyInternal(const TeamPolicyInternal<OtherProperties...> & p)112 TeamPolicyInternal(const TeamPolicyInternal<OtherProperties...>& p) { 113 m_league_size = p.m_league_size; 114 m_team_size = p.m_team_size; 115 m_vector_length = p.m_vector_length; 116 m_team_scratch_size[0] = p.m_team_scratch_size[0]; 117 m_team_scratch_size[1] = p.m_team_scratch_size[1]; 118 m_thread_scratch_size[0] = p.m_thread_scratch_size[0]; 119 m_thread_scratch_size[1] = p.m_thread_scratch_size[1]; 120 m_chunk_size = p.m_chunk_size; 121 m_space = p.m_space; 122 m_tune_team = p.m_tune_team; 123 m_tune_vector = p.m_tune_vector; 124 } 125 126 //---------------------------------------- 127 128 template <class FunctorType> team_size_max(const FunctorType & f,const ParallelForTag &) const129 int team_size_max(const FunctorType& f, const ParallelForTag&) const { 130 using closure_type = 131 Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>; 132 cudaFuncAttributes attr = 133 CudaParallelLaunch<closure_type, typename traits::launch_bounds>:: 134 get_cuda_func_attributes(); 135 int block_size = 136 Kokkos::Impl::cuda_get_max_block_size<FunctorType, 137 typename traits::launch_bounds>( 138 space().impl_internal_space_instance(), attr, f, 139 (size_t)impl_vector_length(), 140 (size_t)team_scratch_size(0) + 2 * sizeof(double), 141 (size_t)thread_scratch_size(0) + sizeof(double)); 142 return block_size / impl_vector_length(); 143 } 144 145 template <class FunctorType> team_size_max(const FunctorType & f,const ParallelReduceTag &) const146 inline int team_size_max(const FunctorType& f, 147 const ParallelReduceTag&) const { 148 using functor_analysis_type = 149 Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, 150 TeamPolicyInternal, FunctorType>; 151 using reducer_type = typename Impl::ParallelReduceReturnValue< 152 void, typename functor_analysis_type::value_type, 153 FunctorType>::reducer_type; 154 using closure_type = 155 Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>, 156 reducer_type>; 157 return internal_team_size_max<closure_type>(f); 158 } 159 160 template <class FunctorType, class ReducerType> team_size_max(const FunctorType & f,const ReducerType &,const ParallelReduceTag &) const161 inline int team_size_max(const FunctorType& f, const ReducerType& /*r*/, 162 const ParallelReduceTag&) const { 163 using closure_type = 164 Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>, 165 ReducerType>; 166 return internal_team_size_max<closure_type>(f); 167 } 168 169 template <class FunctorType> team_size_recommended(const FunctorType & f,const ParallelForTag &) const170 int team_size_recommended(const FunctorType& f, const ParallelForTag&) const { 171 using closure_type = 172 Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>; 173 cudaFuncAttributes attr = 174 CudaParallelLaunch<closure_type, typename traits::launch_bounds>:: 175 get_cuda_func_attributes(); 176 const int block_size = 177 Kokkos::Impl::cuda_get_opt_block_size<FunctorType, 178 typename traits::launch_bounds>( 179 space().impl_internal_space_instance(), attr, f, 180 (size_t)impl_vector_length(), 181 (size_t)team_scratch_size(0) + 2 * sizeof(double), 182 (size_t)thread_scratch_size(0) + sizeof(double)); 183 return block_size / impl_vector_length(); 184 } 185 186 template <class FunctorType> team_size_recommended(const FunctorType & f,const ParallelReduceTag &) const187 inline int team_size_recommended(const FunctorType& f, 188 const ParallelReduceTag&) const { 189 using functor_analysis_type = 190 Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, 191 TeamPolicyInternal, FunctorType>; 192 using reducer_type = typename Impl::ParallelReduceReturnValue< 193 void, typename functor_analysis_type::value_type, 194 FunctorType>::reducer_type; 195 using closure_type = 196 Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>, 197 reducer_type>; 198 return internal_team_size_recommended<closure_type>(f); 199 } 200 201 template <class FunctorType, class ReducerType> team_size_recommended(const FunctorType & f,const ReducerType &,const ParallelReduceTag &) const202 int team_size_recommended(const FunctorType& f, const ReducerType&, 203 const ParallelReduceTag&) const { 204 using closure_type = 205 Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>, 206 ReducerType>; 207 return internal_team_size_recommended<closure_type>(f); 208 } 209 vector_length_max()210 inline static int vector_length_max() { return Impl::CudaTraits::WarpSize; } 211 verify_requested_vector_length(int requested_vector_length)212 inline static int verify_requested_vector_length( 213 int requested_vector_length) { 214 int test_vector_length = 215 std::min(requested_vector_length, vector_length_max()); 216 217 // Allow only power-of-two vector_length 218 if (!(is_integral_power_of_two(test_vector_length))) { 219 int test_pow2 = 1; 220 for (int i = 0; i < 5; i++) { 221 test_pow2 = test_pow2 << 1; 222 if (test_pow2 > test_vector_length) { 223 break; 224 } 225 } 226 test_vector_length = test_pow2 >> 1; 227 } 228 229 return test_vector_length; 230 } 231 scratch_size_max(int level)232 inline static int scratch_size_max(int level) { 233 return ( 234 level == 0 ? 1024 * 40 : // 48kB is the max for CUDA, but we need some 235 // for team_member.reduce etc. 236 20 * 1024 * 237 1024); // arbitrarily setting this to 20MB, for a Volta V100 238 // that would give us about 3.2GB for 2 teams per SM 239 } 240 241 //---------------------------------------- 242 vector_length() const243 KOKKOS_DEPRECATED inline int vector_length() const { 244 return impl_vector_length(); 245 } impl_vector_length() const246 inline int impl_vector_length() const { return m_vector_length; } team_size() const247 inline int team_size() const { return m_team_size; } league_size() const248 inline int league_size() const { return m_league_size; } impl_auto_team_size() const249 inline bool impl_auto_team_size() const { return m_tune_team; } impl_auto_vector_length() const250 inline bool impl_auto_vector_length() const { return m_tune_vector; } impl_set_team_size(size_t team_size)251 inline void impl_set_team_size(size_t team_size) { m_team_size = team_size; } impl_set_vector_length(size_t vector_length)252 inline void impl_set_vector_length(size_t vector_length) { 253 m_vector_length = vector_length; 254 } scratch_size(int level,int team_size_=-1) const255 inline int scratch_size(int level, int team_size_ = -1) const { 256 if (team_size_ < 0) team_size_ = m_team_size; 257 return m_team_scratch_size[level] + 258 team_size_ * m_thread_scratch_size[level]; 259 } team_scratch_size(int level) const260 inline int team_scratch_size(int level) const { 261 return m_team_scratch_size[level]; 262 } thread_scratch_size(int level) const263 inline int thread_scratch_size(int level) const { 264 return m_thread_scratch_size[level]; 265 } 266 space() const267 const typename traits::execution_space& space() const { return m_space; } 268 TeamPolicyInternal()269 TeamPolicyInternal() 270 : m_space(typename traits::execution_space()), 271 m_league_size(0), 272 m_team_size(-1), 273 m_vector_length(0), 274 m_team_scratch_size{0, 0}, 275 m_thread_scratch_size{0, 0}, 276 m_chunk_size(Impl::CudaTraits::WarpSize), 277 m_tune_team(false), 278 m_tune_vector(false) {} 279 280 /** \brief Specify league size, specify team size, specify vector length */ TeamPolicyInternal(const execution_space space_,int league_size_,int team_size_request,int vector_length_request=1)281 TeamPolicyInternal(const execution_space space_, int league_size_, 282 int team_size_request, int vector_length_request = 1) 283 : m_space(space_), 284 m_league_size(league_size_), 285 m_team_size(team_size_request), 286 m_vector_length( 287 (vector_length_request > 0) 288 ? verify_requested_vector_length(vector_length_request) 289 : verify_requested_vector_length(1)), 290 m_team_scratch_size{0, 0}, 291 m_thread_scratch_size{0, 0}, 292 m_chunk_size(Impl::CudaTraits::WarpSize), 293 m_tune_team(bool(team_size_request <= 0)), 294 m_tune_vector(bool(vector_length_request <= 0)) { 295 // Make sure league size is permissible 296 if (league_size_ >= int(Impl::cuda_internal_maximum_grid_count())) 297 Impl::throw_runtime_exception( 298 "Requested too large league_size for TeamPolicy on Cuda execution " 299 "space."); 300 301 // Make sure total block size is permissible 302 if (m_team_size * m_vector_length > 303 int(Impl::CudaTraits::MaxHierarchicalParallelism)) { 304 Impl::throw_runtime_exception( 305 std::string("Kokkos::TeamPolicy< Cuda > the team size is too large. " 306 "Team size x vector length must be smaller than 1024.")); 307 } 308 } 309 310 /** \brief Specify league size, request team size, specify vector length */ TeamPolicyInternal(const execution_space space_,int league_size_,const Kokkos::AUTO_t &,int vector_length_request=1)311 TeamPolicyInternal(const execution_space space_, int league_size_, 312 const Kokkos::AUTO_t& /* team_size_request */ 313 , 314 int vector_length_request = 1) 315 : TeamPolicyInternal(space_, league_size_, -1, vector_length_request) {} 316 317 /** \brief Specify league size, request team size and vector length */ TeamPolicyInternal(const execution_space space_,int league_size_,const Kokkos::AUTO_t &,const Kokkos::AUTO_t &)318 TeamPolicyInternal(const execution_space space_, int league_size_, 319 const Kokkos::AUTO_t& /* team_size_request */, 320 const Kokkos::AUTO_t& /* vector_length_request */ 321 ) 322 : TeamPolicyInternal(space_, league_size_, -1, -1) {} 323 324 /** \brief Specify league size, specify team size, request vector length */ TeamPolicyInternal(const execution_space space_,int league_size_,int team_size_request,const Kokkos::AUTO_t &)325 TeamPolicyInternal(const execution_space space_, int league_size_, 326 int team_size_request, const Kokkos::AUTO_t&) 327 : TeamPolicyInternal(space_, league_size_, team_size_request, -1) {} 328 TeamPolicyInternal(int league_size_,int team_size_request,int vector_length_request=1)329 TeamPolicyInternal(int league_size_, int team_size_request, 330 int vector_length_request = 1) 331 : TeamPolicyInternal(typename traits::execution_space(), league_size_, 332 team_size_request, vector_length_request) {} 333 TeamPolicyInternal(int league_size_,const Kokkos::AUTO_t & team_size_request,int vector_length_request=1)334 TeamPolicyInternal(int league_size_, const Kokkos::AUTO_t& team_size_request, 335 int vector_length_request = 1) 336 : TeamPolicyInternal(typename traits::execution_space(), league_size_, 337 team_size_request, vector_length_request) 338 339 {} 340 341 /** \brief Specify league size, request team size */ TeamPolicyInternal(int league_size_,const Kokkos::AUTO_t & team_size_request,const Kokkos::AUTO_t & vector_length_request)342 TeamPolicyInternal(int league_size_, const Kokkos::AUTO_t& team_size_request, 343 const Kokkos::AUTO_t& vector_length_request) 344 : TeamPolicyInternal(typename traits::execution_space(), league_size_, 345 team_size_request, vector_length_request) {} 346 347 /** \brief Specify league size, request team size */ TeamPolicyInternal(int league_size_,int team_size_request,const Kokkos::AUTO_t & vector_length_request)348 TeamPolicyInternal(int league_size_, int team_size_request, 349 const Kokkos::AUTO_t& vector_length_request) 350 : TeamPolicyInternal(typename traits::execution_space(), league_size_, 351 team_size_request, vector_length_request) {} 352 chunk_size() const353 inline int chunk_size() const { return m_chunk_size; } 354 355 /** \brief set chunk_size to a discrete value*/ set_chunk_size(typename traits::index_type chunk_size_)356 inline TeamPolicyInternal& set_chunk_size( 357 typename traits::index_type chunk_size_) { 358 m_chunk_size = chunk_size_; 359 return *this; 360 } 361 362 /** \brief set per team scratch size for a specific level of the scratch 363 * hierarchy */ set_scratch_size(const int & level,const PerTeamValue & per_team)364 inline TeamPolicyInternal& set_scratch_size(const int& level, 365 const PerTeamValue& per_team) { 366 m_team_scratch_size[level] = per_team.value; 367 return *this; 368 } 369 370 /** \brief set per thread scratch size for a specific level of the scratch 371 * hierarchy */ set_scratch_size(const int & level,const PerThreadValue & per_thread)372 inline TeamPolicyInternal& set_scratch_size( 373 const int& level, const PerThreadValue& per_thread) { 374 m_thread_scratch_size[level] = per_thread.value; 375 return *this; 376 } 377 378 /** \brief set per thread and per team scratch size for a specific level of 379 * the scratch hierarchy */ set_scratch_size(const int & level,const PerTeamValue & per_team,const PerThreadValue & per_thread)380 inline TeamPolicyInternal& set_scratch_size( 381 const int& level, const PerTeamValue& per_team, 382 const PerThreadValue& per_thread) { 383 m_team_scratch_size[level] = per_team.value; 384 m_thread_scratch_size[level] = per_thread.value; 385 return *this; 386 } 387 388 using member_type = Kokkos::Impl::CudaTeamMember; 389 390 protected: 391 template <class ClosureType, class FunctorType, class BlockSizeCallable> internal_team_size_common(const FunctorType & f,BlockSizeCallable && block_size_callable) const392 int internal_team_size_common(const FunctorType& f, 393 BlockSizeCallable&& block_size_callable) const { 394 using closure_type = ClosureType; 395 using functor_value_traits = 396 Impl::FunctorValueTraits<FunctorType, typename traits::work_tag>; 397 398 cudaFuncAttributes attr = 399 CudaParallelLaunch<closure_type, typename traits::launch_bounds>:: 400 get_cuda_func_attributes(); 401 const int block_size = std::forward<BlockSizeCallable>(block_size_callable)( 402 space().impl_internal_space_instance(), attr, f, 403 (size_t)impl_vector_length(), 404 (size_t)team_scratch_size(0) + 2 * sizeof(double), 405 (size_t)thread_scratch_size(0) + sizeof(double) + 406 ((functor_value_traits::StaticValueSize != 0) 407 ? 0 408 : functor_value_traits::value_size(f))); 409 KOKKOS_ASSERT(block_size > 0); 410 411 // Currently we require Power-of-2 team size for reductions. 412 int p2 = 1; 413 while (p2 <= block_size) p2 *= 2; 414 p2 /= 2; 415 return p2 / impl_vector_length(); 416 } 417 418 template <class ClosureType, class FunctorType> internal_team_size_max(const FunctorType & f) const419 int internal_team_size_max(const FunctorType& f) const { 420 return internal_team_size_common<ClosureType>( 421 f, 422 Kokkos::Impl::cuda_get_max_block_size<FunctorType, 423 typename traits::launch_bounds>); 424 } 425 426 template <class ClosureType, class FunctorType> internal_team_size_recommended(const FunctorType & f) const427 int internal_team_size_recommended(const FunctorType& f) const { 428 return internal_team_size_common<ClosureType>( 429 f, 430 Kokkos::Impl::cuda_get_opt_block_size<FunctorType, 431 typename traits::launch_bounds>); 432 } 433 }; 434 435 } // namespace Impl 436 } // namespace Kokkos 437 438 //---------------------------------------------------------------------------- 439 //---------------------------------------------------------------------------- 440 441 namespace Kokkos { 442 namespace Impl { 443 444 template <class FunctorType, class... Traits> 445 class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> { 446 public: 447 using Policy = Kokkos::RangePolicy<Traits...>; 448 449 private: 450 using Member = typename Policy::member_type; 451 using WorkTag = typename Policy::work_tag; 452 using LaunchBounds = typename Policy::launch_bounds; 453 454 const FunctorType m_functor; 455 const Policy m_policy; 456 457 ParallelFor() = delete; 458 ParallelFor& operator=(const ParallelFor&) = delete; 459 460 template <class TagType> 461 inline __device__ 462 typename std::enable_if<std::is_same<TagType, void>::value>::type exec_range(const Member i) const463 exec_range(const Member i) const { 464 m_functor(i); 465 } 466 467 template <class TagType> 468 inline __device__ 469 typename std::enable_if<!std::is_same<TagType, void>::value>::type exec_range(const Member i) const470 exec_range(const Member i) const { 471 m_functor(TagType(), i); 472 } 473 474 public: 475 using functor_type = FunctorType; 476 get_policy() const477 Policy const& get_policy() const { return m_policy; } 478 operator ()() const479 inline __device__ void operator()() const { 480 const Member work_stride = blockDim.y * gridDim.x; 481 const Member work_end = m_policy.end(); 482 483 for (Member iwork = 484 m_policy.begin() + threadIdx.y + blockDim.y * blockIdx.x; 485 iwork < work_end; 486 iwork = iwork < work_end - work_stride ? iwork + work_stride 487 : work_end) { 488 this->template exec_range<WorkTag>(iwork); 489 } 490 } 491 execute() const492 inline void execute() const { 493 const typename Policy::index_type nwork = m_policy.end() - m_policy.begin(); 494 495 cudaFuncAttributes attr = 496 CudaParallelLaunch<ParallelFor, 497 LaunchBounds>::get_cuda_func_attributes(); 498 const int block_size = 499 Kokkos::Impl::cuda_get_opt_block_size<FunctorType, LaunchBounds>( 500 m_policy.space().impl_internal_space_instance(), attr, m_functor, 1, 501 0, 0); 502 KOKKOS_ASSERT(block_size > 0); 503 dim3 block(1, block_size, 1); 504 dim3 grid( 505 std::min( 506 typename Policy::index_type((nwork + block.y - 1) / block.y), 507 typename Policy::index_type(cuda_internal_maximum_grid_count())), 508 1, 1); 509 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION 510 if (Kokkos::Impl::CudaInternal::cuda_use_serial_execution()) { 511 block = dim3(1, 1, 1); 512 grid = dim3(1, 1, 1); 513 } 514 #endif 515 516 CudaParallelLaunch<ParallelFor, LaunchBounds>( 517 *this, grid, block, 0, m_policy.space().impl_internal_space_instance(), 518 false); 519 } 520 ParallelFor(const FunctorType & arg_functor,const Policy & arg_policy)521 ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) 522 : m_functor(arg_functor), m_policy(arg_policy) {} 523 }; 524 525 // MDRangePolicy impl 526 template <class FunctorType, class... Traits> 527 class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> { 528 public: 529 using Policy = Kokkos::MDRangePolicy<Traits...>; 530 using functor_type = FunctorType; 531 532 private: 533 using RP = Policy; 534 using array_index_type = typename Policy::array_index_type; 535 using index_type = typename Policy::index_type; 536 using LaunchBounds = typename Policy::launch_bounds; 537 538 const FunctorType m_functor; 539 const Policy m_rp; 540 541 public: 542 template <typename Policy, typename Functor> max_tile_size_product(const Policy & pol,const Functor &)543 static int max_tile_size_product(const Policy& pol, const Functor&) { 544 cudaFuncAttributes attr = 545 CudaParallelLaunch<ParallelFor, 546 LaunchBounds>::get_cuda_func_attributes(); 547 auto const& prop = pol.space().cuda_device_prop(); 548 // Limits due to registers/SM, MDRange doesn't have 549 // shared memory constraints 550 int const regs_per_sm = prop.regsPerMultiprocessor; 551 int const regs_per_thread = attr.numRegs; 552 int const max_threads_per_sm = regs_per_sm / regs_per_thread; 553 return std::min( 554 max_threads_per_sm, 555 static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism)); 556 } get_policy() const557 Policy const& get_policy() const { return m_rp; } operator ()() const558 inline __device__ void operator()() const { 559 Kokkos::Impl::DeviceIterateTile<Policy::rank, Policy, FunctorType, 560 typename Policy::work_tag>(m_rp, m_functor) 561 .exec_range(); 562 } 563 execute() const564 inline void execute() const { 565 using namespace std; 566 567 if (m_rp.m_num_tiles == 0) return; 568 const array_index_type maxblocks = static_cast<array_index_type>( 569 m_rp.space().impl_internal_space_instance()->m_maxBlock); 570 if (RP::rank == 2) { 571 const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], 1); 572 KOKKOS_ASSERT(block.x > 0); 573 KOKKOS_ASSERT(block.y > 0); 574 const dim3 grid( 575 min((m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x, 576 maxblocks), 577 min((m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1) / block.y, 578 maxblocks), 579 1); 580 CudaParallelLaunch<ParallelFor, LaunchBounds>( 581 *this, grid, block, 0, m_rp.space().impl_internal_space_instance(), 582 false); 583 } else if (RP::rank == 3) { 584 const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]); 585 KOKKOS_ASSERT(block.x > 0); 586 KOKKOS_ASSERT(block.y > 0); 587 KOKKOS_ASSERT(block.z > 0); 588 const dim3 grid( 589 min((m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x, 590 maxblocks), 591 min((m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1) / block.y, 592 maxblocks), 593 min((m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z, 594 maxblocks)); 595 CudaParallelLaunch<ParallelFor, LaunchBounds>( 596 *this, grid, block, 0, m_rp.space().impl_internal_space_instance(), 597 false); 598 } else if (RP::rank == 4) { 599 // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to 600 // threadIdx.z 601 const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], m_rp.m_tile[2], 602 m_rp.m_tile[3]); 603 KOKKOS_ASSERT(block.y > 0); 604 KOKKOS_ASSERT(block.z > 0); 605 const dim3 grid( 606 min(static_cast<index_type>(m_rp.m_tile_end[0] * m_rp.m_tile_end[1]), 607 static_cast<index_type>(maxblocks)), 608 min((m_rp.m_upper[2] - m_rp.m_lower[2] + block.y - 1) / block.y, 609 maxblocks), 610 min((m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z, 611 maxblocks)); 612 CudaParallelLaunch<ParallelFor, LaunchBounds>( 613 *this, grid, block, 0, m_rp.space().impl_internal_space_instance(), 614 false); 615 } else if (RP::rank == 5) { 616 // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to 617 // threadIdx.z 618 const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], 619 m_rp.m_tile[2] * m_rp.m_tile[3], m_rp.m_tile[4]); 620 KOKKOS_ASSERT(block.z > 0); 621 const dim3 grid( 622 min(static_cast<index_type>(m_rp.m_tile_end[0] * m_rp.m_tile_end[1]), 623 static_cast<index_type>(maxblocks)), 624 min(static_cast<index_type>(m_rp.m_tile_end[2] * m_rp.m_tile_end[3]), 625 static_cast<index_type>(maxblocks)), 626 min((m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z, 627 maxblocks)); 628 CudaParallelLaunch<ParallelFor, LaunchBounds>( 629 *this, grid, block, 0, m_rp.space().impl_internal_space_instance(), 630 false); 631 } else if (RP::rank == 6) { 632 // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to 633 // threadIdx.z 634 const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], 635 m_rp.m_tile[2] * m_rp.m_tile[3], 636 m_rp.m_tile[4] * m_rp.m_tile[5]); 637 const dim3 grid( 638 min(static_cast<index_type>(m_rp.m_tile_end[0] * m_rp.m_tile_end[1]), 639 static_cast<index_type>(maxblocks)), 640 min(static_cast<index_type>(m_rp.m_tile_end[2] * m_rp.m_tile_end[3]), 641 static_cast<index_type>(maxblocks)), 642 min(static_cast<index_type>(m_rp.m_tile_end[4] * m_rp.m_tile_end[5]), 643 static_cast<index_type>(maxblocks))); 644 CudaParallelLaunch<ParallelFor, LaunchBounds>( 645 *this, grid, block, 0, m_rp.space().impl_internal_space_instance(), 646 false); 647 } else { 648 Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n"); 649 } 650 651 } // end execute 652 653 // inline ParallelFor(const FunctorType & arg_functor,Policy arg_policy)654 ParallelFor(const FunctorType& arg_functor, Policy arg_policy) 655 : m_functor(arg_functor), m_rp(arg_policy) {} 656 }; 657 658 template <class FunctorType, class... Properties> 659 class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, 660 Kokkos::Cuda> { 661 public: 662 using Policy = TeamPolicy<Properties...>; 663 664 private: 665 using Member = typename Policy::member_type; 666 using WorkTag = typename Policy::work_tag; 667 using LaunchBounds = typename Policy::launch_bounds; 668 669 public: 670 using functor_type = FunctorType; 671 using size_type = Cuda::size_type; 672 673 private: 674 // Algorithmic constraints: blockDim.y is a power of two AND blockDim.y == 675 // blockDim.z == 1 shared memory utilization: 676 // 677 // [ team reduce space ] 678 // [ team shared space ] 679 // 680 681 const FunctorType m_functor; 682 const Policy m_policy; 683 const size_type m_league_size; 684 int m_team_size; 685 const size_type m_vector_size; 686 int m_shmem_begin; 687 int m_shmem_size; 688 void* m_scratch_ptr[2]; 689 int m_scratch_size[2]; 690 691 template <class TagType> 692 __device__ inline 693 typename std::enable_if<std::is_same<TagType, void>::value>::type exec_team(const Member & member) const694 exec_team(const Member& member) const { 695 m_functor(member); 696 } 697 698 template <class TagType> 699 __device__ inline 700 typename std::enable_if<!std::is_same<TagType, void>::value>::type exec_team(const Member & member) const701 exec_team(const Member& member) const { 702 m_functor(TagType(), member); 703 } 704 705 public: get_policy() const706 Policy const& get_policy() const { return m_policy; } 707 operator ()() const708 __device__ inline void operator()() const { 709 // Iterate this block through the league 710 int64_t threadid = 0; 711 if (m_scratch_size[1] > 0) { 712 __shared__ int64_t base_thread_id; 713 if (threadIdx.x == 0 && threadIdx.y == 0) { 714 threadid = (blockIdx.x * blockDim.z + threadIdx.z) % 715 (Kokkos::Impl::g_device_cuda_lock_arrays.n / 716 (blockDim.x * blockDim.y)); 717 threadid *= blockDim.x * blockDim.y; 718 int done = 0; 719 while (!done) { 720 done = 721 (0 == 722 atomicCAS( 723 &Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid], 724 0, 1)); 725 if (!done) { 726 threadid += blockDim.x * blockDim.y; 727 if (int64_t(threadid + blockDim.x * blockDim.y) >= 728 int64_t(Kokkos::Impl::g_device_cuda_lock_arrays.n)) 729 threadid = 0; 730 } 731 } 732 base_thread_id = threadid; 733 } 734 __syncthreads(); 735 threadid = base_thread_id; 736 } 737 738 const int int_league_size = (int)m_league_size; 739 for (int league_rank = blockIdx.x; league_rank < int_league_size; 740 league_rank += gridDim.x) { 741 this->template exec_team<WorkTag>(typename Policy::member_type( 742 kokkos_impl_cuda_shared_memory<void>(), m_shmem_begin, m_shmem_size, 743 (void*)(((char*)m_scratch_ptr[1]) + 744 ptrdiff_t(threadid / (blockDim.x * blockDim.y)) * 745 m_scratch_size[1]), 746 m_scratch_size[1], league_rank, m_league_size)); 747 } 748 if (m_scratch_size[1] > 0) { 749 __syncthreads(); 750 if (threadIdx.x == 0 && threadIdx.y == 0) 751 Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid] = 0; 752 } 753 } 754 execute() const755 inline void execute() const { 756 const int64_t shmem_size_total = m_shmem_begin + m_shmem_size; 757 dim3 grid(int(m_league_size), 1, 1); 758 const dim3 block(int(m_vector_size), int(m_team_size), 1); 759 760 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION 761 if (Kokkos::Impl::CudaInternal::cuda_use_serial_execution()) { 762 grid = dim3(1, 1, 1); 763 } 764 #endif 765 766 CudaParallelLaunch<ParallelFor, LaunchBounds>( 767 *this, grid, block, shmem_size_total, 768 m_policy.space().impl_internal_space_instance(), 769 true); // copy to device and execute 770 } 771 ParallelFor(const FunctorType & arg_functor,const Policy & arg_policy)772 ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) 773 : m_functor(arg_functor), 774 m_policy(arg_policy), 775 m_league_size(arg_policy.league_size()), 776 m_team_size(arg_policy.team_size()), 777 m_vector_size(arg_policy.impl_vector_length()) { 778 cudaFuncAttributes attr = 779 CudaParallelLaunch<ParallelFor, 780 LaunchBounds>::get_cuda_func_attributes(); 781 m_team_size = 782 m_team_size >= 0 783 ? m_team_size 784 : Kokkos::Impl::cuda_get_opt_block_size<FunctorType, LaunchBounds>( 785 m_policy.space().impl_internal_space_instance(), attr, 786 m_functor, m_vector_size, m_policy.team_scratch_size(0), 787 m_policy.thread_scratch_size(0)) / 788 m_vector_size; 789 790 m_shmem_begin = (sizeof(double) * (m_team_size + 2)); 791 m_shmem_size = 792 (m_policy.scratch_size(0, m_team_size) + 793 FunctorTeamShmemSize<FunctorType>::value(m_functor, m_team_size)); 794 m_scratch_size[0] = m_policy.scratch_size(0, m_team_size); 795 m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); 796 797 // Functor's reduce memory, team scan memory, and team shared memory depend 798 // upon team size. 799 m_scratch_ptr[0] = nullptr; 800 m_scratch_ptr[1] = 801 m_team_size <= 0 802 ? nullptr 803 : m_policy.space() 804 .impl_internal_space_instance() 805 ->resize_team_scratch_space( 806 static_cast<ptrdiff_t>(m_scratch_size[1]) * 807 static_cast<ptrdiff_t>(Cuda::concurrency() / 808 (m_team_size * m_vector_size))); 809 810 const int shmem_size_total = m_shmem_begin + m_shmem_size; 811 if (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < 812 shmem_size_total) { 813 printf( 814 "%i %i\n", 815 m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock, 816 shmem_size_total); 817 Kokkos::Impl::throw_runtime_exception(std::string( 818 "Kokkos::Impl::ParallelFor< Cuda > insufficient shared memory")); 819 } 820 821 if (int(m_team_size) > 822 int(Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>( 823 m_policy.space().impl_internal_space_instance(), attr, 824 arg_functor, arg_policy.impl_vector_length(), 825 arg_policy.team_scratch_size(0), 826 arg_policy.thread_scratch_size(0)) / 827 arg_policy.impl_vector_length())) { 828 Kokkos::Impl::throw_runtime_exception(std::string( 829 "Kokkos::Impl::ParallelFor< Cuda > requested too large team size.")); 830 } 831 } 832 }; 833 834 } // namespace Impl 835 } // namespace Kokkos 836 837 //---------------------------------------------------------------------------- 838 //---------------------------------------------------------------------------- 839 840 namespace Kokkos { 841 namespace Impl { 842 843 template <class FunctorType, class ReducerType, class... Traits> 844 class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType, 845 Kokkos::Cuda> { 846 public: 847 using Policy = Kokkos::RangePolicy<Traits...>; 848 849 private: 850 using WorkRange = typename Policy::WorkRange; 851 using WorkTag = typename Policy::work_tag; 852 using Member = typename Policy::member_type; 853 using LaunchBounds = typename Policy::launch_bounds; 854 855 using ReducerConditional = 856 Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, 857 FunctorType, ReducerType>; 858 using ReducerTypeFwd = typename ReducerConditional::type; 859 using WorkTagFwd = 860 typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, 861 WorkTag, void>::type; 862 863 using ValueTraits = 864 Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>; 865 using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>; 866 using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>; 867 868 public: 869 using pointer_type = typename ValueTraits::pointer_type; 870 using value_type = typename ValueTraits::value_type; 871 using reference_type = typename ValueTraits::reference_type; 872 using functor_type = FunctorType; 873 using size_type = Kokkos::Cuda::size_type; 874 using index_type = typename Policy::index_type; 875 using reducer_type = ReducerType; 876 877 // Algorithmic constraints: blockSize is a power of two AND blockDim.y == 878 // blockDim.z == 1 879 880 const FunctorType m_functor; 881 const Policy m_policy; 882 const ReducerType m_reducer; 883 const pointer_type m_result_ptr; 884 const bool m_result_ptr_device_accessible; 885 const bool m_result_ptr_host_accessible; 886 size_type* m_scratch_space; 887 size_type* m_scratch_flags; 888 size_type* m_unified_space; 889 890 // Shall we use the shfl based reduction or not (only use it for static sized 891 // types of more than 128bit) 892 enum { 893 UseShflReduction = false 894 }; //((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize) 895 //}; 896 // Some crutch to do function overloading 897 private: 898 using DummyShflReductionType = double; 899 using DummySHMEMReductionType = int; 900 901 public: get_policy() const902 Policy const& get_policy() const { return m_policy; } 903 904 // Make the exec_range calls call to Reduce::DeviceIterateTile 905 template <class TagType> 906 __device__ inline 907 typename std::enable_if<std::is_same<TagType, void>::value>::type exec_range(const Member & i,reference_type update) const908 exec_range(const Member& i, reference_type update) const { 909 m_functor(i, update); 910 } 911 912 template <class TagType> 913 __device__ inline 914 typename std::enable_if<!std::is_same<TagType, void>::value>::type exec_range(const Member & i,reference_type update) const915 exec_range(const Member& i, reference_type update) const { 916 m_functor(TagType(), i, update); 917 } 918 operator ()() const919 __device__ inline void operator()() const { 920 /* run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, 921 DummySHMEMReductionType>::select(1,1.0) ); 922 } 923 924 __device__ inline 925 void run(const DummySHMEMReductionType& ) const 926 {*/ 927 const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize / 928 sizeof(size_type)> 929 word_count(ValueTraits::value_size( 930 ReducerConditional::select(m_functor, m_reducer)) / 931 sizeof(size_type)); 932 933 { 934 reference_type value = 935 ValueInit::init(ReducerConditional::select(m_functor, m_reducer), 936 kokkos_impl_cuda_shared_memory<size_type>() + 937 threadIdx.y * word_count.value); 938 939 // Number of blocks is bounded so that the reduction can be limited to two 940 // passes. Each thread block is given an approximately equal amount of 941 // work to perform. Accumulate the values for this block. The accumulation 942 // ordering does not match the final pass, but is arithmatically 943 // equivalent. 944 945 const WorkRange range(m_policy, blockIdx.x, gridDim.x); 946 947 for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end(); 948 iwork < iwork_end; iwork += blockDim.y) { 949 this->template exec_range<WorkTag>(iwork, value); 950 } 951 } 952 953 // Doing code duplication here to fix issue #3428 954 // Suspect optimizer bug?? 955 // Reduce with final value at blockDim.y - 1 location. 956 // Shortcut for length zero reduction 957 if (m_policy.begin() == m_policy.end()) { 958 // This is the final block with the final result at the final threads' 959 // location 960 961 size_type* const shared = kokkos_impl_cuda_shared_memory<size_type>() + 962 (blockDim.y - 1) * word_count.value; 963 size_type* const global = 964 m_result_ptr_device_accessible 965 ? reinterpret_cast<size_type*>(m_result_ptr) 966 : (m_unified_space ? m_unified_space : m_scratch_space); 967 968 if (threadIdx.y == 0) { 969 Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final( 970 ReducerConditional::select(m_functor, m_reducer), shared); 971 } 972 973 if (CudaTraits::WarpSize < word_count.value) { 974 __syncthreads(); 975 } 976 977 for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { 978 global[i] = shared[i]; 979 } 980 // return ; 981 } 982 983 if (m_policy.begin() != m_policy.end()) { 984 { 985 if (cuda_single_inter_block_reduce_scan<false, ReducerTypeFwd, 986 WorkTagFwd>( 987 ReducerConditional::select(m_functor, m_reducer), blockIdx.x, 988 gridDim.x, kokkos_impl_cuda_shared_memory<size_type>(), 989 m_scratch_space, m_scratch_flags)) { 990 // This is the final block with the final result at the final threads' 991 // location 992 993 size_type* const shared = 994 kokkos_impl_cuda_shared_memory<size_type>() + 995 (blockDim.y - 1) * word_count.value; 996 size_type* const global = 997 m_result_ptr_device_accessible 998 ? reinterpret_cast<size_type*>(m_result_ptr) 999 : (m_unified_space ? m_unified_space : m_scratch_space); 1000 1001 if (threadIdx.y == 0) { 1002 Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final( 1003 ReducerConditional::select(m_functor, m_reducer), shared); 1004 } 1005 1006 if (CudaTraits::WarpSize < word_count.value) { 1007 __syncthreads(); 1008 } 1009 1010 for (unsigned i = threadIdx.y; i < word_count.value; 1011 i += blockDim.y) { 1012 global[i] = shared[i]; 1013 } 1014 } 1015 } 1016 } 1017 } 1018 /* __device__ inline 1019 void run(const DummyShflReductionType&) const 1020 { 1021 value_type value; 1022 ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , 1023 &value); 1024 // Number of blocks is bounded so that the reduction can be limited to 1025 two passes. 1026 // Each thread block is given an approximately equal amount of work to 1027 perform. 1028 // Accumulate the values for this block. 1029 // The accumulation ordering does not match the final pass, but is 1030 arithmatically equivalent. 1031 1032 const WorkRange range( m_policy , blockIdx.x , gridDim.x ); 1033 1034 for ( Member iwork = range.begin() + threadIdx.y , iwork_end = 1035 range.end() ; iwork < iwork_end ; iwork += blockDim.y ) { this-> template 1036 exec_range< WorkTag >( iwork , value ); 1037 } 1038 1039 pointer_type const result = (pointer_type) (m_unified_space ? 1040 m_unified_space : m_scratch_space) ; 1041 1042 int max_active_thread = range.end()-range.begin() < blockDim.y ? 1043 range.end() - range.begin():blockDim.y; 1044 1045 max_active_thread = (max_active_thread == 1046 0)?blockDim.y:max_active_thread; 1047 1048 value_type init; 1049 ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , 1050 &init); 1051 if(Impl::cuda_inter_block_reduction<ReducerTypeFwd,ValueJoin,WorkTagFwd> 1052 (value,init,ValueJoin(ReducerConditional::select(m_functor , 1053 m_reducer)),m_scratch_space,result,m_scratch_flags,max_active_thread)) { 1054 const unsigned id = threadIdx.y*blockDim.x + threadIdx.x; 1055 if(id==0) { 1056 Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final( 1057 ReducerConditional::select(m_functor , m_reducer) , (void*) &value ); 1058 *result = value; 1059 } 1060 } 1061 }*/ 1062 1063 // Determine block size constrained by shared memory: local_block_size(const FunctorType & f)1064 inline unsigned local_block_size(const FunctorType& f) { 1065 unsigned n = CudaTraits::WarpSize * 8; 1066 int shmem_size = 1067 cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>( 1068 f, n); 1069 using closure_type = Impl::ParallelReduce<FunctorType, Policy, ReducerType>; 1070 cudaFuncAttributes attr = 1071 CudaParallelLaunch<closure_type, 1072 LaunchBounds>::get_cuda_func_attributes(); 1073 while ( 1074 (n && 1075 (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < 1076 shmem_size)) || 1077 (n > 1078 static_cast<unsigned>( 1079 Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>( 1080 m_policy.space().impl_internal_space_instance(), attr, f, 1, 1081 shmem_size, 0)))) { 1082 n >>= 1; 1083 shmem_size = cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, 1084 WorkTag>(f, n); 1085 } 1086 return n; 1087 } 1088 execute()1089 inline void execute() { 1090 const index_type nwork = m_policy.end() - m_policy.begin(); 1091 const bool need_device_set = ReduceFunctorHasInit<FunctorType>::value || 1092 ReduceFunctorHasFinal<FunctorType>::value || 1093 !m_result_ptr_host_accessible || 1094 #ifdef KOKKOS_CUDA_ENABLE_GRAPHS 1095 Policy::is_graph_kernel::value || 1096 #endif 1097 !std::is_same<ReducerType, InvalidType>::value; 1098 if ((nwork > 0) || need_device_set) { 1099 const int block_size = local_block_size(m_functor); 1100 1101 KOKKOS_ASSERT(block_size > 0); 1102 1103 m_scratch_space = cuda_internal_scratch_space( 1104 m_policy.space(), ValueTraits::value_size(ReducerConditional::select( 1105 m_functor, m_reducer)) * 1106 block_size /* block_size == max block_count */); 1107 m_scratch_flags = 1108 cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type)); 1109 m_unified_space = cuda_internal_scratch_unified( 1110 m_policy.space(), ValueTraits::value_size(ReducerConditional::select( 1111 m_functor, m_reducer))); 1112 1113 // REQUIRED ( 1 , N , 1 ) 1114 dim3 block(1, block_size, 1); 1115 // Required grid.x <= block.y 1116 dim3 grid(std::min(int(block.y), int((nwork + block.y - 1) / block.y)), 1, 1117 1); 1118 1119 // TODO @graph We need to effectively insert this in to the graph 1120 const int shmem = 1121 UseShflReduction 1122 ? 0 1123 : cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, 1124 WorkTag>(m_functor, 1125 block.y); 1126 1127 if ((nwork == 0) 1128 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION 1129 || Kokkos::Impl::CudaInternal::cuda_use_serial_execution() 1130 #endif 1131 ) { 1132 block = dim3(1, 1, 1); 1133 grid = dim3(1, 1, 1); 1134 } 1135 1136 CudaParallelLaunch<ParallelReduce, LaunchBounds>( 1137 *this, grid, block, shmem, 1138 m_policy.space().impl_internal_space_instance(), 1139 false); // copy to device and execute 1140 1141 if (!m_result_ptr_device_accessible) { 1142 m_policy.space().fence(); 1143 1144 if (m_result_ptr) { 1145 if (m_unified_space) { 1146 const int count = ValueTraits::value_count( 1147 ReducerConditional::select(m_functor, m_reducer)); 1148 for (int i = 0; i < count; ++i) { 1149 m_result_ptr[i] = pointer_type(m_unified_space)[i]; 1150 } 1151 } else { 1152 const int size = ValueTraits::value_size( 1153 ReducerConditional::select(m_functor, m_reducer)); 1154 DeepCopy<HostSpace, CudaSpace>(m_result_ptr, m_scratch_space, size); 1155 } 1156 } 1157 } 1158 } else { 1159 if (m_result_ptr) { 1160 // TODO @graph We need to effectively insert this in to the graph 1161 ValueInit::init(ReducerConditional::select(m_functor, m_reducer), 1162 m_result_ptr); 1163 } 1164 } 1165 } 1166 1167 template <class ViewType> ParallelReduce(const FunctorType & arg_functor,const Policy & arg_policy,const ViewType & arg_result,typename std::enable_if<Kokkos::is_view<ViewType>::value,void * >::type=nullptr)1168 ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, 1169 const ViewType& arg_result, 1170 typename std::enable_if<Kokkos::is_view<ViewType>::value, 1171 void*>::type = nullptr) 1172 : m_functor(arg_functor), 1173 m_policy(arg_policy), 1174 m_reducer(InvalidType()), 1175 m_result_ptr(arg_result.data()), 1176 m_result_ptr_device_accessible( 1177 MemorySpaceAccess<Kokkos::CudaSpace, 1178 typename ViewType::memory_space>::accessible), 1179 m_result_ptr_host_accessible( 1180 MemorySpaceAccess<Kokkos::HostSpace, 1181 typename ViewType::memory_space>::accessible), 1182 m_scratch_space(nullptr), 1183 m_scratch_flags(nullptr), 1184 m_unified_space(nullptr) {} 1185 ParallelReduce(const FunctorType & arg_functor,const Policy & arg_policy,const ReducerType & reducer)1186 ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, 1187 const ReducerType& reducer) 1188 : m_functor(arg_functor), 1189 m_policy(arg_policy), 1190 m_reducer(reducer), 1191 m_result_ptr(reducer.view().data()), 1192 m_result_ptr_device_accessible( 1193 MemorySpaceAccess<Kokkos::CudaSpace, 1194 typename ReducerType::result_view_type:: 1195 memory_space>::accessible), 1196 m_result_ptr_host_accessible( 1197 MemorySpaceAccess<Kokkos::HostSpace, 1198 typename ReducerType::result_view_type:: 1199 memory_space>::accessible), 1200 m_scratch_space(nullptr), 1201 m_scratch_flags(nullptr), 1202 m_unified_space(nullptr) {} 1203 }; 1204 1205 // MDRangePolicy impl 1206 template <class FunctorType, class ReducerType, class... Traits> 1207 class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType, 1208 Kokkos::Cuda> { 1209 public: 1210 using Policy = Kokkos::MDRangePolicy<Traits...>; 1211 1212 private: 1213 using array_index_type = typename Policy::array_index_type; 1214 using index_type = typename Policy::index_type; 1215 1216 using WorkTag = typename Policy::work_tag; 1217 using Member = typename Policy::member_type; 1218 using LaunchBounds = typename Policy::launch_bounds; 1219 1220 using ReducerConditional = 1221 Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, 1222 FunctorType, ReducerType>; 1223 using ReducerTypeFwd = typename ReducerConditional::type; 1224 using WorkTagFwd = 1225 typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, 1226 WorkTag, void>::type; 1227 1228 using ValueTraits = 1229 Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>; 1230 using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>; 1231 using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>; 1232 1233 public: 1234 using pointer_type = typename ValueTraits::pointer_type; 1235 using value_type = typename ValueTraits::value_type; 1236 using reference_type = typename ValueTraits::reference_type; 1237 using functor_type = FunctorType; 1238 using size_type = Cuda::size_type; 1239 using reducer_type = ReducerType; 1240 1241 // Algorithmic constraints: blockSize is a power of two AND blockDim.y == 1242 // blockDim.z == 1 1243 1244 const FunctorType m_functor; 1245 const Policy m_policy; // used for workrange and nwork 1246 const ReducerType m_reducer; 1247 const pointer_type m_result_ptr; 1248 const bool m_result_ptr_device_accessible; 1249 size_type* m_scratch_space; 1250 size_type* m_scratch_flags; 1251 size_type* m_unified_space; 1252 1253 using DeviceIteratePattern = typename Kokkos::Impl::Reduce::DeviceIterateTile< 1254 Policy::rank, Policy, FunctorType, typename Policy::work_tag, 1255 reference_type>; 1256 1257 // Shall we use the shfl based reduction or not (only use it for static sized 1258 // types of more than 128bit 1259 static constexpr bool UseShflReduction = false; 1260 //((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize) 1261 // Some crutch to do function overloading 1262 private: 1263 using DummyShflReductionType = double; 1264 using DummySHMEMReductionType = int; 1265 1266 public: 1267 template <typename Policy, typename Functor> max_tile_size_product(const Policy & pol,const Functor &)1268 static int max_tile_size_product(const Policy& pol, const Functor&) { 1269 cudaFuncAttributes attr = 1270 CudaParallelLaunch<ParallelReduce, 1271 LaunchBounds>::get_cuda_func_attributes(); 1272 auto const& prop = pol.space().cuda_device_prop(); 1273 // Limits due do registers/SM 1274 int const regs_per_sm = prop.regsPerMultiprocessor; 1275 int const regs_per_thread = attr.numRegs; 1276 int const max_threads_per_sm = regs_per_sm / regs_per_thread; 1277 return std::min( 1278 max_threads_per_sm, 1279 static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism)); 1280 } get_policy() const1281 Policy const& get_policy() const { return m_policy; } exec_range(reference_type update) const1282 inline __device__ void exec_range(reference_type update) const { 1283 Kokkos::Impl::Reduce::DeviceIterateTile<Policy::rank, Policy, FunctorType, 1284 typename Policy::work_tag, 1285 reference_type>(m_policy, m_functor, 1286 update) 1287 .exec_range(); 1288 } 1289 operator ()() const1290 inline __device__ void operator()() const { 1291 /* run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, 1292 DummySHMEMReductionType>::select(1,1.0) ); 1293 } 1294 1295 __device__ inline 1296 void run(const DummySHMEMReductionType& ) const 1297 {*/ 1298 const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize / 1299 sizeof(size_type)> 1300 word_count(ValueTraits::value_size( 1301 ReducerConditional::select(m_functor, m_reducer)) / 1302 sizeof(size_type)); 1303 1304 { 1305 reference_type value = 1306 ValueInit::init(ReducerConditional::select(m_functor, m_reducer), 1307 kokkos_impl_cuda_shared_memory<size_type>() + 1308 threadIdx.y * word_count.value); 1309 1310 // Number of blocks is bounded so that the reduction can be limited to two 1311 // passes. Each thread block is given an approximately equal amount of 1312 // work to perform. Accumulate the values for this block. The accumulation 1313 // ordering does not match the final pass, but is arithmatically 1314 // equivalent. 1315 1316 this->exec_range(value); 1317 } 1318 1319 // Reduce with final value at blockDim.y - 1 location. 1320 // Problem: non power-of-two blockDim 1321 if (cuda_single_inter_block_reduce_scan<false, ReducerTypeFwd, WorkTagFwd>( 1322 ReducerConditional::select(m_functor, m_reducer), blockIdx.x, 1323 gridDim.x, kokkos_impl_cuda_shared_memory<size_type>(), 1324 m_scratch_space, m_scratch_flags)) { 1325 // This is the final block with the final result at the final threads' 1326 // location 1327 size_type* const shared = kokkos_impl_cuda_shared_memory<size_type>() + 1328 (blockDim.y - 1) * word_count.value; 1329 size_type* const global = 1330 m_result_ptr_device_accessible 1331 ? reinterpret_cast<size_type*>(m_result_ptr) 1332 : (m_unified_space ? m_unified_space : m_scratch_space); 1333 1334 if (threadIdx.y == 0) { 1335 Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final( 1336 ReducerConditional::select(m_functor, m_reducer), shared); 1337 } 1338 1339 if (CudaTraits::WarpSize < word_count.value) { 1340 __syncthreads(); 1341 } 1342 1343 for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { 1344 global[i] = shared[i]; 1345 } 1346 } 1347 } 1348 1349 /* __device__ inline 1350 void run(const DummyShflReductionType&) const 1351 { 1352 1353 value_type value; 1354 ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , 1355 &value); 1356 // Number of blocks is bounded so that the reduction can be limited to 1357 two passes. 1358 // Each thread block is given an approximately equal amount of work to 1359 perform. 1360 // Accumulate the values for this block. 1361 // The accumulation ordering does not match the final pass, but is 1362 arithmatically equivalent. 1363 1364 const Member work_part = 1365 ( ( m_policy.m_num_tiles + ( gridDim.x - 1 ) ) / gridDim.x ); //portion 1366 of tiles handled by each block 1367 1368 this-> exec_range( value ); 1369 1370 pointer_type const result = (pointer_type) (m_unified_space ? 1371 m_unified_space : m_scratch_space) ; 1372 1373 int max_active_thread = work_part < blockDim.y ? work_part:blockDim.y; 1374 max_active_thread = (max_active_thread == 1375 0)?blockDim.y:max_active_thread; 1376 1377 value_type init; 1378 ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , 1379 &init); 1380 if(Impl::cuda_inter_block_reduction<ReducerTypeFwd,ValueJoin,WorkTagFwd> 1381 (value,init,ValueJoin(ReducerConditional::select(m_functor , 1382 m_reducer)),m_scratch_space,result,m_scratch_flags,max_active_thread)) { 1383 const unsigned id = threadIdx.y*blockDim.x + threadIdx.x; 1384 if(id==0) { 1385 Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final( 1386 ReducerConditional::select(m_functor , m_reducer) , (void*) &value ); 1387 *result = value; 1388 } 1389 } 1390 } 1391 */ 1392 // Determine block size constrained by shared memory: local_block_size(const FunctorType & f)1393 inline unsigned local_block_size(const FunctorType& f) { 1394 unsigned n = CudaTraits::WarpSize * 8; 1395 int shmem_size = 1396 cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>( 1397 f, n); 1398 using closure_type = Impl::ParallelReduce<FunctorType, Policy, ReducerType>; 1399 cudaFuncAttributes attr = 1400 CudaParallelLaunch<closure_type, 1401 LaunchBounds>::get_cuda_func_attributes(); 1402 while ( 1403 (n && 1404 (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < 1405 shmem_size)) || 1406 (n > 1407 static_cast<unsigned>( 1408 Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>( 1409 m_policy.space().impl_internal_space_instance(), attr, f, 1, 1410 shmem_size, 0)))) { 1411 n >>= 1; 1412 shmem_size = cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, 1413 WorkTag>(f, n); 1414 } 1415 return n; 1416 } 1417 execute()1418 inline void execute() { 1419 const int nwork = m_policy.m_num_tiles; 1420 if (nwork) { 1421 int block_size = m_policy.m_prod_tile_dims; 1422 // CONSTRAINT: Algorithm requires block_size >= product of tile dimensions 1423 // Nearest power of two 1424 int exponent_pow_two = std::ceil(std::log2(block_size)); 1425 block_size = std::pow(2, exponent_pow_two); 1426 int suggested_blocksize = local_block_size(m_functor); 1427 1428 block_size = (block_size > suggested_blocksize) 1429 ? block_size 1430 : suggested_blocksize; // Note: block_size must be less 1431 // than or equal to 512 1432 1433 m_scratch_space = cuda_internal_scratch_space( 1434 m_policy.space(), ValueTraits::value_size(ReducerConditional::select( 1435 m_functor, m_reducer)) * 1436 block_size /* block_size == max block_count */); 1437 m_scratch_flags = 1438 cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type)); 1439 m_unified_space = cuda_internal_scratch_unified( 1440 m_policy.space(), ValueTraits::value_size(ReducerConditional::select( 1441 m_functor, m_reducer))); 1442 1443 // REQUIRED ( 1 , N , 1 ) 1444 const dim3 block(1, block_size, 1); 1445 // Required grid.x <= block.y 1446 const dim3 grid(std::min(int(block.y), int(nwork)), 1, 1); 1447 1448 // TODO @graph We need to effectively insert this in to the graph 1449 const int shmem = 1450 UseShflReduction 1451 ? 0 1452 : cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, 1453 WorkTag>(m_functor, 1454 block.y); 1455 1456 CudaParallelLaunch<ParallelReduce, LaunchBounds>( 1457 *this, grid, block, shmem, 1458 m_policy.space().impl_internal_space_instance(), 1459 false); // copy to device and execute 1460 1461 if (!m_result_ptr_device_accessible) { 1462 m_policy.space().fence(); 1463 1464 if (m_result_ptr) { 1465 if (m_unified_space) { 1466 const int count = ValueTraits::value_count( 1467 ReducerConditional::select(m_functor, m_reducer)); 1468 for (int i = 0; i < count; ++i) { 1469 m_result_ptr[i] = pointer_type(m_unified_space)[i]; 1470 } 1471 } else { 1472 const int size = ValueTraits::value_size( 1473 ReducerConditional::select(m_functor, m_reducer)); 1474 DeepCopy<HostSpace, CudaSpace>(m_result_ptr, m_scratch_space, size); 1475 } 1476 } 1477 } 1478 } else { 1479 if (m_result_ptr) { 1480 // TODO @graph We need to effectively insert this in to the graph 1481 ValueInit::init(ReducerConditional::select(m_functor, m_reducer), 1482 m_result_ptr); 1483 } 1484 } 1485 } 1486 1487 template <class ViewType> ParallelReduce(const FunctorType & arg_functor,const Policy & arg_policy,const ViewType & arg_result,typename std::enable_if<Kokkos::is_view<ViewType>::value,void * >::type=nullptr)1488 ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, 1489 const ViewType& arg_result, 1490 typename std::enable_if<Kokkos::is_view<ViewType>::value, 1491 void*>::type = nullptr) 1492 : m_functor(arg_functor), 1493 m_policy(arg_policy), 1494 m_reducer(InvalidType()), 1495 m_result_ptr(arg_result.data()), 1496 m_result_ptr_device_accessible( 1497 MemorySpaceAccess<Kokkos::CudaSpace, 1498 typename ViewType::memory_space>::accessible), 1499 m_scratch_space(nullptr), 1500 m_scratch_flags(nullptr), 1501 m_unified_space(nullptr) {} 1502 ParallelReduce(const FunctorType & arg_functor,const Policy & arg_policy,const ReducerType & reducer)1503 ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, 1504 const ReducerType& reducer) 1505 : m_functor(arg_functor), 1506 m_policy(arg_policy), 1507 m_reducer(reducer), 1508 m_result_ptr(reducer.view().data()), 1509 m_result_ptr_device_accessible( 1510 MemorySpaceAccess<Kokkos::CudaSpace, 1511 typename ReducerType::result_view_type:: 1512 memory_space>::accessible), 1513 m_scratch_space(nullptr), 1514 m_scratch_flags(nullptr), 1515 m_unified_space(nullptr) {} 1516 }; 1517 1518 //---------------------------------------------------------------------------- 1519 1520 template <class FunctorType, class ReducerType, class... Properties> 1521 class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>, 1522 ReducerType, Kokkos::Cuda> { 1523 public: 1524 using Policy = TeamPolicy<Properties...>; 1525 1526 private: 1527 using Member = typename Policy::member_type; 1528 using WorkTag = typename Policy::work_tag; 1529 using LaunchBounds = typename Policy::launch_bounds; 1530 1531 using ReducerConditional = 1532 Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, 1533 FunctorType, ReducerType>; 1534 using ReducerTypeFwd = typename ReducerConditional::type; 1535 using WorkTagFwd = 1536 typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, 1537 WorkTag, void>::type; 1538 1539 using ValueTraits = 1540 Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>; 1541 using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>; 1542 using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>; 1543 1544 using pointer_type = typename ValueTraits::pointer_type; 1545 using reference_type = typename ValueTraits::reference_type; 1546 using value_type = typename ValueTraits::value_type; 1547 1548 public: 1549 using functor_type = FunctorType; 1550 using size_type = Cuda::size_type; 1551 using reducer_type = ReducerType; 1552 1553 enum : bool { 1554 UseShflReduction = (true && (ValueTraits::StaticValueSize != 0)) 1555 }; 1556 1557 private: 1558 using DummyShflReductionType = double; 1559 using DummySHMEMReductionType = int; 1560 1561 // Algorithmic constraints: blockDim.y is a power of two AND blockDim.y == 1562 // blockDim.z == 1 shared memory utilization: 1563 // 1564 // [ global reduce space ] 1565 // [ team reduce space ] 1566 // [ team shared space ] 1567 // 1568 1569 const FunctorType m_functor; 1570 const Policy m_policy; 1571 const ReducerType m_reducer; 1572 const pointer_type m_result_ptr; 1573 const bool m_result_ptr_device_accessible; 1574 const bool m_result_ptr_host_accessible; 1575 size_type* m_scratch_space; 1576 size_type* m_scratch_flags; 1577 size_type* m_unified_space; 1578 size_type m_team_begin; 1579 size_type m_shmem_begin; 1580 size_type m_shmem_size; 1581 void* m_scratch_ptr[2]; 1582 int m_scratch_size[2]; 1583 const size_type m_league_size; 1584 int m_team_size; 1585 const size_type m_vector_size; 1586 1587 template <class TagType> 1588 __device__ inline 1589 typename std::enable_if<std::is_same<TagType, void>::value>::type exec_team(const Member & member,reference_type update) const1590 exec_team(const Member& member, reference_type update) const { 1591 m_functor(member, update); 1592 } 1593 1594 template <class TagType> 1595 __device__ inline 1596 typename std::enable_if<!std::is_same<TagType, void>::value>::type exec_team(const Member & member,reference_type update) const1597 exec_team(const Member& member, reference_type update) const { 1598 m_functor(TagType(), member, update); 1599 } 1600 1601 public: get_policy() const1602 Policy const& get_policy() const { return m_policy; } 1603 operator ()() const1604 __device__ inline void operator()() const { 1605 int64_t threadid = 0; 1606 if (m_scratch_size[1] > 0) { 1607 __shared__ int64_t base_thread_id; 1608 if (threadIdx.x == 0 && threadIdx.y == 0) { 1609 threadid = (blockIdx.x * blockDim.z + threadIdx.z) % 1610 (Kokkos::Impl::g_device_cuda_lock_arrays.n / 1611 (blockDim.x * blockDim.y)); 1612 threadid *= blockDim.x * blockDim.y; 1613 int done = 0; 1614 while (!done) { 1615 done = 1616 (0 == 1617 atomicCAS( 1618 &Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid], 1619 0, 1)); 1620 if (!done) { 1621 threadid += blockDim.x * blockDim.y; 1622 if (int64_t(threadid + blockDim.x * blockDim.y) >= 1623 int64_t(Kokkos::Impl::g_device_cuda_lock_arrays.n)) 1624 threadid = 0; 1625 } 1626 } 1627 base_thread_id = threadid; 1628 } 1629 __syncthreads(); 1630 threadid = base_thread_id; 1631 } 1632 1633 run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, 1634 DummySHMEMReductionType>::select(1, 1.0), 1635 threadid); 1636 if (m_scratch_size[1] > 0) { 1637 __syncthreads(); 1638 if (threadIdx.x == 0 && threadIdx.y == 0) 1639 Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid] = 0; 1640 } 1641 } 1642 run(const DummySHMEMReductionType &,const int & threadid) const1643 __device__ inline void run(const DummySHMEMReductionType&, 1644 const int& threadid) const { 1645 const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize / 1646 sizeof(size_type)> 1647 word_count(ValueTraits::value_size( 1648 ReducerConditional::select(m_functor, m_reducer)) / 1649 sizeof(size_type)); 1650 1651 reference_type value = 1652 ValueInit::init(ReducerConditional::select(m_functor, m_reducer), 1653 kokkos_impl_cuda_shared_memory<size_type>() + 1654 threadIdx.y * word_count.value); 1655 1656 // Iterate this block through the league 1657 const int int_league_size = (int)m_league_size; 1658 for (int league_rank = blockIdx.x; league_rank < int_league_size; 1659 league_rank += gridDim.x) { 1660 this->template exec_team<WorkTag>( 1661 Member(kokkos_impl_cuda_shared_memory<char>() + m_team_begin, 1662 m_shmem_begin, m_shmem_size, 1663 (void*)(((char*)m_scratch_ptr[1]) + 1664 ptrdiff_t(threadid / (blockDim.x * blockDim.y)) * 1665 m_scratch_size[1]), 1666 m_scratch_size[1], league_rank, m_league_size), 1667 value); 1668 } 1669 1670 // Reduce with final value at blockDim.y - 1 location. 1671 // Doing code duplication here to fix issue #3428 1672 // Suspect optimizer bug?? 1673 if (m_league_size == 0) { 1674 // This is the final block with the final result at the final threads' 1675 // location 1676 1677 size_type* const shared = kokkos_impl_cuda_shared_memory<size_type>() + 1678 (blockDim.y - 1) * word_count.value; 1679 size_type* const global = 1680 m_result_ptr_device_accessible 1681 ? reinterpret_cast<size_type*>(m_result_ptr) 1682 : (m_unified_space ? m_unified_space : m_scratch_space); 1683 1684 if (threadIdx.y == 0) { 1685 Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final( 1686 ReducerConditional::select(m_functor, m_reducer), shared); 1687 } 1688 1689 if (CudaTraits::WarpSize < word_count.value) { 1690 __syncthreads(); 1691 } 1692 1693 for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { 1694 global[i] = shared[i]; 1695 } 1696 } 1697 1698 if (m_league_size != 0) { 1699 if (cuda_single_inter_block_reduce_scan<false, FunctorType, WorkTag>( 1700 ReducerConditional::select(m_functor, m_reducer), blockIdx.x, 1701 gridDim.x, kokkos_impl_cuda_shared_memory<size_type>(), 1702 m_scratch_space, m_scratch_flags)) { 1703 // This is the final block with the final result at the final threads' 1704 // location 1705 1706 size_type* const shared = kokkos_impl_cuda_shared_memory<size_type>() + 1707 (blockDim.y - 1) * word_count.value; 1708 size_type* const global = 1709 m_result_ptr_device_accessible 1710 ? reinterpret_cast<size_type*>(m_result_ptr) 1711 : (m_unified_space ? m_unified_space : m_scratch_space); 1712 1713 if (threadIdx.y == 0) { 1714 Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final( 1715 ReducerConditional::select(m_functor, m_reducer), shared); 1716 } 1717 1718 if (CudaTraits::WarpSize < word_count.value) { 1719 __syncthreads(); 1720 } 1721 1722 for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { 1723 global[i] = shared[i]; 1724 } 1725 } 1726 } 1727 } 1728 run(const DummyShflReductionType &,const int & threadid) const1729 __device__ inline void run(const DummyShflReductionType&, 1730 const int& threadid) const { 1731 value_type value; 1732 ValueInit::init(ReducerConditional::select(m_functor, m_reducer), &value); 1733 1734 // Iterate this block through the league 1735 const int int_league_size = (int)m_league_size; 1736 for (int league_rank = blockIdx.x; league_rank < int_league_size; 1737 league_rank += gridDim.x) { 1738 this->template exec_team<WorkTag>( 1739 Member(kokkos_impl_cuda_shared_memory<char>() + m_team_begin, 1740 m_shmem_begin, m_shmem_size, 1741 (void*)(((char*)m_scratch_ptr[1]) + 1742 ptrdiff_t(threadid / (blockDim.x * blockDim.y)) * 1743 m_scratch_size[1]), 1744 m_scratch_size[1], league_rank, m_league_size), 1745 value); 1746 } 1747 1748 pointer_type const result = 1749 m_result_ptr_device_accessible 1750 ? m_result_ptr 1751 : (pointer_type)(m_unified_space ? m_unified_space 1752 : m_scratch_space); 1753 1754 value_type init; 1755 ValueInit::init(ReducerConditional::select(m_functor, m_reducer), &init); 1756 1757 if (int_league_size == 0) { 1758 Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final( 1759 ReducerConditional::select(m_functor, m_reducer), (void*)&value); 1760 *result = value; 1761 } else if ( 1762 Impl::cuda_inter_block_reduction<FunctorType, ValueJoin, WorkTag>( 1763 value, init, 1764 ValueJoin(ReducerConditional::select(m_functor, m_reducer)), 1765 m_scratch_space, result, m_scratch_flags, blockDim.y) 1766 // This breaks a test 1767 // Kokkos::Impl::CudaReductionsFunctor<FunctorType,WorkTag,false,true>::scalar_inter_block_reduction(ReducerConditional::select(m_functor 1768 // , m_reducer) , blockIdx.x , gridDim.x , 1769 // kokkos_impl_cuda_shared_memory<size_type>() , 1770 // m_scratch_space , m_scratch_flags) 1771 ) { 1772 const unsigned id = threadIdx.y * blockDim.x + threadIdx.x; 1773 if (id == 0) { 1774 Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final( 1775 ReducerConditional::select(m_functor, m_reducer), (void*)&value); 1776 *result = value; 1777 } 1778 } 1779 } 1780 execute()1781 inline void execute() { 1782 const int nwork = m_league_size * m_team_size; 1783 const bool need_device_set = ReduceFunctorHasInit<FunctorType>::value || 1784 ReduceFunctorHasFinal<FunctorType>::value || 1785 !m_result_ptr_host_accessible || 1786 #ifdef KOKKOS_CUDA_ENABLE_GRAPHS 1787 Policy::is_graph_kernel::value || 1788 #endif 1789 !std::is_same<ReducerType, InvalidType>::value; 1790 if ((nwork > 0) || need_device_set) { 1791 const int block_count = 1792 UseShflReduction ? std::min(m_league_size, size_type(1024 * 32)) 1793 : std::min(int(m_league_size), m_team_size); 1794 1795 m_scratch_space = cuda_internal_scratch_space( 1796 m_policy.space(), ValueTraits::value_size(ReducerConditional::select( 1797 m_functor, m_reducer)) * 1798 block_count); 1799 m_scratch_flags = 1800 cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type)); 1801 m_unified_space = cuda_internal_scratch_unified( 1802 m_policy.space(), ValueTraits::value_size(ReducerConditional::select( 1803 m_functor, m_reducer))); 1804 1805 dim3 block(m_vector_size, m_team_size, 1); 1806 dim3 grid(block_count, 1, 1); 1807 const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; 1808 1809 if ((nwork == 0) 1810 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION 1811 || Kokkos::Impl::CudaInternal::cuda_use_serial_execution() 1812 #endif 1813 ) { 1814 block = dim3(1, 1, 1); 1815 grid = dim3(1, 1, 1); 1816 } 1817 1818 CudaParallelLaunch<ParallelReduce, LaunchBounds>( 1819 *this, grid, block, shmem_size_total, 1820 m_policy.space().impl_internal_space_instance(), 1821 true); // copy to device and execute 1822 1823 if (!m_result_ptr_device_accessible) { 1824 m_policy.space().fence(); 1825 1826 if (m_result_ptr) { 1827 if (m_unified_space) { 1828 const int count = ValueTraits::value_count( 1829 ReducerConditional::select(m_functor, m_reducer)); 1830 for (int i = 0; i < count; ++i) { 1831 m_result_ptr[i] = pointer_type(m_unified_space)[i]; 1832 } 1833 } else { 1834 const int size = ValueTraits::value_size( 1835 ReducerConditional::select(m_functor, m_reducer)); 1836 DeepCopy<HostSpace, CudaSpace>(m_result_ptr, m_scratch_space, size); 1837 } 1838 } 1839 } 1840 } else { 1841 if (m_result_ptr) { 1842 // TODO @graph We need to effectively insert this in to the graph 1843 ValueInit::init(ReducerConditional::select(m_functor, m_reducer), 1844 m_result_ptr); 1845 } 1846 } 1847 } 1848 1849 template <class ViewType> ParallelReduce(const FunctorType & arg_functor,const Policy & arg_policy,const ViewType & arg_result,typename std::enable_if<Kokkos::is_view<ViewType>::value,void * >::type=nullptr)1850 ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, 1851 const ViewType& arg_result, 1852 typename std::enable_if<Kokkos::is_view<ViewType>::value, 1853 void*>::type = nullptr) 1854 : m_functor(arg_functor), 1855 m_policy(arg_policy), 1856 m_reducer(InvalidType()), 1857 m_result_ptr(arg_result.data()), 1858 m_result_ptr_device_accessible( 1859 MemorySpaceAccess<Kokkos::CudaSpace, 1860 typename ViewType::memory_space>::accessible), 1861 m_result_ptr_host_accessible( 1862 MemorySpaceAccess<Kokkos::HostSpace, 1863 typename ViewType::memory_space>::accessible), 1864 m_scratch_space(nullptr), 1865 m_scratch_flags(nullptr), 1866 m_unified_space(nullptr), 1867 m_team_begin(0), 1868 m_shmem_begin(0), 1869 m_shmem_size(0), 1870 m_scratch_ptr{nullptr, nullptr}, 1871 m_league_size(arg_policy.league_size()), 1872 m_team_size(arg_policy.team_size()), 1873 m_vector_size(arg_policy.impl_vector_length()) { 1874 cudaFuncAttributes attr = 1875 CudaParallelLaunch<ParallelReduce, 1876 LaunchBounds>::get_cuda_func_attributes(); 1877 m_team_size = 1878 m_team_size >= 0 1879 ? m_team_size 1880 : Kokkos::Impl::cuda_get_opt_block_size<FunctorType, LaunchBounds>( 1881 m_policy.space().impl_internal_space_instance(), attr, 1882 m_functor, m_vector_size, m_policy.team_scratch_size(0), 1883 m_policy.thread_scratch_size(0)) / 1884 m_vector_size; 1885 1886 m_team_begin = 1887 UseShflReduction 1888 ? 0 1889 : cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, 1890 WorkTag>(arg_functor, 1891 m_team_size); 1892 m_shmem_begin = sizeof(double) * (m_team_size + 2); 1893 m_shmem_size = 1894 m_policy.scratch_size(0, m_team_size) + 1895 FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size); 1896 m_scratch_size[0] = m_shmem_size; 1897 m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); 1898 m_scratch_ptr[1] = 1899 m_team_size <= 0 1900 ? nullptr 1901 : m_policy.space() 1902 .impl_internal_space_instance() 1903 ->resize_team_scratch_space( 1904 static_cast<std::int64_t>(m_scratch_size[1]) * 1905 (static_cast<std::int64_t>( 1906 Cuda::concurrency() / 1907 (m_team_size * m_vector_size)))); 1908 1909 // The global parallel_reduce does not support vector_length other than 1 at 1910 // the moment 1911 if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction) 1912 Impl::throw_runtime_exception( 1913 "Kokkos::parallel_reduce with a TeamPolicy using a vector length of " 1914 "greater than 1 is not currently supported for CUDA for dynamic " 1915 "sized reduction types."); 1916 1917 if ((m_team_size < 32) && !UseShflReduction) 1918 Impl::throw_runtime_exception( 1919 "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller " 1920 "than 32 is not currently supported with CUDA for dynamic sized " 1921 "reduction types."); 1922 1923 // Functor's reduce memory, team scan memory, and team shared memory depend 1924 // upon team size. 1925 1926 const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; 1927 1928 if (!Kokkos::Impl::is_integral_power_of_two(m_team_size) && 1929 !UseShflReduction) { 1930 Kokkos::Impl::throw_runtime_exception( 1931 std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size")); 1932 } 1933 1934 if (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < 1935 shmem_size_total) { 1936 Kokkos::Impl::throw_runtime_exception( 1937 std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too much " 1938 "L0 scratch memory")); 1939 } 1940 1941 if (int(m_team_size) > 1942 arg_policy.team_size_max(m_functor, m_reducer, ParallelReduceTag())) { 1943 Kokkos::Impl::throw_runtime_exception( 1944 std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too " 1945 "large team size.")); 1946 } 1947 } 1948 ParallelReduce(const FunctorType & arg_functor,const Policy & arg_policy,const ReducerType & reducer)1949 ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, 1950 const ReducerType& reducer) 1951 : m_functor(arg_functor), 1952 m_policy(arg_policy), 1953 m_reducer(reducer), 1954 m_result_ptr(reducer.view().data()), 1955 m_result_ptr_device_accessible( 1956 MemorySpaceAccess<Kokkos::CudaSpace, 1957 typename ReducerType::result_view_type:: 1958 memory_space>::accessible), 1959 m_result_ptr_host_accessible( 1960 MemorySpaceAccess<Kokkos::HostSpace, 1961 typename ReducerType::result_view_type:: 1962 memory_space>::accessible), 1963 m_scratch_space(nullptr), 1964 m_scratch_flags(nullptr), 1965 m_unified_space(nullptr), 1966 m_team_begin(0), 1967 m_shmem_begin(0), 1968 m_shmem_size(0), 1969 m_scratch_ptr{nullptr, nullptr}, 1970 m_league_size(arg_policy.league_size()), 1971 m_team_size(arg_policy.team_size()), 1972 m_vector_size(arg_policy.impl_vector_length()) { 1973 cudaFuncAttributes attr = 1974 CudaParallelLaunch<ParallelReduce, 1975 LaunchBounds>::get_cuda_func_attributes(); 1976 m_team_size = 1977 m_team_size >= 0 1978 ? m_team_size 1979 : Kokkos::Impl::cuda_get_opt_block_size<FunctorType, LaunchBounds>( 1980 m_policy.space().impl_internal_space_instance(), attr, 1981 m_functor, m_vector_size, m_policy.team_scratch_size(0), 1982 m_policy.thread_scratch_size(0)) / 1983 m_vector_size; 1984 1985 m_team_begin = 1986 UseShflReduction 1987 ? 0 1988 : cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, 1989 WorkTag>(arg_functor, 1990 m_team_size); 1991 m_shmem_begin = sizeof(double) * (m_team_size + 2); 1992 m_shmem_size = 1993 m_policy.scratch_size(0, m_team_size) + 1994 FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size); 1995 m_scratch_size[0] = m_shmem_size; 1996 m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); 1997 m_scratch_ptr[1] = 1998 m_team_size <= 0 1999 ? nullptr 2000 : m_policy.space() 2001 .impl_internal_space_instance() 2002 ->resize_team_scratch_space( 2003 static_cast<ptrdiff_t>(m_scratch_size[1]) * 2004 static_cast<ptrdiff_t>(Cuda::concurrency() / 2005 (m_team_size * m_vector_size))); 2006 2007 // The global parallel_reduce does not support vector_length other than 1 at 2008 // the moment 2009 if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction) 2010 Impl::throw_runtime_exception( 2011 "Kokkos::parallel_reduce with a TeamPolicy using a vector length of " 2012 "greater than 1 is not currently supported for CUDA for dynamic " 2013 "sized reduction types."); 2014 2015 if ((m_team_size < 32) && !UseShflReduction) 2016 Impl::throw_runtime_exception( 2017 "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller " 2018 "than 32 is not currently supported with CUDA for dynamic sized " 2019 "reduction types."); 2020 2021 // Functor's reduce memory, team scan memory, and team shared memory depend 2022 // upon team size. 2023 2024 const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; 2025 2026 if ((!Kokkos::Impl::is_integral_power_of_two(m_team_size) && 2027 !UseShflReduction) || 2028 m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < 2029 shmem_size_total) { 2030 Kokkos::Impl::throw_runtime_exception( 2031 std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size")); 2032 } 2033 if (int(m_team_size) > 2034 arg_policy.team_size_max(m_functor, m_reducer, ParallelReduceTag())) { 2035 Kokkos::Impl::throw_runtime_exception( 2036 std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too " 2037 "large team size.")); 2038 } 2039 } 2040 }; 2041 2042 } // namespace Impl 2043 } // namespace Kokkos 2044 2045 //---------------------------------------------------------------------------- 2046 //---------------------------------------------------------------------------- 2047 2048 namespace Kokkos { 2049 namespace Impl { 2050 2051 template <class FunctorType, class... Traits> 2052 class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> { 2053 public: 2054 using Policy = Kokkos::RangePolicy<Traits...>; 2055 2056 private: 2057 using Member = typename Policy::member_type; 2058 using WorkTag = typename Policy::work_tag; 2059 using WorkRange = typename Policy::WorkRange; 2060 using LaunchBounds = typename Policy::launch_bounds; 2061 2062 using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, WorkTag>; 2063 using ValueInit = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>; 2064 using ValueOps = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>; 2065 2066 public: 2067 using pointer_type = typename ValueTraits::pointer_type; 2068 using reference_type = typename ValueTraits::reference_type; 2069 using functor_type = FunctorType; 2070 using size_type = Cuda::size_type; 2071 2072 private: 2073 // Algorithmic constraints: 2074 // (a) blockDim.y is a power of two 2075 // (b) blockDim.y == blockDim.z == 1 2076 // (c) gridDim.x <= blockDim.y * blockDim.y 2077 // (d) gridDim.y == gridDim.z == 1 2078 2079 const FunctorType m_functor; 2080 const Policy m_policy; 2081 size_type* m_scratch_space; 2082 size_type* m_scratch_flags; 2083 size_type m_final; 2084 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION 2085 bool m_run_serial; 2086 #endif 2087 2088 template <class TagType> 2089 __device__ inline 2090 typename std::enable_if<std::is_same<TagType, void>::value>::type exec_range(const Member & i,reference_type update,const bool final_result) const2091 exec_range(const Member& i, reference_type update, 2092 const bool final_result) const { 2093 m_functor(i, update, final_result); 2094 } 2095 2096 template <class TagType> 2097 __device__ inline 2098 typename std::enable_if<!std::is_same<TagType, void>::value>::type exec_range(const Member & i,reference_type update,const bool final_result) const2099 exec_range(const Member& i, reference_type update, 2100 const bool final_result) const { 2101 m_functor(TagType(), i, update, final_result); 2102 } 2103 2104 //---------------------------------------- 2105 initial() const2106 __device__ inline void initial() const { 2107 const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize / 2108 sizeof(size_type)> 2109 word_count(ValueTraits::value_size(m_functor) / sizeof(size_type)); 2110 2111 size_type* const shared_value = 2112 kokkos_impl_cuda_shared_memory<size_type>() + 2113 word_count.value * threadIdx.y; 2114 2115 ValueInit::init(m_functor, shared_value); 2116 2117 // Number of blocks is bounded so that the reduction can be limited to two 2118 // passes. Each thread block is given an approximately equal amount of work 2119 // to perform. Accumulate the values for this block. The accumulation 2120 // ordering does not match the final pass, but is arithmatically equivalent. 2121 2122 const WorkRange range(m_policy, blockIdx.x, gridDim.x); 2123 2124 for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end(); 2125 iwork < iwork_end; iwork += blockDim.y) { 2126 this->template exec_range<WorkTag>( 2127 iwork, ValueOps::reference(shared_value), false); 2128 } 2129 2130 // Reduce and scan, writing out scan of blocks' totals and block-groups' 2131 // totals. Blocks' scan values are written to 'blockIdx.x' location. 2132 // Block-groups' scan values are at: i = ( j * blockDim.y - 1 ) for i < 2133 // gridDim.x 2134 cuda_single_inter_block_reduce_scan<true, FunctorType, WorkTag>( 2135 m_functor, blockIdx.x, gridDim.x, 2136 kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space, 2137 m_scratch_flags); 2138 } 2139 2140 //---------------------------------------- 2141 final() const2142 __device__ inline void final() const { 2143 const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize / 2144 sizeof(size_type)> 2145 word_count(ValueTraits::value_size(m_functor) / sizeof(size_type)); 2146 2147 // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] , 2148 // value[2] , ... } 2149 size_type* const shared_data = kokkos_impl_cuda_shared_memory<size_type>(); 2150 size_type* const shared_prefix = 2151 shared_data + word_count.value * threadIdx.y; 2152 size_type* const shared_accum = 2153 shared_data + word_count.value * (blockDim.y + 1); 2154 2155 // Starting value for this thread block is the previous block's total. 2156 if (blockIdx.x) { 2157 size_type* const block_total = 2158 m_scratch_space + word_count.value * (blockIdx.x - 1); 2159 for (unsigned i = threadIdx.y; i < word_count.value; ++i) { 2160 shared_accum[i] = block_total[i]; 2161 } 2162 } else if (0 == threadIdx.y) { 2163 ValueInit::init(m_functor, shared_accum); 2164 } 2165 2166 const WorkRange range(m_policy, blockIdx.x, gridDim.x); 2167 2168 for (typename Policy::member_type iwork_base = range.begin(); 2169 iwork_base < range.end(); iwork_base += blockDim.y) { 2170 #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK 2171 unsigned MASK = KOKKOS_IMPL_CUDA_ACTIVEMASK; 2172 #endif 2173 const typename Policy::member_type iwork = iwork_base + threadIdx.y; 2174 2175 __syncthreads(); // Don't overwrite previous iteration values until they 2176 // are used 2177 2178 ValueInit::init(m_functor, shared_prefix + word_count.value); 2179 2180 // Copy previous block's accumulation total into thread[0] prefix and 2181 // inclusive scan value of this block 2182 for (unsigned i = threadIdx.y; i < word_count.value; ++i) { 2183 shared_data[i + word_count.value] = shared_data[i] = shared_accum[i]; 2184 } 2185 #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK 2186 KOKKOS_IMPL_CUDA_SYNCWARP_MASK(MASK); 2187 #else 2188 KOKKOS_IMPL_CUDA_SYNCWARP; 2189 #endif 2190 if (CudaTraits::WarpSize < word_count.value) { 2191 __syncthreads(); 2192 } // Protect against large scan values. 2193 2194 // Call functor to accumulate inclusive scan value for this work item 2195 if (iwork < range.end()) { 2196 this->template exec_range<WorkTag>( 2197 iwork, ValueOps::reference(shared_prefix + word_count.value), 2198 false); 2199 } 2200 2201 // Scan block values into locations shared_data[1..blockDim.y] 2202 cuda_intra_block_reduce_scan<true, FunctorType, WorkTag>( 2203 m_functor, 2204 typename ValueTraits::pointer_type(shared_data + word_count.value)); 2205 2206 { 2207 size_type* const block_total = 2208 shared_data + word_count.value * blockDim.y; 2209 for (unsigned i = threadIdx.y; i < word_count.value; ++i) { 2210 shared_accum[i] = block_total[i]; 2211 } 2212 } 2213 2214 // Call functor with exclusive scan value 2215 if (iwork < range.end()) { 2216 this->template exec_range<WorkTag>( 2217 iwork, ValueOps::reference(shared_prefix), true); 2218 } 2219 } 2220 } 2221 2222 public: get_policy() const2223 Policy const& get_policy() const { return m_policy; } 2224 2225 //---------------------------------------- 2226 operator ()() const2227 __device__ inline void operator()() const { 2228 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION 2229 if (m_run_serial) { 2230 typename ValueTraits::value_type value; 2231 ValueInit::init(m_functor, (void*)&value); 2232 const WorkRange range(m_policy, blockIdx.x, gridDim.x); 2233 2234 for (typename Policy::member_type iwork_base = range.begin(); 2235 iwork_base < range.end(); iwork_base++) { 2236 this->template exec_range<WorkTag>(iwork_base, value, true); 2237 } 2238 } else { 2239 #endif 2240 if (!m_final) { 2241 initial(); 2242 } else { 2243 final(); 2244 } 2245 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION 2246 } 2247 #endif 2248 } 2249 2250 // Determine block size constrained by shared memory: local_block_size(const FunctorType & f)2251 inline unsigned local_block_size(const FunctorType& f) { 2252 // blockDim.y must be power of two = 128 (4 warps) or 256 (8 warps) or 512 2253 // (16 warps) gridDim.x <= blockDim.y * blockDim.y 2254 // 2255 // 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit 2256 // testing 2257 2258 unsigned n = CudaTraits::WarpSize * 4; 2259 while (n && 2260 unsigned(m_policy.space() 2261 .impl_internal_space_instance() 2262 ->m_maxShmemPerBlock) < 2263 cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, 2264 WorkTag>(f, n)) { 2265 n >>= 1; 2266 } 2267 return n; 2268 } 2269 execute()2270 inline void execute() { 2271 const int nwork = m_policy.end() - m_policy.begin(); 2272 if (nwork) { 2273 enum { GridMaxComputeCapability_2x = 0x0ffff }; 2274 2275 const int block_size = local_block_size(m_functor); 2276 KOKKOS_ASSERT(block_size > 0); 2277 2278 const int grid_max = 2279 (block_size * block_size) < GridMaxComputeCapability_2x 2280 ? (block_size * block_size) 2281 : GridMaxComputeCapability_2x; 2282 2283 // At most 'max_grid' blocks: 2284 const int max_grid = 2285 std::min(int(grid_max), int((nwork + block_size - 1) / block_size)); 2286 2287 // How much work per block: 2288 const int work_per_block = (nwork + max_grid - 1) / max_grid; 2289 2290 // How many block are really needed for this much work: 2291 const int grid_x = (nwork + work_per_block - 1) / work_per_block; 2292 2293 m_scratch_space = cuda_internal_scratch_space( 2294 m_policy.space(), ValueTraits::value_size(m_functor) * grid_x); 2295 m_scratch_flags = 2296 cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1); 2297 2298 dim3 grid(grid_x, 1, 1); 2299 dim3 block(1, block_size, 1); // REQUIRED DIMENSIONS ( 1 , N , 1 ) 2300 const int shmem = ValueTraits::value_size(m_functor) * (block_size + 2); 2301 2302 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION 2303 if (m_run_serial) { 2304 block = dim3(1, 1, 1); 2305 grid = dim3(1, 1, 1); 2306 } else { 2307 #endif 2308 m_final = false; 2309 CudaParallelLaunch<ParallelScan, LaunchBounds>( 2310 *this, grid, block, shmem, 2311 m_policy.space().impl_internal_space_instance(), 2312 false); // copy to device and execute 2313 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION 2314 } 2315 #endif 2316 m_final = true; 2317 CudaParallelLaunch<ParallelScan, LaunchBounds>( 2318 *this, grid, block, shmem, 2319 m_policy.space().impl_internal_space_instance(), 2320 false); // copy to device and execute 2321 } 2322 } 2323 ParallelScan(const FunctorType & arg_functor,const Policy & arg_policy)2324 ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy) 2325 : m_functor(arg_functor), 2326 m_policy(arg_policy), 2327 m_scratch_space(nullptr), 2328 m_scratch_flags(nullptr), 2329 m_final(false) 2330 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION 2331 , 2332 m_run_serial(Kokkos::Impl::CudaInternal::cuda_use_serial_execution()) 2333 #endif 2334 { 2335 } 2336 }; 2337 2338 //---------------------------------------------------------------------------- 2339 template <class FunctorType, class ReturnType, class... Traits> 2340 class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, 2341 ReturnType, Kokkos::Cuda> { 2342 public: 2343 using Policy = Kokkos::RangePolicy<Traits...>; 2344 2345 private: 2346 using Member = typename Policy::member_type; 2347 using WorkTag = typename Policy::work_tag; 2348 using WorkRange = typename Policy::WorkRange; 2349 using LaunchBounds = typename Policy::launch_bounds; 2350 2351 using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, WorkTag>; 2352 using ValueInit = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>; 2353 using ValueOps = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>; 2354 2355 public: 2356 using pointer_type = typename ValueTraits::pointer_type; 2357 using reference_type = typename ValueTraits::reference_type; 2358 using functor_type = FunctorType; 2359 using size_type = Cuda::size_type; 2360 2361 private: 2362 // Algorithmic constraints: 2363 // (a) blockDim.y is a power of two 2364 // (b) blockDim.y == blockDim.z == 1 2365 // (c) gridDim.x <= blockDim.y * blockDim.y 2366 // (d) gridDim.y == gridDim.z == 1 2367 2368 const FunctorType m_functor; 2369 const Policy m_policy; 2370 size_type* m_scratch_space; 2371 size_type* m_scratch_flags; 2372 size_type m_final; 2373 ReturnType& m_returnvalue; 2374 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION 2375 bool m_run_serial; 2376 #endif 2377 2378 template <class TagType> 2379 __device__ inline 2380 typename std::enable_if<std::is_same<TagType, void>::value>::type exec_range(const Member & i,reference_type update,const bool final_result) const2381 exec_range(const Member& i, reference_type update, 2382 const bool final_result) const { 2383 m_functor(i, update, final_result); 2384 } 2385 2386 template <class TagType> 2387 __device__ inline 2388 typename std::enable_if<!std::is_same<TagType, void>::value>::type exec_range(const Member & i,reference_type update,const bool final_result) const2389 exec_range(const Member& i, reference_type update, 2390 const bool final_result) const { 2391 m_functor(TagType(), i, update, final_result); 2392 } 2393 2394 //---------------------------------------- 2395 initial() const2396 __device__ inline void initial() const { 2397 const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize / 2398 sizeof(size_type)> 2399 word_count(ValueTraits::value_size(m_functor) / sizeof(size_type)); 2400 2401 size_type* const shared_value = 2402 kokkos_impl_cuda_shared_memory<size_type>() + 2403 word_count.value * threadIdx.y; 2404 2405 ValueInit::init(m_functor, shared_value); 2406 2407 // Number of blocks is bounded so that the reduction can be limited to two 2408 // passes. Each thread block is given an approximately equal amount of work 2409 // to perform. Accumulate the values for this block. The accumulation 2410 // ordering does not match the final pass, but is arithmatically equivalent. 2411 2412 const WorkRange range(m_policy, blockIdx.x, gridDim.x); 2413 2414 for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end(); 2415 iwork < iwork_end; iwork += blockDim.y) { 2416 this->template exec_range<WorkTag>( 2417 iwork, ValueOps::reference(shared_value), false); 2418 } 2419 2420 // Reduce and scan, writing out scan of blocks' totals and block-groups' 2421 // totals. Blocks' scan values are written to 'blockIdx.x' location. 2422 // Block-groups' scan values are at: i = ( j * blockDim.y - 1 ) for i < 2423 // gridDim.x 2424 cuda_single_inter_block_reduce_scan<true, FunctorType, WorkTag>( 2425 m_functor, blockIdx.x, gridDim.x, 2426 kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space, 2427 m_scratch_flags); 2428 } 2429 2430 //---------------------------------------- 2431 final() const2432 __device__ inline void final() const { 2433 const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize / 2434 sizeof(size_type)> 2435 word_count(ValueTraits::value_size(m_functor) / sizeof(size_type)); 2436 2437 // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] , 2438 // value[2] , ... } 2439 size_type* const shared_data = kokkos_impl_cuda_shared_memory<size_type>(); 2440 size_type* const shared_prefix = 2441 shared_data + word_count.value * threadIdx.y; 2442 size_type* const shared_accum = 2443 shared_data + word_count.value * (blockDim.y + 1); 2444 2445 // Starting value for this thread block is the previous block's total. 2446 if (blockIdx.x) { 2447 size_type* const block_total = 2448 m_scratch_space + word_count.value * (blockIdx.x - 1); 2449 for (unsigned i = threadIdx.y; i < word_count.value; ++i) { 2450 shared_accum[i] = block_total[i]; 2451 } 2452 } else if (0 == threadIdx.y) { 2453 ValueInit::init(m_functor, shared_accum); 2454 } 2455 2456 const WorkRange range(m_policy, blockIdx.x, gridDim.x); 2457 2458 for (typename Policy::member_type iwork_base = range.begin(); 2459 iwork_base < range.end(); iwork_base += blockDim.y) { 2460 #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK 2461 unsigned MASK = KOKKOS_IMPL_CUDA_ACTIVEMASK; 2462 #endif 2463 2464 const typename Policy::member_type iwork = iwork_base + threadIdx.y; 2465 2466 __syncthreads(); // Don't overwrite previous iteration values until they 2467 // are used 2468 2469 ValueInit::init(m_functor, shared_prefix + word_count.value); 2470 2471 // Copy previous block's accumulation total into thread[0] prefix and 2472 // inclusive scan value of this block 2473 for (unsigned i = threadIdx.y; i < word_count.value; ++i) { 2474 shared_data[i + word_count.value] = shared_data[i] = shared_accum[i]; 2475 } 2476 2477 #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK 2478 KOKKOS_IMPL_CUDA_SYNCWARP_MASK(MASK); 2479 #else 2480 KOKKOS_IMPL_CUDA_SYNCWARP; 2481 #endif 2482 if (CudaTraits::WarpSize < word_count.value) { 2483 __syncthreads(); 2484 } // Protect against large scan values. 2485 2486 // Call functor to accumulate inclusive scan value for this work item 2487 if (iwork < range.end()) { 2488 this->template exec_range<WorkTag>( 2489 iwork, ValueOps::reference(shared_prefix + word_count.value), 2490 false); 2491 } 2492 2493 // Scan block values into locations shared_data[1..blockDim.y] 2494 cuda_intra_block_reduce_scan<true, FunctorType, WorkTag>( 2495 m_functor, 2496 typename ValueTraits::pointer_type(shared_data + word_count.value)); 2497 2498 { 2499 size_type* const block_total = 2500 shared_data + word_count.value * blockDim.y; 2501 for (unsigned i = threadIdx.y; i < word_count.value; ++i) { 2502 shared_accum[i] = block_total[i]; 2503 } 2504 } 2505 2506 // Call functor with exclusive scan value 2507 if (iwork < range.end()) { 2508 this->template exec_range<WorkTag>( 2509 iwork, ValueOps::reference(shared_prefix), true); 2510 } 2511 } 2512 } 2513 2514 public: get_policy() const2515 Policy const& get_policy() const { return m_policy; } 2516 2517 //---------------------------------------- 2518 operator ()() const2519 __device__ inline void operator()() const { 2520 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION 2521 if (m_run_serial) { 2522 typename ValueTraits::value_type value; 2523 ValueInit::init(m_functor, (void*)&value); 2524 const WorkRange range(m_policy, blockIdx.x, gridDim.x); 2525 2526 for (typename Policy::member_type iwork_base = range.begin(); 2527 iwork_base < range.end(); iwork_base++) { 2528 this->template exec_range<WorkTag>(iwork_base, value, true); 2529 } 2530 *((typename ValueTraits::value_type*)m_scratch_space) = value; 2531 } else { 2532 #endif 2533 if (!m_final) { 2534 initial(); 2535 } else { 2536 final(); 2537 } 2538 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION 2539 } 2540 #endif 2541 } 2542 2543 // Determine block size constrained by shared memory: local_block_size(const FunctorType & f)2544 inline unsigned local_block_size(const FunctorType& f) { 2545 // blockDim.y must be power of two = 128 (4 warps) or 256 (8 warps) or 512 2546 // (16 warps) gridDim.x <= blockDim.y * blockDim.y 2547 // 2548 // 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit 2549 // testing 2550 2551 unsigned n = CudaTraits::WarpSize * 4; 2552 while (n && 2553 unsigned(m_policy.space() 2554 .impl_internal_space_instance() 2555 ->m_maxShmemPerBlock) < 2556 cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, 2557 WorkTag>(f, n)) { 2558 n >>= 1; 2559 } 2560 return n; 2561 } 2562 execute()2563 inline void execute() { 2564 const int nwork = m_policy.end() - m_policy.begin(); 2565 if (nwork) { 2566 enum { GridMaxComputeCapability_2x = 0x0ffff }; 2567 2568 const int block_size = local_block_size(m_functor); 2569 KOKKOS_ASSERT(block_size > 0); 2570 2571 const int grid_max = 2572 (block_size * block_size) < GridMaxComputeCapability_2x 2573 ? (block_size * block_size) 2574 : GridMaxComputeCapability_2x; 2575 2576 // At most 'max_grid' blocks: 2577 const int max_grid = 2578 std::min(int(grid_max), int((nwork + block_size - 1) / block_size)); 2579 2580 // How much work per block: 2581 const int work_per_block = (nwork + max_grid - 1) / max_grid; 2582 2583 // How many block are really needed for this much work: 2584 const int grid_x = (nwork + work_per_block - 1) / work_per_block; 2585 2586 m_scratch_space = cuda_internal_scratch_space( 2587 m_policy.space(), ValueTraits::value_size(m_functor) * grid_x); 2588 m_scratch_flags = 2589 cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1); 2590 2591 dim3 grid(grid_x, 1, 1); 2592 dim3 block(1, block_size, 1); // REQUIRED DIMENSIONS ( 1 , N , 1 ) 2593 const int shmem = ValueTraits::value_size(m_functor) * (block_size + 2); 2594 2595 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION 2596 if (m_run_serial) { 2597 block = dim3(1, 1, 1); 2598 grid = dim3(1, 1, 1); 2599 } else { 2600 #endif 2601 2602 m_final = false; 2603 CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>( 2604 *this, grid, block, shmem, 2605 m_policy.space().impl_internal_space_instance(), 2606 false); // copy to device and execute 2607 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION 2608 } 2609 #endif 2610 m_final = true; 2611 CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>( 2612 *this, grid, block, shmem, 2613 m_policy.space().impl_internal_space_instance(), 2614 false); // copy to device and execute 2615 2616 const int size = ValueTraits::value_size(m_functor); 2617 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION 2618 if (m_run_serial) 2619 DeepCopy<HostSpace, CudaSpace>(&m_returnvalue, m_scratch_space, size); 2620 else 2621 #endif 2622 DeepCopy<HostSpace, CudaSpace>( 2623 &m_returnvalue, m_scratch_space + (grid_x - 1) * size / sizeof(int), 2624 size); 2625 } 2626 } 2627 ParallelScanWithTotal(const FunctorType & arg_functor,const Policy & arg_policy,ReturnType & arg_returnvalue)2628 ParallelScanWithTotal(const FunctorType& arg_functor, 2629 const Policy& arg_policy, ReturnType& arg_returnvalue) 2630 : m_functor(arg_functor), 2631 m_policy(arg_policy), 2632 m_scratch_space(nullptr), 2633 m_scratch_flags(nullptr), 2634 m_final(false), 2635 m_returnvalue(arg_returnvalue) 2636 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION 2637 , 2638 m_run_serial(Kokkos::Impl::CudaInternal::cuda_use_serial_execution()) 2639 #endif 2640 { 2641 } 2642 }; 2643 2644 } // namespace Impl 2645 } // namespace Kokkos 2646 2647 //---------------------------------------------------------------------------- 2648 //---------------------------------------------------------------------------- 2649 2650 namespace Kokkos { 2651 2652 namespace Impl { 2653 template <class FunctorType, class ExecPolicy, class ValueType, 2654 class Tag = typename ExecPolicy::work_tag> 2655 struct CudaFunctorAdapter { 2656 const FunctorType f; 2657 using value_type = ValueType; CudaFunctorAdapterKokkos::Impl::CudaFunctorAdapter2658 CudaFunctorAdapter(const FunctorType& f_) : f(f_) {} 2659 operator ()Kokkos::Impl::CudaFunctorAdapter2660 __device__ inline void operator()(typename ExecPolicy::work_tag, 2661 const typename ExecPolicy::member_type& i, 2662 ValueType& val) const { 2663 // Insert Static Assert with decltype on ValueType equals third argument 2664 // type of FunctorType::operator() 2665 f(typename ExecPolicy::work_tag(), i, val); 2666 } 2667 operator ()Kokkos::Impl::CudaFunctorAdapter2668 __device__ inline void operator()(typename ExecPolicy::work_tag, 2669 const typename ExecPolicy::member_type& i, 2670 const typename ExecPolicy::member_type& j, 2671 ValueType& val) const { 2672 // Insert Static Assert with decltype on ValueType equals third argument 2673 // type of FunctorType::operator() 2674 f(typename ExecPolicy::work_tag(), i, j, val); 2675 } 2676 operator ()Kokkos::Impl::CudaFunctorAdapter2677 __device__ inline void operator()(typename ExecPolicy::work_tag, 2678 const typename ExecPolicy::member_type& i, 2679 const typename ExecPolicy::member_type& j, 2680 const typename ExecPolicy::member_type& k, 2681 ValueType& val) const { 2682 // Insert Static Assert with decltype on ValueType equals third argument 2683 // type of FunctorType::operator() 2684 f(typename ExecPolicy::work_tag(), i, j, k, val); 2685 } 2686 operator ()Kokkos::Impl::CudaFunctorAdapter2687 __device__ inline void operator()(typename ExecPolicy::work_tag, 2688 const typename ExecPolicy::member_type& i, 2689 const typename ExecPolicy::member_type& j, 2690 const typename ExecPolicy::member_type& k, 2691 const typename ExecPolicy::member_type& l, 2692 ValueType& val) const { 2693 // Insert Static Assert with decltype on ValueType equals third argument 2694 // type of FunctorType::operator() 2695 f(typename ExecPolicy::work_tag(), i, j, k, l, val); 2696 } 2697 operator ()Kokkos::Impl::CudaFunctorAdapter2698 __device__ inline void operator()(typename ExecPolicy::work_tag, 2699 const typename ExecPolicy::member_type& i, 2700 const typename ExecPolicy::member_type& j, 2701 const typename ExecPolicy::member_type& k, 2702 const typename ExecPolicy::member_type& l, 2703 const typename ExecPolicy::member_type& m, 2704 ValueType& val) const { 2705 // Insert Static Assert with decltype on ValueType equals third argument 2706 // type of FunctorType::operator() 2707 f(typename ExecPolicy::work_tag(), i, j, k, l, m, val); 2708 } 2709 operator ()Kokkos::Impl::CudaFunctorAdapter2710 __device__ inline void operator()(typename ExecPolicy::work_tag, 2711 const typename ExecPolicy::member_type& i, 2712 const typename ExecPolicy::member_type& j, 2713 const typename ExecPolicy::member_type& k, 2714 const typename ExecPolicy::member_type& l, 2715 const typename ExecPolicy::member_type& m, 2716 const typename ExecPolicy::member_type& n, 2717 ValueType& val) const { 2718 // Insert Static Assert with decltype on ValueType equals third argument 2719 // type of FunctorType::operator() 2720 f(typename ExecPolicy::work_tag(), i, j, k, l, m, n, val); 2721 } 2722 }; 2723 2724 template <class FunctorType, class ExecPolicy, class ValueType> 2725 struct CudaFunctorAdapter<FunctorType, ExecPolicy, ValueType, void> { 2726 const FunctorType f; 2727 using value_type = ValueType; CudaFunctorAdapterKokkos::Impl::CudaFunctorAdapter2728 CudaFunctorAdapter(const FunctorType& f_) : f(f_) {} 2729 operator ()Kokkos::Impl::CudaFunctorAdapter2730 __device__ inline void operator()(const typename ExecPolicy::member_type& i, 2731 ValueType& val) const { 2732 // Insert Static Assert with decltype on ValueType equals second argument 2733 // type of FunctorType::operator() 2734 f(i, val); 2735 } 2736 operator ()Kokkos::Impl::CudaFunctorAdapter2737 __device__ inline void operator()(const typename ExecPolicy::member_type& i, 2738 const typename ExecPolicy::member_type& j, 2739 ValueType& val) const { 2740 // Insert Static Assert with decltype on ValueType equals second argument 2741 // type of FunctorType::operator() 2742 f(i, j, val); 2743 } 2744 operator ()Kokkos::Impl::CudaFunctorAdapter2745 __device__ inline void operator()(const typename ExecPolicy::member_type& i, 2746 const typename ExecPolicy::member_type& j, 2747 const typename ExecPolicy::member_type& k, 2748 ValueType& val) const { 2749 // Insert Static Assert with decltype on ValueType equals second argument 2750 // type of FunctorType::operator() 2751 f(i, j, k, val); 2752 } 2753 operator ()Kokkos::Impl::CudaFunctorAdapter2754 __device__ inline void operator()(const typename ExecPolicy::member_type& i, 2755 const typename ExecPolicy::member_type& j, 2756 const typename ExecPolicy::member_type& k, 2757 const typename ExecPolicy::member_type& l, 2758 ValueType& val) const { 2759 // Insert Static Assert with decltype on ValueType equals second argument 2760 // type of FunctorType::operator() 2761 f(i, j, k, l, val); 2762 } 2763 operator ()Kokkos::Impl::CudaFunctorAdapter2764 __device__ inline void operator()(const typename ExecPolicy::member_type& i, 2765 const typename ExecPolicy::member_type& j, 2766 const typename ExecPolicy::member_type& k, 2767 const typename ExecPolicy::member_type& l, 2768 const typename ExecPolicy::member_type& m, 2769 ValueType& val) const { 2770 // Insert Static Assert with decltype on ValueType equals second argument 2771 // type of FunctorType::operator() 2772 f(i, j, k, l, m, val); 2773 } 2774 operator ()Kokkos::Impl::CudaFunctorAdapter2775 __device__ inline void operator()(const typename ExecPolicy::member_type& i, 2776 const typename ExecPolicy::member_type& j, 2777 const typename ExecPolicy::member_type& k, 2778 const typename ExecPolicy::member_type& l, 2779 const typename ExecPolicy::member_type& m, 2780 const typename ExecPolicy::member_type& n, 2781 ValueType& val) const { 2782 // Insert Static Assert with decltype on ValueType equals second argument 2783 // type of FunctorType::operator() 2784 f(i, j, k, l, m, n, val); 2785 } 2786 operator ()Kokkos::Impl::CudaFunctorAdapter2787 __device__ inline void operator()(typename ExecPolicy::member_type& i, 2788 ValueType& val) const { 2789 // Insert Static Assert with decltype on ValueType equals second argument 2790 // type of FunctorType::operator() 2791 f(i, val); 2792 } 2793 operator ()Kokkos::Impl::CudaFunctorAdapter2794 __device__ inline void operator()(typename ExecPolicy::member_type& i, 2795 typename ExecPolicy::member_type& j, 2796 ValueType& val) const { 2797 // Insert Static Assert with decltype on ValueType equals second argument 2798 // type of FunctorType::operator() 2799 f(i, j, val); 2800 } 2801 operator ()Kokkos::Impl::CudaFunctorAdapter2802 __device__ inline void operator()(typename ExecPolicy::member_type& i, 2803 typename ExecPolicy::member_type& j, 2804 typename ExecPolicy::member_type& k, 2805 ValueType& val) const { 2806 // Insert Static Assert with decltype on ValueType equals second argument 2807 // type of FunctorType::operator() 2808 f(i, j, k, val); 2809 } 2810 operator ()Kokkos::Impl::CudaFunctorAdapter2811 __device__ inline void operator()(typename ExecPolicy::member_type& i, 2812 typename ExecPolicy::member_type& j, 2813 typename ExecPolicy::member_type& k, 2814 typename ExecPolicy::member_type& l, 2815 ValueType& val) const { 2816 // Insert Static Assert with decltype on ValueType equals second argument 2817 // type of FunctorType::operator() 2818 f(i, j, k, l, val); 2819 } 2820 operator ()Kokkos::Impl::CudaFunctorAdapter2821 __device__ inline void operator()(typename ExecPolicy::member_type& i, 2822 typename ExecPolicy::member_type& j, 2823 typename ExecPolicy::member_type& k, 2824 typename ExecPolicy::member_type& l, 2825 typename ExecPolicy::member_type& m, 2826 ValueType& val) const { 2827 // Insert Static Assert with decltype on ValueType equals second argument 2828 // type of FunctorType::operator() 2829 f(i, j, k, l, m, val); 2830 } 2831 operator ()Kokkos::Impl::CudaFunctorAdapter2832 __device__ inline void operator()(typename ExecPolicy::member_type& i, 2833 typename ExecPolicy::member_type& j, 2834 typename ExecPolicy::member_type& k, 2835 typename ExecPolicy::member_type& l, 2836 typename ExecPolicy::member_type& m, 2837 typename ExecPolicy::member_type& n, 2838 ValueType& val) const { 2839 // Insert Static Assert with decltype on ValueType equals second argument 2840 // type of FunctorType::operator() 2841 f(i, j, k, l, m, n, val); 2842 } 2843 }; 2844 2845 template <class FunctorType, class ResultType, class Tag, 2846 bool Enable = IsNonTrivialReduceFunctor<FunctorType>::value> 2847 struct FunctorReferenceType { 2848 using reference_type = ResultType&; 2849 }; 2850 2851 template <class FunctorType, class ResultType, class Tag> 2852 struct FunctorReferenceType<FunctorType, ResultType, Tag, true> { 2853 using reference_type = 2854 typename Kokkos::Impl::FunctorValueTraits<FunctorType, 2855 Tag>::reference_type; 2856 }; 2857 2858 template <class FunctorTypeIn, class ExecPolicy, class ValueType> 2859 struct ParallelReduceFunctorType<FunctorTypeIn, ExecPolicy, ValueType, Cuda> { 2860 enum { 2861 FunctorHasValueType = IsNonTrivialReduceFunctor<FunctorTypeIn>::value 2862 }; 2863 using functor_type = typename Kokkos::Impl::if_c< 2864 FunctorHasValueType, FunctorTypeIn, 2865 Impl::CudaFunctorAdapter<FunctorTypeIn, ExecPolicy, ValueType>>::type; functorKokkos::Impl::ParallelReduceFunctorType2866 static functor_type functor(const FunctorTypeIn& functor_in) { 2867 return Impl::if_c<FunctorHasValueType, FunctorTypeIn, functor_type>::select( 2868 functor_in, functor_type(functor_in)); 2869 } 2870 }; 2871 2872 } // namespace Impl 2873 2874 } // namespace Kokkos 2875 2876 #endif /* defined(KOKKOS_ENABLE_CUDA) */ 2877 #endif /* #ifndef KOKKOS_CUDA_PARALLEL_HPP */ 2878