1 /* 2 //@HEADER 3 // ************************************************************************ 4 // 5 // Kokkos v. 3.0 6 // Copyright (2020) National Technology & Engineering 7 // Solutions of Sandia, LLC (NTESS). 8 // 9 // Under the terms of Contract DE-NA0003525 with NTESS, 10 // the U.S. Government retains certain rights in this software. 11 // 12 // Redistribution and use in source and binary forms, with or without 13 // modification, are permitted provided that the following conditions are 14 // met: 15 // 16 // 1. Redistributions of source code must retain the above copyright 17 // notice, this list of conditions and the following disclaimer. 18 // 19 // 2. Redistributions in binary form must reproduce the above copyright 20 // notice, this list of conditions and the following disclaimer in the 21 // documentation and/or other materials provided with the distribution. 22 // 23 // 3. Neither the name of the Corporation nor the names of the 24 // contributors may be used to endorse or promote products derived from 25 // this software without specific prior written permission. 26 // 27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY 28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE 31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 // 39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov) 40 // 41 // ************************************************************************ 42 //@HEADER 43 */ 44 45 #ifndef KOKKO_HIP_PARALLEL_TEAM_HPP 46 #define KOKKO_HIP_PARALLEL_TEAM_HPP 47 48 #include <Kokkos_Parallel.hpp> 49 50 #if defined(__HIPCC__) 51 52 #include <HIP/Kokkos_HIP_KernelLaunch.hpp> 53 #include <HIP/Kokkos_HIP_Locks.hpp> 54 #include <HIP/Kokkos_HIP_Team.hpp> 55 #include <HIP/Kokkos_HIP_Instance.hpp> 56 57 namespace Kokkos { 58 namespace Impl { 59 template <typename... Properties> 60 class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...> 61 : public PolicyTraits<Properties...> { 62 public: 63 using execution_policy = TeamPolicyInternal; 64 65 using traits = PolicyTraits<Properties...>; 66 67 template <typename ExecSpace, typename... OtherProperties> 68 friend class TeamPolicyInternal; 69 70 private: 71 static int constexpr MAX_WARP = 8; 72 73 typename traits::execution_space m_space; 74 int m_league_size; 75 int m_team_size; 76 int m_vector_length; 77 int m_team_scratch_size[2]; 78 int m_thread_scratch_size[2]; 79 int m_chunk_size; 80 bool m_tune_team_size; 81 bool m_tune_vector_length; 82 83 public: 84 using execution_space = Kokkos::Experimental::HIP; 85 86 template <class... OtherProperties> TeamPolicyInternal(TeamPolicyInternal<OtherProperties...> const & p)87 TeamPolicyInternal(TeamPolicyInternal<OtherProperties...> const& p) { 88 m_league_size = p.m_league_size; 89 m_team_size = p.m_team_size; 90 m_vector_length = p.m_vector_length; 91 m_team_scratch_size[0] = p.m_team_scratch_size[0]; 92 m_team_scratch_size[1] = p.m_team_scratch_size[1]; 93 m_thread_scratch_size[0] = p.m_thread_scratch_size[0]; 94 m_thread_scratch_size[1] = p.m_thread_scratch_size[1]; 95 m_chunk_size = p.m_chunk_size; 96 m_space = p.m_space; 97 m_tune_team_size = p.m_tune_team_size; 98 m_tune_vector_length = p.m_tune_vector_length; 99 } 100 101 template <typename FunctorType> team_size_max(FunctorType const & f,ParallelForTag const &) const102 int team_size_max(FunctorType const& f, ParallelForTag const&) const { 103 using closure_type = 104 Impl::ParallelFor<FunctorType, TeamPolicy<Properties...> >; 105 hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch< 106 closure_type, 107 typename traits::launch_bounds>::get_hip_func_attributes(); 108 int const block_size = ::Kokkos::Experimental::Impl::hip_get_max_block_size< 109 FunctorType, typename traits::launch_bounds>( 110 space().impl_internal_space_instance(), attr, f, 111 static_cast<size_t>(impl_vector_length()), 112 static_cast<size_t>(team_scratch_size(0)) + 2 * sizeof(double), 113 static_cast<size_t>(thread_scratch_size(0)) + sizeof(double)); 114 return block_size / impl_vector_length(); 115 } 116 117 template <class FunctorType> team_size_max(const FunctorType & f,const ParallelReduceTag &) const118 inline int team_size_max(const FunctorType& f, 119 const ParallelReduceTag&) const { 120 using functor_analysis_type = 121 Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, 122 TeamPolicyInternal, FunctorType>; 123 using reducer_type = typename Impl::ParallelReduceReturnValue< 124 void, typename functor_analysis_type::value_type, 125 FunctorType>::reducer_type; 126 using closure_type = 127 Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>, 128 reducer_type>; 129 return internal_team_size_max<closure_type>(f); 130 } 131 132 template <class FunctorType, class ReducerType> team_size_max(const FunctorType & f,const ReducerType &,const ParallelReduceTag &) const133 inline int team_size_max(const FunctorType& f, const ReducerType& /*r*/, 134 const ParallelReduceTag&) const { 135 using closure_type = 136 Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>, 137 ReducerType>; 138 return internal_team_size_max<closure_type>(f); 139 } 140 141 template <typename FunctorType> team_size_recommended(FunctorType const & f,ParallelForTag const &) const142 int team_size_recommended(FunctorType const& f, ParallelForTag const&) const { 143 using closure_type = 144 Impl::ParallelFor<FunctorType, TeamPolicy<Properties...> >; 145 hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch< 146 closure_type, 147 typename traits::launch_bounds>::get_hip_func_attributes(); 148 int const block_size = ::Kokkos::Experimental::Impl::hip_get_opt_block_size< 149 FunctorType, typename traits::launch_bounds>( 150 space().impl_internal_space_instance(), attr, f, 151 static_cast<size_t>(impl_vector_length()), 152 static_cast<size_t>(team_scratch_size(0)) + 2 * sizeof(double), 153 static_cast<size_t>(thread_scratch_size(0)) + sizeof(double)); 154 return block_size / impl_vector_length(); 155 } 156 157 template <typename FunctorType> team_size_recommended(FunctorType const & f,ParallelReduceTag const &) const158 inline int team_size_recommended(FunctorType const& f, 159 ParallelReduceTag const&) const { 160 using functor_analysis_type = 161 Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, 162 TeamPolicyInternal, FunctorType>; 163 using reducer_type = typename Impl::ParallelReduceReturnValue< 164 void, typename functor_analysis_type::value_type, 165 FunctorType>::reducer_type; 166 using closure_type = 167 Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>, 168 reducer_type>; 169 return internal_team_size_recommended<closure_type>(f); 170 } 171 172 template <class FunctorType, class ReducerType> team_size_recommended(FunctorType const & f,ReducerType const &,ParallelReduceTag const &) const173 int team_size_recommended(FunctorType const& f, ReducerType const&, 174 ParallelReduceTag const&) const { 175 using closure_type = 176 Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>, 177 ReducerType>; 178 return internal_team_size_recommended<closure_type>(f); 179 } impl_auto_vector_length() const180 inline bool impl_auto_vector_length() const { return m_tune_vector_length; } impl_auto_team_size() const181 inline bool impl_auto_team_size() const { return m_tune_team_size; } vector_length_max()182 static int vector_length_max() { 183 return ::Kokkos::Experimental::Impl::HIPTraits::WarpSize; 184 } 185 verify_requested_vector_length(int requested_vector_length)186 static int verify_requested_vector_length(int requested_vector_length) { 187 int test_vector_length = 188 std::min(requested_vector_length, vector_length_max()); 189 190 // Allow only power-of-two vector_length 191 if (!(is_integral_power_of_two(test_vector_length))) { 192 int test_pow2 = 1; 193 int constexpr warp_size = Experimental::Impl::HIPTraits::WarpSize; 194 while (test_pow2 < warp_size) { 195 test_pow2 <<= 1; 196 if (test_pow2 > test_vector_length) { 197 break; 198 } 199 } 200 test_vector_length = test_pow2 >> 1; 201 } 202 203 return test_vector_length; 204 } 205 scratch_size_max(int level)206 static int scratch_size_max(int level) { 207 return ( 208 level == 0 ? 1024 * 40 : // FIXME_HIP arbitrarily setting this to 48kB 209 20 * 1024 * 1024); // FIXME_HIP arbitrarily setting this to 20MB 210 } impl_set_vector_length(size_t size)211 inline void impl_set_vector_length(size_t size) { m_vector_length = size; } impl_set_team_size(size_t size)212 inline void impl_set_team_size(size_t size) { m_team_size = size; } impl_vector_length() const213 int impl_vector_length() const { return m_vector_length; } vector_length() const214 KOKKOS_DEPRECATED int vector_length() const { return impl_vector_length(); } 215 team_size() const216 int team_size() const { return m_team_size; } 217 league_size() const218 int league_size() const { return m_league_size; } 219 scratch_size(int level,int team_size_=-1) const220 int scratch_size(int level, int team_size_ = -1) const { 221 if (team_size_ < 0) team_size_ = m_team_size; 222 return m_team_scratch_size[level] + 223 team_size_ * m_thread_scratch_size[level]; 224 } 225 team_scratch_size(int level) const226 int team_scratch_size(int level) const { return m_team_scratch_size[level]; } 227 thread_scratch_size(int level) const228 int thread_scratch_size(int level) const { 229 return m_thread_scratch_size[level]; 230 } 231 space() const232 typename traits::execution_space space() const { return m_space; } 233 TeamPolicyInternal()234 TeamPolicyInternal() 235 : m_space(typename traits::execution_space()), 236 m_league_size(0), 237 m_team_size(-1), 238 m_vector_length(0), 239 m_team_scratch_size{0, 0}, 240 m_thread_scratch_size{0, 0}, 241 m_chunk_size(::Kokkos::Experimental::Impl::HIPTraits::WarpSize), 242 m_tune_team_size(false), 243 m_tune_vector_length(false) {} 244 245 /** \brief Specify league size, request team size */ TeamPolicyInternal(const execution_space space_,int league_size_,int team_size_request,int vector_length_request=1)246 TeamPolicyInternal(const execution_space space_, int league_size_, 247 int team_size_request, int vector_length_request = 1) 248 : m_space(space_), 249 m_league_size(league_size_), 250 m_team_size(team_size_request), 251 m_vector_length( 252 (vector_length_request > 0) 253 ? verify_requested_vector_length(vector_length_request) 254 : (verify_requested_vector_length(1))), 255 m_team_scratch_size{0, 0}, 256 m_thread_scratch_size{0, 0}, 257 m_chunk_size(::Kokkos::Experimental::Impl::HIPTraits::WarpSize), 258 m_tune_team_size(bool(team_size_request <= 0)), 259 m_tune_vector_length(bool(vector_length_request <= 0)) { 260 // Make sure league size is permissible 261 if (league_size_ >= 262 static_cast<int>( 263 ::Kokkos::Experimental::Impl::hip_internal_maximum_grid_count())) 264 Impl::throw_runtime_exception( 265 "Requested too large league_size for TeamPolicy on HIP execution " 266 "space."); 267 268 // Make sure total block size is permissible 269 if (m_team_size * m_vector_length > 1024) { 270 Impl::throw_runtime_exception( 271 std::string("Kokkos::TeamPolicy< HIP > the team size is too large. " 272 "Team size x vector length must be smaller than 1024.")); 273 } 274 } 275 276 /** \brief Specify league size, request team size */ TeamPolicyInternal(const execution_space space_,int league_size_,const Kokkos::AUTO_t &,int vector_length_request=1)277 TeamPolicyInternal(const execution_space space_, int league_size_, 278 const Kokkos::AUTO_t& /* team_size_request */, 279 int vector_length_request = 1) 280 : TeamPolicyInternal(space_, league_size_, -1, vector_length_request) {} 281 // FLAG 282 /** \brief Specify league size and team size, request vector length*/ TeamPolicyInternal(const execution_space space_,int league_size_,int team_size_request,const Kokkos::AUTO_t &)283 TeamPolicyInternal(const execution_space space_, int league_size_, 284 int team_size_request, 285 const Kokkos::AUTO_t& /* vector_length_request */ 286 ) 287 : TeamPolicyInternal(space_, league_size_, team_size_request, -1) 288 289 {} 290 291 /** \brief Specify league size, request team size and vector length*/ TeamPolicyInternal(const execution_space space_,int league_size_,const Kokkos::AUTO_t &,const Kokkos::AUTO_t &)292 TeamPolicyInternal(const execution_space space_, int league_size_, 293 const Kokkos::AUTO_t& /* team_size_request */, 294 const Kokkos::AUTO_t& /* vector_length_request */ 295 296 ) 297 : TeamPolicyInternal(space_, league_size_, -1, -1) 298 299 {} 300 TeamPolicyInternal(int league_size_,int team_size_request,int vector_length_request=1)301 TeamPolicyInternal(int league_size_, int team_size_request, 302 int vector_length_request = 1) 303 : TeamPolicyInternal(typename traits::execution_space(), league_size_, 304 team_size_request, vector_length_request) {} 305 TeamPolicyInternal(int league_size_,const Kokkos::AUTO_t &,int vector_length_request=1)306 TeamPolicyInternal(int league_size_, 307 const Kokkos::AUTO_t& /* team_size_request */, 308 int vector_length_request = 1) 309 : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1, 310 vector_length_request) {} 311 312 /** \brief Specify league size and team size, request vector length*/ TeamPolicyInternal(int league_size_,int team_size_request,const Kokkos::AUTO_t &)313 TeamPolicyInternal(int league_size_, int team_size_request, 314 const Kokkos::AUTO_t& /* vector_length_request */ 315 316 ) 317 : TeamPolicyInternal(typename traits::execution_space(), league_size_, 318 team_size_request, -1) 319 320 {} 321 322 /** \brief Specify league size, request team size and vector length*/ TeamPolicyInternal(int league_size_,const Kokkos::AUTO_t &,const Kokkos::AUTO_t &)323 TeamPolicyInternal(int league_size_, 324 const Kokkos::AUTO_t& /* team_size_request */, 325 const Kokkos::AUTO_t& /* vector_length_request */ 326 327 ) 328 : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1, 329 -1) {} 330 chunk_size() const331 int chunk_size() const { return m_chunk_size; } 332 set_chunk_size(typename traits::index_type chunk_size_)333 TeamPolicyInternal& set_chunk_size(typename traits::index_type chunk_size_) { 334 m_chunk_size = chunk_size_; 335 return *this; 336 } 337 338 /** \brief set per team scratch size for a specific level of the scratch 339 * hierarchy */ set_scratch_size(int level,PerTeamValue const & per_team)340 TeamPolicyInternal& set_scratch_size(int level, 341 PerTeamValue const& per_team) { 342 m_team_scratch_size[level] = per_team.value; 343 return *this; 344 } 345 346 /** \brief set per thread scratch size for a specific level of the scratch 347 * hierarchy */ set_scratch_size(int level,PerThreadValue const & per_thread)348 TeamPolicyInternal& set_scratch_size(int level, 349 PerThreadValue const& per_thread) { 350 m_thread_scratch_size[level] = per_thread.value; 351 return *this; 352 } 353 354 /** \brief set per thread and per team scratch size for a specific level of 355 * the scratch hierarchy */ set_scratch_size(int level,PerTeamValue const & per_team,PerThreadValue const & per_thread)356 TeamPolicyInternal& set_scratch_size(int level, PerTeamValue const& per_team, 357 PerThreadValue const& per_thread) { 358 m_team_scratch_size[level] = per_team.value; 359 m_thread_scratch_size[level] = per_thread.value; 360 return *this; 361 } 362 363 using member_type = Kokkos::Impl::HIPTeamMember; 364 365 protected: 366 template <class ClosureType, class FunctorType, class BlockSizeCallable> internal_team_size_common(const FunctorType & f,BlockSizeCallable && block_size_callable) const367 int internal_team_size_common(const FunctorType& f, 368 BlockSizeCallable&& block_size_callable) const { 369 using closure_type = ClosureType; 370 using functor_value_traits = 371 Impl::FunctorValueTraits<FunctorType, typename traits::work_tag>; 372 373 hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch< 374 closure_type, 375 typename traits::launch_bounds>::get_hip_func_attributes(); 376 const int block_size = std::forward<BlockSizeCallable>(block_size_callable)( 377 space().impl_internal_space_instance(), attr, f, 378 static_cast<size_t>(impl_vector_length()), 379 static_cast<size_t>(team_scratch_size(0)) + 2 * sizeof(double), 380 static_cast<size_t>(thread_scratch_size(0)) + sizeof(double) + 381 ((functor_value_traits::StaticValueSize != 0) 382 ? 0 383 : functor_value_traits::value_size(f))); 384 KOKKOS_ASSERT(block_size > 0); 385 386 // Currently we require Power-of-2 team size for reductions. 387 int p2 = 1; 388 while (p2 <= block_size) p2 *= 2; 389 p2 /= 2; 390 return p2 / impl_vector_length(); 391 } 392 393 template <class ClosureType, class FunctorType> internal_team_size_max(const FunctorType & f) const394 int internal_team_size_max(const FunctorType& f) const { 395 return internal_team_size_common<ClosureType>( 396 f, ::Kokkos::Experimental::Impl::hip_get_max_block_size< 397 FunctorType, typename traits::launch_bounds>); 398 } 399 400 template <class ClosureType, class FunctorType> internal_team_size_recommended(const FunctorType & f) const401 int internal_team_size_recommended(const FunctorType& f) const { 402 return internal_team_size_common<ClosureType>( 403 f, ::Kokkos::Experimental::Impl::hip_get_opt_block_size< 404 FunctorType, typename traits::launch_bounds>); 405 } 406 }; 407 408 template <typename FunctorType, typename... Properties> 409 class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, 410 Kokkos::Experimental::HIP> { 411 public: 412 using Policy = TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>; 413 using functor_type = FunctorType; 414 using size_type = ::Kokkos::Experimental::HIP::size_type; 415 416 private: 417 using member_type = typename Policy::member_type; 418 using work_tag = typename Policy::work_tag; 419 using launch_bounds = typename Policy::launch_bounds; 420 421 // Algorithmic constraints: blockDim.y is a power of two AND 422 // blockDim.y == blockDim.z == 1 shared memory utilization: 423 // 424 // [ team reduce space ] 425 // [ team shared space ] 426 427 FunctorType const m_functor; 428 Policy const m_policy; 429 size_type const m_league_size; 430 int m_team_size; 431 size_type const m_vector_size; 432 int m_shmem_begin; 433 int m_shmem_size; 434 void* m_scratch_ptr[2]; 435 int m_scratch_size[2]; 436 // Only let one ParallelFor/Reduce modify the team scratch memory. The 437 // constructor acquires the mutex which is released in the destructor. 438 std::unique_lock<std::mutex> m_scratch_lock; 439 440 template <typename TagType> 441 __device__ inline 442 typename std::enable_if<std::is_same<TagType, void>::value>::type exec_team(const member_type & member) const443 exec_team(const member_type& member) const { 444 m_functor(member); 445 } 446 447 template <typename TagType> 448 __device__ inline 449 typename std::enable_if<!std::is_same<TagType, void>::value>::type exec_team(const member_type & member) const450 exec_team(const member_type& member) const { 451 m_functor(TagType(), member); 452 } 453 454 public: operator ()() const455 __device__ inline void operator()() const { 456 // Iterate this block through the league 457 int64_t threadid = 0; 458 if (m_scratch_size[1] > 0) { 459 __shared__ int64_t base_thread_id; 460 if (threadIdx.x == 0 && threadIdx.y == 0) { 461 threadid = (blockIdx.x * blockDim.z + threadIdx.z) % 462 (Kokkos::Impl::g_device_hip_lock_arrays.n / 463 (blockDim.x * blockDim.y)); 464 threadid *= blockDim.x * blockDim.y; 465 int done = 0; 466 while (!done) { 467 done = (0 == 468 atomicCAS( 469 &Kokkos::Impl::g_device_hip_lock_arrays.scratch[threadid], 470 0, 1)); 471 if (!done) { 472 threadid += blockDim.x * blockDim.y; 473 if (int64_t(threadid + blockDim.x * blockDim.y) >= 474 int64_t(Kokkos::Impl::g_device_hip_lock_arrays.n)) 475 threadid = 0; 476 } 477 } 478 base_thread_id = threadid; 479 } 480 __syncthreads(); 481 threadid = base_thread_id; 482 } 483 484 int const int_league_size = static_cast<int>(m_league_size); 485 for (int league_rank = blockIdx.x; league_rank < int_league_size; 486 league_rank += gridDim.x) { 487 this->template exec_team<work_tag>(typename Policy::member_type( 488 ::Kokkos::Experimental::kokkos_impl_hip_shared_memory<void>(), 489 m_shmem_begin, m_shmem_size, 490 static_cast<void*>(static_cast<char*>(m_scratch_ptr[1]) + 491 ptrdiff_t(threadid / (blockDim.x * blockDim.y)) * 492 m_scratch_size[1]), 493 m_scratch_size[1], league_rank, m_league_size)); 494 } 495 if (m_scratch_size[1] > 0) { 496 __syncthreads(); 497 if (threadIdx.x == 0 && threadIdx.y == 0) 498 Kokkos::Impl::g_device_hip_lock_arrays.scratch[threadid] = 0; 499 } 500 } 501 execute() const502 inline void execute() const { 503 int64_t const shmem_size_total = m_shmem_begin + m_shmem_size; 504 dim3 const grid(static_cast<int>(m_league_size), 1, 1); 505 dim3 const block(static_cast<int>(m_vector_size), 506 static_cast<int>(m_team_size), 1); 507 508 ::Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, launch_bounds>( 509 *this, grid, block, shmem_size_total, 510 m_policy.space().impl_internal_space_instance(), 511 true); // copy to device and execute 512 } 513 ParallelFor(FunctorType const & arg_functor,Policy const & arg_policy)514 ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) 515 : m_functor(arg_functor), 516 m_policy(arg_policy), 517 m_league_size(arg_policy.league_size()), 518 m_team_size(arg_policy.team_size()), 519 m_vector_size(arg_policy.impl_vector_length()), 520 m_scratch_lock(m_policy.space() 521 .impl_internal_space_instance() 522 ->m_team_scratch_mutex) { 523 hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch< 524 ParallelFor, launch_bounds>::get_hip_func_attributes(); 525 m_team_size = 526 m_team_size >= 0 527 ? m_team_size 528 : ::Kokkos::Experimental::Impl::hip_get_opt_block_size< 529 FunctorType, launch_bounds>( 530 m_policy.space().impl_internal_space_instance(), attr, 531 m_functor, m_vector_size, m_policy.team_scratch_size(0), 532 m_policy.thread_scratch_size(0)) / 533 m_vector_size; 534 535 m_shmem_begin = (sizeof(double) * (m_team_size + 2)); 536 m_shmem_size = 537 (m_policy.scratch_size(0, m_team_size) + 538 FunctorTeamShmemSize<FunctorType>::value(m_functor, m_team_size)); 539 m_scratch_size[0] = m_policy.scratch_size(0, m_team_size); 540 m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); 541 542 // Functor's reduce memory, team scan memory, and team shared memory depend 543 // upon team size. 544 m_scratch_ptr[0] = nullptr; 545 m_scratch_ptr[1] = 546 m_team_size <= 0 547 ? nullptr 548 : m_policy.space() 549 .impl_internal_space_instance() 550 ->resize_team_scratch_space( 551 static_cast<ptrdiff_t>(m_scratch_size[1]) * 552 static_cast<ptrdiff_t>( 553 ::Kokkos::Experimental::HIP::concurrency() / 554 (m_team_size * m_vector_size))); 555 556 int const shmem_size_total = m_shmem_begin + m_shmem_size; 557 if (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < 558 shmem_size_total) { 559 printf( 560 "%i %i\n", 561 m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock, 562 shmem_size_total); 563 Kokkos::Impl::throw_runtime_exception(std::string( 564 "Kokkos::Impl::ParallelFor< HIP > insufficient shared memory")); 565 } 566 567 if (static_cast<int>(m_team_size) > 568 static_cast<int>( 569 ::Kokkos::Experimental::Impl::hip_get_max_block_size<FunctorType, 570 launch_bounds>( 571 m_policy.space().impl_internal_space_instance(), attr, 572 arg_functor, arg_policy.impl_vector_length(), 573 arg_policy.team_scratch_size(0), 574 arg_policy.thread_scratch_size(0)) / 575 arg_policy.impl_vector_length())) { 576 Kokkos::Impl::throw_runtime_exception(std::string( 577 "Kokkos::Impl::ParallelFor< HIP > requested too large team size.")); 578 } 579 } 580 }; 581 582 //---------------------------------------------------------------------------- 583 //---------------------------------------------------------------------------- 584 585 template <class FunctorType, class ReducerType, class... Properties> 586 class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>, 587 ReducerType, Kokkos::Experimental::HIP> { 588 public: 589 using Policy = TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>; 590 591 private: 592 using member_type = typename Policy::member_type; 593 using work_tag = typename Policy::work_tag; 594 using launch_bounds = typename Policy::launch_bounds; 595 596 using reducer_conditional = 597 Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, 598 FunctorType, ReducerType>; 599 using reducer_type_fwd = typename reducer_conditional::type; 600 using work_tag_fwd = 601 typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, 602 work_tag, void>::type; 603 604 using value_traits = 605 Kokkos::Impl::FunctorValueTraits<reducer_type_fwd, work_tag_fwd>; 606 using value_init = 607 Kokkos::Impl::FunctorValueInit<reducer_type_fwd, work_tag_fwd>; 608 using value_join = 609 Kokkos::Impl::FunctorValueJoin<reducer_type_fwd, work_tag_fwd>; 610 611 using pointer_type = typename value_traits::pointer_type; 612 using reference_type = typename value_traits::reference_type; 613 using value_type = typename value_traits::value_type; 614 615 public: 616 using functor_type = FunctorType; 617 using size_type = Kokkos::Experimental::HIP::size_type; 618 619 static int constexpr UseShflReduction = (value_traits::StaticValueSize != 0); 620 621 private: 622 struct ShflReductionTag {}; 623 struct SHMEMReductionTag {}; 624 625 // Algorithmic constraints: blockDim.y is a power of two AND 626 // blockDim.y == blockDim.z == 1 shared memory utilization: 627 // 628 // [ global reduce space ] 629 // [ team reduce space ] 630 // [ team shared space ] 631 // 632 633 const FunctorType m_functor; 634 const Policy m_policy; 635 const ReducerType m_reducer; 636 const pointer_type m_result_ptr; 637 const bool m_result_ptr_device_accessible; 638 const bool m_result_ptr_host_accessible; 639 size_type* m_scratch_space; 640 size_type* m_scratch_flags; 641 size_type m_team_begin; 642 size_type m_shmem_begin; 643 size_type m_shmem_size; 644 void* m_scratch_ptr[2]; 645 int m_scratch_size[2]; 646 const size_type m_league_size; 647 int m_team_size; 648 const size_type m_vector_size; 649 // Only let one ParallelFor/Reduce modify the team scratch memory. The 650 // constructor acquires the mutex which is released in the destructor. 651 std::unique_lock<std::mutex> m_scratch_lock; 652 653 template <class TagType> 654 __device__ inline 655 typename std::enable_if<std::is_same<TagType, void>::value>::type exec_team(member_type const & member,reference_type update) const656 exec_team(member_type const& member, reference_type update) const { 657 m_functor(member, update); 658 } 659 660 template <class TagType> 661 __device__ inline 662 typename std::enable_if<!std::is_same<TagType, void>::value>::type exec_team(member_type const & member,reference_type update) const663 exec_team(member_type const& member, reference_type update) const { 664 m_functor(TagType(), member, update); 665 } 666 iterate_through_league(int const threadid,reference_type value) const667 __device__ inline void iterate_through_league(int const threadid, 668 reference_type value) const { 669 int const int_league_size = static_cast<int>(m_league_size); 670 for (int league_rank = blockIdx.x; league_rank < int_league_size; 671 league_rank += gridDim.x) { 672 this->template exec_team<work_tag>( 673 member_type( 674 Kokkos::Experimental::kokkos_impl_hip_shared_memory<char>() + 675 m_team_begin, 676 m_shmem_begin, m_shmem_size, 677 reinterpret_cast<void*>( 678 reinterpret_cast<char*>(m_scratch_ptr[1]) + 679 static_cast<ptrdiff_t>(threadid / (blockDim.x * blockDim.y)) * 680 m_scratch_size[1]), 681 m_scratch_size[1], league_rank, m_league_size), 682 value); 683 } 684 } 685 686 public: operator ()() const687 __device__ inline void operator()() const { 688 int64_t threadid = 0; 689 if (m_scratch_size[1] > 0) { 690 __shared__ int64_t base_thread_id; 691 if (threadIdx.x == 0 && threadIdx.y == 0) { 692 threadid = (blockIdx.x * blockDim.z + threadIdx.z) % 693 (Kokkos::Impl::g_device_hip_lock_arrays.n / 694 (blockDim.x * blockDim.y)); 695 threadid *= blockDim.x * blockDim.y; 696 int done = 0; 697 while (!done) { 698 done = (0 == 699 atomicCAS( 700 &Kokkos::Impl::g_device_hip_lock_arrays.scratch[threadid], 701 0, 1)); 702 if (!done) { 703 threadid += blockDim.x * blockDim.y; 704 if (static_cast<int64_t>(threadid + blockDim.x * blockDim.y) >= 705 static_cast<int64_t>(Kokkos::Impl::g_device_hip_lock_arrays.n)) 706 threadid = 0; 707 } 708 } 709 base_thread_id = threadid; 710 } 711 __syncthreads(); 712 threadid = base_thread_id; 713 } 714 715 using ReductionTag = std::conditional_t<UseShflReduction, ShflReductionTag, 716 SHMEMReductionTag>; 717 run(ReductionTag{}, threadid); 718 719 if (m_scratch_size[1] > 0) { 720 __syncthreads(); 721 if (threadIdx.x == 0 && threadIdx.y == 0) { 722 Kokkos::Impl::g_device_hip_lock_arrays.scratch[threadid] = 0; 723 } 724 } 725 } 726 run(SHMEMReductionTag,int const threadid) const727 __device__ inline void run(SHMEMReductionTag, int const threadid) const { 728 integral_nonzero_constant<size_type, value_traits::StaticValueSize / 729 sizeof(size_type)> const 730 word_count(value_traits::value_size( 731 reducer_conditional::select(m_functor, m_reducer)) / 732 sizeof(size_type)); 733 734 reference_type value = value_init::init( 735 reducer_conditional::select(m_functor, m_reducer), 736 Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>() + 737 threadIdx.y * word_count.value); 738 739 // Iterate this block through the league 740 iterate_through_league(threadid, value); 741 742 // Reduce with final value at blockDim.y - 1 location. 743 bool do_final_reduce = (m_league_size == 0); 744 if (!do_final_reduce) 745 do_final_reduce = 746 hip_single_inter_block_reduce_scan<false, FunctorType, work_tag>( 747 reducer_conditional::select(m_functor, m_reducer), blockIdx.x, 748 gridDim.x, 749 Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>(), 750 m_scratch_space, m_scratch_flags); 751 if (do_final_reduce) { 752 // This is the final block with the final result at the final threads' 753 // location 754 755 size_type* const shared = 756 Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>() + 757 (blockDim.y - 1) * word_count.value; 758 size_type* const global = m_result_ptr_device_accessible 759 ? reinterpret_cast<size_type*>(m_result_ptr) 760 : m_scratch_space; 761 762 if (threadIdx.y == 0) { 763 Kokkos::Impl::FunctorFinal<reducer_type_fwd, work_tag_fwd>::final( 764 reducer_conditional::select(m_functor, m_reducer), shared); 765 } 766 767 if (Kokkos::Experimental::Impl::HIPTraits::WarpSize < word_count.value) { 768 __syncthreads(); 769 } 770 771 for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { 772 global[i] = shared[i]; 773 } 774 } 775 } 776 run(ShflReductionTag,int const threadid) const777 __device__ inline void run(ShflReductionTag, int const threadid) const { 778 value_type value; 779 value_init::init(reducer_conditional::select(m_functor, m_reducer), &value); 780 781 // Iterate this block through the league 782 iterate_through_league(threadid, value); 783 784 pointer_type const result = 785 m_result_ptr_device_accessible 786 ? m_result_ptr 787 : reinterpret_cast<pointer_type>(m_scratch_space); 788 789 value_type init; 790 value_init::init(reducer_conditional::select(m_functor, m_reducer), &init); 791 if (m_league_size == 0) { 792 Kokkos::Impl::FunctorFinal<reducer_type_fwd, work_tag_fwd>::final( 793 reducer_conditional::select(m_functor, m_reducer), 794 reinterpret_cast<void*>(&value)); 795 *result = value; 796 } else if (Impl::hip_inter_block_shuffle_reduction<FunctorType, value_join, 797 work_tag>( 798 value, init, 799 value_join( 800 reducer_conditional::select(m_functor, m_reducer)), 801 m_scratch_space, result, m_scratch_flags, blockDim.y)) { 802 unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x; 803 if (id == 0) { 804 Kokkos::Impl::FunctorFinal<reducer_type_fwd, work_tag_fwd>::final( 805 reducer_conditional::select(m_functor, m_reducer), 806 reinterpret_cast<void*>(&value)); 807 *result = value; 808 } 809 } 810 } 811 execute()812 inline void execute() { 813 const int nwork = m_league_size * m_team_size; 814 const bool need_device_set = ReduceFunctorHasInit<FunctorType>::value || 815 ReduceFunctorHasFinal<FunctorType>::value || 816 !m_result_ptr_host_accessible || 817 !std::is_same<ReducerType, InvalidType>::value; 818 if ((nwork > 0) || need_device_set) { 819 const int block_count = 820 UseShflReduction 821 ? std::min( 822 m_league_size, 823 size_type(1024 * 824 Kokkos::Experimental::Impl::HIPTraits::WarpSize)) 825 : std::min(static_cast<int>(m_league_size), m_team_size); 826 827 m_scratch_space = Kokkos::Experimental::Impl::hip_internal_scratch_space( 828 value_traits::value_size( 829 reducer_conditional::select(m_functor, m_reducer)) * 830 block_count); 831 m_scratch_flags = Kokkos::Experimental::Impl::hip_internal_scratch_flags( 832 sizeof(size_type)); 833 834 dim3 block(m_vector_size, m_team_size, 1); 835 dim3 grid(block_count, 1, 1); 836 if (nwork == 0) { 837 block = dim3(1, 1, 1); 838 grid = dim3(1, 1, 1); 839 } 840 const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; 841 842 Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelReduce, 843 launch_bounds>( 844 *this, grid, block, shmem_size_total, 845 m_policy.space().impl_internal_space_instance(), 846 true); // copy to device and execute 847 848 if (!m_result_ptr_device_accessible) { 849 m_policy.space().impl_internal_space_instance()->fence(); 850 851 if (m_result_ptr) { 852 const int size = value_traits::value_size( 853 reducer_conditional::select(m_functor, m_reducer)); 854 DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace>( 855 m_result_ptr, m_scratch_space, size); 856 } 857 } 858 } else { 859 if (m_result_ptr) { 860 value_init::init(reducer_conditional::select(m_functor, m_reducer), 861 m_result_ptr); 862 } 863 } 864 } 865 866 template <class ViewType> ParallelReduce(FunctorType const & arg_functor,Policy const & arg_policy,ViewType const & arg_result,typename std::enable_if<Kokkos::is_view<ViewType>::value,void * >::type=nullptr)867 ParallelReduce(FunctorType const& arg_functor, Policy const& arg_policy, 868 ViewType const& arg_result, 869 typename std::enable_if<Kokkos::is_view<ViewType>::value, 870 void*>::type = nullptr) 871 : m_functor(arg_functor), 872 m_policy(arg_policy), 873 m_reducer(InvalidType()), 874 m_result_ptr(arg_result.data()), 875 m_result_ptr_device_accessible( 876 MemorySpaceAccess<Kokkos::Experimental::HIPSpace, 877 typename ViewType::memory_space>::accessible), 878 m_result_ptr_host_accessible( 879 MemorySpaceAccess<Kokkos::HostSpace, 880 typename ViewType::memory_space>::accessible), 881 m_scratch_space(nullptr), 882 m_scratch_flags(nullptr), 883 m_team_begin(0), 884 m_shmem_begin(0), 885 m_shmem_size(0), 886 m_scratch_ptr{nullptr, nullptr}, 887 m_league_size(arg_policy.league_size()), 888 m_team_size(arg_policy.team_size()), 889 m_vector_size(arg_policy.impl_vector_length()), 890 m_scratch_lock(m_policy.space() 891 .impl_internal_space_instance() 892 ->m_team_scratch_mutex) { 893 hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch< 894 ParallelReduce, launch_bounds>::get_hip_func_attributes(); 895 m_team_size = 896 m_team_size >= 0 897 ? m_team_size 898 : Kokkos::Experimental::Impl::hip_get_opt_block_size<FunctorType, 899 launch_bounds>( 900 m_policy.space().impl_internal_space_instance(), attr, 901 m_functor, m_vector_size, m_policy.team_scratch_size(0), 902 m_policy.thread_scratch_size(0)) / 903 m_vector_size; 904 905 m_team_begin = 906 UseShflReduction 907 ? 0 908 : hip_single_inter_block_reduce_scan_shmem<false, FunctorType, 909 work_tag>(arg_functor, 910 m_team_size); 911 m_shmem_begin = sizeof(double) * (m_team_size + 2); 912 m_shmem_size = 913 m_policy.scratch_size(0, m_team_size) + 914 FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size); 915 m_scratch_size[0] = m_shmem_size; 916 m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); 917 m_scratch_ptr[1] = 918 m_team_size <= 0 919 ? nullptr 920 : m_policy.space() 921 .impl_internal_space_instance() 922 ->resize_team_scratch_space( 923 static_cast<std::int64_t>(m_scratch_size[1]) * 924 (static_cast<std::int64_t>( 925 Kokkos::Experimental::HIP::concurrency() / 926 (m_team_size * m_vector_size)))); 927 928 // The global parallel_reduce does not support vector_length other than 1 at 929 // the moment 930 if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction) 931 Impl::throw_runtime_exception( 932 "Kokkos::parallel_reduce with a TeamPolicy using a vector length of " 933 "greater than 1 is not currently supported for HIP for dynamic " 934 "sized reduction types."); 935 936 if ((m_team_size < Kokkos::Experimental::Impl::HIPTraits::WarpSize) && 937 !UseShflReduction) 938 Impl::throw_runtime_exception( 939 "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller " 940 "than 64 is not currently supported with HIP for dynamic sized " 941 "reduction types."); 942 943 // Functor's reduce memory, team scan memory, and team shared memory depend 944 // upon team size. 945 946 const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; 947 948 if (!Kokkos::Impl::is_integral_power_of_two(m_team_size) && 949 !UseShflReduction) { 950 Kokkos::Impl::throw_runtime_exception( 951 std::string("Kokkos::Impl::ParallelReduce< HIP > bad team size")); 952 } 953 954 if (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < 955 shmem_size_total) { 956 Kokkos::Impl::throw_runtime_exception( 957 std::string("Kokkos::Impl::ParallelReduce< HIP > requested too much " 958 "L0 scratch memory")); 959 } 960 961 if (static_cast<int>(m_team_size) > 962 arg_policy.team_size_max(m_functor, m_reducer, ParallelReduceTag())) { 963 Kokkos::Impl::throw_runtime_exception( 964 std::string("Kokkos::Impl::ParallelReduce< HIP > requested too " 965 "large team size.")); 966 } 967 } 968 ParallelReduce(FunctorType const & arg_functor,Policy const & arg_policy,ReducerType const & reducer)969 ParallelReduce(FunctorType const& arg_functor, Policy const& arg_policy, 970 ReducerType const& reducer) 971 : m_functor(arg_functor), 972 m_policy(arg_policy), 973 m_reducer(reducer), 974 m_result_ptr(reducer.view().data()), 975 m_result_ptr_device_accessible( 976 MemorySpaceAccess<Kokkos::Experimental::HIPSpace, 977 typename ReducerType::result_view_type:: 978 memory_space>::accessible), 979 m_result_ptr_host_accessible( 980 MemorySpaceAccess<Kokkos::HostSpace, 981 typename ReducerType::result_view_type:: 982 memory_space>::accessible), 983 m_scratch_space(nullptr), 984 m_scratch_flags(nullptr), 985 m_team_begin(0), 986 m_shmem_begin(0), 987 m_shmem_size(0), 988 m_scratch_ptr{nullptr, nullptr}, 989 m_league_size(arg_policy.league_size()), 990 m_team_size(arg_policy.team_size()), 991 m_vector_size(arg_policy.impl_vector_length()), 992 m_scratch_lock(m_policy.space() 993 .impl_internal_space_instance() 994 ->m_team_scratch_mutex) { 995 hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch< 996 ParallelReduce, launch_bounds>::get_hip_func_attributes(); 997 m_team_size = 998 m_team_size >= 0 999 ? m_team_size 1000 : Kokkos::Experimental::Impl::hip_get_opt_block_size<FunctorType, 1001 launch_bounds>( 1002 m_policy.space().impl_internal_space_instance(), attr, 1003 m_functor, m_vector_size, m_policy.team_scratch_size(0), 1004 m_policy.thread_scratch_size(0)) / 1005 m_vector_size; 1006 1007 m_team_begin = 1008 UseShflReduction 1009 ? 0 1010 : hip_single_inter_block_reduce_scan_shmem<false, FunctorType, 1011 work_tag>(arg_functor, 1012 m_team_size); 1013 m_shmem_begin = sizeof(double) * (m_team_size + 2); 1014 m_shmem_size = 1015 m_policy.scratch_size(0, m_team_size) + 1016 FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size); 1017 m_scratch_size[0] = m_shmem_size; 1018 m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); 1019 m_scratch_ptr[1] = 1020 m_team_size <= 0 1021 ? nullptr 1022 : m_policy.space() 1023 .impl_internal_space_instance() 1024 ->resize_team_scratch_space( 1025 static_cast<ptrdiff_t>(m_scratch_size[1]) * 1026 static_cast<ptrdiff_t>( 1027 Kokkos::Experimental::HIP::concurrency() / 1028 (m_team_size * m_vector_size))); 1029 1030 // The global parallel_reduce does not support vector_length other than 1 at 1031 // the moment 1032 if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction) 1033 Impl::throw_runtime_exception( 1034 "Kokkos::parallel_reduce with a TeamPolicy using a vector length of " 1035 "greater than 1 is not currently supported for HIP for dynamic " 1036 "sized reduction types."); 1037 1038 if ((m_team_size < Kokkos::Experimental::Impl::HIPTraits::WarpSize) && 1039 !UseShflReduction) 1040 Impl::throw_runtime_exception( 1041 "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller " 1042 "than 64 is not currently supported with HIP for dynamic sized " 1043 "reduction types."); 1044 1045 // Functor's reduce memory, team scan memory, and team shared memory depend 1046 // upon team size. 1047 1048 const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; 1049 1050 if ((!Kokkos::Impl::is_integral_power_of_two(m_team_size) && 1051 !UseShflReduction) || 1052 m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < 1053 shmem_size_total) { 1054 Kokkos::Impl::throw_runtime_exception( 1055 std::string("Kokkos::Impl::ParallelReduce< HIP > bad team size")); 1056 } 1057 if (static_cast<int>(m_team_size) > 1058 arg_policy.team_size_max(m_functor, m_reducer, ParallelReduceTag())) { 1059 Kokkos::Impl::throw_runtime_exception( 1060 std::string("Kokkos::Impl::ParallelReduce< HIP > requested too " 1061 "large team size.")); 1062 } 1063 } 1064 }; 1065 } // namespace Impl 1066 } // namespace Kokkos 1067 1068 #endif 1069 1070 #endif 1071