1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 //                        Kokkos v. 3.0
6 //       Copyright (2020) National Technology & Engineering
7 //               Solutions of Sandia, LLC (NTESS).
8 //
9 // Under the terms of Contract DE-NA0003525 with NTESS,
10 // the U.S. Government retains certain rights in this software.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions are
14 // met:
15 //
16 // 1. Redistributions of source code must retain the above copyright
17 // notice, this list of conditions and the following disclaimer.
18 //
19 // 2. Redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution.
22 //
23 // 3. Neither the name of the Corporation nor the names of the
24 // contributors may be used to endorse or promote products derived from
25 // this software without specific prior written permission.
26 //
27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 //
39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
40 //
41 // ************************************************************************
42 //@HEADER
43 */
44 
45 #ifndef KOKKO_HIP_PARALLEL_TEAM_HPP
46 #define KOKKO_HIP_PARALLEL_TEAM_HPP
47 
48 #include <Kokkos_Parallel.hpp>
49 
50 #if defined(__HIPCC__)
51 
52 #include <HIP/Kokkos_HIP_KernelLaunch.hpp>
53 #include <HIP/Kokkos_HIP_Locks.hpp>
54 #include <HIP/Kokkos_HIP_Team.hpp>
55 #include <HIP/Kokkos_HIP_Instance.hpp>
56 
57 namespace Kokkos {
58 namespace Impl {
59 template <typename... Properties>
60 class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
61     : public PolicyTraits<Properties...> {
62  public:
63   using execution_policy = TeamPolicyInternal;
64 
65   using traits = PolicyTraits<Properties...>;
66 
67   template <typename ExecSpace, typename... OtherProperties>
68   friend class TeamPolicyInternal;
69 
70  private:
71   static int constexpr MAX_WARP = 8;
72 
73   typename traits::execution_space m_space;
74   int m_league_size;
75   int m_team_size;
76   int m_vector_length;
77   int m_team_scratch_size[2];
78   int m_thread_scratch_size[2];
79   int m_chunk_size;
80   bool m_tune_team_size;
81   bool m_tune_vector_length;
82 
83  public:
84   using execution_space = Kokkos::Experimental::HIP;
85 
86   template <class... OtherProperties>
TeamPolicyInternal(TeamPolicyInternal<OtherProperties...> const & p)87   TeamPolicyInternal(TeamPolicyInternal<OtherProperties...> const& p) {
88     m_league_size            = p.m_league_size;
89     m_team_size              = p.m_team_size;
90     m_vector_length          = p.m_vector_length;
91     m_team_scratch_size[0]   = p.m_team_scratch_size[0];
92     m_team_scratch_size[1]   = p.m_team_scratch_size[1];
93     m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
94     m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
95     m_chunk_size             = p.m_chunk_size;
96     m_space                  = p.m_space;
97     m_tune_team_size         = p.m_tune_team_size;
98     m_tune_vector_length     = p.m_tune_vector_length;
99   }
100 
101   template <typename FunctorType>
team_size_max(FunctorType const & f,ParallelForTag const &) const102   int team_size_max(FunctorType const& f, ParallelForTag const&) const {
103     using closure_type =
104         Impl::ParallelFor<FunctorType, TeamPolicy<Properties...> >;
105     hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
106         closure_type,
107         typename traits::launch_bounds>::get_hip_func_attributes();
108     int const block_size = ::Kokkos::Experimental::Impl::hip_get_max_block_size<
109         FunctorType, typename traits::launch_bounds>(
110         space().impl_internal_space_instance(), attr, f,
111         static_cast<size_t>(impl_vector_length()),
112         static_cast<size_t>(team_scratch_size(0)) + 2 * sizeof(double),
113         static_cast<size_t>(thread_scratch_size(0)) + sizeof(double));
114     return block_size / impl_vector_length();
115   }
116 
117   template <class FunctorType>
team_size_max(const FunctorType & f,const ParallelReduceTag &) const118   inline int team_size_max(const FunctorType& f,
119                            const ParallelReduceTag&) const {
120     using functor_analysis_type =
121         Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
122                               TeamPolicyInternal, FunctorType>;
123     using reducer_type = typename Impl::ParallelReduceReturnValue<
124         void, typename functor_analysis_type::value_type,
125         FunctorType>::reducer_type;
126     using closure_type =
127         Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
128                              reducer_type>;
129     return internal_team_size_max<closure_type>(f);
130   }
131 
132   template <class FunctorType, class ReducerType>
team_size_max(const FunctorType & f,const ReducerType &,const ParallelReduceTag &) const133   inline int team_size_max(const FunctorType& f, const ReducerType& /*r*/,
134                            const ParallelReduceTag&) const {
135     using closure_type =
136         Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
137                              ReducerType>;
138     return internal_team_size_max<closure_type>(f);
139   }
140 
141   template <typename FunctorType>
team_size_recommended(FunctorType const & f,ParallelForTag const &) const142   int team_size_recommended(FunctorType const& f, ParallelForTag const&) const {
143     using closure_type =
144         Impl::ParallelFor<FunctorType, TeamPolicy<Properties...> >;
145     hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
146         closure_type,
147         typename traits::launch_bounds>::get_hip_func_attributes();
148     int const block_size = ::Kokkos::Experimental::Impl::hip_get_opt_block_size<
149         FunctorType, typename traits::launch_bounds>(
150         space().impl_internal_space_instance(), attr, f,
151         static_cast<size_t>(impl_vector_length()),
152         static_cast<size_t>(team_scratch_size(0)) + 2 * sizeof(double),
153         static_cast<size_t>(thread_scratch_size(0)) + sizeof(double));
154     return block_size / impl_vector_length();
155   }
156 
157   template <typename FunctorType>
team_size_recommended(FunctorType const & f,ParallelReduceTag const &) const158   inline int team_size_recommended(FunctorType const& f,
159                                    ParallelReduceTag const&) const {
160     using functor_analysis_type =
161         Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
162                               TeamPolicyInternal, FunctorType>;
163     using reducer_type = typename Impl::ParallelReduceReturnValue<
164         void, typename functor_analysis_type::value_type,
165         FunctorType>::reducer_type;
166     using closure_type =
167         Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
168                              reducer_type>;
169     return internal_team_size_recommended<closure_type>(f);
170   }
171 
172   template <class FunctorType, class ReducerType>
team_size_recommended(FunctorType const & f,ReducerType const &,ParallelReduceTag const &) const173   int team_size_recommended(FunctorType const& f, ReducerType const&,
174                             ParallelReduceTag const&) const {
175     using closure_type =
176         Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
177                              ReducerType>;
178     return internal_team_size_recommended<closure_type>(f);
179   }
impl_auto_vector_length() const180   inline bool impl_auto_vector_length() const { return m_tune_vector_length; }
impl_auto_team_size() const181   inline bool impl_auto_team_size() const { return m_tune_team_size; }
vector_length_max()182   static int vector_length_max() {
183     return ::Kokkos::Experimental::Impl::HIPTraits::WarpSize;
184   }
185 
verify_requested_vector_length(int requested_vector_length)186   static int verify_requested_vector_length(int requested_vector_length) {
187     int test_vector_length =
188         std::min(requested_vector_length, vector_length_max());
189 
190     // Allow only power-of-two vector_length
191     if (!(is_integral_power_of_two(test_vector_length))) {
192       int test_pow2           = 1;
193       int constexpr warp_size = Experimental::Impl::HIPTraits::WarpSize;
194       while (test_pow2 < warp_size) {
195         test_pow2 <<= 1;
196         if (test_pow2 > test_vector_length) {
197           break;
198         }
199       }
200       test_vector_length = test_pow2 >> 1;
201     }
202 
203     return test_vector_length;
204   }
205 
scratch_size_max(int level)206   static int scratch_size_max(int level) {
207     return (
208         level == 0 ? 1024 * 40 :  // FIXME_HIP arbitrarily setting this to 48kB
209             20 * 1024 * 1024);    // FIXME_HIP arbitrarily setting this to 20MB
210   }
impl_set_vector_length(size_t size)211   inline void impl_set_vector_length(size_t size) { m_vector_length = size; }
impl_set_team_size(size_t size)212   inline void impl_set_team_size(size_t size) { m_team_size = size; }
impl_vector_length() const213   int impl_vector_length() const { return m_vector_length; }
vector_length() const214   KOKKOS_DEPRECATED int vector_length() const { return impl_vector_length(); }
215 
team_size() const216   int team_size() const { return m_team_size; }
217 
league_size() const218   int league_size() const { return m_league_size; }
219 
scratch_size(int level,int team_size_=-1) const220   int scratch_size(int level, int team_size_ = -1) const {
221     if (team_size_ < 0) team_size_ = m_team_size;
222     return m_team_scratch_size[level] +
223            team_size_ * m_thread_scratch_size[level];
224   }
225 
team_scratch_size(int level) const226   int team_scratch_size(int level) const { return m_team_scratch_size[level]; }
227 
thread_scratch_size(int level) const228   int thread_scratch_size(int level) const {
229     return m_thread_scratch_size[level];
230   }
231 
space() const232   typename traits::execution_space space() const { return m_space; }
233 
TeamPolicyInternal()234   TeamPolicyInternal()
235       : m_space(typename traits::execution_space()),
236         m_league_size(0),
237         m_team_size(-1),
238         m_vector_length(0),
239         m_team_scratch_size{0, 0},
240         m_thread_scratch_size{0, 0},
241         m_chunk_size(::Kokkos::Experimental::Impl::HIPTraits::WarpSize),
242         m_tune_team_size(false),
243         m_tune_vector_length(false) {}
244 
245   /** \brief  Specify league size, request team size */
TeamPolicyInternal(const execution_space space_,int league_size_,int team_size_request,int vector_length_request=1)246   TeamPolicyInternal(const execution_space space_, int league_size_,
247                      int team_size_request, int vector_length_request = 1)
248       : m_space(space_),
249         m_league_size(league_size_),
250         m_team_size(team_size_request),
251         m_vector_length(
252             (vector_length_request > 0)
253                 ? verify_requested_vector_length(vector_length_request)
254                 : (verify_requested_vector_length(1))),
255         m_team_scratch_size{0, 0},
256         m_thread_scratch_size{0, 0},
257         m_chunk_size(::Kokkos::Experimental::Impl::HIPTraits::WarpSize),
258         m_tune_team_size(bool(team_size_request <= 0)),
259         m_tune_vector_length(bool(vector_length_request <= 0)) {
260     // Make sure league size is permissible
261     if (league_size_ >=
262         static_cast<int>(
263             ::Kokkos::Experimental::Impl::hip_internal_maximum_grid_count()))
264       Impl::throw_runtime_exception(
265           "Requested too large league_size for TeamPolicy on HIP execution "
266           "space.");
267 
268     // Make sure total block size is permissible
269     if (m_team_size * m_vector_length > 1024) {
270       Impl::throw_runtime_exception(
271           std::string("Kokkos::TeamPolicy< HIP > the team size is too large. "
272                       "Team size x vector length must be smaller than 1024."));
273     }
274   }
275 
276   /** \brief  Specify league size, request team size */
TeamPolicyInternal(const execution_space space_,int league_size_,const Kokkos::AUTO_t &,int vector_length_request=1)277   TeamPolicyInternal(const execution_space space_, int league_size_,
278                      const Kokkos::AUTO_t& /* team_size_request */,
279                      int vector_length_request = 1)
280       : TeamPolicyInternal(space_, league_size_, -1, vector_length_request) {}
281   // FLAG
282   /** \brief  Specify league size and team size, request vector length*/
TeamPolicyInternal(const execution_space space_,int league_size_,int team_size_request,const Kokkos::AUTO_t &)283   TeamPolicyInternal(const execution_space space_, int league_size_,
284                      int team_size_request,
285                      const Kokkos::AUTO_t& /* vector_length_request */
286                      )
287       : TeamPolicyInternal(space_, league_size_, team_size_request, -1)
288 
289   {}
290 
291   /** \brief  Specify league size, request team size and vector length*/
TeamPolicyInternal(const execution_space space_,int league_size_,const Kokkos::AUTO_t &,const Kokkos::AUTO_t &)292   TeamPolicyInternal(const execution_space space_, int league_size_,
293                      const Kokkos::AUTO_t& /* team_size_request */,
294                      const Kokkos::AUTO_t& /* vector_length_request */
295 
296                      )
297       : TeamPolicyInternal(space_, league_size_, -1, -1)
298 
299   {}
300 
TeamPolicyInternal(int league_size_,int team_size_request,int vector_length_request=1)301   TeamPolicyInternal(int league_size_, int team_size_request,
302                      int vector_length_request = 1)
303       : TeamPolicyInternal(typename traits::execution_space(), league_size_,
304                            team_size_request, vector_length_request) {}
305 
TeamPolicyInternal(int league_size_,const Kokkos::AUTO_t &,int vector_length_request=1)306   TeamPolicyInternal(int league_size_,
307                      const Kokkos::AUTO_t& /* team_size_request */,
308                      int vector_length_request = 1)
309       : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1,
310                            vector_length_request) {}
311 
312   /** \brief  Specify league size and team size, request vector length*/
TeamPolicyInternal(int league_size_,int team_size_request,const Kokkos::AUTO_t &)313   TeamPolicyInternal(int league_size_, int team_size_request,
314                      const Kokkos::AUTO_t& /* vector_length_request */
315 
316                      )
317       : TeamPolicyInternal(typename traits::execution_space(), league_size_,
318                            team_size_request, -1)
319 
320   {}
321 
322   /** \brief  Specify league size, request team size and vector length*/
TeamPolicyInternal(int league_size_,const Kokkos::AUTO_t &,const Kokkos::AUTO_t &)323   TeamPolicyInternal(int league_size_,
324                      const Kokkos::AUTO_t& /* team_size_request */,
325                      const Kokkos::AUTO_t& /* vector_length_request */
326 
327                      )
328       : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1,
329                            -1) {}
330 
chunk_size() const331   int chunk_size() const { return m_chunk_size; }
332 
set_chunk_size(typename traits::index_type chunk_size_)333   TeamPolicyInternal& set_chunk_size(typename traits::index_type chunk_size_) {
334     m_chunk_size = chunk_size_;
335     return *this;
336   }
337 
338   /** \brief set per team scratch size for a specific level of the scratch
339    * hierarchy */
set_scratch_size(int level,PerTeamValue const & per_team)340   TeamPolicyInternal& set_scratch_size(int level,
341                                        PerTeamValue const& per_team) {
342     m_team_scratch_size[level] = per_team.value;
343     return *this;
344   }
345 
346   /** \brief set per thread scratch size for a specific level of the scratch
347    * hierarchy */
set_scratch_size(int level,PerThreadValue const & per_thread)348   TeamPolicyInternal& set_scratch_size(int level,
349                                        PerThreadValue const& per_thread) {
350     m_thread_scratch_size[level] = per_thread.value;
351     return *this;
352   }
353 
354   /** \brief set per thread and per team scratch size for a specific level of
355    * the scratch hierarchy */
set_scratch_size(int level,PerTeamValue const & per_team,PerThreadValue const & per_thread)356   TeamPolicyInternal& set_scratch_size(int level, PerTeamValue const& per_team,
357                                        PerThreadValue const& per_thread) {
358     m_team_scratch_size[level]   = per_team.value;
359     m_thread_scratch_size[level] = per_thread.value;
360     return *this;
361   }
362 
363   using member_type = Kokkos::Impl::HIPTeamMember;
364 
365  protected:
366   template <class ClosureType, class FunctorType, class BlockSizeCallable>
internal_team_size_common(const FunctorType & f,BlockSizeCallable && block_size_callable) const367   int internal_team_size_common(const FunctorType& f,
368                                 BlockSizeCallable&& block_size_callable) const {
369     using closure_type = ClosureType;
370     using functor_value_traits =
371         Impl::FunctorValueTraits<FunctorType, typename traits::work_tag>;
372 
373     hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
374         closure_type,
375         typename traits::launch_bounds>::get_hip_func_attributes();
376     const int block_size = std::forward<BlockSizeCallable>(block_size_callable)(
377         space().impl_internal_space_instance(), attr, f,
378         static_cast<size_t>(impl_vector_length()),
379         static_cast<size_t>(team_scratch_size(0)) + 2 * sizeof(double),
380         static_cast<size_t>(thread_scratch_size(0)) + sizeof(double) +
381             ((functor_value_traits::StaticValueSize != 0)
382                  ? 0
383                  : functor_value_traits::value_size(f)));
384     KOKKOS_ASSERT(block_size > 0);
385 
386     // Currently we require Power-of-2 team size for reductions.
387     int p2 = 1;
388     while (p2 <= block_size) p2 *= 2;
389     p2 /= 2;
390     return p2 / impl_vector_length();
391   }
392 
393   template <class ClosureType, class FunctorType>
internal_team_size_max(const FunctorType & f) const394   int internal_team_size_max(const FunctorType& f) const {
395     return internal_team_size_common<ClosureType>(
396         f, ::Kokkos::Experimental::Impl::hip_get_max_block_size<
397                FunctorType, typename traits::launch_bounds>);
398   }
399 
400   template <class ClosureType, class FunctorType>
internal_team_size_recommended(const FunctorType & f) const401   int internal_team_size_recommended(const FunctorType& f) const {
402     return internal_team_size_common<ClosureType>(
403         f, ::Kokkos::Experimental::Impl::hip_get_opt_block_size<
404                FunctorType, typename traits::launch_bounds>);
405   }
406 };
407 
408 template <typename FunctorType, typename... Properties>
409 class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
410                   Kokkos::Experimental::HIP> {
411  public:
412   using Policy = TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>;
413   using functor_type = FunctorType;
414   using size_type    = ::Kokkos::Experimental::HIP::size_type;
415 
416  private:
417   using member_type   = typename Policy::member_type;
418   using work_tag      = typename Policy::work_tag;
419   using launch_bounds = typename Policy::launch_bounds;
420 
421   // Algorithmic constraints: blockDim.y is a power of two AND
422   // blockDim.y  == blockDim.z == 1 shared memory utilization:
423   //
424   //  [ team   reduce space ]
425   //  [ team   shared space ]
426 
427   FunctorType const m_functor;
428   Policy const m_policy;
429   size_type const m_league_size;
430   int m_team_size;
431   size_type const m_vector_size;
432   int m_shmem_begin;
433   int m_shmem_size;
434   void* m_scratch_ptr[2];
435   int m_scratch_size[2];
436   // Only let one ParallelFor/Reduce modify the team scratch memory. The
437   // constructor acquires the mutex which is released in the destructor.
438   std::unique_lock<std::mutex> m_scratch_lock;
439 
440   template <typename TagType>
441   __device__ inline
442       typename std::enable_if<std::is_same<TagType, void>::value>::type
exec_team(const member_type & member) const443       exec_team(const member_type& member) const {
444     m_functor(member);
445   }
446 
447   template <typename TagType>
448   __device__ inline
449       typename std::enable_if<!std::is_same<TagType, void>::value>::type
exec_team(const member_type & member) const450       exec_team(const member_type& member) const {
451     m_functor(TagType(), member);
452   }
453 
454  public:
operator ()() const455   __device__ inline void operator()() const {
456     // Iterate this block through the league
457     int64_t threadid = 0;
458     if (m_scratch_size[1] > 0) {
459       __shared__ int64_t base_thread_id;
460       if (threadIdx.x == 0 && threadIdx.y == 0) {
461         threadid = (blockIdx.x * blockDim.z + threadIdx.z) %
462                    (Kokkos::Impl::g_device_hip_lock_arrays.n /
463                     (blockDim.x * blockDim.y));
464         threadid *= blockDim.x * blockDim.y;
465         int done = 0;
466         while (!done) {
467           done = (0 ==
468                   atomicCAS(
469                       &Kokkos::Impl::g_device_hip_lock_arrays.scratch[threadid],
470                       0, 1));
471           if (!done) {
472             threadid += blockDim.x * blockDim.y;
473             if (int64_t(threadid + blockDim.x * blockDim.y) >=
474                 int64_t(Kokkos::Impl::g_device_hip_lock_arrays.n))
475               threadid = 0;
476           }
477         }
478         base_thread_id = threadid;
479       }
480       __syncthreads();
481       threadid = base_thread_id;
482     }
483 
484     int const int_league_size = static_cast<int>(m_league_size);
485     for (int league_rank = blockIdx.x; league_rank < int_league_size;
486          league_rank += gridDim.x) {
487       this->template exec_team<work_tag>(typename Policy::member_type(
488           ::Kokkos::Experimental::kokkos_impl_hip_shared_memory<void>(),
489           m_shmem_begin, m_shmem_size,
490           static_cast<void*>(static_cast<char*>(m_scratch_ptr[1]) +
491                              ptrdiff_t(threadid / (blockDim.x * blockDim.y)) *
492                                  m_scratch_size[1]),
493           m_scratch_size[1], league_rank, m_league_size));
494     }
495     if (m_scratch_size[1] > 0) {
496       __syncthreads();
497       if (threadIdx.x == 0 && threadIdx.y == 0)
498         Kokkos::Impl::g_device_hip_lock_arrays.scratch[threadid] = 0;
499     }
500   }
501 
execute() const502   inline void execute() const {
503     int64_t const shmem_size_total = m_shmem_begin + m_shmem_size;
504     dim3 const grid(static_cast<int>(m_league_size), 1, 1);
505     dim3 const block(static_cast<int>(m_vector_size),
506                      static_cast<int>(m_team_size), 1);
507 
508     ::Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, launch_bounds>(
509         *this, grid, block, shmem_size_total,
510         m_policy.space().impl_internal_space_instance(),
511         true);  // copy to device and execute
512   }
513 
ParallelFor(FunctorType const & arg_functor,Policy const & arg_policy)514   ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy)
515       : m_functor(arg_functor),
516         m_policy(arg_policy),
517         m_league_size(arg_policy.league_size()),
518         m_team_size(arg_policy.team_size()),
519         m_vector_size(arg_policy.impl_vector_length()),
520         m_scratch_lock(m_policy.space()
521                            .impl_internal_space_instance()
522                            ->m_team_scratch_mutex) {
523     hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
524         ParallelFor, launch_bounds>::get_hip_func_attributes();
525     m_team_size =
526         m_team_size >= 0
527             ? m_team_size
528             : ::Kokkos::Experimental::Impl::hip_get_opt_block_size<
529                   FunctorType, launch_bounds>(
530                   m_policy.space().impl_internal_space_instance(), attr,
531                   m_functor, m_vector_size, m_policy.team_scratch_size(0),
532                   m_policy.thread_scratch_size(0)) /
533                   m_vector_size;
534 
535     m_shmem_begin = (sizeof(double) * (m_team_size + 2));
536     m_shmem_size =
537         (m_policy.scratch_size(0, m_team_size) +
538          FunctorTeamShmemSize<FunctorType>::value(m_functor, m_team_size));
539     m_scratch_size[0] = m_policy.scratch_size(0, m_team_size);
540     m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
541 
542     // Functor's reduce memory, team scan memory, and team shared memory depend
543     // upon team size.
544     m_scratch_ptr[0] = nullptr;
545     m_scratch_ptr[1] =
546         m_team_size <= 0
547             ? nullptr
548             : m_policy.space()
549                   .impl_internal_space_instance()
550                   ->resize_team_scratch_space(
551                       static_cast<ptrdiff_t>(m_scratch_size[1]) *
552                       static_cast<ptrdiff_t>(
553                           ::Kokkos::Experimental::HIP::concurrency() /
554                           (m_team_size * m_vector_size)));
555 
556     int const shmem_size_total = m_shmem_begin + m_shmem_size;
557     if (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
558         shmem_size_total) {
559       printf(
560           "%i %i\n",
561           m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock,
562           shmem_size_total);
563       Kokkos::Impl::throw_runtime_exception(std::string(
564           "Kokkos::Impl::ParallelFor< HIP > insufficient shared memory"));
565     }
566 
567     if (static_cast<int>(m_team_size) >
568         static_cast<int>(
569             ::Kokkos::Experimental::Impl::hip_get_max_block_size<FunctorType,
570                                                                  launch_bounds>(
571                 m_policy.space().impl_internal_space_instance(), attr,
572                 arg_functor, arg_policy.impl_vector_length(),
573                 arg_policy.team_scratch_size(0),
574                 arg_policy.thread_scratch_size(0)) /
575             arg_policy.impl_vector_length())) {
576       Kokkos::Impl::throw_runtime_exception(std::string(
577           "Kokkos::Impl::ParallelFor< HIP > requested too large team size."));
578     }
579   }
580 };
581 
582 //----------------------------------------------------------------------------
583 //----------------------------------------------------------------------------
584 
585 template <class FunctorType, class ReducerType, class... Properties>
586 class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
587                      ReducerType, Kokkos::Experimental::HIP> {
588  public:
589   using Policy = TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>;
590 
591  private:
592   using member_type   = typename Policy::member_type;
593   using work_tag      = typename Policy::work_tag;
594   using launch_bounds = typename Policy::launch_bounds;
595 
596   using reducer_conditional =
597       Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
598                          FunctorType, ReducerType>;
599   using reducer_type_fwd = typename reducer_conditional::type;
600   using work_tag_fwd =
601       typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
602                                   work_tag, void>::type;
603 
604   using value_traits =
605       Kokkos::Impl::FunctorValueTraits<reducer_type_fwd, work_tag_fwd>;
606   using value_init =
607       Kokkos::Impl::FunctorValueInit<reducer_type_fwd, work_tag_fwd>;
608   using value_join =
609       Kokkos::Impl::FunctorValueJoin<reducer_type_fwd, work_tag_fwd>;
610 
611   using pointer_type   = typename value_traits::pointer_type;
612   using reference_type = typename value_traits::reference_type;
613   using value_type     = typename value_traits::value_type;
614 
615  public:
616   using functor_type = FunctorType;
617   using size_type    = Kokkos::Experimental::HIP::size_type;
618 
619   static int constexpr UseShflReduction = (value_traits::StaticValueSize != 0);
620 
621  private:
622   struct ShflReductionTag {};
623   struct SHMEMReductionTag {};
624 
625   // Algorithmic constraints: blockDim.y is a power of two AND
626   // blockDim.y == blockDim.z == 1 shared memory utilization:
627   //
628   //  [ global reduce space ]
629   //  [ team   reduce space ]
630   //  [ team   shared space ]
631   //
632 
633   const FunctorType m_functor;
634   const Policy m_policy;
635   const ReducerType m_reducer;
636   const pointer_type m_result_ptr;
637   const bool m_result_ptr_device_accessible;
638   const bool m_result_ptr_host_accessible;
639   size_type* m_scratch_space;
640   size_type* m_scratch_flags;
641   size_type m_team_begin;
642   size_type m_shmem_begin;
643   size_type m_shmem_size;
644   void* m_scratch_ptr[2];
645   int m_scratch_size[2];
646   const size_type m_league_size;
647   int m_team_size;
648   const size_type m_vector_size;
649   // Only let one ParallelFor/Reduce modify the team scratch memory. The
650   // constructor acquires the mutex which is released in the destructor.
651   std::unique_lock<std::mutex> m_scratch_lock;
652 
653   template <class TagType>
654   __device__ inline
655       typename std::enable_if<std::is_same<TagType, void>::value>::type
exec_team(member_type const & member,reference_type update) const656       exec_team(member_type const& member, reference_type update) const {
657     m_functor(member, update);
658   }
659 
660   template <class TagType>
661   __device__ inline
662       typename std::enable_if<!std::is_same<TagType, void>::value>::type
exec_team(member_type const & member,reference_type update) const663       exec_team(member_type const& member, reference_type update) const {
664     m_functor(TagType(), member, update);
665   }
666 
iterate_through_league(int const threadid,reference_type value) const667   __device__ inline void iterate_through_league(int const threadid,
668                                                 reference_type value) const {
669     int const int_league_size = static_cast<int>(m_league_size);
670     for (int league_rank = blockIdx.x; league_rank < int_league_size;
671          league_rank += gridDim.x) {
672       this->template exec_team<work_tag>(
673           member_type(
674               Kokkos::Experimental::kokkos_impl_hip_shared_memory<char>() +
675                   m_team_begin,
676               m_shmem_begin, m_shmem_size,
677               reinterpret_cast<void*>(
678                   reinterpret_cast<char*>(m_scratch_ptr[1]) +
679                   static_cast<ptrdiff_t>(threadid / (blockDim.x * blockDim.y)) *
680                       m_scratch_size[1]),
681               m_scratch_size[1], league_rank, m_league_size),
682           value);
683     }
684   }
685 
686  public:
operator ()() const687   __device__ inline void operator()() const {
688     int64_t threadid = 0;
689     if (m_scratch_size[1] > 0) {
690       __shared__ int64_t base_thread_id;
691       if (threadIdx.x == 0 && threadIdx.y == 0) {
692         threadid = (blockIdx.x * blockDim.z + threadIdx.z) %
693                    (Kokkos::Impl::g_device_hip_lock_arrays.n /
694                     (blockDim.x * blockDim.y));
695         threadid *= blockDim.x * blockDim.y;
696         int done = 0;
697         while (!done) {
698           done = (0 ==
699                   atomicCAS(
700                       &Kokkos::Impl::g_device_hip_lock_arrays.scratch[threadid],
701                       0, 1));
702           if (!done) {
703             threadid += blockDim.x * blockDim.y;
704             if (static_cast<int64_t>(threadid + blockDim.x * blockDim.y) >=
705                 static_cast<int64_t>(Kokkos::Impl::g_device_hip_lock_arrays.n))
706               threadid = 0;
707           }
708         }
709         base_thread_id = threadid;
710       }
711       __syncthreads();
712       threadid = base_thread_id;
713     }
714 
715     using ReductionTag = std::conditional_t<UseShflReduction, ShflReductionTag,
716                                             SHMEMReductionTag>;
717     run(ReductionTag{}, threadid);
718 
719     if (m_scratch_size[1] > 0) {
720       __syncthreads();
721       if (threadIdx.x == 0 && threadIdx.y == 0) {
722         Kokkos::Impl::g_device_hip_lock_arrays.scratch[threadid] = 0;
723       }
724     }
725   }
726 
run(SHMEMReductionTag,int const threadid) const727   __device__ inline void run(SHMEMReductionTag, int const threadid) const {
728     integral_nonzero_constant<size_type, value_traits::StaticValueSize /
729                                              sizeof(size_type)> const
730         word_count(value_traits::value_size(
731                        reducer_conditional::select(m_functor, m_reducer)) /
732                    sizeof(size_type));
733 
734     reference_type value = value_init::init(
735         reducer_conditional::select(m_functor, m_reducer),
736         Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>() +
737             threadIdx.y * word_count.value);
738 
739     // Iterate this block through the league
740     iterate_through_league(threadid, value);
741 
742     // Reduce with final value at blockDim.y - 1 location.
743     bool do_final_reduce = (m_league_size == 0);
744     if (!do_final_reduce)
745       do_final_reduce =
746           hip_single_inter_block_reduce_scan<false, FunctorType, work_tag>(
747               reducer_conditional::select(m_functor, m_reducer), blockIdx.x,
748               gridDim.x,
749               Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>(),
750               m_scratch_space, m_scratch_flags);
751     if (do_final_reduce) {
752       // This is the final block with the final result at the final threads'
753       // location
754 
755       size_type* const shared =
756           Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>() +
757           (blockDim.y - 1) * word_count.value;
758       size_type* const global = m_result_ptr_device_accessible
759                                     ? reinterpret_cast<size_type*>(m_result_ptr)
760                                     : m_scratch_space;
761 
762       if (threadIdx.y == 0) {
763         Kokkos::Impl::FunctorFinal<reducer_type_fwd, work_tag_fwd>::final(
764             reducer_conditional::select(m_functor, m_reducer), shared);
765       }
766 
767       if (Kokkos::Experimental::Impl::HIPTraits::WarpSize < word_count.value) {
768         __syncthreads();
769       }
770 
771       for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) {
772         global[i] = shared[i];
773       }
774     }
775   }
776 
run(ShflReductionTag,int const threadid) const777   __device__ inline void run(ShflReductionTag, int const threadid) const {
778     value_type value;
779     value_init::init(reducer_conditional::select(m_functor, m_reducer), &value);
780 
781     // Iterate this block through the league
782     iterate_through_league(threadid, value);
783 
784     pointer_type const result =
785         m_result_ptr_device_accessible
786             ? m_result_ptr
787             : reinterpret_cast<pointer_type>(m_scratch_space);
788 
789     value_type init;
790     value_init::init(reducer_conditional::select(m_functor, m_reducer), &init);
791     if (m_league_size == 0) {
792       Kokkos::Impl::FunctorFinal<reducer_type_fwd, work_tag_fwd>::final(
793           reducer_conditional::select(m_functor, m_reducer),
794           reinterpret_cast<void*>(&value));
795       *result = value;
796     } else if (Impl::hip_inter_block_shuffle_reduction<FunctorType, value_join,
797                                                        work_tag>(
798                    value, init,
799                    value_join(
800                        reducer_conditional::select(m_functor, m_reducer)),
801                    m_scratch_space, result, m_scratch_flags, blockDim.y)) {
802       unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x;
803       if (id == 0) {
804         Kokkos::Impl::FunctorFinal<reducer_type_fwd, work_tag_fwd>::final(
805             reducer_conditional::select(m_functor, m_reducer),
806             reinterpret_cast<void*>(&value));
807         *result = value;
808       }
809     }
810   }
811 
execute()812   inline void execute() {
813     const int nwork            = m_league_size * m_team_size;
814     const bool need_device_set = ReduceFunctorHasInit<FunctorType>::value ||
815                                  ReduceFunctorHasFinal<FunctorType>::value ||
816                                  !m_result_ptr_host_accessible ||
817                                  !std::is_same<ReducerType, InvalidType>::value;
818     if ((nwork > 0) || need_device_set) {
819       const int block_count =
820           UseShflReduction
821               ? std::min(
822                     m_league_size,
823                     size_type(1024 *
824                               Kokkos::Experimental::Impl::HIPTraits::WarpSize))
825               : std::min(static_cast<int>(m_league_size), m_team_size);
826 
827       m_scratch_space = Kokkos::Experimental::Impl::hip_internal_scratch_space(
828           value_traits::value_size(
829               reducer_conditional::select(m_functor, m_reducer)) *
830           block_count);
831       m_scratch_flags = Kokkos::Experimental::Impl::hip_internal_scratch_flags(
832           sizeof(size_type));
833 
834       dim3 block(m_vector_size, m_team_size, 1);
835       dim3 grid(block_count, 1, 1);
836       if (nwork == 0) {
837         block = dim3(1, 1, 1);
838         grid  = dim3(1, 1, 1);
839       }
840       const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size;
841 
842       Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelReduce,
843                                                     launch_bounds>(
844           *this, grid, block, shmem_size_total,
845           m_policy.space().impl_internal_space_instance(),
846           true);  // copy to device and execute
847 
848       if (!m_result_ptr_device_accessible) {
849         m_policy.space().impl_internal_space_instance()->fence();
850 
851         if (m_result_ptr) {
852           const int size = value_traits::value_size(
853               reducer_conditional::select(m_functor, m_reducer));
854           DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace>(
855               m_result_ptr, m_scratch_space, size);
856         }
857       }
858     } else {
859       if (m_result_ptr) {
860         value_init::init(reducer_conditional::select(m_functor, m_reducer),
861                          m_result_ptr);
862       }
863     }
864   }
865 
866   template <class ViewType>
ParallelReduce(FunctorType const & arg_functor,Policy const & arg_policy,ViewType const & arg_result,typename std::enable_if<Kokkos::is_view<ViewType>::value,void * >::type=nullptr)867   ParallelReduce(FunctorType const& arg_functor, Policy const& arg_policy,
868                  ViewType const& arg_result,
869                  typename std::enable_if<Kokkos::is_view<ViewType>::value,
870                                          void*>::type = nullptr)
871       : m_functor(arg_functor),
872         m_policy(arg_policy),
873         m_reducer(InvalidType()),
874         m_result_ptr(arg_result.data()),
875         m_result_ptr_device_accessible(
876             MemorySpaceAccess<Kokkos::Experimental::HIPSpace,
877                               typename ViewType::memory_space>::accessible),
878         m_result_ptr_host_accessible(
879             MemorySpaceAccess<Kokkos::HostSpace,
880                               typename ViewType::memory_space>::accessible),
881         m_scratch_space(nullptr),
882         m_scratch_flags(nullptr),
883         m_team_begin(0),
884         m_shmem_begin(0),
885         m_shmem_size(0),
886         m_scratch_ptr{nullptr, nullptr},
887         m_league_size(arg_policy.league_size()),
888         m_team_size(arg_policy.team_size()),
889         m_vector_size(arg_policy.impl_vector_length()),
890         m_scratch_lock(m_policy.space()
891                            .impl_internal_space_instance()
892                            ->m_team_scratch_mutex) {
893     hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch<
894         ParallelReduce, launch_bounds>::get_hip_func_attributes();
895     m_team_size =
896         m_team_size >= 0
897             ? m_team_size
898             : Kokkos::Experimental::Impl::hip_get_opt_block_size<FunctorType,
899                                                                  launch_bounds>(
900                   m_policy.space().impl_internal_space_instance(), attr,
901                   m_functor, m_vector_size, m_policy.team_scratch_size(0),
902                   m_policy.thread_scratch_size(0)) /
903                   m_vector_size;
904 
905     m_team_begin =
906         UseShflReduction
907             ? 0
908             : hip_single_inter_block_reduce_scan_shmem<false, FunctorType,
909                                                        work_tag>(arg_functor,
910                                                                  m_team_size);
911     m_shmem_begin = sizeof(double) * (m_team_size + 2);
912     m_shmem_size =
913         m_policy.scratch_size(0, m_team_size) +
914         FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size);
915     m_scratch_size[0] = m_shmem_size;
916     m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
917     m_scratch_ptr[1] =
918         m_team_size <= 0
919             ? nullptr
920             : m_policy.space()
921                   .impl_internal_space_instance()
922                   ->resize_team_scratch_space(
923                       static_cast<std::int64_t>(m_scratch_size[1]) *
924                       (static_cast<std::int64_t>(
925                           Kokkos::Experimental::HIP::concurrency() /
926                           (m_team_size * m_vector_size))));
927 
928     // The global parallel_reduce does not support vector_length other than 1 at
929     // the moment
930     if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction)
931       Impl::throw_runtime_exception(
932           "Kokkos::parallel_reduce with a TeamPolicy using a vector length of "
933           "greater than 1 is not currently supported for HIP for dynamic "
934           "sized reduction types.");
935 
936     if ((m_team_size < Kokkos::Experimental::Impl::HIPTraits::WarpSize) &&
937         !UseShflReduction)
938       Impl::throw_runtime_exception(
939           "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller "
940           "than 64 is not currently supported with HIP for dynamic sized "
941           "reduction types.");
942 
943     // Functor's reduce memory, team scan memory, and team shared memory depend
944     // upon team size.
945 
946     const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size;
947 
948     if (!Kokkos::Impl::is_integral_power_of_two(m_team_size) &&
949         !UseShflReduction) {
950       Kokkos::Impl::throw_runtime_exception(
951           std::string("Kokkos::Impl::ParallelReduce< HIP > bad team size"));
952     }
953 
954     if (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
955         shmem_size_total) {
956       Kokkos::Impl::throw_runtime_exception(
957           std::string("Kokkos::Impl::ParallelReduce< HIP > requested too much "
958                       "L0 scratch memory"));
959     }
960 
961     if (static_cast<int>(m_team_size) >
962         arg_policy.team_size_max(m_functor, m_reducer, ParallelReduceTag())) {
963       Kokkos::Impl::throw_runtime_exception(
964           std::string("Kokkos::Impl::ParallelReduce< HIP > requested too "
965                       "large team size."));
966     }
967   }
968 
ParallelReduce(FunctorType const & arg_functor,Policy const & arg_policy,ReducerType const & reducer)969   ParallelReduce(FunctorType const& arg_functor, Policy const& arg_policy,
970                  ReducerType const& reducer)
971       : m_functor(arg_functor),
972         m_policy(arg_policy),
973         m_reducer(reducer),
974         m_result_ptr(reducer.view().data()),
975         m_result_ptr_device_accessible(
976             MemorySpaceAccess<Kokkos::Experimental::HIPSpace,
977                               typename ReducerType::result_view_type::
978                                   memory_space>::accessible),
979         m_result_ptr_host_accessible(
980             MemorySpaceAccess<Kokkos::HostSpace,
981                               typename ReducerType::result_view_type::
982                                   memory_space>::accessible),
983         m_scratch_space(nullptr),
984         m_scratch_flags(nullptr),
985         m_team_begin(0),
986         m_shmem_begin(0),
987         m_shmem_size(0),
988         m_scratch_ptr{nullptr, nullptr},
989         m_league_size(arg_policy.league_size()),
990         m_team_size(arg_policy.team_size()),
991         m_vector_size(arg_policy.impl_vector_length()),
992         m_scratch_lock(m_policy.space()
993                            .impl_internal_space_instance()
994                            ->m_team_scratch_mutex) {
995     hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch<
996         ParallelReduce, launch_bounds>::get_hip_func_attributes();
997     m_team_size =
998         m_team_size >= 0
999             ? m_team_size
1000             : Kokkos::Experimental::Impl::hip_get_opt_block_size<FunctorType,
1001                                                                  launch_bounds>(
1002                   m_policy.space().impl_internal_space_instance(), attr,
1003                   m_functor, m_vector_size, m_policy.team_scratch_size(0),
1004                   m_policy.thread_scratch_size(0)) /
1005                   m_vector_size;
1006 
1007     m_team_begin =
1008         UseShflReduction
1009             ? 0
1010             : hip_single_inter_block_reduce_scan_shmem<false, FunctorType,
1011                                                        work_tag>(arg_functor,
1012                                                                  m_team_size);
1013     m_shmem_begin = sizeof(double) * (m_team_size + 2);
1014     m_shmem_size =
1015         m_policy.scratch_size(0, m_team_size) +
1016         FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size);
1017     m_scratch_size[0] = m_shmem_size;
1018     m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
1019     m_scratch_ptr[1] =
1020         m_team_size <= 0
1021             ? nullptr
1022             : m_policy.space()
1023                   .impl_internal_space_instance()
1024                   ->resize_team_scratch_space(
1025                       static_cast<ptrdiff_t>(m_scratch_size[1]) *
1026                       static_cast<ptrdiff_t>(
1027                           Kokkos::Experimental::HIP::concurrency() /
1028                           (m_team_size * m_vector_size)));
1029 
1030     // The global parallel_reduce does not support vector_length other than 1 at
1031     // the moment
1032     if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction)
1033       Impl::throw_runtime_exception(
1034           "Kokkos::parallel_reduce with a TeamPolicy using a vector length of "
1035           "greater than 1 is not currently supported for HIP for dynamic "
1036           "sized reduction types.");
1037 
1038     if ((m_team_size < Kokkos::Experimental::Impl::HIPTraits::WarpSize) &&
1039         !UseShflReduction)
1040       Impl::throw_runtime_exception(
1041           "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller "
1042           "than 64 is not currently supported with HIP for dynamic sized "
1043           "reduction types.");
1044 
1045     // Functor's reduce memory, team scan memory, and team shared memory depend
1046     // upon team size.
1047 
1048     const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size;
1049 
1050     if ((!Kokkos::Impl::is_integral_power_of_two(m_team_size) &&
1051          !UseShflReduction) ||
1052         m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
1053             shmem_size_total) {
1054       Kokkos::Impl::throw_runtime_exception(
1055           std::string("Kokkos::Impl::ParallelReduce< HIP > bad team size"));
1056     }
1057     if (static_cast<int>(m_team_size) >
1058         arg_policy.team_size_max(m_functor, m_reducer, ParallelReduceTag())) {
1059       Kokkos::Impl::throw_runtime_exception(
1060           std::string("Kokkos::Impl::ParallelReduce< HIP > requested too "
1061                       "large team size."));
1062     }
1063   }
1064 };
1065 }  // namespace Impl
1066 }  // namespace Kokkos
1067 
1068 #endif
1069 
1070 #endif
1071