1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 //                        Kokkos v. 3.0
6 //       Copyright (2020) National Technology & Engineering
7 //               Solutions of Sandia, LLC (NTESS).
8 //
9 // Under the terms of Contract DE-NA0003525 with NTESS,
10 // the U.S. Government retains certain rights in this software.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions are
14 // met:
15 //
16 // 1. Redistributions of source code must retain the above copyright
17 // notice, this list of conditions and the following disclaimer.
18 //
19 // 2. Redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution.
22 //
23 // 3. Neither the name of the Corporation nor the names of the
24 // contributors may be used to endorse or promote products derived from
25 // this software without specific prior written permission.
26 //
27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 //
39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
40 //
41 // ************************************************************************
42 //@HEADER
43 */
44 
45 #ifndef KOKKOS_CUDA_PARALLEL_HPP
46 #define KOKKOS_CUDA_PARALLEL_HPP
47 
48 #include <Kokkos_Macros.hpp>
49 #if defined(KOKKOS_ENABLE_CUDA)
50 
51 #include <algorithm>
52 #include <string>
53 #include <cstdio>
54 #include <cstdint>
55 
56 #include <utility>
57 #include <Kokkos_Parallel.hpp>
58 
59 #include <Cuda/Kokkos_Cuda_KernelLaunch.hpp>
60 #include <Cuda/Kokkos_Cuda_ReduceScan.hpp>
61 #include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
62 #include <Cuda/Kokkos_Cuda_Locks.hpp>
63 #include <Cuda/Kokkos_Cuda_Team.hpp>
64 #include <Kokkos_Vectorization.hpp>
65 #include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
66 
67 #include <impl/Kokkos_Tools.hpp>
68 #include <typeinfo>
69 
70 #include <KokkosExp_MDRangePolicy.hpp>
71 #include <impl/KokkosExp_IterateTileGPU.hpp>
72 
73 //----------------------------------------------------------------------------
74 //----------------------------------------------------------------------------
75 
76 namespace Kokkos {
77 
78 extern bool show_warnings() noexcept;
79 
80 namespace Impl {
81 
82 template <class... Properties>
83 class TeamPolicyInternal<Kokkos::Cuda, Properties...>
84     : public PolicyTraits<Properties...> {
85  public:
86   //! Tag this class as a kokkos execution policy
87   using execution_policy = TeamPolicyInternal;
88 
89   using traits = PolicyTraits<Properties...>;
90 
91   template <class ExecSpace, class... OtherProperties>
92   friend class TeamPolicyInternal;
93 
94  private:
95   enum { MAX_WARP = 8 };
96 
97   typename traits::execution_space m_space;
98   int m_league_size;
99   int m_team_size;
100   int m_vector_length;
101   int m_team_scratch_size[2];
102   int m_thread_scratch_size[2];
103   int m_chunk_size;
104   bool m_tune_team;
105   bool m_tune_vector;
106 
107  public:
108   //! Execution space of this execution policy
109   using execution_space = Kokkos::Cuda;
110 
111   template <class... OtherProperties>
TeamPolicyInternal(const TeamPolicyInternal<OtherProperties...> & p)112   TeamPolicyInternal(const TeamPolicyInternal<OtherProperties...>& p) {
113     m_league_size            = p.m_league_size;
114     m_team_size              = p.m_team_size;
115     m_vector_length          = p.m_vector_length;
116     m_team_scratch_size[0]   = p.m_team_scratch_size[0];
117     m_team_scratch_size[1]   = p.m_team_scratch_size[1];
118     m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
119     m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
120     m_chunk_size             = p.m_chunk_size;
121     m_space                  = p.m_space;
122     m_tune_team              = p.m_tune_team;
123     m_tune_vector            = p.m_tune_vector;
124   }
125 
126   //----------------------------------------
127 
128   template <class FunctorType>
team_size_max(const FunctorType & f,const ParallelForTag &) const129   int team_size_max(const FunctorType& f, const ParallelForTag&) const {
130     using closure_type =
131         Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>;
132     cudaFuncAttributes attr =
133         CudaParallelLaunch<closure_type, typename traits::launch_bounds>::
134             get_cuda_func_attributes();
135     int block_size =
136         Kokkos::Impl::cuda_get_max_block_size<FunctorType,
137                                               typename traits::launch_bounds>(
138             space().impl_internal_space_instance(), attr, f,
139             (size_t)impl_vector_length(),
140             (size_t)team_scratch_size(0) + 2 * sizeof(double),
141             (size_t)thread_scratch_size(0) + sizeof(double));
142     return block_size / impl_vector_length();
143   }
144 
145   template <class FunctorType>
team_size_max(const FunctorType & f,const ParallelReduceTag &) const146   inline int team_size_max(const FunctorType& f,
147                            const ParallelReduceTag&) const {
148     using functor_analysis_type =
149         Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
150                               TeamPolicyInternal, FunctorType>;
151     using reducer_type = typename Impl::ParallelReduceReturnValue<
152         void, typename functor_analysis_type::value_type,
153         FunctorType>::reducer_type;
154     using closure_type =
155         Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
156                              reducer_type>;
157     return internal_team_size_max<closure_type>(f);
158   }
159 
160   template <class FunctorType, class ReducerType>
team_size_max(const FunctorType & f,const ReducerType &,const ParallelReduceTag &) const161   inline int team_size_max(const FunctorType& f, const ReducerType& /*r*/,
162                            const ParallelReduceTag&) const {
163     using closure_type =
164         Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
165                              ReducerType>;
166     return internal_team_size_max<closure_type>(f);
167   }
168 
169   template <class FunctorType>
team_size_recommended(const FunctorType & f,const ParallelForTag &) const170   int team_size_recommended(const FunctorType& f, const ParallelForTag&) const {
171     using closure_type =
172         Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>;
173     cudaFuncAttributes attr =
174         CudaParallelLaunch<closure_type, typename traits::launch_bounds>::
175             get_cuda_func_attributes();
176     const int block_size =
177         Kokkos::Impl::cuda_get_opt_block_size<FunctorType,
178                                               typename traits::launch_bounds>(
179             space().impl_internal_space_instance(), attr, f,
180             (size_t)impl_vector_length(),
181             (size_t)team_scratch_size(0) + 2 * sizeof(double),
182             (size_t)thread_scratch_size(0) + sizeof(double));
183     return block_size / impl_vector_length();
184   }
185 
186   template <class FunctorType>
team_size_recommended(const FunctorType & f,const ParallelReduceTag &) const187   inline int team_size_recommended(const FunctorType& f,
188                                    const ParallelReduceTag&) const {
189     using functor_analysis_type =
190         Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
191                               TeamPolicyInternal, FunctorType>;
192     using reducer_type = typename Impl::ParallelReduceReturnValue<
193         void, typename functor_analysis_type::value_type,
194         FunctorType>::reducer_type;
195     using closure_type =
196         Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
197                              reducer_type>;
198     return internal_team_size_recommended<closure_type>(f);
199   }
200 
201   template <class FunctorType, class ReducerType>
team_size_recommended(const FunctorType & f,const ReducerType &,const ParallelReduceTag &) const202   int team_size_recommended(const FunctorType& f, const ReducerType&,
203                             const ParallelReduceTag&) const {
204     using closure_type =
205         Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
206                              ReducerType>;
207     return internal_team_size_recommended<closure_type>(f);
208   }
209 
vector_length_max()210   inline static int vector_length_max() { return Impl::CudaTraits::WarpSize; }
211 
verify_requested_vector_length(int requested_vector_length)212   inline static int verify_requested_vector_length(
213       int requested_vector_length) {
214     int test_vector_length =
215         std::min(requested_vector_length, vector_length_max());
216 
217     // Allow only power-of-two vector_length
218     if (!(is_integral_power_of_two(test_vector_length))) {
219       int test_pow2 = 1;
220       for (int i = 0; i < 5; i++) {
221         test_pow2 = test_pow2 << 1;
222         if (test_pow2 > test_vector_length) {
223           break;
224         }
225       }
226       test_vector_length = test_pow2 >> 1;
227     }
228 
229     return test_vector_length;
230   }
231 
scratch_size_max(int level)232   inline static int scratch_size_max(int level) {
233     return (
234         level == 0 ? 1024 * 40 :  // 48kB is the max for CUDA, but we need some
235                                   // for team_member.reduce etc.
236             20 * 1024 *
237                 1024);  // arbitrarily setting this to 20MB, for a Volta V100
238                         // that would give us about 3.2GB for 2 teams per SM
239   }
240 
241   //----------------------------------------
242 
vector_length() const243   KOKKOS_DEPRECATED inline int vector_length() const {
244     return impl_vector_length();
245   }
impl_vector_length() const246   inline int impl_vector_length() const { return m_vector_length; }
team_size() const247   inline int team_size() const { return m_team_size; }
league_size() const248   inline int league_size() const { return m_league_size; }
impl_auto_team_size() const249   inline bool impl_auto_team_size() const { return m_tune_team; }
impl_auto_vector_length() const250   inline bool impl_auto_vector_length() const { return m_tune_vector; }
impl_set_team_size(size_t team_size)251   inline void impl_set_team_size(size_t team_size) { m_team_size = team_size; }
impl_set_vector_length(size_t vector_length)252   inline void impl_set_vector_length(size_t vector_length) {
253     m_vector_length = vector_length;
254   }
scratch_size(int level,int team_size_=-1) const255   inline int scratch_size(int level, int team_size_ = -1) const {
256     if (team_size_ < 0) team_size_ = m_team_size;
257     return m_team_scratch_size[level] +
258            team_size_ * m_thread_scratch_size[level];
259   }
team_scratch_size(int level) const260   inline int team_scratch_size(int level) const {
261     return m_team_scratch_size[level];
262   }
thread_scratch_size(int level) const263   inline int thread_scratch_size(int level) const {
264     return m_thread_scratch_size[level];
265   }
266 
space() const267   const typename traits::execution_space& space() const { return m_space; }
268 
TeamPolicyInternal()269   TeamPolicyInternal()
270       : m_space(typename traits::execution_space()),
271         m_league_size(0),
272         m_team_size(-1),
273         m_vector_length(0),
274         m_team_scratch_size{0, 0},
275         m_thread_scratch_size{0, 0},
276         m_chunk_size(Impl::CudaTraits::WarpSize),
277         m_tune_team(false),
278         m_tune_vector(false) {}
279 
280   /** \brief  Specify league size, specify team size, specify vector length */
TeamPolicyInternal(const execution_space space_,int league_size_,int team_size_request,int vector_length_request=1)281   TeamPolicyInternal(const execution_space space_, int league_size_,
282                      int team_size_request, int vector_length_request = 1)
283       : m_space(space_),
284         m_league_size(league_size_),
285         m_team_size(team_size_request),
286         m_vector_length(
287             (vector_length_request > 0)
288                 ? verify_requested_vector_length(vector_length_request)
289                 : verify_requested_vector_length(1)),
290         m_team_scratch_size{0, 0},
291         m_thread_scratch_size{0, 0},
292         m_chunk_size(Impl::CudaTraits::WarpSize),
293         m_tune_team(bool(team_size_request <= 0)),
294         m_tune_vector(bool(vector_length_request <= 0)) {
295     // Make sure league size is permissible
296     if (league_size_ >= int(Impl::cuda_internal_maximum_grid_count()))
297       Impl::throw_runtime_exception(
298           "Requested too large league_size for TeamPolicy on Cuda execution "
299           "space.");
300 
301     // Make sure total block size is permissible
302     if (m_team_size * m_vector_length >
303         int(Impl::CudaTraits::MaxHierarchicalParallelism)) {
304       Impl::throw_runtime_exception(
305           std::string("Kokkos::TeamPolicy< Cuda > the team size is too large. "
306                       "Team size x vector length must be smaller than 1024."));
307     }
308   }
309 
310   /** \brief  Specify league size, request team size, specify vector length */
TeamPolicyInternal(const execution_space space_,int league_size_,const Kokkos::AUTO_t &,int vector_length_request=1)311   TeamPolicyInternal(const execution_space space_, int league_size_,
312                      const Kokkos::AUTO_t& /* team_size_request */
313                      ,
314                      int vector_length_request = 1)
315       : TeamPolicyInternal(space_, league_size_, -1, vector_length_request) {}
316 
317   /** \brief  Specify league size, request team size and vector length */
TeamPolicyInternal(const execution_space space_,int league_size_,const Kokkos::AUTO_t &,const Kokkos::AUTO_t &)318   TeamPolicyInternal(const execution_space space_, int league_size_,
319                      const Kokkos::AUTO_t& /* team_size_request */,
320                      const Kokkos::AUTO_t& /* vector_length_request */
321                      )
322       : TeamPolicyInternal(space_, league_size_, -1, -1) {}
323 
324   /** \brief  Specify league size, specify team size, request vector length */
TeamPolicyInternal(const execution_space space_,int league_size_,int team_size_request,const Kokkos::AUTO_t &)325   TeamPolicyInternal(const execution_space space_, int league_size_,
326                      int team_size_request, const Kokkos::AUTO_t&)
327       : TeamPolicyInternal(space_, league_size_, team_size_request, -1) {}
328 
TeamPolicyInternal(int league_size_,int team_size_request,int vector_length_request=1)329   TeamPolicyInternal(int league_size_, int team_size_request,
330                      int vector_length_request = 1)
331       : TeamPolicyInternal(typename traits::execution_space(), league_size_,
332                            team_size_request, vector_length_request) {}
333 
TeamPolicyInternal(int league_size_,const Kokkos::AUTO_t & team_size_request,int vector_length_request=1)334   TeamPolicyInternal(int league_size_, const Kokkos::AUTO_t& team_size_request,
335                      int vector_length_request = 1)
336       : TeamPolicyInternal(typename traits::execution_space(), league_size_,
337                            team_size_request, vector_length_request)
338 
339   {}
340 
341   /** \brief  Specify league size, request team size */
TeamPolicyInternal(int league_size_,const Kokkos::AUTO_t & team_size_request,const Kokkos::AUTO_t & vector_length_request)342   TeamPolicyInternal(int league_size_, const Kokkos::AUTO_t& team_size_request,
343                      const Kokkos::AUTO_t& vector_length_request)
344       : TeamPolicyInternal(typename traits::execution_space(), league_size_,
345                            team_size_request, vector_length_request) {}
346 
347   /** \brief  Specify league size, request team size */
TeamPolicyInternal(int league_size_,int team_size_request,const Kokkos::AUTO_t & vector_length_request)348   TeamPolicyInternal(int league_size_, int team_size_request,
349                      const Kokkos::AUTO_t& vector_length_request)
350       : TeamPolicyInternal(typename traits::execution_space(), league_size_,
351                            team_size_request, vector_length_request) {}
352 
chunk_size() const353   inline int chunk_size() const { return m_chunk_size; }
354 
355   /** \brief set chunk_size to a discrete value*/
set_chunk_size(typename traits::index_type chunk_size_)356   inline TeamPolicyInternal& set_chunk_size(
357       typename traits::index_type chunk_size_) {
358     m_chunk_size = chunk_size_;
359     return *this;
360   }
361 
362   /** \brief set per team scratch size for a specific level of the scratch
363    * hierarchy */
set_scratch_size(const int & level,const PerTeamValue & per_team)364   inline TeamPolicyInternal& set_scratch_size(const int& level,
365                                               const PerTeamValue& per_team) {
366     m_team_scratch_size[level] = per_team.value;
367     return *this;
368   }
369 
370   /** \brief set per thread scratch size for a specific level of the scratch
371    * hierarchy */
set_scratch_size(const int & level,const PerThreadValue & per_thread)372   inline TeamPolicyInternal& set_scratch_size(
373       const int& level, const PerThreadValue& per_thread) {
374     m_thread_scratch_size[level] = per_thread.value;
375     return *this;
376   }
377 
378   /** \brief set per thread and per team scratch size for a specific level of
379    * the scratch hierarchy */
set_scratch_size(const int & level,const PerTeamValue & per_team,const PerThreadValue & per_thread)380   inline TeamPolicyInternal& set_scratch_size(
381       const int& level, const PerTeamValue& per_team,
382       const PerThreadValue& per_thread) {
383     m_team_scratch_size[level]   = per_team.value;
384     m_thread_scratch_size[level] = per_thread.value;
385     return *this;
386   }
387 
388   using member_type = Kokkos::Impl::CudaTeamMember;
389 
390  protected:
391   template <class ClosureType, class FunctorType, class BlockSizeCallable>
internal_team_size_common(const FunctorType & f,BlockSizeCallable && block_size_callable) const392   int internal_team_size_common(const FunctorType& f,
393                                 BlockSizeCallable&& block_size_callable) const {
394     using closure_type = ClosureType;
395     using functor_value_traits =
396         Impl::FunctorValueTraits<FunctorType, typename traits::work_tag>;
397 
398     cudaFuncAttributes attr =
399         CudaParallelLaunch<closure_type, typename traits::launch_bounds>::
400             get_cuda_func_attributes();
401     const int block_size = std::forward<BlockSizeCallable>(block_size_callable)(
402         space().impl_internal_space_instance(), attr, f,
403         (size_t)impl_vector_length(),
404         (size_t)team_scratch_size(0) + 2 * sizeof(double),
405         (size_t)thread_scratch_size(0) + sizeof(double) +
406             ((functor_value_traits::StaticValueSize != 0)
407                  ? 0
408                  : functor_value_traits::value_size(f)));
409     KOKKOS_ASSERT(block_size > 0);
410 
411     // Currently we require Power-of-2 team size for reductions.
412     int p2 = 1;
413     while (p2 <= block_size) p2 *= 2;
414     p2 /= 2;
415     return p2 / impl_vector_length();
416   }
417 
418   template <class ClosureType, class FunctorType>
internal_team_size_max(const FunctorType & f) const419   int internal_team_size_max(const FunctorType& f) const {
420     return internal_team_size_common<ClosureType>(
421         f,
422         Kokkos::Impl::cuda_get_max_block_size<FunctorType,
423                                               typename traits::launch_bounds>);
424   }
425 
426   template <class ClosureType, class FunctorType>
internal_team_size_recommended(const FunctorType & f) const427   int internal_team_size_recommended(const FunctorType& f) const {
428     return internal_team_size_common<ClosureType>(
429         f,
430         Kokkos::Impl::cuda_get_opt_block_size<FunctorType,
431                                               typename traits::launch_bounds>);
432   }
433 };
434 
435 }  // namespace Impl
436 }  // namespace Kokkos
437 
438 //----------------------------------------------------------------------------
439 //----------------------------------------------------------------------------
440 
441 namespace Kokkos {
442 namespace Impl {
443 
444 template <class FunctorType, class... Traits>
445 class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
446  public:
447   using Policy = Kokkos::RangePolicy<Traits...>;
448 
449  private:
450   using Member       = typename Policy::member_type;
451   using WorkTag      = typename Policy::work_tag;
452   using LaunchBounds = typename Policy::launch_bounds;
453 
454   const FunctorType m_functor;
455   const Policy m_policy;
456 
457   ParallelFor()        = delete;
458   ParallelFor& operator=(const ParallelFor&) = delete;
459 
460   template <class TagType>
461   inline __device__
462       typename std::enable_if<std::is_same<TagType, void>::value>::type
exec_range(const Member i) const463       exec_range(const Member i) const {
464     m_functor(i);
465   }
466 
467   template <class TagType>
468   inline __device__
469       typename std::enable_if<!std::is_same<TagType, void>::value>::type
exec_range(const Member i) const470       exec_range(const Member i) const {
471     m_functor(TagType(), i);
472   }
473 
474  public:
475   using functor_type = FunctorType;
476 
get_policy() const477   Policy const& get_policy() const { return m_policy; }
478 
operator ()() const479   inline __device__ void operator()() const {
480     const Member work_stride = blockDim.y * gridDim.x;
481     const Member work_end    = m_policy.end();
482 
483     for (Member iwork =
484              m_policy.begin() + threadIdx.y + blockDim.y * blockIdx.x;
485          iwork < work_end;
486          iwork = iwork < work_end - work_stride ? iwork + work_stride
487                                                 : work_end) {
488       this->template exec_range<WorkTag>(iwork);
489     }
490   }
491 
execute() const492   inline void execute() const {
493     const typename Policy::index_type nwork = m_policy.end() - m_policy.begin();
494 
495     cudaFuncAttributes attr =
496         CudaParallelLaunch<ParallelFor,
497                            LaunchBounds>::get_cuda_func_attributes();
498     const int block_size =
499         Kokkos::Impl::cuda_get_opt_block_size<FunctorType, LaunchBounds>(
500             m_policy.space().impl_internal_space_instance(), attr, m_functor, 1,
501             0, 0);
502     KOKKOS_ASSERT(block_size > 0);
503     dim3 block(1, block_size, 1);
504     dim3 grid(
505         std::min(
506             typename Policy::index_type((nwork + block.y - 1) / block.y),
507             typename Policy::index_type(cuda_internal_maximum_grid_count())),
508         1, 1);
509 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
510     if (Kokkos::Impl::CudaInternal::cuda_use_serial_execution()) {
511       block = dim3(1, 1, 1);
512       grid  = dim3(1, 1, 1);
513     }
514 #endif
515 
516     CudaParallelLaunch<ParallelFor, LaunchBounds>(
517         *this, grid, block, 0, m_policy.space().impl_internal_space_instance(),
518         false);
519   }
520 
ParallelFor(const FunctorType & arg_functor,const Policy & arg_policy)521   ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
522       : m_functor(arg_functor), m_policy(arg_policy) {}
523 };
524 
525 // MDRangePolicy impl
526 template <class FunctorType, class... Traits>
527 class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
528  public:
529   using Policy       = Kokkos::MDRangePolicy<Traits...>;
530   using functor_type = FunctorType;
531 
532  private:
533   using RP               = Policy;
534   using array_index_type = typename Policy::array_index_type;
535   using index_type       = typename Policy::index_type;
536   using LaunchBounds     = typename Policy::launch_bounds;
537 
538   const FunctorType m_functor;
539   const Policy m_rp;
540 
541  public:
542   template <typename Policy, typename Functor>
max_tile_size_product(const Policy & pol,const Functor &)543   static int max_tile_size_product(const Policy& pol, const Functor&) {
544     cudaFuncAttributes attr =
545         CudaParallelLaunch<ParallelFor,
546                            LaunchBounds>::get_cuda_func_attributes();
547     auto const& prop = pol.space().cuda_device_prop();
548     // Limits due to registers/SM, MDRange doesn't have
549     // shared memory constraints
550     int const regs_per_sm        = prop.regsPerMultiprocessor;
551     int const regs_per_thread    = attr.numRegs;
552     int const max_threads_per_sm = regs_per_sm / regs_per_thread;
553     return std::min(
554         max_threads_per_sm,
555         static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
556   }
get_policy() const557   Policy const& get_policy() const { return m_rp; }
operator ()() const558   inline __device__ void operator()() const {
559     Kokkos::Impl::DeviceIterateTile<Policy::rank, Policy, FunctorType,
560                                     typename Policy::work_tag>(m_rp, m_functor)
561         .exec_range();
562   }
563 
execute() const564   inline void execute() const {
565     using namespace std;
566 
567     if (m_rp.m_num_tiles == 0) return;
568     const array_index_type maxblocks = static_cast<array_index_type>(
569         m_rp.space().impl_internal_space_instance()->m_maxBlock);
570     if (RP::rank == 2) {
571       const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], 1);
572       KOKKOS_ASSERT(block.x > 0);
573       KOKKOS_ASSERT(block.y > 0);
574       const dim3 grid(
575           min((m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x,
576               maxblocks),
577           min((m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1) / block.y,
578               maxblocks),
579           1);
580       CudaParallelLaunch<ParallelFor, LaunchBounds>(
581           *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
582           false);
583     } else if (RP::rank == 3) {
584       const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]);
585       KOKKOS_ASSERT(block.x > 0);
586       KOKKOS_ASSERT(block.y > 0);
587       KOKKOS_ASSERT(block.z > 0);
588       const dim3 grid(
589           min((m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x,
590               maxblocks),
591           min((m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1) / block.y,
592               maxblocks),
593           min((m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z,
594               maxblocks));
595       CudaParallelLaunch<ParallelFor, LaunchBounds>(
596           *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
597           false);
598     } else if (RP::rank == 4) {
599       // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to
600       // threadIdx.z
601       const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], m_rp.m_tile[2],
602                        m_rp.m_tile[3]);
603       KOKKOS_ASSERT(block.y > 0);
604       KOKKOS_ASSERT(block.z > 0);
605       const dim3 grid(
606           min(static_cast<index_type>(m_rp.m_tile_end[0] * m_rp.m_tile_end[1]),
607               static_cast<index_type>(maxblocks)),
608           min((m_rp.m_upper[2] - m_rp.m_lower[2] + block.y - 1) / block.y,
609               maxblocks),
610           min((m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z,
611               maxblocks));
612       CudaParallelLaunch<ParallelFor, LaunchBounds>(
613           *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
614           false);
615     } else if (RP::rank == 5) {
616       // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to
617       // threadIdx.z
618       const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1],
619                        m_rp.m_tile[2] * m_rp.m_tile[3], m_rp.m_tile[4]);
620       KOKKOS_ASSERT(block.z > 0);
621       const dim3 grid(
622           min(static_cast<index_type>(m_rp.m_tile_end[0] * m_rp.m_tile_end[1]),
623               static_cast<index_type>(maxblocks)),
624           min(static_cast<index_type>(m_rp.m_tile_end[2] * m_rp.m_tile_end[3]),
625               static_cast<index_type>(maxblocks)),
626           min((m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z,
627               maxblocks));
628       CudaParallelLaunch<ParallelFor, LaunchBounds>(
629           *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
630           false);
631     } else if (RP::rank == 6) {
632       // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to
633       // threadIdx.z
634       const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1],
635                        m_rp.m_tile[2] * m_rp.m_tile[3],
636                        m_rp.m_tile[4] * m_rp.m_tile[5]);
637       const dim3 grid(
638           min(static_cast<index_type>(m_rp.m_tile_end[0] * m_rp.m_tile_end[1]),
639               static_cast<index_type>(maxblocks)),
640           min(static_cast<index_type>(m_rp.m_tile_end[2] * m_rp.m_tile_end[3]),
641               static_cast<index_type>(maxblocks)),
642           min(static_cast<index_type>(m_rp.m_tile_end[4] * m_rp.m_tile_end[5]),
643               static_cast<index_type>(maxblocks)));
644       CudaParallelLaunch<ParallelFor, LaunchBounds>(
645           *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
646           false);
647     } else {
648       Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n");
649     }
650 
651   }  // end execute
652 
653   //  inline
ParallelFor(const FunctorType & arg_functor,Policy arg_policy)654   ParallelFor(const FunctorType& arg_functor, Policy arg_policy)
655       : m_functor(arg_functor), m_rp(arg_policy) {}
656 };
657 
658 template <class FunctorType, class... Properties>
659 class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
660                   Kokkos::Cuda> {
661  public:
662   using Policy = TeamPolicy<Properties...>;
663 
664  private:
665   using Member       = typename Policy::member_type;
666   using WorkTag      = typename Policy::work_tag;
667   using LaunchBounds = typename Policy::launch_bounds;
668 
669  public:
670   using functor_type = FunctorType;
671   using size_type    = Cuda::size_type;
672 
673  private:
674   // Algorithmic constraints: blockDim.y is a power of two AND blockDim.y ==
675   // blockDim.z == 1 shared memory utilization:
676   //
677   //  [ team   reduce space ]
678   //  [ team   shared space ]
679   //
680 
681   const FunctorType m_functor;
682   const Policy m_policy;
683   const size_type m_league_size;
684   int m_team_size;
685   const size_type m_vector_size;
686   int m_shmem_begin;
687   int m_shmem_size;
688   void* m_scratch_ptr[2];
689   int m_scratch_size[2];
690 
691   template <class TagType>
692   __device__ inline
693       typename std::enable_if<std::is_same<TagType, void>::value>::type
exec_team(const Member & member) const694       exec_team(const Member& member) const {
695     m_functor(member);
696   }
697 
698   template <class TagType>
699   __device__ inline
700       typename std::enable_if<!std::is_same<TagType, void>::value>::type
exec_team(const Member & member) const701       exec_team(const Member& member) const {
702     m_functor(TagType(), member);
703   }
704 
705  public:
get_policy() const706   Policy const& get_policy() const { return m_policy; }
707 
operator ()() const708   __device__ inline void operator()() const {
709     // Iterate this block through the league
710     int64_t threadid = 0;
711     if (m_scratch_size[1] > 0) {
712       __shared__ int64_t base_thread_id;
713       if (threadIdx.x == 0 && threadIdx.y == 0) {
714         threadid = (blockIdx.x * blockDim.z + threadIdx.z) %
715                    (Kokkos::Impl::g_device_cuda_lock_arrays.n /
716                     (blockDim.x * blockDim.y));
717         threadid *= blockDim.x * blockDim.y;
718         int done = 0;
719         while (!done) {
720           done =
721               (0 ==
722                atomicCAS(
723                    &Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid],
724                    0, 1));
725           if (!done) {
726             threadid += blockDim.x * blockDim.y;
727             if (int64_t(threadid + blockDim.x * blockDim.y) >=
728                 int64_t(Kokkos::Impl::g_device_cuda_lock_arrays.n))
729               threadid = 0;
730           }
731         }
732         base_thread_id = threadid;
733       }
734       __syncthreads();
735       threadid = base_thread_id;
736     }
737 
738     const int int_league_size = (int)m_league_size;
739     for (int league_rank = blockIdx.x; league_rank < int_league_size;
740          league_rank += gridDim.x) {
741       this->template exec_team<WorkTag>(typename Policy::member_type(
742           kokkos_impl_cuda_shared_memory<void>(), m_shmem_begin, m_shmem_size,
743           (void*)(((char*)m_scratch_ptr[1]) +
744                   ptrdiff_t(threadid / (blockDim.x * blockDim.y)) *
745                       m_scratch_size[1]),
746           m_scratch_size[1], league_rank, m_league_size));
747     }
748     if (m_scratch_size[1] > 0) {
749       __syncthreads();
750       if (threadIdx.x == 0 && threadIdx.y == 0)
751         Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid] = 0;
752     }
753   }
754 
execute() const755   inline void execute() const {
756     const int64_t shmem_size_total = m_shmem_begin + m_shmem_size;
757     dim3 grid(int(m_league_size), 1, 1);
758     const dim3 block(int(m_vector_size), int(m_team_size), 1);
759 
760 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
761     if (Kokkos::Impl::CudaInternal::cuda_use_serial_execution()) {
762       grid = dim3(1, 1, 1);
763     }
764 #endif
765 
766     CudaParallelLaunch<ParallelFor, LaunchBounds>(
767         *this, grid, block, shmem_size_total,
768         m_policy.space().impl_internal_space_instance(),
769         true);  // copy to device and execute
770   }
771 
ParallelFor(const FunctorType & arg_functor,const Policy & arg_policy)772   ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
773       : m_functor(arg_functor),
774         m_policy(arg_policy),
775         m_league_size(arg_policy.league_size()),
776         m_team_size(arg_policy.team_size()),
777         m_vector_size(arg_policy.impl_vector_length()) {
778     cudaFuncAttributes attr =
779         CudaParallelLaunch<ParallelFor,
780                            LaunchBounds>::get_cuda_func_attributes();
781     m_team_size =
782         m_team_size >= 0
783             ? m_team_size
784             : Kokkos::Impl::cuda_get_opt_block_size<FunctorType, LaunchBounds>(
785                   m_policy.space().impl_internal_space_instance(), attr,
786                   m_functor, m_vector_size, m_policy.team_scratch_size(0),
787                   m_policy.thread_scratch_size(0)) /
788                   m_vector_size;
789 
790     m_shmem_begin = (sizeof(double) * (m_team_size + 2));
791     m_shmem_size =
792         (m_policy.scratch_size(0, m_team_size) +
793          FunctorTeamShmemSize<FunctorType>::value(m_functor, m_team_size));
794     m_scratch_size[0] = m_policy.scratch_size(0, m_team_size);
795     m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
796 
797     // Functor's reduce memory, team scan memory, and team shared memory depend
798     // upon team size.
799     m_scratch_ptr[0] = nullptr;
800     m_scratch_ptr[1] =
801         m_team_size <= 0
802             ? nullptr
803             : m_policy.space()
804                   .impl_internal_space_instance()
805                   ->resize_team_scratch_space(
806                       static_cast<ptrdiff_t>(m_scratch_size[1]) *
807                       static_cast<ptrdiff_t>(Cuda::concurrency() /
808                                              (m_team_size * m_vector_size)));
809 
810     const int shmem_size_total = m_shmem_begin + m_shmem_size;
811     if (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
812         shmem_size_total) {
813       printf(
814           "%i %i\n",
815           m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock,
816           shmem_size_total);
817       Kokkos::Impl::throw_runtime_exception(std::string(
818           "Kokkos::Impl::ParallelFor< Cuda > insufficient shared memory"));
819     }
820 
821     if (int(m_team_size) >
822         int(Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>(
823                 m_policy.space().impl_internal_space_instance(), attr,
824                 arg_functor, arg_policy.impl_vector_length(),
825                 arg_policy.team_scratch_size(0),
826                 arg_policy.thread_scratch_size(0)) /
827             arg_policy.impl_vector_length())) {
828       Kokkos::Impl::throw_runtime_exception(std::string(
829           "Kokkos::Impl::ParallelFor< Cuda > requested too large team size."));
830     }
831   }
832 };
833 
834 }  // namespace Impl
835 }  // namespace Kokkos
836 
837 //----------------------------------------------------------------------------
838 //----------------------------------------------------------------------------
839 
840 namespace Kokkos {
841 namespace Impl {
842 
843 template <class FunctorType, class ReducerType, class... Traits>
844 class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
845                      Kokkos::Cuda> {
846  public:
847   using Policy = Kokkos::RangePolicy<Traits...>;
848 
849  private:
850   using WorkRange    = typename Policy::WorkRange;
851   using WorkTag      = typename Policy::work_tag;
852   using Member       = typename Policy::member_type;
853   using LaunchBounds = typename Policy::launch_bounds;
854 
855   using ReducerConditional =
856       Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
857                          FunctorType, ReducerType>;
858   using ReducerTypeFwd = typename ReducerConditional::type;
859   using WorkTagFwd =
860       typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
861                                   WorkTag, void>::type;
862 
863   using ValueTraits =
864       Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
865   using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
866   using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
867 
868  public:
869   using pointer_type   = typename ValueTraits::pointer_type;
870   using value_type     = typename ValueTraits::value_type;
871   using reference_type = typename ValueTraits::reference_type;
872   using functor_type   = FunctorType;
873   using size_type      = Kokkos::Cuda::size_type;
874   using index_type     = typename Policy::index_type;
875   using reducer_type   = ReducerType;
876 
877   // Algorithmic constraints: blockSize is a power of two AND blockDim.y ==
878   // blockDim.z == 1
879 
880   const FunctorType m_functor;
881   const Policy m_policy;
882   const ReducerType m_reducer;
883   const pointer_type m_result_ptr;
884   const bool m_result_ptr_device_accessible;
885   const bool m_result_ptr_host_accessible;
886   size_type* m_scratch_space;
887   size_type* m_scratch_flags;
888   size_type* m_unified_space;
889 
890   // Shall we use the shfl based reduction or not (only use it for static sized
891   // types of more than 128bit)
892   enum {
893     UseShflReduction = false
894   };  //((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize)
895       //};
896       // Some crutch to do function overloading
897  private:
898   using DummyShflReductionType  = double;
899   using DummySHMEMReductionType = int;
900 
901  public:
get_policy() const902   Policy const& get_policy() const { return m_policy; }
903 
904   // Make the exec_range calls call to Reduce::DeviceIterateTile
905   template <class TagType>
906   __device__ inline
907       typename std::enable_if<std::is_same<TagType, void>::value>::type
exec_range(const Member & i,reference_type update) const908       exec_range(const Member& i, reference_type update) const {
909     m_functor(i, update);
910   }
911 
912   template <class TagType>
913   __device__ inline
914       typename std::enable_if<!std::is_same<TagType, void>::value>::type
exec_range(const Member & i,reference_type update) const915       exec_range(const Member& i, reference_type update) const {
916     m_functor(TagType(), i, update);
917   }
918 
operator ()() const919   __device__ inline void operator()() const {
920     /*    run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType,
921       DummySHMEMReductionType>::select(1,1.0) );
922       }
923 
924       __device__ inline
925       void run(const DummySHMEMReductionType& ) const
926       {*/
927     const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize /
928                                                    sizeof(size_type)>
929         word_count(ValueTraits::value_size(
930                        ReducerConditional::select(m_functor, m_reducer)) /
931                    sizeof(size_type));
932 
933     {
934       reference_type value =
935           ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
936                           kokkos_impl_cuda_shared_memory<size_type>() +
937                               threadIdx.y * word_count.value);
938 
939       // Number of blocks is bounded so that the reduction can be limited to two
940       // passes. Each thread block is given an approximately equal amount of
941       // work to perform. Accumulate the values for this block. The accumulation
942       // ordering does not match the final pass, but is arithmatically
943       // equivalent.
944 
945       const WorkRange range(m_policy, blockIdx.x, gridDim.x);
946 
947       for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end();
948            iwork < iwork_end; iwork += blockDim.y) {
949         this->template exec_range<WorkTag>(iwork, value);
950       }
951     }
952 
953     // Doing code duplication here to fix issue #3428
954     // Suspect optimizer bug??
955     // Reduce with final value at blockDim.y - 1 location.
956     // Shortcut for length zero reduction
957     if (m_policy.begin() == m_policy.end()) {
958       // This is the final block with the final result at the final threads'
959       // location
960 
961       size_type* const shared = kokkos_impl_cuda_shared_memory<size_type>() +
962                                 (blockDim.y - 1) * word_count.value;
963       size_type* const global =
964           m_result_ptr_device_accessible
965               ? reinterpret_cast<size_type*>(m_result_ptr)
966               : (m_unified_space ? m_unified_space : m_scratch_space);
967 
968       if (threadIdx.y == 0) {
969         Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
970             ReducerConditional::select(m_functor, m_reducer), shared);
971       }
972 
973       if (CudaTraits::WarpSize < word_count.value) {
974         __syncthreads();
975       }
976 
977       for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) {
978         global[i] = shared[i];
979       }
980       // return ;
981     }
982 
983     if (m_policy.begin() != m_policy.end()) {
984       {
985         if (cuda_single_inter_block_reduce_scan<false, ReducerTypeFwd,
986                                                 WorkTagFwd>(
987                 ReducerConditional::select(m_functor, m_reducer), blockIdx.x,
988                 gridDim.x, kokkos_impl_cuda_shared_memory<size_type>(),
989                 m_scratch_space, m_scratch_flags)) {
990           // This is the final block with the final result at the final threads'
991           // location
992 
993           size_type* const shared =
994               kokkos_impl_cuda_shared_memory<size_type>() +
995               (blockDim.y - 1) * word_count.value;
996           size_type* const global =
997               m_result_ptr_device_accessible
998                   ? reinterpret_cast<size_type*>(m_result_ptr)
999                   : (m_unified_space ? m_unified_space : m_scratch_space);
1000 
1001           if (threadIdx.y == 0) {
1002             Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
1003                 ReducerConditional::select(m_functor, m_reducer), shared);
1004           }
1005 
1006           if (CudaTraits::WarpSize < word_count.value) {
1007             __syncthreads();
1008           }
1009 
1010           for (unsigned i = threadIdx.y; i < word_count.value;
1011                i += blockDim.y) {
1012             global[i] = shared[i];
1013           }
1014         }
1015       }
1016     }
1017   }
1018   /*  __device__ inline
1019      void run(const DummyShflReductionType&) const
1020      {
1021        value_type value;
1022        ValueInit::init( ReducerConditional::select(m_functor , m_reducer) ,
1023      &value);
1024        // Number of blocks is bounded so that the reduction can be limited to
1025      two passes.
1026        // Each thread block is given an approximately equal amount of work to
1027      perform.
1028        // Accumulate the values for this block.
1029        // The accumulation ordering does not match the final pass, but is
1030      arithmatically equivalent.
1031 
1032        const WorkRange range( m_policy , blockIdx.x , gridDim.x );
1033 
1034        for ( Member iwork = range.begin() + threadIdx.y , iwork_end =
1035      range.end() ; iwork < iwork_end ; iwork += blockDim.y ) { this-> template
1036      exec_range< WorkTag >( iwork , value );
1037        }
1038 
1039        pointer_type const result = (pointer_type) (m_unified_space ?
1040      m_unified_space : m_scratch_space) ;
1041 
1042        int max_active_thread = range.end()-range.begin() < blockDim.y ?
1043      range.end() - range.begin():blockDim.y;
1044 
1045        max_active_thread = (max_active_thread ==
1046      0)?blockDim.y:max_active_thread;
1047 
1048       value_type init;
1049       ValueInit::init( ReducerConditional::select(m_functor , m_reducer) ,
1050      &init);
1051        if(Impl::cuda_inter_block_reduction<ReducerTypeFwd,ValueJoin,WorkTagFwd>
1052               (value,init,ValueJoin(ReducerConditional::select(m_functor ,
1053      m_reducer)),m_scratch_space,result,m_scratch_flags,max_active_thread)) {
1054          const unsigned id = threadIdx.y*blockDim.x + threadIdx.x;
1055          if(id==0) {
1056            Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final(
1057      ReducerConditional::select(m_functor , m_reducer) , (void*) &value );
1058            *result = value;
1059          }
1060        }
1061      }*/
1062 
1063   // Determine block size constrained by shared memory:
local_block_size(const FunctorType & f)1064   inline unsigned local_block_size(const FunctorType& f) {
1065     unsigned n = CudaTraits::WarpSize * 8;
1066     int shmem_size =
1067         cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>(
1068             f, n);
1069     using closure_type = Impl::ParallelReduce<FunctorType, Policy, ReducerType>;
1070     cudaFuncAttributes attr =
1071         CudaParallelLaunch<closure_type,
1072                            LaunchBounds>::get_cuda_func_attributes();
1073     while (
1074         (n &&
1075          (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
1076           shmem_size)) ||
1077         (n >
1078          static_cast<unsigned>(
1079              Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>(
1080                  m_policy.space().impl_internal_space_instance(), attr, f, 1,
1081                  shmem_size, 0)))) {
1082       n >>= 1;
1083       shmem_size = cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
1084                                                              WorkTag>(f, n);
1085     }
1086     return n;
1087   }
1088 
execute()1089   inline void execute() {
1090     const index_type nwork     = m_policy.end() - m_policy.begin();
1091     const bool need_device_set = ReduceFunctorHasInit<FunctorType>::value ||
1092                                  ReduceFunctorHasFinal<FunctorType>::value ||
1093                                  !m_result_ptr_host_accessible ||
1094 #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
1095                                  Policy::is_graph_kernel::value ||
1096 #endif
1097                                  !std::is_same<ReducerType, InvalidType>::value;
1098     if ((nwork > 0) || need_device_set) {
1099       const int block_size = local_block_size(m_functor);
1100 
1101       KOKKOS_ASSERT(block_size > 0);
1102 
1103       m_scratch_space = cuda_internal_scratch_space(
1104           m_policy.space(), ValueTraits::value_size(ReducerConditional::select(
1105                                 m_functor, m_reducer)) *
1106                                 block_size /* block_size == max block_count */);
1107       m_scratch_flags =
1108           cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type));
1109       m_unified_space = cuda_internal_scratch_unified(
1110           m_policy.space(), ValueTraits::value_size(ReducerConditional::select(
1111                                 m_functor, m_reducer)));
1112 
1113       // REQUIRED ( 1 , N , 1 )
1114       dim3 block(1, block_size, 1);
1115       // Required grid.x <= block.y
1116       dim3 grid(std::min(int(block.y), int((nwork + block.y - 1) / block.y)), 1,
1117                 1);
1118 
1119       // TODO @graph We need to effectively insert this in to the graph
1120       const int shmem =
1121           UseShflReduction
1122               ? 0
1123               : cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
1124                                                           WorkTag>(m_functor,
1125                                                                    block.y);
1126 
1127       if ((nwork == 0)
1128 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
1129           || Kokkos::Impl::CudaInternal::cuda_use_serial_execution()
1130 #endif
1131       ) {
1132         block = dim3(1, 1, 1);
1133         grid  = dim3(1, 1, 1);
1134       }
1135 
1136       CudaParallelLaunch<ParallelReduce, LaunchBounds>(
1137           *this, grid, block, shmem,
1138           m_policy.space().impl_internal_space_instance(),
1139           false);  // copy to device and execute
1140 
1141       if (!m_result_ptr_device_accessible) {
1142         m_policy.space().fence();
1143 
1144         if (m_result_ptr) {
1145           if (m_unified_space) {
1146             const int count = ValueTraits::value_count(
1147                 ReducerConditional::select(m_functor, m_reducer));
1148             for (int i = 0; i < count; ++i) {
1149               m_result_ptr[i] = pointer_type(m_unified_space)[i];
1150             }
1151           } else {
1152             const int size = ValueTraits::value_size(
1153                 ReducerConditional::select(m_functor, m_reducer));
1154             DeepCopy<HostSpace, CudaSpace>(m_result_ptr, m_scratch_space, size);
1155           }
1156         }
1157       }
1158     } else {
1159       if (m_result_ptr) {
1160         // TODO @graph We need to effectively insert this in to the graph
1161         ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
1162                         m_result_ptr);
1163       }
1164     }
1165   }
1166 
1167   template <class ViewType>
ParallelReduce(const FunctorType & arg_functor,const Policy & arg_policy,const ViewType & arg_result,typename std::enable_if<Kokkos::is_view<ViewType>::value,void * >::type=nullptr)1168   ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
1169                  const ViewType& arg_result,
1170                  typename std::enable_if<Kokkos::is_view<ViewType>::value,
1171                                          void*>::type = nullptr)
1172       : m_functor(arg_functor),
1173         m_policy(arg_policy),
1174         m_reducer(InvalidType()),
1175         m_result_ptr(arg_result.data()),
1176         m_result_ptr_device_accessible(
1177             MemorySpaceAccess<Kokkos::CudaSpace,
1178                               typename ViewType::memory_space>::accessible),
1179         m_result_ptr_host_accessible(
1180             MemorySpaceAccess<Kokkos::HostSpace,
1181                               typename ViewType::memory_space>::accessible),
1182         m_scratch_space(nullptr),
1183         m_scratch_flags(nullptr),
1184         m_unified_space(nullptr) {}
1185 
ParallelReduce(const FunctorType & arg_functor,const Policy & arg_policy,const ReducerType & reducer)1186   ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
1187                  const ReducerType& reducer)
1188       : m_functor(arg_functor),
1189         m_policy(arg_policy),
1190         m_reducer(reducer),
1191         m_result_ptr(reducer.view().data()),
1192         m_result_ptr_device_accessible(
1193             MemorySpaceAccess<Kokkos::CudaSpace,
1194                               typename ReducerType::result_view_type::
1195                                   memory_space>::accessible),
1196         m_result_ptr_host_accessible(
1197             MemorySpaceAccess<Kokkos::HostSpace,
1198                               typename ReducerType::result_view_type::
1199                                   memory_space>::accessible),
1200         m_scratch_space(nullptr),
1201         m_scratch_flags(nullptr),
1202         m_unified_space(nullptr) {}
1203 };
1204 
1205 // MDRangePolicy impl
1206 template <class FunctorType, class ReducerType, class... Traits>
1207 class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
1208                      Kokkos::Cuda> {
1209  public:
1210   using Policy = Kokkos::MDRangePolicy<Traits...>;
1211 
1212  private:
1213   using array_index_type = typename Policy::array_index_type;
1214   using index_type       = typename Policy::index_type;
1215 
1216   using WorkTag      = typename Policy::work_tag;
1217   using Member       = typename Policy::member_type;
1218   using LaunchBounds = typename Policy::launch_bounds;
1219 
1220   using ReducerConditional =
1221       Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
1222                          FunctorType, ReducerType>;
1223   using ReducerTypeFwd = typename ReducerConditional::type;
1224   using WorkTagFwd =
1225       typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
1226                                   WorkTag, void>::type;
1227 
1228   using ValueTraits =
1229       Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
1230   using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
1231   using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
1232 
1233  public:
1234   using pointer_type   = typename ValueTraits::pointer_type;
1235   using value_type     = typename ValueTraits::value_type;
1236   using reference_type = typename ValueTraits::reference_type;
1237   using functor_type   = FunctorType;
1238   using size_type      = Cuda::size_type;
1239   using reducer_type   = ReducerType;
1240 
1241   // Algorithmic constraints: blockSize is a power of two AND blockDim.y ==
1242   // blockDim.z == 1
1243 
1244   const FunctorType m_functor;
1245   const Policy m_policy;  // used for workrange and nwork
1246   const ReducerType m_reducer;
1247   const pointer_type m_result_ptr;
1248   const bool m_result_ptr_device_accessible;
1249   size_type* m_scratch_space;
1250   size_type* m_scratch_flags;
1251   size_type* m_unified_space;
1252 
1253   using DeviceIteratePattern = typename Kokkos::Impl::Reduce::DeviceIterateTile<
1254       Policy::rank, Policy, FunctorType, typename Policy::work_tag,
1255       reference_type>;
1256 
1257   // Shall we use the shfl based reduction or not (only use it for static sized
1258   // types of more than 128bit
1259   static constexpr bool UseShflReduction = false;
1260   //((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize)
1261   // Some crutch to do function overloading
1262  private:
1263   using DummyShflReductionType  = double;
1264   using DummySHMEMReductionType = int;
1265 
1266  public:
1267   template <typename Policy, typename Functor>
max_tile_size_product(const Policy & pol,const Functor &)1268   static int max_tile_size_product(const Policy& pol, const Functor&) {
1269     cudaFuncAttributes attr =
1270         CudaParallelLaunch<ParallelReduce,
1271                            LaunchBounds>::get_cuda_func_attributes();
1272     auto const& prop = pol.space().cuda_device_prop();
1273     // Limits due do registers/SM
1274     int const regs_per_sm        = prop.regsPerMultiprocessor;
1275     int const regs_per_thread    = attr.numRegs;
1276     int const max_threads_per_sm = regs_per_sm / regs_per_thread;
1277     return std::min(
1278         max_threads_per_sm,
1279         static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
1280   }
get_policy() const1281   Policy const& get_policy() const { return m_policy; }
exec_range(reference_type update) const1282   inline __device__ void exec_range(reference_type update) const {
1283     Kokkos::Impl::Reduce::DeviceIterateTile<Policy::rank, Policy, FunctorType,
1284                                             typename Policy::work_tag,
1285                                             reference_type>(m_policy, m_functor,
1286                                                             update)
1287         .exec_range();
1288   }
1289 
operator ()() const1290   inline __device__ void operator()() const {
1291     /*    run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType,
1292       DummySHMEMReductionType>::select(1,1.0) );
1293       }
1294 
1295       __device__ inline
1296       void run(const DummySHMEMReductionType& ) const
1297       {*/
1298     const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize /
1299                                                    sizeof(size_type)>
1300         word_count(ValueTraits::value_size(
1301                        ReducerConditional::select(m_functor, m_reducer)) /
1302                    sizeof(size_type));
1303 
1304     {
1305       reference_type value =
1306           ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
1307                           kokkos_impl_cuda_shared_memory<size_type>() +
1308                               threadIdx.y * word_count.value);
1309 
1310       // Number of blocks is bounded so that the reduction can be limited to two
1311       // passes. Each thread block is given an approximately equal amount of
1312       // work to perform. Accumulate the values for this block. The accumulation
1313       // ordering does not match the final pass, but is arithmatically
1314       // equivalent.
1315 
1316       this->exec_range(value);
1317     }
1318 
1319     // Reduce with final value at blockDim.y - 1 location.
1320     // Problem: non power-of-two blockDim
1321     if (cuda_single_inter_block_reduce_scan<false, ReducerTypeFwd, WorkTagFwd>(
1322             ReducerConditional::select(m_functor, m_reducer), blockIdx.x,
1323             gridDim.x, kokkos_impl_cuda_shared_memory<size_type>(),
1324             m_scratch_space, m_scratch_flags)) {
1325       // This is the final block with the final result at the final threads'
1326       // location
1327       size_type* const shared = kokkos_impl_cuda_shared_memory<size_type>() +
1328                                 (blockDim.y - 1) * word_count.value;
1329       size_type* const global =
1330           m_result_ptr_device_accessible
1331               ? reinterpret_cast<size_type*>(m_result_ptr)
1332               : (m_unified_space ? m_unified_space : m_scratch_space);
1333 
1334       if (threadIdx.y == 0) {
1335         Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
1336             ReducerConditional::select(m_functor, m_reducer), shared);
1337       }
1338 
1339       if (CudaTraits::WarpSize < word_count.value) {
1340         __syncthreads();
1341       }
1342 
1343       for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) {
1344         global[i] = shared[i];
1345       }
1346     }
1347   }
1348 
1349   /*  __device__ inline
1350      void run(const DummyShflReductionType&) const
1351      {
1352 
1353        value_type value;
1354        ValueInit::init( ReducerConditional::select(m_functor , m_reducer) ,
1355      &value);
1356        // Number of blocks is bounded so that the reduction can be limited to
1357      two passes.
1358        // Each thread block is given an approximately equal amount of work to
1359      perform.
1360        // Accumulate the values for this block.
1361        // The accumulation ordering does not match the final pass, but is
1362      arithmatically equivalent.
1363 
1364        const Member work_part =
1365          ( ( m_policy.m_num_tiles + ( gridDim.x - 1 ) ) / gridDim.x ); //portion
1366      of tiles handled by each block
1367 
1368        this-> exec_range( value );
1369 
1370        pointer_type const result = (pointer_type) (m_unified_space ?
1371      m_unified_space : m_scratch_space) ;
1372 
1373        int max_active_thread = work_part < blockDim.y ? work_part:blockDim.y;
1374        max_active_thread = (max_active_thread ==
1375      0)?blockDim.y:max_active_thread;
1376 
1377        value_type init;
1378        ValueInit::init( ReducerConditional::select(m_functor , m_reducer) ,
1379      &init);
1380        if(Impl::cuda_inter_block_reduction<ReducerTypeFwd,ValueJoin,WorkTagFwd>
1381            (value,init,ValueJoin(ReducerConditional::select(m_functor ,
1382      m_reducer)),m_scratch_space,result,m_scratch_flags,max_active_thread)) {
1383          const unsigned id = threadIdx.y*blockDim.x + threadIdx.x;
1384          if(id==0) {
1385            Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final(
1386      ReducerConditional::select(m_functor , m_reducer) , (void*) &value );
1387            *result = value;
1388          }
1389        }
1390      }
1391   */
1392   // Determine block size constrained by shared memory:
local_block_size(const FunctorType & f)1393   inline unsigned local_block_size(const FunctorType& f) {
1394     unsigned n = CudaTraits::WarpSize * 8;
1395     int shmem_size =
1396         cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>(
1397             f, n);
1398     using closure_type = Impl::ParallelReduce<FunctorType, Policy, ReducerType>;
1399     cudaFuncAttributes attr =
1400         CudaParallelLaunch<closure_type,
1401                            LaunchBounds>::get_cuda_func_attributes();
1402     while (
1403         (n &&
1404          (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
1405           shmem_size)) ||
1406         (n >
1407          static_cast<unsigned>(
1408              Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>(
1409                  m_policy.space().impl_internal_space_instance(), attr, f, 1,
1410                  shmem_size, 0)))) {
1411       n >>= 1;
1412       shmem_size = cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
1413                                                              WorkTag>(f, n);
1414     }
1415     return n;
1416   }
1417 
execute()1418   inline void execute() {
1419     const int nwork = m_policy.m_num_tiles;
1420     if (nwork) {
1421       int block_size = m_policy.m_prod_tile_dims;
1422       // CONSTRAINT: Algorithm requires block_size >= product of tile dimensions
1423       // Nearest power of two
1424       int exponent_pow_two    = std::ceil(std::log2(block_size));
1425       block_size              = std::pow(2, exponent_pow_two);
1426       int suggested_blocksize = local_block_size(m_functor);
1427 
1428       block_size = (block_size > suggested_blocksize)
1429                        ? block_size
1430                        : suggested_blocksize;  // Note: block_size must be less
1431                                                // than or equal to 512
1432 
1433       m_scratch_space = cuda_internal_scratch_space(
1434           m_policy.space(), ValueTraits::value_size(ReducerConditional::select(
1435                                 m_functor, m_reducer)) *
1436                                 block_size /* block_size == max block_count */);
1437       m_scratch_flags =
1438           cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type));
1439       m_unified_space = cuda_internal_scratch_unified(
1440           m_policy.space(), ValueTraits::value_size(ReducerConditional::select(
1441                                 m_functor, m_reducer)));
1442 
1443       // REQUIRED ( 1 , N , 1 )
1444       const dim3 block(1, block_size, 1);
1445       // Required grid.x <= block.y
1446       const dim3 grid(std::min(int(block.y), int(nwork)), 1, 1);
1447 
1448       // TODO @graph We need to effectively insert this in to the graph
1449       const int shmem =
1450           UseShflReduction
1451               ? 0
1452               : cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
1453                                                           WorkTag>(m_functor,
1454                                                                    block.y);
1455 
1456       CudaParallelLaunch<ParallelReduce, LaunchBounds>(
1457           *this, grid, block, shmem,
1458           m_policy.space().impl_internal_space_instance(),
1459           false);  // copy to device and execute
1460 
1461       if (!m_result_ptr_device_accessible) {
1462         m_policy.space().fence();
1463 
1464         if (m_result_ptr) {
1465           if (m_unified_space) {
1466             const int count = ValueTraits::value_count(
1467                 ReducerConditional::select(m_functor, m_reducer));
1468             for (int i = 0; i < count; ++i) {
1469               m_result_ptr[i] = pointer_type(m_unified_space)[i];
1470             }
1471           } else {
1472             const int size = ValueTraits::value_size(
1473                 ReducerConditional::select(m_functor, m_reducer));
1474             DeepCopy<HostSpace, CudaSpace>(m_result_ptr, m_scratch_space, size);
1475           }
1476         }
1477       }
1478     } else {
1479       if (m_result_ptr) {
1480         // TODO @graph We need to effectively insert this in to the graph
1481         ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
1482                         m_result_ptr);
1483       }
1484     }
1485   }
1486 
1487   template <class ViewType>
ParallelReduce(const FunctorType & arg_functor,const Policy & arg_policy,const ViewType & arg_result,typename std::enable_if<Kokkos::is_view<ViewType>::value,void * >::type=nullptr)1488   ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
1489                  const ViewType& arg_result,
1490                  typename std::enable_if<Kokkos::is_view<ViewType>::value,
1491                                          void*>::type = nullptr)
1492       : m_functor(arg_functor),
1493         m_policy(arg_policy),
1494         m_reducer(InvalidType()),
1495         m_result_ptr(arg_result.data()),
1496         m_result_ptr_device_accessible(
1497             MemorySpaceAccess<Kokkos::CudaSpace,
1498                               typename ViewType::memory_space>::accessible),
1499         m_scratch_space(nullptr),
1500         m_scratch_flags(nullptr),
1501         m_unified_space(nullptr) {}
1502 
ParallelReduce(const FunctorType & arg_functor,const Policy & arg_policy,const ReducerType & reducer)1503   ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
1504                  const ReducerType& reducer)
1505       : m_functor(arg_functor),
1506         m_policy(arg_policy),
1507         m_reducer(reducer),
1508         m_result_ptr(reducer.view().data()),
1509         m_result_ptr_device_accessible(
1510             MemorySpaceAccess<Kokkos::CudaSpace,
1511                               typename ReducerType::result_view_type::
1512                                   memory_space>::accessible),
1513         m_scratch_space(nullptr),
1514         m_scratch_flags(nullptr),
1515         m_unified_space(nullptr) {}
1516 };
1517 
1518 //----------------------------------------------------------------------------
1519 
1520 template <class FunctorType, class ReducerType, class... Properties>
1521 class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
1522                      ReducerType, Kokkos::Cuda> {
1523  public:
1524   using Policy = TeamPolicy<Properties...>;
1525 
1526  private:
1527   using Member       = typename Policy::member_type;
1528   using WorkTag      = typename Policy::work_tag;
1529   using LaunchBounds = typename Policy::launch_bounds;
1530 
1531   using ReducerConditional =
1532       Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
1533                          FunctorType, ReducerType>;
1534   using ReducerTypeFwd = typename ReducerConditional::type;
1535   using WorkTagFwd =
1536       typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
1537                                   WorkTag, void>::type;
1538 
1539   using ValueTraits =
1540       Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
1541   using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
1542   using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
1543 
1544   using pointer_type   = typename ValueTraits::pointer_type;
1545   using reference_type = typename ValueTraits::reference_type;
1546   using value_type     = typename ValueTraits::value_type;
1547 
1548  public:
1549   using functor_type = FunctorType;
1550   using size_type    = Cuda::size_type;
1551   using reducer_type = ReducerType;
1552 
1553   enum : bool {
1554     UseShflReduction = (true && (ValueTraits::StaticValueSize != 0))
1555   };
1556 
1557  private:
1558   using DummyShflReductionType  = double;
1559   using DummySHMEMReductionType = int;
1560 
1561   // Algorithmic constraints: blockDim.y is a power of two AND blockDim.y ==
1562   // blockDim.z == 1 shared memory utilization:
1563   //
1564   //  [ global reduce space ]
1565   //  [ team   reduce space ]
1566   //  [ team   shared space ]
1567   //
1568 
1569   const FunctorType m_functor;
1570   const Policy m_policy;
1571   const ReducerType m_reducer;
1572   const pointer_type m_result_ptr;
1573   const bool m_result_ptr_device_accessible;
1574   const bool m_result_ptr_host_accessible;
1575   size_type* m_scratch_space;
1576   size_type* m_scratch_flags;
1577   size_type* m_unified_space;
1578   size_type m_team_begin;
1579   size_type m_shmem_begin;
1580   size_type m_shmem_size;
1581   void* m_scratch_ptr[2];
1582   int m_scratch_size[2];
1583   const size_type m_league_size;
1584   int m_team_size;
1585   const size_type m_vector_size;
1586 
1587   template <class TagType>
1588   __device__ inline
1589       typename std::enable_if<std::is_same<TagType, void>::value>::type
exec_team(const Member & member,reference_type update) const1590       exec_team(const Member& member, reference_type update) const {
1591     m_functor(member, update);
1592   }
1593 
1594   template <class TagType>
1595   __device__ inline
1596       typename std::enable_if<!std::is_same<TagType, void>::value>::type
exec_team(const Member & member,reference_type update) const1597       exec_team(const Member& member, reference_type update) const {
1598     m_functor(TagType(), member, update);
1599   }
1600 
1601  public:
get_policy() const1602   Policy const& get_policy() const { return m_policy; }
1603 
operator ()() const1604   __device__ inline void operator()() const {
1605     int64_t threadid = 0;
1606     if (m_scratch_size[1] > 0) {
1607       __shared__ int64_t base_thread_id;
1608       if (threadIdx.x == 0 && threadIdx.y == 0) {
1609         threadid = (blockIdx.x * blockDim.z + threadIdx.z) %
1610                    (Kokkos::Impl::g_device_cuda_lock_arrays.n /
1611                     (blockDim.x * blockDim.y));
1612         threadid *= blockDim.x * blockDim.y;
1613         int done = 0;
1614         while (!done) {
1615           done =
1616               (0 ==
1617                atomicCAS(
1618                    &Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid],
1619                    0, 1));
1620           if (!done) {
1621             threadid += blockDim.x * blockDim.y;
1622             if (int64_t(threadid + blockDim.x * blockDim.y) >=
1623                 int64_t(Kokkos::Impl::g_device_cuda_lock_arrays.n))
1624               threadid = 0;
1625           }
1626         }
1627         base_thread_id = threadid;
1628       }
1629       __syncthreads();
1630       threadid = base_thread_id;
1631     }
1632 
1633     run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType,
1634                            DummySHMEMReductionType>::select(1, 1.0),
1635         threadid);
1636     if (m_scratch_size[1] > 0) {
1637       __syncthreads();
1638       if (threadIdx.x == 0 && threadIdx.y == 0)
1639         Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid] = 0;
1640     }
1641   }
1642 
run(const DummySHMEMReductionType &,const int & threadid) const1643   __device__ inline void run(const DummySHMEMReductionType&,
1644                              const int& threadid) const {
1645     const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize /
1646                                                    sizeof(size_type)>
1647         word_count(ValueTraits::value_size(
1648                        ReducerConditional::select(m_functor, m_reducer)) /
1649                    sizeof(size_type));
1650 
1651     reference_type value =
1652         ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
1653                         kokkos_impl_cuda_shared_memory<size_type>() +
1654                             threadIdx.y * word_count.value);
1655 
1656     // Iterate this block through the league
1657     const int int_league_size = (int)m_league_size;
1658     for (int league_rank = blockIdx.x; league_rank < int_league_size;
1659          league_rank += gridDim.x) {
1660       this->template exec_team<WorkTag>(
1661           Member(kokkos_impl_cuda_shared_memory<char>() + m_team_begin,
1662                  m_shmem_begin, m_shmem_size,
1663                  (void*)(((char*)m_scratch_ptr[1]) +
1664                          ptrdiff_t(threadid / (blockDim.x * blockDim.y)) *
1665                              m_scratch_size[1]),
1666                  m_scratch_size[1], league_rank, m_league_size),
1667           value);
1668     }
1669 
1670     // Reduce with final value at blockDim.y - 1 location.
1671     // Doing code duplication here to fix issue #3428
1672     // Suspect optimizer bug??
1673     if (m_league_size == 0) {
1674       // This is the final block with the final result at the final threads'
1675       // location
1676 
1677       size_type* const shared = kokkos_impl_cuda_shared_memory<size_type>() +
1678                                 (blockDim.y - 1) * word_count.value;
1679       size_type* const global =
1680           m_result_ptr_device_accessible
1681               ? reinterpret_cast<size_type*>(m_result_ptr)
1682               : (m_unified_space ? m_unified_space : m_scratch_space);
1683 
1684       if (threadIdx.y == 0) {
1685         Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
1686             ReducerConditional::select(m_functor, m_reducer), shared);
1687       }
1688 
1689       if (CudaTraits::WarpSize < word_count.value) {
1690         __syncthreads();
1691       }
1692 
1693       for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) {
1694         global[i] = shared[i];
1695       }
1696     }
1697 
1698     if (m_league_size != 0) {
1699       if (cuda_single_inter_block_reduce_scan<false, FunctorType, WorkTag>(
1700               ReducerConditional::select(m_functor, m_reducer), blockIdx.x,
1701               gridDim.x, kokkos_impl_cuda_shared_memory<size_type>(),
1702               m_scratch_space, m_scratch_flags)) {
1703         // This is the final block with the final result at the final threads'
1704         // location
1705 
1706         size_type* const shared = kokkos_impl_cuda_shared_memory<size_type>() +
1707                                   (blockDim.y - 1) * word_count.value;
1708         size_type* const global =
1709             m_result_ptr_device_accessible
1710                 ? reinterpret_cast<size_type*>(m_result_ptr)
1711                 : (m_unified_space ? m_unified_space : m_scratch_space);
1712 
1713         if (threadIdx.y == 0) {
1714           Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
1715               ReducerConditional::select(m_functor, m_reducer), shared);
1716         }
1717 
1718         if (CudaTraits::WarpSize < word_count.value) {
1719           __syncthreads();
1720         }
1721 
1722         for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) {
1723           global[i] = shared[i];
1724         }
1725       }
1726     }
1727   }
1728 
run(const DummyShflReductionType &,const int & threadid) const1729   __device__ inline void run(const DummyShflReductionType&,
1730                              const int& threadid) const {
1731     value_type value;
1732     ValueInit::init(ReducerConditional::select(m_functor, m_reducer), &value);
1733 
1734     // Iterate this block through the league
1735     const int int_league_size = (int)m_league_size;
1736     for (int league_rank = blockIdx.x; league_rank < int_league_size;
1737          league_rank += gridDim.x) {
1738       this->template exec_team<WorkTag>(
1739           Member(kokkos_impl_cuda_shared_memory<char>() + m_team_begin,
1740                  m_shmem_begin, m_shmem_size,
1741                  (void*)(((char*)m_scratch_ptr[1]) +
1742                          ptrdiff_t(threadid / (blockDim.x * blockDim.y)) *
1743                              m_scratch_size[1]),
1744                  m_scratch_size[1], league_rank, m_league_size),
1745           value);
1746     }
1747 
1748     pointer_type const result =
1749         m_result_ptr_device_accessible
1750             ? m_result_ptr
1751             : (pointer_type)(m_unified_space ? m_unified_space
1752                                              : m_scratch_space);
1753 
1754     value_type init;
1755     ValueInit::init(ReducerConditional::select(m_functor, m_reducer), &init);
1756 
1757     if (int_league_size == 0) {
1758       Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
1759           ReducerConditional::select(m_functor, m_reducer), (void*)&value);
1760       *result = value;
1761     } else if (
1762         Impl::cuda_inter_block_reduction<FunctorType, ValueJoin, WorkTag>(
1763             value, init,
1764             ValueJoin(ReducerConditional::select(m_functor, m_reducer)),
1765             m_scratch_space, result, m_scratch_flags, blockDim.y)
1766         // This breaks a test
1767         //   Kokkos::Impl::CudaReductionsFunctor<FunctorType,WorkTag,false,true>::scalar_inter_block_reduction(ReducerConditional::select(m_functor
1768         //   , m_reducer) , blockIdx.x , gridDim.x ,
1769         //              kokkos_impl_cuda_shared_memory<size_type>() ,
1770         //              m_scratch_space , m_scratch_flags)
1771     ) {
1772       const unsigned id = threadIdx.y * blockDim.x + threadIdx.x;
1773       if (id == 0) {
1774         Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
1775             ReducerConditional::select(m_functor, m_reducer), (void*)&value);
1776         *result = value;
1777       }
1778     }
1779   }
1780 
execute()1781   inline void execute() {
1782     const int nwork            = m_league_size * m_team_size;
1783     const bool need_device_set = ReduceFunctorHasInit<FunctorType>::value ||
1784                                  ReduceFunctorHasFinal<FunctorType>::value ||
1785                                  !m_result_ptr_host_accessible ||
1786 #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
1787                                  Policy::is_graph_kernel::value ||
1788 #endif
1789                                  !std::is_same<ReducerType, InvalidType>::value;
1790     if ((nwork > 0) || need_device_set) {
1791       const int block_count =
1792           UseShflReduction ? std::min(m_league_size, size_type(1024 * 32))
1793                            : std::min(int(m_league_size), m_team_size);
1794 
1795       m_scratch_space = cuda_internal_scratch_space(
1796           m_policy.space(), ValueTraits::value_size(ReducerConditional::select(
1797                                 m_functor, m_reducer)) *
1798                                 block_count);
1799       m_scratch_flags =
1800           cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type));
1801       m_unified_space = cuda_internal_scratch_unified(
1802           m_policy.space(), ValueTraits::value_size(ReducerConditional::select(
1803                                 m_functor, m_reducer)));
1804 
1805       dim3 block(m_vector_size, m_team_size, 1);
1806       dim3 grid(block_count, 1, 1);
1807       const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size;
1808 
1809       if ((nwork == 0)
1810 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
1811           || Kokkos::Impl::CudaInternal::cuda_use_serial_execution()
1812 #endif
1813       ) {
1814         block = dim3(1, 1, 1);
1815         grid  = dim3(1, 1, 1);
1816       }
1817 
1818       CudaParallelLaunch<ParallelReduce, LaunchBounds>(
1819           *this, grid, block, shmem_size_total,
1820           m_policy.space().impl_internal_space_instance(),
1821           true);  // copy to device and execute
1822 
1823       if (!m_result_ptr_device_accessible) {
1824         m_policy.space().fence();
1825 
1826         if (m_result_ptr) {
1827           if (m_unified_space) {
1828             const int count = ValueTraits::value_count(
1829                 ReducerConditional::select(m_functor, m_reducer));
1830             for (int i = 0; i < count; ++i) {
1831               m_result_ptr[i] = pointer_type(m_unified_space)[i];
1832             }
1833           } else {
1834             const int size = ValueTraits::value_size(
1835                 ReducerConditional::select(m_functor, m_reducer));
1836             DeepCopy<HostSpace, CudaSpace>(m_result_ptr, m_scratch_space, size);
1837           }
1838         }
1839       }
1840     } else {
1841       if (m_result_ptr) {
1842         // TODO @graph We need to effectively insert this in to the graph
1843         ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
1844                         m_result_ptr);
1845       }
1846     }
1847   }
1848 
1849   template <class ViewType>
ParallelReduce(const FunctorType & arg_functor,const Policy & arg_policy,const ViewType & arg_result,typename std::enable_if<Kokkos::is_view<ViewType>::value,void * >::type=nullptr)1850   ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
1851                  const ViewType& arg_result,
1852                  typename std::enable_if<Kokkos::is_view<ViewType>::value,
1853                                          void*>::type = nullptr)
1854       : m_functor(arg_functor),
1855         m_policy(arg_policy),
1856         m_reducer(InvalidType()),
1857         m_result_ptr(arg_result.data()),
1858         m_result_ptr_device_accessible(
1859             MemorySpaceAccess<Kokkos::CudaSpace,
1860                               typename ViewType::memory_space>::accessible),
1861         m_result_ptr_host_accessible(
1862             MemorySpaceAccess<Kokkos::HostSpace,
1863                               typename ViewType::memory_space>::accessible),
1864         m_scratch_space(nullptr),
1865         m_scratch_flags(nullptr),
1866         m_unified_space(nullptr),
1867         m_team_begin(0),
1868         m_shmem_begin(0),
1869         m_shmem_size(0),
1870         m_scratch_ptr{nullptr, nullptr},
1871         m_league_size(arg_policy.league_size()),
1872         m_team_size(arg_policy.team_size()),
1873         m_vector_size(arg_policy.impl_vector_length()) {
1874     cudaFuncAttributes attr =
1875         CudaParallelLaunch<ParallelReduce,
1876                            LaunchBounds>::get_cuda_func_attributes();
1877     m_team_size =
1878         m_team_size >= 0
1879             ? m_team_size
1880             : Kokkos::Impl::cuda_get_opt_block_size<FunctorType, LaunchBounds>(
1881                   m_policy.space().impl_internal_space_instance(), attr,
1882                   m_functor, m_vector_size, m_policy.team_scratch_size(0),
1883                   m_policy.thread_scratch_size(0)) /
1884                   m_vector_size;
1885 
1886     m_team_begin =
1887         UseShflReduction
1888             ? 0
1889             : cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
1890                                                         WorkTag>(arg_functor,
1891                                                                  m_team_size);
1892     m_shmem_begin = sizeof(double) * (m_team_size + 2);
1893     m_shmem_size =
1894         m_policy.scratch_size(0, m_team_size) +
1895         FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size);
1896     m_scratch_size[0] = m_shmem_size;
1897     m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
1898     m_scratch_ptr[1] =
1899         m_team_size <= 0
1900             ? nullptr
1901             : m_policy.space()
1902                   .impl_internal_space_instance()
1903                   ->resize_team_scratch_space(
1904                       static_cast<std::int64_t>(m_scratch_size[1]) *
1905                       (static_cast<std::int64_t>(
1906                           Cuda::concurrency() /
1907                           (m_team_size * m_vector_size))));
1908 
1909     // The global parallel_reduce does not support vector_length other than 1 at
1910     // the moment
1911     if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction)
1912       Impl::throw_runtime_exception(
1913           "Kokkos::parallel_reduce with a TeamPolicy using a vector length of "
1914           "greater than 1 is not currently supported for CUDA for dynamic "
1915           "sized reduction types.");
1916 
1917     if ((m_team_size < 32) && !UseShflReduction)
1918       Impl::throw_runtime_exception(
1919           "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller "
1920           "than 32 is not currently supported with CUDA for dynamic sized "
1921           "reduction types.");
1922 
1923     // Functor's reduce memory, team scan memory, and team shared memory depend
1924     // upon team size.
1925 
1926     const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size;
1927 
1928     if (!Kokkos::Impl::is_integral_power_of_two(m_team_size) &&
1929         !UseShflReduction) {
1930       Kokkos::Impl::throw_runtime_exception(
1931           std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size"));
1932     }
1933 
1934     if (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
1935         shmem_size_total) {
1936       Kokkos::Impl::throw_runtime_exception(
1937           std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too much "
1938                       "L0 scratch memory"));
1939     }
1940 
1941     if (int(m_team_size) >
1942         arg_policy.team_size_max(m_functor, m_reducer, ParallelReduceTag())) {
1943       Kokkos::Impl::throw_runtime_exception(
1944           std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too "
1945                       "large team size."));
1946     }
1947   }
1948 
ParallelReduce(const FunctorType & arg_functor,const Policy & arg_policy,const ReducerType & reducer)1949   ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
1950                  const ReducerType& reducer)
1951       : m_functor(arg_functor),
1952         m_policy(arg_policy),
1953         m_reducer(reducer),
1954         m_result_ptr(reducer.view().data()),
1955         m_result_ptr_device_accessible(
1956             MemorySpaceAccess<Kokkos::CudaSpace,
1957                               typename ReducerType::result_view_type::
1958                                   memory_space>::accessible),
1959         m_result_ptr_host_accessible(
1960             MemorySpaceAccess<Kokkos::HostSpace,
1961                               typename ReducerType::result_view_type::
1962                                   memory_space>::accessible),
1963         m_scratch_space(nullptr),
1964         m_scratch_flags(nullptr),
1965         m_unified_space(nullptr),
1966         m_team_begin(0),
1967         m_shmem_begin(0),
1968         m_shmem_size(0),
1969         m_scratch_ptr{nullptr, nullptr},
1970         m_league_size(arg_policy.league_size()),
1971         m_team_size(arg_policy.team_size()),
1972         m_vector_size(arg_policy.impl_vector_length()) {
1973     cudaFuncAttributes attr =
1974         CudaParallelLaunch<ParallelReduce,
1975                            LaunchBounds>::get_cuda_func_attributes();
1976     m_team_size =
1977         m_team_size >= 0
1978             ? m_team_size
1979             : Kokkos::Impl::cuda_get_opt_block_size<FunctorType, LaunchBounds>(
1980                   m_policy.space().impl_internal_space_instance(), attr,
1981                   m_functor, m_vector_size, m_policy.team_scratch_size(0),
1982                   m_policy.thread_scratch_size(0)) /
1983                   m_vector_size;
1984 
1985     m_team_begin =
1986         UseShflReduction
1987             ? 0
1988             : cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
1989                                                         WorkTag>(arg_functor,
1990                                                                  m_team_size);
1991     m_shmem_begin = sizeof(double) * (m_team_size + 2);
1992     m_shmem_size =
1993         m_policy.scratch_size(0, m_team_size) +
1994         FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size);
1995     m_scratch_size[0] = m_shmem_size;
1996     m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
1997     m_scratch_ptr[1] =
1998         m_team_size <= 0
1999             ? nullptr
2000             : m_policy.space()
2001                   .impl_internal_space_instance()
2002                   ->resize_team_scratch_space(
2003                       static_cast<ptrdiff_t>(m_scratch_size[1]) *
2004                       static_cast<ptrdiff_t>(Cuda::concurrency() /
2005                                              (m_team_size * m_vector_size)));
2006 
2007     // The global parallel_reduce does not support vector_length other than 1 at
2008     // the moment
2009     if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction)
2010       Impl::throw_runtime_exception(
2011           "Kokkos::parallel_reduce with a TeamPolicy using a vector length of "
2012           "greater than 1 is not currently supported for CUDA for dynamic "
2013           "sized reduction types.");
2014 
2015     if ((m_team_size < 32) && !UseShflReduction)
2016       Impl::throw_runtime_exception(
2017           "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller "
2018           "than 32 is not currently supported with CUDA for dynamic sized "
2019           "reduction types.");
2020 
2021     // Functor's reduce memory, team scan memory, and team shared memory depend
2022     // upon team size.
2023 
2024     const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size;
2025 
2026     if ((!Kokkos::Impl::is_integral_power_of_two(m_team_size) &&
2027          !UseShflReduction) ||
2028         m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
2029             shmem_size_total) {
2030       Kokkos::Impl::throw_runtime_exception(
2031           std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size"));
2032     }
2033     if (int(m_team_size) >
2034         arg_policy.team_size_max(m_functor, m_reducer, ParallelReduceTag())) {
2035       Kokkos::Impl::throw_runtime_exception(
2036           std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too "
2037                       "large team size."));
2038     }
2039   }
2040 };
2041 
2042 }  // namespace Impl
2043 }  // namespace Kokkos
2044 
2045 //----------------------------------------------------------------------------
2046 //----------------------------------------------------------------------------
2047 
2048 namespace Kokkos {
2049 namespace Impl {
2050 
2051 template <class FunctorType, class... Traits>
2052 class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
2053  public:
2054   using Policy = Kokkos::RangePolicy<Traits...>;
2055 
2056  private:
2057   using Member       = typename Policy::member_type;
2058   using WorkTag      = typename Policy::work_tag;
2059   using WorkRange    = typename Policy::WorkRange;
2060   using LaunchBounds = typename Policy::launch_bounds;
2061 
2062   using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, WorkTag>;
2063   using ValueInit   = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>;
2064   using ValueOps    = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>;
2065 
2066  public:
2067   using pointer_type   = typename ValueTraits::pointer_type;
2068   using reference_type = typename ValueTraits::reference_type;
2069   using functor_type   = FunctorType;
2070   using size_type      = Cuda::size_type;
2071 
2072  private:
2073   // Algorithmic constraints:
2074   //  (a) blockDim.y is a power of two
2075   //  (b) blockDim.y == blockDim.z == 1
2076   //  (c) gridDim.x  <= blockDim.y * blockDim.y
2077   //  (d) gridDim.y  == gridDim.z == 1
2078 
2079   const FunctorType m_functor;
2080   const Policy m_policy;
2081   size_type* m_scratch_space;
2082   size_type* m_scratch_flags;
2083   size_type m_final;
2084 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
2085   bool m_run_serial;
2086 #endif
2087 
2088   template <class TagType>
2089   __device__ inline
2090       typename std::enable_if<std::is_same<TagType, void>::value>::type
exec_range(const Member & i,reference_type update,const bool final_result) const2091       exec_range(const Member& i, reference_type update,
2092                  const bool final_result) const {
2093     m_functor(i, update, final_result);
2094   }
2095 
2096   template <class TagType>
2097   __device__ inline
2098       typename std::enable_if<!std::is_same<TagType, void>::value>::type
exec_range(const Member & i,reference_type update,const bool final_result) const2099       exec_range(const Member& i, reference_type update,
2100                  const bool final_result) const {
2101     m_functor(TagType(), i, update, final_result);
2102   }
2103 
2104   //----------------------------------------
2105 
initial() const2106   __device__ inline void initial() const {
2107     const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize /
2108                                                    sizeof(size_type)>
2109         word_count(ValueTraits::value_size(m_functor) / sizeof(size_type));
2110 
2111     size_type* const shared_value =
2112         kokkos_impl_cuda_shared_memory<size_type>() +
2113         word_count.value * threadIdx.y;
2114 
2115     ValueInit::init(m_functor, shared_value);
2116 
2117     // Number of blocks is bounded so that the reduction can be limited to two
2118     // passes. Each thread block is given an approximately equal amount of work
2119     // to perform. Accumulate the values for this block. The accumulation
2120     // ordering does not match the final pass, but is arithmatically equivalent.
2121 
2122     const WorkRange range(m_policy, blockIdx.x, gridDim.x);
2123 
2124     for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end();
2125          iwork < iwork_end; iwork += blockDim.y) {
2126       this->template exec_range<WorkTag>(
2127           iwork, ValueOps::reference(shared_value), false);
2128     }
2129 
2130     // Reduce and scan, writing out scan of blocks' totals and block-groups'
2131     // totals. Blocks' scan values are written to 'blockIdx.x' location.
2132     // Block-groups' scan values are at: i = ( j * blockDim.y - 1 ) for i <
2133     // gridDim.x
2134     cuda_single_inter_block_reduce_scan<true, FunctorType, WorkTag>(
2135         m_functor, blockIdx.x, gridDim.x,
2136         kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space,
2137         m_scratch_flags);
2138   }
2139 
2140   //----------------------------------------
2141 
final() const2142   __device__ inline void final() const {
2143     const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize /
2144                                                    sizeof(size_type)>
2145         word_count(ValueTraits::value_size(m_functor) / sizeof(size_type));
2146 
2147     // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] ,
2148     // value[2] , ... }
2149     size_type* const shared_data = kokkos_impl_cuda_shared_memory<size_type>();
2150     size_type* const shared_prefix =
2151         shared_data + word_count.value * threadIdx.y;
2152     size_type* const shared_accum =
2153         shared_data + word_count.value * (blockDim.y + 1);
2154 
2155     // Starting value for this thread block is the previous block's total.
2156     if (blockIdx.x) {
2157       size_type* const block_total =
2158           m_scratch_space + word_count.value * (blockIdx.x - 1);
2159       for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
2160         shared_accum[i] = block_total[i];
2161       }
2162     } else if (0 == threadIdx.y) {
2163       ValueInit::init(m_functor, shared_accum);
2164     }
2165 
2166     const WorkRange range(m_policy, blockIdx.x, gridDim.x);
2167 
2168     for (typename Policy::member_type iwork_base = range.begin();
2169          iwork_base < range.end(); iwork_base += blockDim.y) {
2170 #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
2171       unsigned MASK = KOKKOS_IMPL_CUDA_ACTIVEMASK;
2172 #endif
2173       const typename Policy::member_type iwork = iwork_base + threadIdx.y;
2174 
2175       __syncthreads();  // Don't overwrite previous iteration values until they
2176                         // are used
2177 
2178       ValueInit::init(m_functor, shared_prefix + word_count.value);
2179 
2180       // Copy previous block's accumulation total into thread[0] prefix and
2181       // inclusive scan value of this block
2182       for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
2183         shared_data[i + word_count.value] = shared_data[i] = shared_accum[i];
2184       }
2185 #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
2186       KOKKOS_IMPL_CUDA_SYNCWARP_MASK(MASK);
2187 #else
2188       KOKKOS_IMPL_CUDA_SYNCWARP;
2189 #endif
2190       if (CudaTraits::WarpSize < word_count.value) {
2191         __syncthreads();
2192       }  // Protect against large scan values.
2193 
2194       // Call functor to accumulate inclusive scan value for this work item
2195       if (iwork < range.end()) {
2196         this->template exec_range<WorkTag>(
2197             iwork, ValueOps::reference(shared_prefix + word_count.value),
2198             false);
2199       }
2200 
2201       // Scan block values into locations shared_data[1..blockDim.y]
2202       cuda_intra_block_reduce_scan<true, FunctorType, WorkTag>(
2203           m_functor,
2204           typename ValueTraits::pointer_type(shared_data + word_count.value));
2205 
2206       {
2207         size_type* const block_total =
2208             shared_data + word_count.value * blockDim.y;
2209         for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
2210           shared_accum[i] = block_total[i];
2211         }
2212       }
2213 
2214       // Call functor with exclusive scan value
2215       if (iwork < range.end()) {
2216         this->template exec_range<WorkTag>(
2217             iwork, ValueOps::reference(shared_prefix), true);
2218       }
2219     }
2220   }
2221 
2222  public:
get_policy() const2223   Policy const& get_policy() const { return m_policy; }
2224 
2225   //----------------------------------------
2226 
operator ()() const2227   __device__ inline void operator()() const {
2228 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
2229     if (m_run_serial) {
2230       typename ValueTraits::value_type value;
2231       ValueInit::init(m_functor, (void*)&value);
2232       const WorkRange range(m_policy, blockIdx.x, gridDim.x);
2233 
2234       for (typename Policy::member_type iwork_base = range.begin();
2235            iwork_base < range.end(); iwork_base++) {
2236         this->template exec_range<WorkTag>(iwork_base, value, true);
2237       }
2238     } else {
2239 #endif
2240       if (!m_final) {
2241         initial();
2242       } else {
2243         final();
2244       }
2245 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
2246     }
2247 #endif
2248   }
2249 
2250   // Determine block size constrained by shared memory:
local_block_size(const FunctorType & f)2251   inline unsigned local_block_size(const FunctorType& f) {
2252     // blockDim.y must be power of two = 128 (4 warps) or 256 (8 warps) or 512
2253     // (16 warps) gridDim.x <= blockDim.y * blockDim.y
2254     //
2255     // 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit
2256     // testing
2257 
2258     unsigned n = CudaTraits::WarpSize * 4;
2259     while (n &&
2260            unsigned(m_policy.space()
2261                         .impl_internal_space_instance()
2262                         ->m_maxShmemPerBlock) <
2263                cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
2264                                                          WorkTag>(f, n)) {
2265       n >>= 1;
2266     }
2267     return n;
2268   }
2269 
execute()2270   inline void execute() {
2271     const int nwork = m_policy.end() - m_policy.begin();
2272     if (nwork) {
2273       enum { GridMaxComputeCapability_2x = 0x0ffff };
2274 
2275       const int block_size = local_block_size(m_functor);
2276       KOKKOS_ASSERT(block_size > 0);
2277 
2278       const int grid_max =
2279           (block_size * block_size) < GridMaxComputeCapability_2x
2280               ? (block_size * block_size)
2281               : GridMaxComputeCapability_2x;
2282 
2283       // At most 'max_grid' blocks:
2284       const int max_grid =
2285           std::min(int(grid_max), int((nwork + block_size - 1) / block_size));
2286 
2287       // How much work per block:
2288       const int work_per_block = (nwork + max_grid - 1) / max_grid;
2289 
2290       // How many block are really needed for this much work:
2291       const int grid_x = (nwork + work_per_block - 1) / work_per_block;
2292 
2293       m_scratch_space = cuda_internal_scratch_space(
2294           m_policy.space(), ValueTraits::value_size(m_functor) * grid_x);
2295       m_scratch_flags =
2296           cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1);
2297 
2298       dim3 grid(grid_x, 1, 1);
2299       dim3 block(1, block_size, 1);  // REQUIRED DIMENSIONS ( 1 , N , 1 )
2300       const int shmem = ValueTraits::value_size(m_functor) * (block_size + 2);
2301 
2302 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
2303       if (m_run_serial) {
2304         block = dim3(1, 1, 1);
2305         grid  = dim3(1, 1, 1);
2306       } else {
2307 #endif
2308         m_final = false;
2309         CudaParallelLaunch<ParallelScan, LaunchBounds>(
2310             *this, grid, block, shmem,
2311             m_policy.space().impl_internal_space_instance(),
2312             false);  // copy to device and execute
2313 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
2314       }
2315 #endif
2316       m_final = true;
2317       CudaParallelLaunch<ParallelScan, LaunchBounds>(
2318           *this, grid, block, shmem,
2319           m_policy.space().impl_internal_space_instance(),
2320           false);  // copy to device and execute
2321     }
2322   }
2323 
ParallelScan(const FunctorType & arg_functor,const Policy & arg_policy)2324   ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy)
2325       : m_functor(arg_functor),
2326         m_policy(arg_policy),
2327         m_scratch_space(nullptr),
2328         m_scratch_flags(nullptr),
2329         m_final(false)
2330 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
2331         ,
2332         m_run_serial(Kokkos::Impl::CudaInternal::cuda_use_serial_execution())
2333 #endif
2334   {
2335   }
2336 };
2337 
2338 //----------------------------------------------------------------------------
2339 template <class FunctorType, class ReturnType, class... Traits>
2340 class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
2341                             ReturnType, Kokkos::Cuda> {
2342  public:
2343   using Policy = Kokkos::RangePolicy<Traits...>;
2344 
2345  private:
2346   using Member       = typename Policy::member_type;
2347   using WorkTag      = typename Policy::work_tag;
2348   using WorkRange    = typename Policy::WorkRange;
2349   using LaunchBounds = typename Policy::launch_bounds;
2350 
2351   using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, WorkTag>;
2352   using ValueInit   = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>;
2353   using ValueOps    = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>;
2354 
2355  public:
2356   using pointer_type   = typename ValueTraits::pointer_type;
2357   using reference_type = typename ValueTraits::reference_type;
2358   using functor_type   = FunctorType;
2359   using size_type      = Cuda::size_type;
2360 
2361  private:
2362   // Algorithmic constraints:
2363   //  (a) blockDim.y is a power of two
2364   //  (b) blockDim.y == blockDim.z == 1
2365   //  (c) gridDim.x  <= blockDim.y * blockDim.y
2366   //  (d) gridDim.y  == gridDim.z == 1
2367 
2368   const FunctorType m_functor;
2369   const Policy m_policy;
2370   size_type* m_scratch_space;
2371   size_type* m_scratch_flags;
2372   size_type m_final;
2373   ReturnType& m_returnvalue;
2374 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
2375   bool m_run_serial;
2376 #endif
2377 
2378   template <class TagType>
2379   __device__ inline
2380       typename std::enable_if<std::is_same<TagType, void>::value>::type
exec_range(const Member & i,reference_type update,const bool final_result) const2381       exec_range(const Member& i, reference_type update,
2382                  const bool final_result) const {
2383     m_functor(i, update, final_result);
2384   }
2385 
2386   template <class TagType>
2387   __device__ inline
2388       typename std::enable_if<!std::is_same<TagType, void>::value>::type
exec_range(const Member & i,reference_type update,const bool final_result) const2389       exec_range(const Member& i, reference_type update,
2390                  const bool final_result) const {
2391     m_functor(TagType(), i, update, final_result);
2392   }
2393 
2394   //----------------------------------------
2395 
initial() const2396   __device__ inline void initial() const {
2397     const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize /
2398                                                    sizeof(size_type)>
2399         word_count(ValueTraits::value_size(m_functor) / sizeof(size_type));
2400 
2401     size_type* const shared_value =
2402         kokkos_impl_cuda_shared_memory<size_type>() +
2403         word_count.value * threadIdx.y;
2404 
2405     ValueInit::init(m_functor, shared_value);
2406 
2407     // Number of blocks is bounded so that the reduction can be limited to two
2408     // passes. Each thread block is given an approximately equal amount of work
2409     // to perform. Accumulate the values for this block. The accumulation
2410     // ordering does not match the final pass, but is arithmatically equivalent.
2411 
2412     const WorkRange range(m_policy, blockIdx.x, gridDim.x);
2413 
2414     for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end();
2415          iwork < iwork_end; iwork += blockDim.y) {
2416       this->template exec_range<WorkTag>(
2417           iwork, ValueOps::reference(shared_value), false);
2418     }
2419 
2420     // Reduce and scan, writing out scan of blocks' totals and block-groups'
2421     // totals. Blocks' scan values are written to 'blockIdx.x' location.
2422     // Block-groups' scan values are at: i = ( j * blockDim.y - 1 ) for i <
2423     // gridDim.x
2424     cuda_single_inter_block_reduce_scan<true, FunctorType, WorkTag>(
2425         m_functor, blockIdx.x, gridDim.x,
2426         kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space,
2427         m_scratch_flags);
2428   }
2429 
2430   //----------------------------------------
2431 
final() const2432   __device__ inline void final() const {
2433     const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize /
2434                                                    sizeof(size_type)>
2435         word_count(ValueTraits::value_size(m_functor) / sizeof(size_type));
2436 
2437     // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] ,
2438     // value[2] , ... }
2439     size_type* const shared_data = kokkos_impl_cuda_shared_memory<size_type>();
2440     size_type* const shared_prefix =
2441         shared_data + word_count.value * threadIdx.y;
2442     size_type* const shared_accum =
2443         shared_data + word_count.value * (blockDim.y + 1);
2444 
2445     // Starting value for this thread block is the previous block's total.
2446     if (blockIdx.x) {
2447       size_type* const block_total =
2448           m_scratch_space + word_count.value * (blockIdx.x - 1);
2449       for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
2450         shared_accum[i] = block_total[i];
2451       }
2452     } else if (0 == threadIdx.y) {
2453       ValueInit::init(m_functor, shared_accum);
2454     }
2455 
2456     const WorkRange range(m_policy, blockIdx.x, gridDim.x);
2457 
2458     for (typename Policy::member_type iwork_base = range.begin();
2459          iwork_base < range.end(); iwork_base += blockDim.y) {
2460 #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
2461       unsigned MASK = KOKKOS_IMPL_CUDA_ACTIVEMASK;
2462 #endif
2463 
2464       const typename Policy::member_type iwork = iwork_base + threadIdx.y;
2465 
2466       __syncthreads();  // Don't overwrite previous iteration values until they
2467                         // are used
2468 
2469       ValueInit::init(m_functor, shared_prefix + word_count.value);
2470 
2471       // Copy previous block's accumulation total into thread[0] prefix and
2472       // inclusive scan value of this block
2473       for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
2474         shared_data[i + word_count.value] = shared_data[i] = shared_accum[i];
2475       }
2476 
2477 #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
2478       KOKKOS_IMPL_CUDA_SYNCWARP_MASK(MASK);
2479 #else
2480       KOKKOS_IMPL_CUDA_SYNCWARP;
2481 #endif
2482       if (CudaTraits::WarpSize < word_count.value) {
2483         __syncthreads();
2484       }  // Protect against large scan values.
2485 
2486       // Call functor to accumulate inclusive scan value for this work item
2487       if (iwork < range.end()) {
2488         this->template exec_range<WorkTag>(
2489             iwork, ValueOps::reference(shared_prefix + word_count.value),
2490             false);
2491       }
2492 
2493       // Scan block values into locations shared_data[1..blockDim.y]
2494       cuda_intra_block_reduce_scan<true, FunctorType, WorkTag>(
2495           m_functor,
2496           typename ValueTraits::pointer_type(shared_data + word_count.value));
2497 
2498       {
2499         size_type* const block_total =
2500             shared_data + word_count.value * blockDim.y;
2501         for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
2502           shared_accum[i] = block_total[i];
2503         }
2504       }
2505 
2506       // Call functor with exclusive scan value
2507       if (iwork < range.end()) {
2508         this->template exec_range<WorkTag>(
2509             iwork, ValueOps::reference(shared_prefix), true);
2510       }
2511     }
2512   }
2513 
2514  public:
get_policy() const2515   Policy const& get_policy() const { return m_policy; }
2516 
2517   //----------------------------------------
2518 
operator ()() const2519   __device__ inline void operator()() const {
2520 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
2521     if (m_run_serial) {
2522       typename ValueTraits::value_type value;
2523       ValueInit::init(m_functor, (void*)&value);
2524       const WorkRange range(m_policy, blockIdx.x, gridDim.x);
2525 
2526       for (typename Policy::member_type iwork_base = range.begin();
2527            iwork_base < range.end(); iwork_base++) {
2528         this->template exec_range<WorkTag>(iwork_base, value, true);
2529       }
2530       *((typename ValueTraits::value_type*)m_scratch_space) = value;
2531     } else {
2532 #endif
2533       if (!m_final) {
2534         initial();
2535       } else {
2536         final();
2537       }
2538 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
2539     }
2540 #endif
2541   }
2542 
2543   // Determine block size constrained by shared memory:
local_block_size(const FunctorType & f)2544   inline unsigned local_block_size(const FunctorType& f) {
2545     // blockDim.y must be power of two = 128 (4 warps) or 256 (8 warps) or 512
2546     // (16 warps) gridDim.x <= blockDim.y * blockDim.y
2547     //
2548     // 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit
2549     // testing
2550 
2551     unsigned n = CudaTraits::WarpSize * 4;
2552     while (n &&
2553            unsigned(m_policy.space()
2554                         .impl_internal_space_instance()
2555                         ->m_maxShmemPerBlock) <
2556                cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
2557                                                          WorkTag>(f, n)) {
2558       n >>= 1;
2559     }
2560     return n;
2561   }
2562 
execute()2563   inline void execute() {
2564     const int nwork = m_policy.end() - m_policy.begin();
2565     if (nwork) {
2566       enum { GridMaxComputeCapability_2x = 0x0ffff };
2567 
2568       const int block_size = local_block_size(m_functor);
2569       KOKKOS_ASSERT(block_size > 0);
2570 
2571       const int grid_max =
2572           (block_size * block_size) < GridMaxComputeCapability_2x
2573               ? (block_size * block_size)
2574               : GridMaxComputeCapability_2x;
2575 
2576       // At most 'max_grid' blocks:
2577       const int max_grid =
2578           std::min(int(grid_max), int((nwork + block_size - 1) / block_size));
2579 
2580       // How much work per block:
2581       const int work_per_block = (nwork + max_grid - 1) / max_grid;
2582 
2583       // How many block are really needed for this much work:
2584       const int grid_x = (nwork + work_per_block - 1) / work_per_block;
2585 
2586       m_scratch_space = cuda_internal_scratch_space(
2587           m_policy.space(), ValueTraits::value_size(m_functor) * grid_x);
2588       m_scratch_flags =
2589           cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1);
2590 
2591       dim3 grid(grid_x, 1, 1);
2592       dim3 block(1, block_size, 1);  // REQUIRED DIMENSIONS ( 1 , N , 1 )
2593       const int shmem = ValueTraits::value_size(m_functor) * (block_size + 2);
2594 
2595 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
2596       if (m_run_serial) {
2597         block = dim3(1, 1, 1);
2598         grid  = dim3(1, 1, 1);
2599       } else {
2600 #endif
2601 
2602         m_final = false;
2603         CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
2604             *this, grid, block, shmem,
2605             m_policy.space().impl_internal_space_instance(),
2606             false);  // copy to device and execute
2607 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
2608       }
2609 #endif
2610       m_final = true;
2611       CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
2612           *this, grid, block, shmem,
2613           m_policy.space().impl_internal_space_instance(),
2614           false);  // copy to device and execute
2615 
2616       const int size = ValueTraits::value_size(m_functor);
2617 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
2618       if (m_run_serial)
2619         DeepCopy<HostSpace, CudaSpace>(&m_returnvalue, m_scratch_space, size);
2620       else
2621 #endif
2622         DeepCopy<HostSpace, CudaSpace>(
2623             &m_returnvalue, m_scratch_space + (grid_x - 1) * size / sizeof(int),
2624             size);
2625     }
2626   }
2627 
ParallelScanWithTotal(const FunctorType & arg_functor,const Policy & arg_policy,ReturnType & arg_returnvalue)2628   ParallelScanWithTotal(const FunctorType& arg_functor,
2629                         const Policy& arg_policy, ReturnType& arg_returnvalue)
2630       : m_functor(arg_functor),
2631         m_policy(arg_policy),
2632         m_scratch_space(nullptr),
2633         m_scratch_flags(nullptr),
2634         m_final(false),
2635         m_returnvalue(arg_returnvalue)
2636 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
2637         ,
2638         m_run_serial(Kokkos::Impl::CudaInternal::cuda_use_serial_execution())
2639 #endif
2640   {
2641   }
2642 };
2643 
2644 }  // namespace Impl
2645 }  // namespace Kokkos
2646 
2647 //----------------------------------------------------------------------------
2648 //----------------------------------------------------------------------------
2649 
2650 namespace Kokkos {
2651 
2652 namespace Impl {
2653 template <class FunctorType, class ExecPolicy, class ValueType,
2654           class Tag = typename ExecPolicy::work_tag>
2655 struct CudaFunctorAdapter {
2656   const FunctorType f;
2657   using value_type = ValueType;
CudaFunctorAdapterKokkos::Impl::CudaFunctorAdapter2658   CudaFunctorAdapter(const FunctorType& f_) : f(f_) {}
2659 
operator ()Kokkos::Impl::CudaFunctorAdapter2660   __device__ inline void operator()(typename ExecPolicy::work_tag,
2661                                     const typename ExecPolicy::member_type& i,
2662                                     ValueType& val) const {
2663     // Insert Static Assert with decltype on ValueType equals third argument
2664     // type of FunctorType::operator()
2665     f(typename ExecPolicy::work_tag(), i, val);
2666   }
2667 
operator ()Kokkos::Impl::CudaFunctorAdapter2668   __device__ inline void operator()(typename ExecPolicy::work_tag,
2669                                     const typename ExecPolicy::member_type& i,
2670                                     const typename ExecPolicy::member_type& j,
2671                                     ValueType& val) const {
2672     // Insert Static Assert with decltype on ValueType equals third argument
2673     // type of FunctorType::operator()
2674     f(typename ExecPolicy::work_tag(), i, j, val);
2675   }
2676 
operator ()Kokkos::Impl::CudaFunctorAdapter2677   __device__ inline void operator()(typename ExecPolicy::work_tag,
2678                                     const typename ExecPolicy::member_type& i,
2679                                     const typename ExecPolicy::member_type& j,
2680                                     const typename ExecPolicy::member_type& k,
2681                                     ValueType& val) const {
2682     // Insert Static Assert with decltype on ValueType equals third argument
2683     // type of FunctorType::operator()
2684     f(typename ExecPolicy::work_tag(), i, j, k, val);
2685   }
2686 
operator ()Kokkos::Impl::CudaFunctorAdapter2687   __device__ inline void operator()(typename ExecPolicy::work_tag,
2688                                     const typename ExecPolicy::member_type& i,
2689                                     const typename ExecPolicy::member_type& j,
2690                                     const typename ExecPolicy::member_type& k,
2691                                     const typename ExecPolicy::member_type& l,
2692                                     ValueType& val) const {
2693     // Insert Static Assert with decltype on ValueType equals third argument
2694     // type of FunctorType::operator()
2695     f(typename ExecPolicy::work_tag(), i, j, k, l, val);
2696   }
2697 
operator ()Kokkos::Impl::CudaFunctorAdapter2698   __device__ inline void operator()(typename ExecPolicy::work_tag,
2699                                     const typename ExecPolicy::member_type& i,
2700                                     const typename ExecPolicy::member_type& j,
2701                                     const typename ExecPolicy::member_type& k,
2702                                     const typename ExecPolicy::member_type& l,
2703                                     const typename ExecPolicy::member_type& m,
2704                                     ValueType& val) const {
2705     // Insert Static Assert with decltype on ValueType equals third argument
2706     // type of FunctorType::operator()
2707     f(typename ExecPolicy::work_tag(), i, j, k, l, m, val);
2708   }
2709 
operator ()Kokkos::Impl::CudaFunctorAdapter2710   __device__ inline void operator()(typename ExecPolicy::work_tag,
2711                                     const typename ExecPolicy::member_type& i,
2712                                     const typename ExecPolicy::member_type& j,
2713                                     const typename ExecPolicy::member_type& k,
2714                                     const typename ExecPolicy::member_type& l,
2715                                     const typename ExecPolicy::member_type& m,
2716                                     const typename ExecPolicy::member_type& n,
2717                                     ValueType& val) const {
2718     // Insert Static Assert with decltype on ValueType equals third argument
2719     // type of FunctorType::operator()
2720     f(typename ExecPolicy::work_tag(), i, j, k, l, m, n, val);
2721   }
2722 };
2723 
2724 template <class FunctorType, class ExecPolicy, class ValueType>
2725 struct CudaFunctorAdapter<FunctorType, ExecPolicy, ValueType, void> {
2726   const FunctorType f;
2727   using value_type = ValueType;
CudaFunctorAdapterKokkos::Impl::CudaFunctorAdapter2728   CudaFunctorAdapter(const FunctorType& f_) : f(f_) {}
2729 
operator ()Kokkos::Impl::CudaFunctorAdapter2730   __device__ inline void operator()(const typename ExecPolicy::member_type& i,
2731                                     ValueType& val) const {
2732     // Insert Static Assert with decltype on ValueType equals second argument
2733     // type of FunctorType::operator()
2734     f(i, val);
2735   }
2736 
operator ()Kokkos::Impl::CudaFunctorAdapter2737   __device__ inline void operator()(const typename ExecPolicy::member_type& i,
2738                                     const typename ExecPolicy::member_type& j,
2739                                     ValueType& val) const {
2740     // Insert Static Assert with decltype on ValueType equals second argument
2741     // type of FunctorType::operator()
2742     f(i, j, val);
2743   }
2744 
operator ()Kokkos::Impl::CudaFunctorAdapter2745   __device__ inline void operator()(const typename ExecPolicy::member_type& i,
2746                                     const typename ExecPolicy::member_type& j,
2747                                     const typename ExecPolicy::member_type& k,
2748                                     ValueType& val) const {
2749     // Insert Static Assert with decltype on ValueType equals second argument
2750     // type of FunctorType::operator()
2751     f(i, j, k, val);
2752   }
2753 
operator ()Kokkos::Impl::CudaFunctorAdapter2754   __device__ inline void operator()(const typename ExecPolicy::member_type& i,
2755                                     const typename ExecPolicy::member_type& j,
2756                                     const typename ExecPolicy::member_type& k,
2757                                     const typename ExecPolicy::member_type& l,
2758                                     ValueType& val) const {
2759     // Insert Static Assert with decltype on ValueType equals second argument
2760     // type of FunctorType::operator()
2761     f(i, j, k, l, val);
2762   }
2763 
operator ()Kokkos::Impl::CudaFunctorAdapter2764   __device__ inline void operator()(const typename ExecPolicy::member_type& i,
2765                                     const typename ExecPolicy::member_type& j,
2766                                     const typename ExecPolicy::member_type& k,
2767                                     const typename ExecPolicy::member_type& l,
2768                                     const typename ExecPolicy::member_type& m,
2769                                     ValueType& val) const {
2770     // Insert Static Assert with decltype on ValueType equals second argument
2771     // type of FunctorType::operator()
2772     f(i, j, k, l, m, val);
2773   }
2774 
operator ()Kokkos::Impl::CudaFunctorAdapter2775   __device__ inline void operator()(const typename ExecPolicy::member_type& i,
2776                                     const typename ExecPolicy::member_type& j,
2777                                     const typename ExecPolicy::member_type& k,
2778                                     const typename ExecPolicy::member_type& l,
2779                                     const typename ExecPolicy::member_type& m,
2780                                     const typename ExecPolicy::member_type& n,
2781                                     ValueType& val) const {
2782     // Insert Static Assert with decltype on ValueType equals second argument
2783     // type of FunctorType::operator()
2784     f(i, j, k, l, m, n, val);
2785   }
2786 
operator ()Kokkos::Impl::CudaFunctorAdapter2787   __device__ inline void operator()(typename ExecPolicy::member_type& i,
2788                                     ValueType& val) const {
2789     // Insert Static Assert with decltype on ValueType equals second argument
2790     // type of FunctorType::operator()
2791     f(i, val);
2792   }
2793 
operator ()Kokkos::Impl::CudaFunctorAdapter2794   __device__ inline void operator()(typename ExecPolicy::member_type& i,
2795                                     typename ExecPolicy::member_type& j,
2796                                     ValueType& val) const {
2797     // Insert Static Assert with decltype on ValueType equals second argument
2798     // type of FunctorType::operator()
2799     f(i, j, val);
2800   }
2801 
operator ()Kokkos::Impl::CudaFunctorAdapter2802   __device__ inline void operator()(typename ExecPolicy::member_type& i,
2803                                     typename ExecPolicy::member_type& j,
2804                                     typename ExecPolicy::member_type& k,
2805                                     ValueType& val) const {
2806     // Insert Static Assert with decltype on ValueType equals second argument
2807     // type of FunctorType::operator()
2808     f(i, j, k, val);
2809   }
2810 
operator ()Kokkos::Impl::CudaFunctorAdapter2811   __device__ inline void operator()(typename ExecPolicy::member_type& i,
2812                                     typename ExecPolicy::member_type& j,
2813                                     typename ExecPolicy::member_type& k,
2814                                     typename ExecPolicy::member_type& l,
2815                                     ValueType& val) const {
2816     // Insert Static Assert with decltype on ValueType equals second argument
2817     // type of FunctorType::operator()
2818     f(i, j, k, l, val);
2819   }
2820 
operator ()Kokkos::Impl::CudaFunctorAdapter2821   __device__ inline void operator()(typename ExecPolicy::member_type& i,
2822                                     typename ExecPolicy::member_type& j,
2823                                     typename ExecPolicy::member_type& k,
2824                                     typename ExecPolicy::member_type& l,
2825                                     typename ExecPolicy::member_type& m,
2826                                     ValueType& val) const {
2827     // Insert Static Assert with decltype on ValueType equals second argument
2828     // type of FunctorType::operator()
2829     f(i, j, k, l, m, val);
2830   }
2831 
operator ()Kokkos::Impl::CudaFunctorAdapter2832   __device__ inline void operator()(typename ExecPolicy::member_type& i,
2833                                     typename ExecPolicy::member_type& j,
2834                                     typename ExecPolicy::member_type& k,
2835                                     typename ExecPolicy::member_type& l,
2836                                     typename ExecPolicy::member_type& m,
2837                                     typename ExecPolicy::member_type& n,
2838                                     ValueType& val) const {
2839     // Insert Static Assert with decltype on ValueType equals second argument
2840     // type of FunctorType::operator()
2841     f(i, j, k, l, m, n, val);
2842   }
2843 };
2844 
2845 template <class FunctorType, class ResultType, class Tag,
2846           bool Enable = IsNonTrivialReduceFunctor<FunctorType>::value>
2847 struct FunctorReferenceType {
2848   using reference_type = ResultType&;
2849 };
2850 
2851 template <class FunctorType, class ResultType, class Tag>
2852 struct FunctorReferenceType<FunctorType, ResultType, Tag, true> {
2853   using reference_type =
2854       typename Kokkos::Impl::FunctorValueTraits<FunctorType,
2855                                                 Tag>::reference_type;
2856 };
2857 
2858 template <class FunctorTypeIn, class ExecPolicy, class ValueType>
2859 struct ParallelReduceFunctorType<FunctorTypeIn, ExecPolicy, ValueType, Cuda> {
2860   enum {
2861     FunctorHasValueType = IsNonTrivialReduceFunctor<FunctorTypeIn>::value
2862   };
2863   using functor_type = typename Kokkos::Impl::if_c<
2864       FunctorHasValueType, FunctorTypeIn,
2865       Impl::CudaFunctorAdapter<FunctorTypeIn, ExecPolicy, ValueType>>::type;
functorKokkos::Impl::ParallelReduceFunctorType2866   static functor_type functor(const FunctorTypeIn& functor_in) {
2867     return Impl::if_c<FunctorHasValueType, FunctorTypeIn, functor_type>::select(
2868         functor_in, functor_type(functor_in));
2869   }
2870 };
2871 
2872 }  // namespace Impl
2873 
2874 }  // namespace Kokkos
2875 
2876 #endif /* defined(KOKKOS_ENABLE_CUDA) */
2877 #endif /* #ifndef KOKKOS_CUDA_PARALLEL_HPP */
2878