1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 //                        Kokkos v. 3.0
6 //       Copyright (2020) National Technology & Engineering
7 //               Solutions of Sandia, LLC (NTESS).
8 //
9 // Under the terms of Contract DE-NA0003525 with NTESS,
10 // the U.S. Government retains certain rights in this software.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions are
14 // met:
15 //
16 // 1. Redistributions of source code must retain the above copyright
17 // notice, this list of conditions and the following disclaimer.
18 //
19 // 2. Redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution.
22 //
23 // 3. Neither the name of the Corporation nor the names of the
24 // contributors may be used to endorse or promote products derived from
25 // this software without specific prior written permission.
26 //
27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 //
39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
40 //
41 // ************************************************************************
42 //@HEADER
43 */
44 
45 #ifndef KOKKOS_SYCL_PARALLEL_TEAM_HPP
46 #define KOKKOS_SYCL_PARALLEL_TEAM_HPP
47 
48 #include <Kokkos_Parallel.hpp>
49 
50 #include <SYCL/Kokkos_SYCL_Team.hpp>
51 
52 namespace Kokkos {
53 namespace Impl {
54 template <typename... Properties>
55 class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
56     : public PolicyTraits<Properties...> {
57  public:
58   using execution_policy = TeamPolicyInternal;
59 
60   using traits = PolicyTraits<Properties...>;
61 
62   template <typename ExecSpace, typename... OtherProperties>
63   friend class TeamPolicyInternal;
64 
65  private:
66   static int constexpr MAX_WARP = 8;
67 
68   typename traits::execution_space m_space;
69   int m_league_size;
70   int m_team_size;
71   int m_vector_length;
72   int m_team_scratch_size[2];
73   int m_thread_scratch_size[2];
74   int m_chunk_size;
75   bool m_tune_team_size;
76   bool m_tune_vector_length;
77 
78  public:
79   using execution_space = Kokkos::Experimental::SYCL;
80 
81   template <class... OtherProperties>
TeamPolicyInternal(TeamPolicyInternal<OtherProperties...> const & p)82   TeamPolicyInternal(TeamPolicyInternal<OtherProperties...> const& p) {
83     m_league_size            = p.m_league_size;
84     m_team_size              = p.m_team_size;
85     m_vector_length          = p.m_vector_length;
86     m_team_scratch_size[0]   = p.m_team_scratch_size[0];
87     m_team_scratch_size[1]   = p.m_team_scratch_size[1];
88     m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
89     m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
90     m_chunk_size             = p.m_chunk_size;
91     m_space                  = p.m_space;
92     m_tune_team_size         = p.m_tune_team_size;
93     m_tune_vector_length     = p.m_tune_vector_length;
94   }
95 
96   template <typename FunctorType>
team_size_max(FunctorType const & f,ParallelForTag const &) const97   int team_size_max(FunctorType const& f, ParallelForTag const&) const {
98     return internal_team_size_max_for(f);
99   }
100 
101   template <class FunctorType>
team_size_max(const FunctorType & f,const ParallelReduceTag &) const102   inline int team_size_max(const FunctorType& f,
103                            const ParallelReduceTag&) const {
104     return internal_team_size_max_reduce(f);
105   }
106 
107   template <class FunctorType, class ReducerType>
team_size_max(const FunctorType & f,const ReducerType &,const ParallelReduceTag &) const108   inline int team_size_max(const FunctorType& f, const ReducerType& /*r*/,
109                            const ParallelReduceTag&) const {
110     return internal_team_size_max_reduce(f);
111   }
112 
113   template <typename FunctorType>
team_size_recommended(FunctorType const & f,ParallelForTag const &) const114   int team_size_recommended(FunctorType const& f, ParallelForTag const&) const {
115     return internal_team_size_max_for(f);
116   }
117 
118   template <typename FunctorType>
team_size_recommended(FunctorType const & f,ParallelReduceTag const &) const119   inline int team_size_recommended(FunctorType const& f,
120                                    ParallelReduceTag const&) const {
121     return internal_team_size_recommended_reduce(f);
122   }
123 
124   template <class FunctorType, class ReducerType>
team_size_recommended(FunctorType const & f,ReducerType const &,ParallelReduceTag const &) const125   int team_size_recommended(FunctorType const& f, ReducerType const&,
126                             ParallelReduceTag const&) const {
127     return internal_team_size_recommended_reduce(f);
128   }
impl_auto_vector_length() const129   inline bool impl_auto_vector_length() const { return m_tune_vector_length; }
impl_auto_team_size() const130   inline bool impl_auto_team_size() const { return m_tune_team_size; }
vector_length_max()131   static int vector_length_max() {
132     // FIXME_SYCL provide a reasonable value
133     return 1;
134   }
135 
verify_requested_vector_length(int requested_vector_length)136   static int verify_requested_vector_length(int requested_vector_length) {
137     int test_vector_length =
138         std::min(requested_vector_length, vector_length_max());
139 
140     // Allow only power-of-two vector_length
141     if (!(is_integral_power_of_two(test_vector_length))) {
142       int test_pow2 = 1;
143       for (int i = 0; i < 5; i++) {
144         test_pow2 = test_pow2 << 1;
145         if (test_pow2 > test_vector_length) {
146           break;
147         }
148       }
149       test_vector_length = test_pow2 >> 1;
150     }
151 
152     return test_vector_length;
153   }
154 
scratch_size_max(int level)155   static int scratch_size_max(int level) {
156     return level == 0 ? 1024 * 32
157                       :           // FIXME_SYCL arbitrarily setting this to 32kB
158                20 * 1024 * 1024;  // FIXME_SYCL arbitrarily setting this to 20MB
159   }
impl_set_vector_length(size_t size)160   inline void impl_set_vector_length(size_t size) { m_vector_length = size; }
impl_set_team_size(size_t size)161   inline void impl_set_team_size(size_t size) { m_team_size = size; }
impl_vector_length() const162   int impl_vector_length() const { return m_vector_length; }
vector_length() const163   KOKKOS_DEPRECATED int vector_length() const { return impl_vector_length(); }
164 
team_size() const165   int team_size() const { return m_team_size; }
166 
league_size() const167   int league_size() const { return m_league_size; }
168 
scratch_size(int level,int team_size_=-1) const169   int scratch_size(int level, int team_size_ = -1) const {
170     if (team_size_ < 0) team_size_ = m_team_size;
171     return m_team_scratch_size[level] +
172            team_size_ * m_thread_scratch_size[level];
173   }
174 
team_scratch_size(int level) const175   int team_scratch_size(int level) const { return m_team_scratch_size[level]; }
176 
thread_scratch_size(int level) const177   int thread_scratch_size(int level) const {
178     return m_thread_scratch_size[level];
179   }
180 
space() const181   typename traits::execution_space space() const { return m_space; }
182 
TeamPolicyInternal()183   TeamPolicyInternal()
184       : m_space(typename traits::execution_space()),
185         m_league_size(0),
186         m_team_size(-1),
187         m_vector_length(0),
188         m_team_scratch_size{0, 0},
189         m_thread_scratch_size{0, 0},
190         m_chunk_size(0),
191         m_tune_team_size(false),
192         m_tune_vector_length(false) {}
193 
194   /** \brief  Specify league size, request team size */
TeamPolicyInternal(const execution_space space_,int league_size_,int team_size_request,int vector_length_request=1)195   TeamPolicyInternal(const execution_space space_, int league_size_,
196                      int team_size_request, int vector_length_request = 1)
197       : m_space(space_),
198         m_league_size(league_size_),
199         m_team_size(team_size_request),
200         m_vector_length(
201             (vector_length_request > 0)
202                 ? verify_requested_vector_length(vector_length_request)
203                 : (verify_requested_vector_length(1))),
204         m_team_scratch_size{0, 0},
205         m_thread_scratch_size{0, 0},
206         m_chunk_size(0),
207         m_tune_team_size(bool(team_size_request <= 0)),
208         m_tune_vector_length(bool(vector_length_request <= 0)) {
209     // FIXME_SYCL check paramters
210   }
211 
212   /** \brief  Specify league size, request team size */
TeamPolicyInternal(const execution_space space_,int league_size_,const Kokkos::AUTO_t &,int vector_length_request=1)213   TeamPolicyInternal(const execution_space space_, int league_size_,
214                      const Kokkos::AUTO_t& /* team_size_request */,
215                      int vector_length_request = 1)
216       : TeamPolicyInternal(space_, league_size_, -1, vector_length_request) {}
217   // FLAG
218   /** \brief  Specify league size and team size, request vector length*/
TeamPolicyInternal(const execution_space space_,int league_size_,int team_size_request,const Kokkos::AUTO_t &)219   TeamPolicyInternal(const execution_space space_, int league_size_,
220                      int team_size_request,
221                      const Kokkos::AUTO_t& /* vector_length_request */
222                      )
223       : TeamPolicyInternal(space_, league_size_, team_size_request, -1)
224 
225   {}
226 
227   /** \brief  Specify league size, request team size and vector length*/
TeamPolicyInternal(const execution_space space_,int league_size_,const Kokkos::AUTO_t &,const Kokkos::AUTO_t &)228   TeamPolicyInternal(const execution_space space_, int league_size_,
229                      const Kokkos::AUTO_t& /* team_size_request */,
230                      const Kokkos::AUTO_t& /* vector_length_request */
231 
232                      )
233       : TeamPolicyInternal(space_, league_size_, -1, -1)
234 
235   {}
236 
TeamPolicyInternal(int league_size_,int team_size_request,int vector_length_request=1)237   TeamPolicyInternal(int league_size_, int team_size_request,
238                      int vector_length_request = 1)
239       : TeamPolicyInternal(typename traits::execution_space(), league_size_,
240                            team_size_request, vector_length_request) {}
241 
TeamPolicyInternal(int league_size_,const Kokkos::AUTO_t &,int vector_length_request=1)242   TeamPolicyInternal(int league_size_,
243                      const Kokkos::AUTO_t& /* team_size_request */,
244                      int vector_length_request = 1)
245       : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1,
246                            vector_length_request) {}
247 
248   /** \brief  Specify league size and team size, request vector length*/
TeamPolicyInternal(int league_size_,int team_size_request,const Kokkos::AUTO_t &)249   TeamPolicyInternal(int league_size_, int team_size_request,
250                      const Kokkos::AUTO_t& /* vector_length_request */
251 
252                      )
253       : TeamPolicyInternal(typename traits::execution_space(), league_size_,
254                            team_size_request, -1)
255 
256   {}
257 
258   /** \brief  Specify league size, request team size and vector length*/
TeamPolicyInternal(int league_size_,const Kokkos::AUTO_t &,const Kokkos::AUTO_t &)259   TeamPolicyInternal(int league_size_,
260                      const Kokkos::AUTO_t& /* team_size_request */,
261                      const Kokkos::AUTO_t& /* vector_length_request */
262 
263                      )
264       : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1,
265                            -1) {}
266 
chunk_size() const267   int chunk_size() const { return m_chunk_size; }
268 
set_chunk_size(typename traits::index_type chunk_size_)269   TeamPolicyInternal& set_chunk_size(typename traits::index_type chunk_size_) {
270     m_chunk_size = chunk_size_;
271     return *this;
272   }
273 
274   /** \brief set per team scratch size for a specific level of the scratch
275    * hierarchy */
set_scratch_size(int level,PerTeamValue const & per_team)276   TeamPolicyInternal& set_scratch_size(int level,
277                                        PerTeamValue const& per_team) {
278     m_team_scratch_size[level] = per_team.value;
279     return *this;
280   }
281 
282   /** \brief set per thread scratch size for a specific level of the scratch
283    * hierarchy */
set_scratch_size(int level,PerThreadValue const & per_thread)284   TeamPolicyInternal& set_scratch_size(int level,
285                                        PerThreadValue const& per_thread) {
286     m_thread_scratch_size[level] = per_thread.value;
287     return *this;
288   }
289 
290   /** \brief set per thread and per team scratch size for a specific level of
291    * the scratch hierarchy */
set_scratch_size(int level,PerTeamValue const & per_team,PerThreadValue const & per_thread)292   TeamPolicyInternal& set_scratch_size(int level, PerTeamValue const& per_team,
293                                        PerThreadValue const& per_thread) {
294     m_team_scratch_size[level]   = per_team.value;
295     m_thread_scratch_size[level] = per_thread.value;
296     return *this;
297   }
298 
299   using member_type = Kokkos::Impl::SYCLTeamMember;
300 
301  protected:
302   template <class FunctorType>
internal_team_size_max_for(const FunctorType &) const303   int internal_team_size_max_for(const FunctorType& /*f*/) const {
304     // nested_reducer_memsize = (sizeof(double) * (m_team_size + 2)
305     // custom: m_team_scratch_size[0] + m_thread_scratch_size[0] * m_team_size
306     // total:
307     // 2*sizeof(double)+m_team_scratch_size[0]
308     // + m_team_size(sizeof(double)+m_thread_scratch_size[0])
309     const int max_threads_for_memory =
310         (space().impl_internal_space_instance()->m_maxShmemPerBlock -
311          2 * sizeof(double) - m_team_scratch_size[0]) /
312         (sizeof(double) + m_thread_scratch_size[0]);
313     return std::min<int>(
314         m_space.impl_internal_space_instance()->m_maxWorkgroupSize,
315         max_threads_for_memory);
316   }
317 
318   template <class FunctorType>
internal_team_size_max_reduce(const FunctorType & f) const319   int internal_team_size_max_reduce(const FunctorType& f) const {
320     using Analysis        = FunctorAnalysis<FunctorPatternInterface::REDUCE,
321                                      TeamPolicyInternal, FunctorType>;
322     using value_type      = typename Analysis::value_type;
323     const int value_count = Analysis::value_count(f);
324 
325     // nested_reducer_memsize = (sizeof(double) * (m_team_size + 2)
326     // reducer_memsize = sizeof(value_type) * m_team_size * value_count
327     // custom: m_team_scratch_size[0] + m_thread_scratch_size[0] * m_team_size
328     // total:
329     // 2*sizeof(double)+m_team_scratch_size[0]
330     // + m_team_size(sizeof(double)+sizeof(value_type)*value_count
331     //               +m_thread_scratch_size[0])
332     const int max_threads_for_memory =
333         (space().impl_internal_space_instance()->m_maxShmemPerBlock -
334          2 * sizeof(double) - m_team_scratch_size[0]) /
335         (sizeof(double) + sizeof(value_type) * value_count +
336          m_thread_scratch_size[0]);
337     return std::min<int>(
338         m_space.impl_internal_space_instance()->m_maxWorkgroupSize,
339         max_threads_for_memory);
340   }
341 
342   template <class FunctorType>
internal_team_size_recommended_for(const FunctorType & f) const343   int internal_team_size_recommended_for(const FunctorType& f) const {
344     // FIXME_SYCL improve
345     return internal_team_size_max_for(f);
346   }
347 
348   template <class FunctorType>
internal_team_size_recommended_reduce(const FunctorType & f) const349   int internal_team_size_recommended_reduce(const FunctorType& f) const {
350     // FIXME_SYCL improve
351     return internal_team_size_max_reduce(f);
352   }
353 };
354 
355 template <typename FunctorType, typename... Properties>
356 class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
357                   Kokkos::Experimental::SYCL> {
358  public:
359   using Policy = TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>;
360   using functor_type = FunctorType;
361   using size_type    = ::Kokkos::Experimental::SYCL::size_type;
362 
363  private:
364   using member_type   = typename Policy::member_type;
365   using work_tag      = typename Policy::work_tag;
366   using launch_bounds = typename Policy::launch_bounds;
367 
368   FunctorType const m_functor;
369   Policy const m_policy;
370   size_type const m_league_size;
371   int m_team_size;
372   size_type const m_vector_size;
373   int m_shmem_begin;
374   int m_shmem_size;
375   void* m_scratch_ptr[2];
376   int m_scratch_size[2];
377 
378   template <typename Functor>
sycl_direct_launch(const Policy & policy,const Functor & functor) const379   void sycl_direct_launch(const Policy& policy, const Functor& functor) const {
380     // Convenience references
381     const Kokkos::Experimental::SYCL& space = policy.space();
382     Kokkos::Experimental::Impl::SYCLInternal& instance =
383         *space.impl_internal_space_instance();
384     sycl::queue& q = *instance.m_queue;
385 
386     q.submit([&](sycl::handler& cgh) {
387       // FIXME_SYCL accessors seem to need a size greater than zero at least for
388       // host queues
389       sycl::accessor<char, 1, sycl::access::mode::read_write,
390                      sycl::access::target::local>
391           team_scratch_memory_L0(
392               sycl::range<1>(std::max(m_scratch_size[0] + m_shmem_begin, 1)),
393               cgh);
394 
395       // Avoid capturing *this since it might not be trivially copyable
396       const auto shmem_begin     = m_shmem_begin;
397       const int scratch_size[2]  = {m_scratch_size[0], m_scratch_size[1]};
398       void* const scratch_ptr[2] = {m_scratch_ptr[0], m_scratch_ptr[1]};
399 
400       cgh.parallel_for(
401           sycl::nd_range<2>(
402               sycl::range<2>(m_league_size * m_team_size, m_vector_size),
403               sycl::range<2>(m_team_size, m_vector_size)),
404           [=](sycl::nd_item<2> item) {
405             const member_type team_member(
406                 team_scratch_memory_L0.get_pointer(), shmem_begin,
407                 scratch_size[0],
408                 static_cast<char*>(scratch_ptr[1]) +
409                     item.get_group(0) * scratch_size[1],
410                 scratch_size[1], item);
411             if constexpr (std::is_same<work_tag, void>::value)
412               functor(team_member);
413             else
414               functor(work_tag(), team_member);
415           });
416     });
417     space.fence();
418   }
419 
420  public:
execute() const421   inline void execute() const {
422     if (m_league_size == 0) return;
423 
424     Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem&
425         indirectKernelMem = m_policy.space()
426                                 .impl_internal_space_instance()
427                                 ->m_indirectKernelMem;
428 
429     const auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper(
430         m_functor, indirectKernelMem);
431 
432     sycl_direct_launch(m_policy, functor_wrapper.get_functor());
433   }
434 
ParallelFor(FunctorType const & arg_functor,Policy const & arg_policy)435   ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy)
436       : m_functor(arg_functor),
437         m_policy(arg_policy),
438         m_league_size(arg_policy.league_size()),
439         m_team_size(arg_policy.team_size()),
440         m_vector_size(arg_policy.impl_vector_length()) {
441     // FIXME_SYCL optimize
442     if (m_team_size < 0) m_team_size = 32;
443 
444     m_shmem_begin = (sizeof(double) * (m_team_size + 2));
445     m_shmem_size =
446         (m_policy.scratch_size(0, m_team_size) +
447          FunctorTeamShmemSize<FunctorType>::value(m_functor, m_team_size));
448     m_scratch_size[0] = m_shmem_size;
449     m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
450 
451     // FIXME_SYCL so far accessors used instead of these pointers
452     // Functor's reduce memory, team scan memory, and team shared memory depend
453     // upon team size.
454     const auto& space    = *m_policy.space().impl_internal_space_instance();
455     const sycl::queue& q = *space.m_queue;
456     m_scratch_ptr[0]     = nullptr;
457     m_scratch_ptr[1]     = sycl::malloc_device(
458         sizeof(char) * m_scratch_size[1] * m_league_size, q);
459 
460     if (static_cast<int>(space.m_maxShmemPerBlock) <
461         m_shmem_size - m_shmem_begin) {
462       std::stringstream out;
463       out << "Kokkos::Impl::ParallelFor<SYCL> insufficient shared memory! "
464              "Requested "
465           << m_shmem_size - m_shmem_begin << " bytes but maximum is "
466           << m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock
467           << '\n';
468       Kokkos::Impl::throw_runtime_exception(out.str());
469     }
470 
471     if (m_team_size > m_policy.team_size_max(arg_functor, ParallelForTag{}))
472       Kokkos::Impl::throw_runtime_exception(
473           "Kokkos::Impl::ParallelFor<SYCL> requested too large team size.");
474   }
475 
476   // FIXME_SYCL remove when managing m_scratch_ptr[1] in the execution space
477   // instance
478   ParallelFor(const ParallelFor&) = delete;
479   ParallelFor& operator=(const ParallelFor&) = delete;
480 
~ParallelFor()481   ~ParallelFor() {
482     const Kokkos::Experimental::SYCL& space = m_policy.space();
483     Kokkos::Experimental::Impl::SYCLInternal& instance =
484         *space.impl_internal_space_instance();
485     sycl::queue& q = *instance.m_queue;
486     sycl::free(m_scratch_ptr[1], q);
487   }
488 };
489 
490 //----------------------------------------------------------------------------
491 //----------------------------------------------------------------------------
492 
493 template <class FunctorType, class ReducerType, class... Properties>
494 class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
495                      ReducerType, Kokkos::Experimental::SYCL> {
496  public:
497   using Policy = TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>;
498 
499  private:
500   using Analysis =
501       FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, FunctorType>;
502   using member_type   = typename Policy::member_type;
503   using WorkTag       = typename Policy::work_tag;
504   using launch_bounds = typename Policy::launch_bounds;
505 
506   using pointer_type   = typename Analysis::pointer_type;
507   using reference_type = typename Analysis::reference_type;
508   using value_type     = typename Analysis::value_type;
509 
510  public:
511   using functor_type = FunctorType;
512   using size_type    = Kokkos::Experimental::SYCL::size_type;
513 
514  private:
515   const FunctorType m_functor;
516   const Policy m_policy;
517   const ReducerType m_reducer;
518   const pointer_type m_result_ptr;
519   // FIXME_SYCL avoid reallocating memory for reductions
520   /*  size_type* m_scratch_space;
521     size_type* m_scratch_flags;
522     size_type m_team_begin;*/
523   size_type m_shmem_begin;
524   size_type m_shmem_size;
525   void* m_scratch_ptr[2];
526   int m_scratch_size[2];
527   const size_type m_league_size;
528   int m_team_size;
529   const size_type m_vector_size;
530 
531   template <typename PolicyType, typename Functor, typename Reducer>
sycl_direct_launch(const PolicyType & policy,const Functor & functor,const Reducer & reducer) const532   void sycl_direct_launch(const PolicyType& policy, const Functor& functor,
533                           const Reducer& reducer) const {
534     using ReducerConditional =
535         Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
536                            FunctorType, ReducerType>;
537     using ReducerTypeFwd = typename ReducerConditional::type;
538     using WorkTagFwd =
539         std::conditional_t<std::is_same<InvalidType, ReducerType>::value,
540                            WorkTag, void>;
541     using ValueInit =
542         Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
543     using ValueJoin =
544         Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
545     using ValueOps = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>;
546 
547     auto selected_reducer = ReducerConditional::select(functor, reducer);
548 
549     // Convenience references
550     const Kokkos::Experimental::SYCL& space = policy.space();
551     Kokkos::Experimental::Impl::SYCLInternal& instance =
552         *space.impl_internal_space_instance();
553     sycl::queue& q = *instance.m_queue;
554 
555     // FIXME_SYCL optimize
556     const size_t wgroup_size = m_team_size;
557     std::size_t size         = m_league_size * m_team_size;
558     const auto init_size =
559         std::max<std::size_t>((size + wgroup_size - 1) / wgroup_size, 1);
560     const unsigned int value_count =
561         FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>::value_count(
562             selected_reducer);
563     // FIXME_SYCL only use the first half
564     const auto results_ptr = static_cast<pointer_type>(instance.scratch_space(
565         sizeof(value_type) * std::max(value_count, 1u) * init_size * 2));
566     // FIXME_SYCL without this we are running into a race condition
567     const auto results_ptr2 =
568         results_ptr + std::max(value_count, 1u) * init_size;
569 
570     // If size<=1 we only call init(), the functor and possibly final once
571     // working with the global scratch memory but don't copy back to
572     // m_result_ptr yet.
573     if (size <= 1) {
574       q.submit([&](sycl::handler& cgh) {
575         // FIXME_SYCL accessors seem to need a size greater than zero at least
576         // for host queues
577         sycl::accessor<char, 1, sycl::access::mode::read_write,
578                        sycl::access::target::local>
579             team_scratch_memory_L0(
580                 sycl::range<1>(std::max(m_scratch_size[0] + m_shmem_begin, 1)),
581                 cgh);
582 
583         // Avoid capturing *this since it might not be trivially copyable
584         const auto shmem_begin     = m_shmem_begin;
585         const int scratch_size[2]  = {m_scratch_size[0], m_scratch_size[1]};
586         void* const scratch_ptr[2] = {m_scratch_ptr[0], m_scratch_ptr[1]};
587 
588         cgh.parallel_for(
589             sycl::nd_range<2>(sycl::range<2>(1, 1), sycl::range<2>(1, 1)),
590             [=](sycl::nd_item<2> item) {
591               const auto& selected_reducer = ReducerConditional::select(
592                   static_cast<const FunctorType&>(functor),
593                   static_cast<const ReducerType&>(reducer));
594               reference_type update =
595                   ValueInit::init(selected_reducer, results_ptr);
596               if (size == 1) {
597                 const member_type team_member(
598                     team_scratch_memory_L0.get_pointer(), shmem_begin,
599                     scratch_size[0], static_cast<char*>(scratch_ptr[1]),
600                     scratch_size[1], item);
601                 if constexpr (std::is_same<WorkTag, void>::value)
602                   functor(team_member, update);
603                 else
604                   functor(WorkTag(), team_member, update);
605               }
606               if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
607                 FunctorFinal<FunctorType, WorkTag>::final(
608                     static_cast<const FunctorType&>(functor), results_ptr);
609             });
610       });
611       space.fence();
612     }
613 
614     // Otherwise, we perform a reduction on the values in all workgroups
615     // separately, write the workgroup results back to global memory and recurse
616     // until only one workgroup does the reduction and thus gets the final
617     // value.
618     bool first_run = true;
619     while (size > 1) {
620       auto n_wgroups = (size + wgroup_size - 1) / wgroup_size;
621       q.submit([&](sycl::handler& cgh) {
622         sycl::accessor<value_type, 1, sycl::access::mode::read_write,
623                        sycl::access::target::local>
624             local_mem(sycl::range<1>(wgroup_size) * std::max(value_count, 1u),
625                       cgh);
626         // FIXME_SYCL accessors seem to need a size greater than zero at least
627         // for host queues
628         sycl::accessor<char, 1, sycl::access::mode::read_write,
629                        sycl::access::target::local>
630             team_scratch_memory_L0(
631                 sycl::range<1>(std::max(m_scratch_size[0] + m_shmem_begin, 1)),
632                 cgh);
633 
634         // Avoid capturing *this since it might not be trivially copyable
635         const auto shmem_begin     = m_shmem_begin;
636         const int scratch_size[2]  = {m_scratch_size[0], m_scratch_size[1]};
637         void* const scratch_ptr[2] = {m_scratch_ptr[0], m_scratch_ptr[1]};
638 
639         cgh.parallel_for(
640             sycl::nd_range<2>(
641                 sycl::range<2>(m_league_size * m_team_size, m_vector_size),
642                 sycl::range<2>(m_team_size, m_vector_size)),
643             [=](sycl::nd_item<2> item) {
644               const auto local_id = item.get_local_linear_id();
645               const auto global_id =
646                   wgroup_size * item.get_group_linear_id() + local_id;
647               const auto& selected_reducer = ReducerConditional::select(
648                   static_cast<const FunctorType&>(functor),
649                   static_cast<const ReducerType&>(reducer));
650 
651               // In the first iteration, we call functor to initialize the local
652               // memory. Otherwise, the local memory is initialized with the
653               // results from the previous iteration that are stored in global
654               // memory. Note that we load values_per_thread values per thread
655               // and immediately combine them to avoid too many threads being
656               // idle in the actual workgroup reduction.
657               if (first_run) {
658                 reference_type update = ValueInit::init(
659                     selected_reducer, &local_mem[local_id * value_count]);
660                 const member_type team_member(
661                     team_scratch_memory_L0.get_pointer(), shmem_begin,
662                     scratch_size[0],
663                     static_cast<char*>(scratch_ptr[1]) +
664                         item.get_group(0) * scratch_size[1],
665                     scratch_size[1], item);
666                 if constexpr (std::is_same<WorkTag, void>::value)
667                   functor(team_member, update);
668                 else
669                   functor(WorkTag(), team_member, update);
670               } else {
671                 if (global_id >= size)
672                   ValueInit::init(selected_reducer,
673                                   &local_mem[local_id * value_count]);
674                 else {
675                   ValueOps::copy(functor, &local_mem[local_id * value_count],
676                                  &results_ptr[global_id * value_count]);
677                 }
678               }
679               item.barrier(sycl::access::fence_space::local_space);
680 
681               // Perform the actual workgroup reduction. To achieve a better
682               // memory access pattern, we use sequential addressing and a
683               // reversed loop. If the workgroup size is 8, the first element
684               // contains all the values with index%4==0, after the second one
685               // the values with index%2==0 and after the third one index%1==0,
686               // i.e., all values.
687               for (unsigned int stride = wgroup_size / 2; stride > 0;
688                    stride >>= 1) {
689                 const auto idx = local_id;
690                 if (idx < stride) {
691                   ValueJoin::join(selected_reducer,
692                                   &local_mem[idx * value_count],
693                                   &local_mem[(idx + stride) * value_count]);
694                 }
695                 item.barrier(sycl::access::fence_space::local_space);
696               }
697 
698               // Finally, we copy the workgroup results back to global memory to
699               // be used in the next iteration. If this is the last iteration,
700               // i.e., there is only one workgroup also call final() if
701               // necessary.
702               if (local_id == 0) {
703                 ValueOps::copy(
704                     functor,
705                     &results_ptr2[(item.get_group_linear_id()) * value_count],
706                     &local_mem[0]);
707                 if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
708                   if (n_wgroups <= 1 && item.get_group_linear_id() == 0) {
709                     FunctorFinal<FunctorType, WorkTag>::final(
710                         static_cast<const FunctorType&>(functor),
711                         &results_ptr2[(item.get_group_linear_id()) *
712                                       value_count]);
713                   }
714               }
715             });
716       });
717       space.fence();
718 
719       // FIXME_SYCL this is likely not necessary, see above
720       Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
721                              Kokkos::Experimental::SYCLDeviceUSMSpace>(
722           space, results_ptr, results_ptr2,
723           sizeof(*m_result_ptr) * value_count * n_wgroups);
724       space.fence();
725 
726       first_run = false;
727       size      = n_wgroups;
728     }
729 
730     // At this point, the reduced value is written to the entry in results_ptr
731     // and all that is left is to copy it back to the given result pointer if
732     // necessary.
733     if (m_result_ptr) {
734       Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
735                              Kokkos::Experimental::SYCLDeviceUSMSpace>(
736           space, m_result_ptr, results_ptr,
737           sizeof(*m_result_ptr) * value_count);
738       space.fence();
739     }
740   }
741 
742  public:
execute()743   inline void execute() {
744     Kokkos::Experimental::Impl::SYCLInternal& instance =
745         *m_policy.space().impl_internal_space_instance();
746     using IndirectKernelMem =
747         Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem;
748     IndirectKernelMem& indirectKernelMem  = instance.m_indirectKernelMem;
749     IndirectKernelMem& indirectReducerMem = instance.m_indirectReducerMem;
750 
751     const auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper(
752         m_functor, indirectKernelMem);
753     const auto reducer_wrapper = Experimental::Impl::make_sycl_function_wrapper(
754         m_reducer, indirectReducerMem);
755 
756     sycl_direct_launch(m_policy, functor_wrapper.get_functor(),
757                        reducer_wrapper.get_functor());
758   }
759 
760  private:
initialize()761   void initialize() {
762     // FIXME_SYCL optimize
763     if (m_team_size < 0) m_team_size = 32;
764     // Must be a power of two greater than two, get the one not bigger than the
765     // requested one.
766     if ((m_team_size & m_team_size - 1) || m_team_size < 2) {
767       int temp_team_size = 2;
768       while ((temp_team_size << 1) < m_team_size) temp_team_size <<= 1;
769       m_team_size = temp_team_size;
770     }
771 
772     m_shmem_begin = (sizeof(double) * (m_team_size + 2));
773     m_shmem_size =
774         (m_policy.scratch_size(0, m_team_size) +
775          FunctorTeamShmemSize<FunctorType>::value(m_functor, m_team_size));
776     m_scratch_size[0] = m_shmem_size;
777     m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
778 
779     // FIXME_SYCL so far accessors used instead of these pointers
780     // Functor's reduce memory, team scan memory, and team shared memory depend
781     // upon team size.
782     const auto& space    = *m_policy.space().impl_internal_space_instance();
783     const sycl::queue& q = *space.m_queue;
784     m_scratch_ptr[0]     = nullptr;
785     m_scratch_ptr[1]     = sycl::malloc_device(
786         sizeof(char) * m_scratch_size[1] * m_league_size, q);
787 
788     if (static_cast<int>(space.m_maxShmemPerBlock) <
789         m_shmem_size - m_shmem_begin) {
790       std::stringstream out;
791       out << "Kokkos::Impl::ParallelFor<SYCL> insufficient shared memory! "
792              "Requested "
793           << m_shmem_size - m_shmem_begin << " bytes but maximum is "
794           << m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock
795           << '\n';
796       Kokkos::Impl::throw_runtime_exception(out.str());
797     }
798 
799     if (m_team_size > m_policy.team_size_max(m_functor, ParallelForTag{}))
800       Kokkos::Impl::throw_runtime_exception(
801           "Kokkos::Impl::ParallelFor<SYCL> requested too large team size.");
802   }
803 
804  public:
805   template <class ViewType>
ParallelReduce(FunctorType const & arg_functor,Policy const & arg_policy,ViewType const & arg_result,typename std::enable_if<Kokkos::is_view<ViewType>::value,void * >::type=nullptr)806   ParallelReduce(FunctorType const& arg_functor, Policy const& arg_policy,
807                  ViewType const& arg_result,
808                  typename std::enable_if<Kokkos::is_view<ViewType>::value,
809                                          void*>::type = nullptr)
810       : m_functor(arg_functor),
811         m_policy(arg_policy),
812         m_reducer(InvalidType()),
813         m_result_ptr(arg_result.data()),
814         m_league_size(arg_policy.league_size()),
815         m_team_size(arg_policy.team_size()),
816         m_vector_size(arg_policy.impl_vector_length()) {
817     initialize();
818   }
819 
ParallelReduce(FunctorType const & arg_functor,Policy const & arg_policy,ReducerType const & reducer)820   ParallelReduce(FunctorType const& arg_functor, Policy const& arg_policy,
821                  ReducerType const& reducer)
822       : m_functor(arg_functor),
823         m_policy(arg_policy),
824         m_reducer(reducer),
825         m_result_ptr(reducer.view().data()),
826         m_league_size(arg_policy.league_size()),
827         m_team_size(arg_policy.team_size()),
828         m_vector_size(arg_policy.impl_vector_length()) {
829     initialize();
830   }
831 };
832 }  // namespace Impl
833 }  // namespace Kokkos
834 
835 #endif
836