1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 //                        Kokkos v. 3.0
6 //       Copyright (2020) National Technology & Engineering
7 //               Solutions of Sandia, LLC (NTESS).
8 //
9 // Under the terms of Contract DE-NA0003525 with NTESS,
10 // the U.S. Government retains certain rights in this software.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions are
14 // met:
15 //
16 // 1. Redistributions of source code must retain the above copyright
17 // notice, this list of conditions and the following disclaimer.
18 //
19 // 2. Redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution.
22 //
23 // 3. Neither the name of the Corporation nor the names of the
24 // contributors may be used to endorse or promote products derived from
25 // this software without specific prior written permission.
26 //
27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 //
39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
40 //
41 // ************************************************************************
42 //@HEADER
43 */
44 
45 #ifndef KOKKOS_OPENMPTARGET_PARALLEL_HPP
46 #define KOKKOS_OPENMPTARGET_PARALLEL_HPP
47 
48 #include <omp.h>
49 #include <sstream>
50 #include <Kokkos_Parallel.hpp>
51 #include <OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp>
52 #include <impl/Kokkos_FunctorAdapter.hpp>
53 
54 #define KOKKOS_IMPL_LOCK_FREE_HIERARCHICAL
55 
56 namespace Kokkos {
57 namespace Impl {
58 
59 template <class FunctorType, class... Traits>
60 class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
61                   Kokkos::Experimental::OpenMPTarget> {
62  private:
63   using Policy    = Kokkos::RangePolicy<Traits...>;
64   using WorkTag   = typename Policy::work_tag;
65   using WorkRange = typename Policy::WorkRange;
66   using Member    = typename Policy::member_type;
67 
68   const FunctorType m_functor;
69   const Policy m_policy;
70 
71  public:
execute() const72   inline void execute() const { execute_impl<WorkTag>(); }
73   /*
74     template <class TagType>
75     inline typename std::enable_if<std::is_same<TagType, void>::value>::type
76     execute_impl() const {
77       OpenMPTargetExec::verify_is_process(
78           "Kokkos::Experimental::OpenMPTarget parallel_for");
79       OpenMPTargetExec::verify_initialized(
80           "Kokkos::Experimental::OpenMPTarget parallel_for");
81       const typename Policy::member_type begin = m_policy.begin();
82       const typename Policy::member_type end   = m_policy.end();
83 
84   #pragma omp target teams distribute parallel for map(to: this->m_functor)
85       for (int i = begin; i < end; i++) m_functor(i);
86     }
87   */
88   template <class TagType>
execute_impl() const89   inline void execute_impl() const {
90     OpenMPTargetExec::verify_is_process(
91         "Kokkos::Experimental::OpenMPTarget parallel_for");
92     OpenMPTargetExec::verify_initialized(
93         "Kokkos::Experimental::OpenMPTarget parallel_for");
94     const auto begin = m_policy.begin();
95     const auto end   = m_policy.end();
96 
97     if (end <= begin) return;
98 
99     FunctorType a_functor(m_functor);
100 
101     if constexpr (std::is_same<TagType, void>::value) {
102 #pragma omp target teams distribute parallel for map(to : a_functor)
103       for (auto i = begin; i < end; i++) a_functor(i);
104     } else {
105 #pragma omp target teams distribute parallel for map(to : a_functor)
106       for (auto i = begin; i < end; i++) a_functor(TagType(), i);
107     }
108   }
109 
ParallelFor(const FunctorType & arg_functor,Policy arg_policy)110   inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy)
111       : m_functor(arg_functor), m_policy(arg_policy) {}
112 };
113 
114 }  // namespace Impl
115 }  // namespace Kokkos
116 
117 //----------------------------------------------------------------------------
118 //----------------------------------------------------------------------------
119 
120 namespace Kokkos {
121 namespace Impl {
122 
123 template <class FunctorType, class PolicyType, class ReducerType,
124           class PointerType, class ValueType, bool FunctorHasJoin,
125           bool UseReducerType>
126 struct ParallelReduceSpecialize {
executeKokkos::Impl::ParallelReduceSpecialize127   static inline void execute(const FunctorType& /*f*/, const PolicyType& /*p*/,
128                              PointerType /*result_ptr*/) {
129     std::stringstream error_message;
130     error_message << "Error: Invalid Specialization " << FunctorHasJoin << ' '
131                   << UseReducerType << '\n';
132     // FIXME_OPENMPTARGET
133     OpenMPTarget_abort(error_message.str().c_str());
134   }
135 };
136 
137 template <class FunctorType, class ReducerType, class PointerType,
138           class ValueType, class... PolicyArgs>
139 struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>,
140                                 ReducerType, PointerType, ValueType, false,
141                                 false> {
142   using PolicyType = Kokkos::RangePolicy<PolicyArgs...>;
143   template <class TagType>
execute_implKokkos::Impl::ParallelReduceSpecialize144   inline static void execute_impl(const FunctorType& f, const PolicyType& p,
145                                   PointerType result_ptr) {
146     OpenMPTargetExec::verify_is_process(
147         "Kokkos::Experimental::OpenMPTarget parallel_for");
148     OpenMPTargetExec::verify_initialized(
149         "Kokkos::Experimental::OpenMPTarget parallel_for");
150     const auto begin = p.begin();
151     const auto end   = p.end();
152 
153     if (end <= begin) return;
154 
155     ValueType result = ValueType();
156     if constexpr (std::is_same<TagType, void>::value) {
157 #pragma omp target teams distribute parallel for num_teams(512) \
158                 map(to:f) map(tofrom:result) reduction(+: result)
159       for (auto i = begin; i < end; i++) f(i, result);
160     } else {
161 #pragma omp target teams distribute parallel for num_teams(512) \
162                 map(to:f) map(tofrom:result) reduction(+: result)
163       for (auto i = begin; i < end; i++) f(TagType(), i, result);
164     }
165 
166     *result_ptr = result;
167   }
168 
executeKokkos::Impl::ParallelReduceSpecialize169   inline static void execute(const FunctorType& f, const PolicyType& p,
170                              PointerType ptr) {
171     execute_impl<typename PolicyType::work_tag>(f, p, ptr);
172   }
173 };
174 
175 template <class FunctorType, class PolicyType, class ReducerType,
176           class PointerType, class ValueType>
177 struct ParallelReduceSpecialize<FunctorType, PolicyType, ReducerType,
178                                 PointerType, ValueType, false, true> {
179 #pragma omp declare reduction(                                         \
180     custom:ValueType                                                   \
181     : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
182     initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
183 
184   template <class TagType>
execute_implKokkos::Impl::ParallelReduceSpecialize185   inline static void execute_impl(const FunctorType& f, const PolicyType& p,
186                                   PointerType result_ptr) {
187     OpenMPTargetExec::verify_is_process(
188         "Kokkos::Experimental::OpenMPTarget parallel_for");
189     OpenMPTargetExec::verify_initialized(
190         "Kokkos::Experimental::OpenMPTarget parallel_for");
191     const typename PolicyType::member_type begin = p.begin();
192     const typename PolicyType::member_type end   = p.end();
193 
194     if (end <= begin) return;
195 
196     ValueType result = ValueType();
197     OpenMPTargetReducerWrapper<ReducerType>::init(result);
198 
199     if constexpr (std::is_same<TagType, void>::value) {
200 #pragma omp target teams distribute parallel for num_teams(512) map(to   \
201                                                                     : f) \
202     reduction(custom                                                     \
203               : result)
204       for (auto i = begin; i < end; i++) f(i, result);
205       *result_ptr = result;
206     } else {
207 #pragma omp target teams distribute parallel for num_teams(512) map(to   \
208                                                                     : f) \
209     reduction(custom                                                     \
210               : result)
211       for (auto i = begin; i < end; i++) f(TagType(), i, result);
212       *result_ptr = result;
213     }
214   }
215 
executeKokkos::Impl::ParallelReduceSpecialize216   inline static void execute(const FunctorType& f, const PolicyType& p,
217                              PointerType ptr) {
218     execute_impl<typename PolicyType::work_tag>(f, p, ptr);
219   }
220 };
221 
222 template <class FunctorType, class ReducerType, class... Traits>
223 class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
224                      Kokkos::Experimental::OpenMPTarget> {
225  private:
226   using Policy = Kokkos::RangePolicy<Traits...>;
227 
228   using WorkTag   = typename Policy::work_tag;
229   using WorkRange = typename Policy::WorkRange;
230   using Member    = typename Policy::member_type;
231 
232   using ReducerConditional =
233       Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
234                          FunctorType, ReducerType>;
235   using ReducerTypeFwd = typename ReducerConditional::type;
236   using WorkTagFwd =
237       std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
238                          void>;
239 
240   // Static Assert WorkTag void if ReducerType not InvalidType
241 
242   using ValueTraits =
243       Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
244   using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
245   using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
246 
247   enum { HasJoin = ReduceFunctorHasJoin<FunctorType>::value };
248   enum { UseReducer = is_reducer_type<ReducerType>::value };
249 
250   using pointer_type   = typename ValueTraits::pointer_type;
251   using reference_type = typename ValueTraits::reference_type;
252 
253   using ParReduceSpecialize =
254       ParallelReduceSpecialize<FunctorType, Policy, ReducerType, pointer_type,
255                                typename ValueTraits::value_type, HasJoin,
256                                UseReducer>;
257 
258   const FunctorType m_functor;
259   const Policy m_policy;
260   const ReducerType m_reducer;
261   const pointer_type m_result_ptr;
262 
263  public:
execute() const264   inline void execute() const {
265     ParReduceSpecialize::execute(m_functor, m_policy, m_result_ptr);
266   }
267 
268   template <class ViewType>
ParallelReduce(const FunctorType & arg_functor,Policy arg_policy,const ViewType & arg_result_view,typename std::enable_if<Kokkos::is_view<ViewType>::value &&!Kokkos::is_reducer_type<ReducerType>::value,void * >::type=nullptr)269   inline ParallelReduce(
270       const FunctorType& arg_functor, Policy arg_policy,
271       const ViewType& arg_result_view,
272       typename std::enable_if<Kokkos::is_view<ViewType>::value &&
273                                   !Kokkos::is_reducer_type<ReducerType>::value,
274                               void*>::type = nullptr)
275       : m_functor(arg_functor),
276         m_policy(arg_policy),
277         m_reducer(InvalidType()),
278         m_result_ptr(arg_result_view.data()) {}
279 
ParallelReduce(const FunctorType & arg_functor,Policy arg_policy,const ReducerType & reducer)280   inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy,
281                         const ReducerType& reducer)
282       : m_functor(arg_functor),
283         m_policy(arg_policy),
284         m_reducer(reducer),
285         m_result_ptr(reducer.view().data()) {}
286 };
287 
288 }  // namespace Impl
289 }  // namespace Kokkos
290 
291 //----------------------------------------------------------------------------
292 //----------------------------------------------------------------------------
293 
294 namespace Kokkos {
295 namespace Impl {
296 
297 template <class FunctorType, class... Traits>
298 class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
299                    Kokkos::Experimental::OpenMPTarget> {
300  protected:
301   using Policy = Kokkos::RangePolicy<Traits...>;
302 
303   using WorkTag   = typename Policy::work_tag;
304   using WorkRange = typename Policy::WorkRange;
305   using Member    = typename Policy::member_type;
306   using idx_type  = typename Policy::index_type;
307 
308   using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, WorkTag>;
309   using ValueInit   = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>;
310   using ValueJoin   = Kokkos::Impl::FunctorValueJoin<FunctorType, WorkTag>;
311   using ValueOps    = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>;
312 
313   using value_type     = typename ValueTraits::value_type;
314   using pointer_type   = typename ValueTraits::pointer_type;
315   using reference_type = typename ValueTraits::reference_type;
316 
317   const FunctorType m_functor;
318   const Policy m_policy;
319 
320   template <class TagType>
321   inline typename std::enable_if<std::is_same<TagType, void>::value>::type
call_with_tag(const FunctorType & f,const idx_type & idx,value_type & val,const bool & is_final) const322   call_with_tag(const FunctorType& f, const idx_type& idx, value_type& val,
323                 const bool& is_final) const {
324     f(idx, val, is_final);
325   }
326   template <class TagType>
327   inline typename std::enable_if<!std::is_same<TagType, void>::value>::type
call_with_tag(const FunctorType & f,const idx_type & idx,value_type & val,const bool & is_final) const328   call_with_tag(const FunctorType& f, const idx_type& idx, value_type& val,
329                 const bool& is_final) const {
330     f(WorkTag(), idx, val, is_final);
331   }
332 
333  public:
impl_execute(Kokkos::View<value_type **,Kokkos::LayoutRight,Kokkos::Experimental::OpenMPTargetSpace> element_values,Kokkos::View<value_type *,Kokkos::Experimental::OpenMPTargetSpace> chunk_values,Kokkos::View<int64_t,Kokkos::Experimental::OpenMPTargetSpace> count) const334   inline void impl_execute(
335       Kokkos::View<value_type**, Kokkos::LayoutRight,
336                    Kokkos::Experimental::OpenMPTargetSpace>
337           element_values,
338       Kokkos::View<value_type*, Kokkos::Experimental::OpenMPTargetSpace>
339           chunk_values,
340       Kokkos::View<int64_t, Kokkos::Experimental::OpenMPTargetSpace> count)
341       const {
342     const idx_type N          = m_policy.end() - m_policy.begin();
343     const idx_type chunk_size = 128;
344     const idx_type n_chunks   = (N + chunk_size - 1) / chunk_size;
345     idx_type nteams           = n_chunks > 512 ? 512 : n_chunks;
346     idx_type team_size        = 128;
347 
348     FunctorType a_functor(m_functor);
349 #pragma omp target teams distribute map(to                             \
350                                         : a_functor) num_teams(nteams) \
351     thread_limit(team_size)
352     for (idx_type team_id = 0; team_id < n_chunks; team_id++) {
353 #pragma omp parallel num_threads(team_size)
354       {
355         const idx_type local_offset = team_id * chunk_size;
356 
357 #pragma omp for
358         for (idx_type i = 0; i < chunk_size; i++) {
359           const idx_type idx = local_offset + i;
360           value_type val;
361           ValueInit::init(a_functor, &val);
362           if (idx < N) call_with_tag<WorkTag>(a_functor, idx, val, false);
363           element_values(team_id, i) = val;
364         }
365 #pragma omp barrier
366         if (omp_get_thread_num() == 0) {
367           value_type sum;
368           ValueInit::init(a_functor, &sum);
369           for (idx_type i = 0; i < chunk_size; i++) {
370             ValueJoin::join(a_functor, &sum, &element_values(team_id, i));
371             element_values(team_id, i) = sum;
372           }
373           chunk_values(team_id) = sum;
374         }
375 #pragma omp barrier
376         if (omp_get_thread_num() == 0) {
377           if (Kokkos::atomic_fetch_add(&count(), 1) == n_chunks - 1) {
378             value_type sum;
379             ValueInit::init(a_functor, &sum);
380             for (idx_type i = 0; i < n_chunks; i++) {
381               ValueJoin::join(a_functor, &sum, &chunk_values(i));
382               chunk_values(i) = sum;
383             }
384           }
385         }
386       }
387     }
388 
389 #pragma omp target teams distribute map(to                             \
390                                         : a_functor) num_teams(nteams) \
391     thread_limit(team_size)
392     for (idx_type team_id = 0; team_id < n_chunks; team_id++) {
393 #pragma omp parallel num_threads(team_size)
394       {
395         const idx_type local_offset = team_id * chunk_size;
396         value_type offset_value;
397         if (team_id > 0)
398           offset_value = chunk_values(team_id - 1);
399         else
400           ValueInit::init(a_functor, &offset_value);
401 
402 #pragma omp for
403         for (idx_type i = 0; i < chunk_size; i++) {
404           const idx_type idx = local_offset + i;
405           value_type local_offset_value;
406           if (i > 0) {
407             local_offset_value = element_values(team_id, i - 1);
408             ValueJoin::join(a_functor, &local_offset_value, &offset_value);
409           } else
410             local_offset_value = offset_value;
411           if (idx < N)
412             call_with_tag<WorkTag>(a_functor, idx, local_offset_value, true);
413         }
414       }
415     }
416   }
417 
execute() const418   inline void execute() const {
419     OpenMPTargetExec::verify_is_process(
420         "Kokkos::Experimental::OpenMPTarget parallel_for");
421     OpenMPTargetExec::verify_initialized(
422         "Kokkos::Experimental::OpenMPTarget parallel_for");
423     const idx_type N          = m_policy.end() - m_policy.begin();
424     const idx_type chunk_size = 128;
425     const idx_type n_chunks   = (N + chunk_size - 1) / chunk_size;
426 
427     // This could be scratch memory per team
428     Kokkos::View<value_type**, Kokkos::LayoutRight,
429                  Kokkos::Experimental::OpenMPTargetSpace>
430         element_values("element_values", n_chunks, chunk_size);
431     Kokkos::View<value_type*, Kokkos::Experimental::OpenMPTargetSpace>
432         chunk_values("chunk_values", n_chunks);
433     Kokkos::View<int64_t, Kokkos::Experimental::OpenMPTargetSpace> count(
434         "Count");
435 
436     impl_execute(element_values, chunk_values, count);
437   }
438 
439   //----------------------------------------
440 
ParallelScan(const FunctorType & arg_functor,const Policy & arg_policy)441   inline ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy)
442       : m_functor(arg_functor), m_policy(arg_policy) {}
443 
444   //----------------------------------------
445 };
446 
447 template <class FunctorType, class ReturnType, class... Traits>
448 class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
449                             ReturnType, Kokkos::Experimental::OpenMPTarget>
450     : public ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
451                           Kokkos::Experimental::OpenMPTarget> {
452   using base_t     = ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
453                               Kokkos::Experimental::OpenMPTarget>;
454   using value_type = typename base_t::value_type;
455   value_type& m_returnvalue;
456 
457  public:
execute() const458   inline void execute() const {
459     OpenMPTargetExec::verify_is_process(
460         "Kokkos::Experimental::OpenMPTarget parallel_for");
461     OpenMPTargetExec::verify_initialized(
462         "Kokkos::Experimental::OpenMPTarget parallel_for");
463     const int64_t N        = base_t::m_policy.end() - base_t::m_policy.begin();
464     const int chunk_size   = 128;
465     const int64_t n_chunks = (N + chunk_size - 1) / chunk_size;
466 
467     if (N > 0) {
468       // This could be scratch memory per team
469       Kokkos::View<value_type**, Kokkos::LayoutRight,
470                    Kokkos::Experimental::OpenMPTargetSpace>
471           element_values("element_values", n_chunks, chunk_size);
472       Kokkos::View<value_type*, Kokkos::Experimental::OpenMPTargetSpace>
473           chunk_values("chunk_values", n_chunks);
474       Kokkos::View<int64_t, Kokkos::Experimental::OpenMPTargetSpace> count(
475           "Count");
476 
477       base_t::impl_execute(element_values, chunk_values, count);
478 
479       const int size = base_t::ValueTraits::value_size(base_t::m_functor);
480       DeepCopy<HostSpace, Kokkos::Experimental::OpenMPTargetSpace>(
481           &m_returnvalue, chunk_values.data() + (n_chunks - 1), size);
482     } else {
483       m_returnvalue = 0;
484     }
485   }
486 
ParallelScanWithTotal(const FunctorType & arg_functor,const typename base_t::Policy & arg_policy,ReturnType & arg_returnvalue)487   ParallelScanWithTotal(const FunctorType& arg_functor,
488                         const typename base_t::Policy& arg_policy,
489                         ReturnType& arg_returnvalue)
490       : base_t(arg_functor, arg_policy), m_returnvalue(arg_returnvalue) {}
491 };
492 }  // namespace Impl
493 }  // namespace Kokkos
494 
495 //----------------------------------------------------------------------------
496 //----------------------------------------------------------------------------
497 
498 namespace Kokkos {
499 namespace Impl {
500 
501 template <class FunctorType, class... Properties>
502 class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
503                   Kokkos::Experimental::OpenMPTarget> {
504  private:
505   using Policy =
506       Kokkos::Impl::TeamPolicyInternal<Kokkos::Experimental::OpenMPTarget,
507                                        Properties...>;
508   using WorkTag = typename Policy::work_tag;
509   using Member  = typename Policy::member_type;
510 
511   const FunctorType m_functor;
512   const Policy m_policy;
513   const int m_shmem_size;
514 
515  public:
execute() const516   inline void execute() const {
517     OpenMPTargetExec::verify_is_process(
518         "Kokkos::Experimental::OpenMPTarget parallel_for");
519     OpenMPTargetExec::verify_initialized(
520         "Kokkos::Experimental::OpenMPTarget parallel_for");
521     execute_impl<WorkTag>();
522   }
523 
524  private:
525   template <class TagType>
execute_impl() const526   inline void execute_impl() const {
527     OpenMPTargetExec::verify_is_process(
528         "Kokkos::Experimental::OpenMPTarget parallel_for");
529     OpenMPTargetExec::verify_initialized(
530         "Kokkos::Experimental::OpenMPTarget parallel_for");
531     const auto league_size   = m_policy.league_size();
532     const auto team_size     = m_policy.team_size();
533     const auto vector_length = m_policy.impl_vector_length();
534 
535     const size_t shmem_size_L0 = m_policy.scratch_size(0, team_size);
536     const size_t shmem_size_L1 = m_policy.scratch_size(1, team_size);
537     OpenMPTargetExec::resize_scratch(team_size, shmem_size_L0, shmem_size_L1);
538 
539     void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
540     FunctorType a_functor(m_functor);
541 
542     // FIXME_OPENMPTARGET - If the team_size is not a multiple of 32, the
543     // scratch implementation does not work in the Release or RelWithDebugInfo
544     // mode but works in the Debug mode.
545 
546     // Maximum active teams possible.
547     int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size;
548     // nteams should not exceed the maximum in-flight teams possible.
549     const auto nteams =
550         league_size < max_active_teams ? league_size : max_active_teams;
551 
552 #ifdef KOKKOS_IMPL_LOCK_FREE_HIERARCHICAL
553 // Performing our own scheduling of teams to avoid separation of code between
554 // teams-distribute and parallel. Gave a 2x performance boost in test cases with
555 // the clang compiler. atomic_compare_exchange can be avoided since the standard
556 // guarantees that the number of teams specified in the `num_teams` clause is
557 // always less than or equal to the maximum concurrently running teams.
558 #pragma omp target teams num_teams(nteams) thread_limit(team_size) \
559     map(to                                                         \
560         : a_functor) is_device_ptr(scratch_ptr)
561 #pragma omp parallel
562     {
563       const int blockIdx = omp_get_team_num();
564       const int gridDim  = omp_get_num_teams();
565 
566       // Iterate through the number of teams until league_size and assign the
567       // league_id accordingly
568       // Guarantee that the compilers respect the `num_teams` clause
569       if (gridDim <= nteams) {
570         for (int league_id = blockIdx; league_id < league_size;
571              league_id += gridDim) {
572           typename Policy::member_type team(
573               league_id, league_size, team_size, vector_length, scratch_ptr,
574               blockIdx, shmem_size_L0, shmem_size_L1);
575           if constexpr (std::is_same<TagType, void>::value)
576             m_functor(team);
577           else
578             m_functor(TagType(), team);
579         }
580       } else
581         Kokkos::abort("`num_teams` clause was not respected.\n");
582     }
583 
584 #else
585 // Saving the older implementation that uses `atomic_compare_exchange` to
586 // calculate the shared memory block index and `distribute` clause to distribute
587 // teams.
588 #pragma omp target teams distribute map(to                   \
589                                         : a_functor)         \
590     is_device_ptr(scratch_ptr, lock_array) num_teams(nteams) \
591         thread_limit(team_size)
592     for (int i = 0; i < league_size; i++) {
593       int shmem_block_index = -1, lock_team = 99999, iter = -1;
594       iter = (omp_get_team_num() % max_active_teams);
595 
596       // Loop as long as a shmem_block_index is not found.
597       while (shmem_block_index == -1) {
598         // Try and acquire a lock on the index.
599         lock_team = atomic_compare_exchange(&lock_array[iter], 0, 1);
600 
601         // If lock is acquired assign it to the block index.
602         // lock_team = 0, implies atomic_compare_exchange is successfull.
603         if (lock_team == 0)
604           shmem_block_index = iter;
605         else
606           iter = ++iter % max_active_teams;
607       }
608 
609 #pragma omp parallel num_threads(team_size)
610       {
611         typename Policy::member_type team(
612             i, league_size, team_size, vector_length, scratch_ptr,
613             shmem_block_index, shmem_size_L0, shmem_size_L1);
614         m_functor(team);
615       }
616 
617       // Free the locked block and increment the number of available free
618       // blocks.
619       lock_team = atomic_compare_exchange(&lock_array[shmem_block_index], 1, 0);
620     }
621 #endif
622   }
623 
624  public:
ParallelFor(const FunctorType & arg_functor,const Policy & arg_policy)625   inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
626       : m_functor(arg_functor),
627         m_policy(arg_policy),
628         m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
629                      FunctorTeamShmemSize<FunctorType>::value(
630                          arg_functor, arg_policy.team_size())) {}
631 };
632 
633 template <class FunctorType, class ReducerType, class PointerType,
634           class ValueType, class... PolicyArgs>
635 struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
636                                 ReducerType, PointerType, ValueType, false,
637                                 false> {
638   using PolicyType = TeamPolicyInternal<PolicyArgs...>;
639 
640   template <class TagType>
execute_implKokkos::Impl::ParallelReduceSpecialize641   inline static void execute_impl(const FunctorType& f, const PolicyType& p,
642                                   PointerType result_ptr) {
643     OpenMPTargetExec::verify_is_process(
644         "Kokkos::Experimental::OpenMPTarget parallel_for");
645     OpenMPTargetExec::verify_initialized(
646         "Kokkos::Experimental::OpenMPTarget parallel_for");
647 
648     const int league_size   = p.league_size();
649     const int team_size     = p.team_size();
650     const int vector_length = p.impl_vector_length();
651 
652     const size_t shmem_size_L0 = p.scratch_size(0, team_size);
653     const size_t shmem_size_L1 = p.scratch_size(1, team_size);
654     OpenMPTargetExec::resize_scratch(PolicyType::member_type::TEAM_REDUCE_SIZE,
655                                      shmem_size_L0, shmem_size_L1);
656     void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
657 
658     ValueType result = ValueType();
659 
660     // Maximum active teams possible.
661     int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size;
662     const auto nteams =
663         league_size < max_active_teams ? league_size : max_active_teams;
664 
665 #ifdef KOKKOS_IMPL_LOCK_FREE_HIERARCHICAL
666 #pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to   \
667                                                                        : f) \
668     is_device_ptr(scratch_ptr) reduction(+: result)
669 #pragma omp parallel reduction(+ : result)
670     {
671       const int blockIdx = omp_get_team_num();
672       const int gridDim  = omp_get_num_teams();
673 
674       // Guarantee that the compilers respect the `num_teams` clause
675       if (gridDim <= nteams) {
676         for (int league_id = blockIdx; league_id < league_size;
677              league_id += gridDim) {
678           typename PolicyType::member_type team(
679               league_id, league_size, team_size, vector_length, scratch_ptr,
680               blockIdx, shmem_size_L0, shmem_size_L1);
681           if constexpr (std::is_same<TagType, void>::value)
682             f(team, result);
683           else
684             f(TagType(), team, result);
685         }
686       } else
687         Kokkos::abort("`num_teams` clause was not respected.\n");
688     }
689 
690     *result_ptr = result;
691 #else
692 // Saving the older implementation that uses `atomic_compare_exchange` to
693 // calculate the shared memory block index and `distribute` clause to distribute
694 // teams.
695 #pragma omp target teams distribute num_teams(nteams) thread_limit(team_size) \
696          map(to:f) map(tofrom:result) reduction(+: result) \
697     is_device_ptr(scratch_ptr, lock_array)
698     for (int i = 0; i < league_size; i++) {
699       ValueType inner_result = ValueType();
700       int shmem_block_index = -1, lock_team = 99999, iter = -1;
701       iter = (omp_get_team_num() % max_active_teams);
702 
703       // Loop as long as a shmem_block_index is not found.
704       while (shmem_block_index == -1) {
705         // Try and acquire a lock on the index.
706         lock_team = atomic_compare_exchange(&lock_array[iter], 0, 1);
707 
708         // If lock is acquired assign it to the block index.
709         // lock_team = 0, implies atomic_compare_exchange is successfull.
710         if (lock_team == 0)
711           shmem_block_index = iter;
712         else
713           iter = ++iter % max_active_teams;
714       }
715 #pragma omp parallel num_threads(team_size) reduction(+ : inner_result)
716       {
717         typename PolicyType::member_type team(
718             i, league_size, team_size, vector_length, scratch_ptr,
719             shmem_block_index, shmem_size_L0, shmem_size_L1);
720         f(team, inner_result);
721       }
722       result = inner_result;
723 
724       // Free the locked block and increment the number of available free
725       // blocks.
726       lock_team = atomic_compare_exchange(&lock_array[shmem_block_index], 1, 0);
727     }
728 
729     *result_ptr = result;
730 #endif
731   }
732 
executeKokkos::Impl::ParallelReduceSpecialize733   inline static void execute(const FunctorType& f, const PolicyType& p,
734                              PointerType ptr) {
735     execute_impl<typename PolicyType::work_tag>(f, p, ptr);
736   }
737 };
738 
739 template <class FunctorType, class ReducerType, class PointerType,
740           class ValueType, class... PolicyArgs>
741 struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
742                                 ReducerType, PointerType, ValueType, false,
743                                 true> {
744   using PolicyType = TeamPolicyInternal<PolicyArgs...>;
745   template <class TagType>
execute_implKokkos::Impl::ParallelReduceSpecialize746   inline static void execute_impl(const FunctorType& f, const PolicyType& p,
747                                   PointerType result_ptr) {
748     OpenMPTargetExec::verify_is_process(
749         "Kokkos::Experimental::OpenMPTarget parallel_for");
750     OpenMPTargetExec::verify_initialized(
751         "Kokkos::Experimental::OpenMPTarget parallel_for");
752 
753 #pragma omp declare reduction(                                         \
754     custom:ValueType                                                   \
755     : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
756     initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
757     const int league_size      = p.league_size();
758     const int team_size        = p.team_size();
759     const int vector_length    = p.impl_vector_length();
760     const size_t shmem_size_L0 = p.scratch_size(0, team_size);
761     const size_t shmem_size_L1 = p.scratch_size(1, team_size);
762     OpenMPTargetExec::resize_scratch(team_size, shmem_size_L0, shmem_size_L1);
763     void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
764 
765     ValueType result = ValueType();
766 
767     // Maximum active teams possible.
768     int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size;
769     const auto nteams =
770         league_size < max_active_teams ? league_size : max_active_teams;
771 
772 #pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to   \
773                                                                        : f) \
774     is_device_ptr(scratch_ptr) reduction(custom                             \
775                                          : result)
776 #pragma omp parallel reduction(custom : result)
777     {
778       const int blockIdx = omp_get_team_num();
779       const int gridDim  = omp_get_num_teams();
780 
781       // Guarantee that the compilers respect the `num_teams` clause
782       if (gridDim <= nteams) {
783         for (int league_id = blockIdx; league_id < league_size;
784              league_id += gridDim) {
785           typename PolicyType::member_type team(
786               league_id, league_size, team_size, vector_length, scratch_ptr,
787               blockIdx, shmem_size_L0, shmem_size_L1);
788           if constexpr (std::is_same<TagType, void>::value)
789             f(team, result);
790           else
791             f(TagType(), team, result);
792         }
793       } else
794         Kokkos::abort("`num_teams` clause was not respected.\n");
795     }
796 
797     *result_ptr = result;
798   }
799 
executeKokkos::Impl::ParallelReduceSpecialize800   inline static void execute(const FunctorType& f, const PolicyType& p,
801                              PointerType ptr) {
802     execute_impl<typename PolicyType::work_tag>(f, p, ptr);
803   }
804 };
805 
806 template <class FunctorType, class ReducerType, class... Properties>
807 class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
808                      ReducerType, Kokkos::Experimental::OpenMPTarget> {
809  private:
810   using Policy =
811       Kokkos::Impl::TeamPolicyInternal<Kokkos::Experimental::OpenMPTarget,
812                                        Properties...>;
813 
814   using WorkTag = typename Policy::work_tag;
815   using Member  = typename Policy::member_type;
816 
817   using ReducerConditional =
818       Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
819                          FunctorType, ReducerType>;
820   using ReducerTypeFwd = typename ReducerConditional::type;
821   using WorkTagFwd =
822       std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
823                          void>;
824 
825   using ValueTraits =
826       Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
827   using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
828   using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
829 
830   using pointer_type   = typename ValueTraits::pointer_type;
831   using reference_type = typename ValueTraits::reference_type;
832   using value_type     = typename ValueTraits::value_type;
833 
834   enum { HasJoin = ReduceFunctorHasJoin<FunctorType>::value };
835   enum { UseReducer = is_reducer_type<ReducerType>::value };
836 
837   using ParForSpecialize =
838       ParallelReduceSpecialize<FunctorType, Policy, ReducerType, pointer_type,
839                                typename ValueTraits::value_type, HasJoin,
840                                UseReducer>;
841 
842   const FunctorType m_functor;
843   const Policy m_policy;
844   const ReducerType m_reducer;
845   const pointer_type m_result_ptr;
846   const int m_shmem_size;
847 
848  public:
execute() const849   inline void execute() const {
850     ParForSpecialize::execute(m_functor, m_policy, m_result_ptr);
851   }
852 
853   template <class ViewType>
ParallelReduce(const FunctorType & arg_functor,const Policy & arg_policy,const ViewType & arg_result,typename std::enable_if<Kokkos::is_view<ViewType>::value &&!Kokkos::is_reducer_type<ReducerType>::value,void * >::type=nullptr)854   inline ParallelReduce(
855       const FunctorType& arg_functor, const Policy& arg_policy,
856       const ViewType& arg_result,
857       typename std::enable_if<Kokkos::is_view<ViewType>::value &&
858                                   !Kokkos::is_reducer_type<ReducerType>::value,
859                               void*>::type = nullptr)
860       : m_functor(arg_functor),
861         m_policy(arg_policy),
862         m_reducer(InvalidType()),
863         m_result_ptr(arg_result.data()),
864         m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
865                      FunctorTeamShmemSize<FunctorType>::value(
866                          arg_functor, arg_policy.team_size())) {}
867 
ParallelReduce(const FunctorType & arg_functor,Policy arg_policy,const ReducerType & reducer)868   inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy,
869                         const ReducerType& reducer)
870       : m_functor(arg_functor),
871         m_policy(arg_policy),
872         m_reducer(reducer),
873         m_result_ptr(reducer.view().data()),
874         m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
875                      FunctorTeamShmemSize<FunctorType>::value(
876                          arg_functor, arg_policy.team_size())) {}
877 };
878 
879 }  // namespace Impl
880 }  // namespace Kokkos
881 
882 namespace Kokkos {
883 namespace Impl {
884 
885 template <typename iType>
886 struct TeamThreadRangeBoundariesStruct<iType, OpenMPTargetExecTeamMember> {
887   using index_type = iType;
888   const iType start;
889   const iType end;
890   const OpenMPTargetExecTeamMember& team;
891 
TeamThreadRangeBoundariesStructKokkos::Impl::TeamThreadRangeBoundariesStruct892   inline TeamThreadRangeBoundariesStruct(
893       const OpenMPTargetExecTeamMember& thread_, iType count)
894       : start(0), end(count), team(thread_) {}
TeamThreadRangeBoundariesStructKokkos::Impl::TeamThreadRangeBoundariesStruct895   inline TeamThreadRangeBoundariesStruct(
896       const OpenMPTargetExecTeamMember& thread_, iType begin_, iType end_)
897       : start(begin_), end(end_), team(thread_) {}
898 };
899 
900 template <typename iType>
901 struct ThreadVectorRangeBoundariesStruct<iType, OpenMPTargetExecTeamMember> {
902   using index_type = iType;
903   const index_type start;
904   const index_type end;
905   const OpenMPTargetExecTeamMember& team;
906 
ThreadVectorRangeBoundariesStructKokkos::Impl::ThreadVectorRangeBoundariesStruct907   inline ThreadVectorRangeBoundariesStruct(
908       const OpenMPTargetExecTeamMember& thread_, index_type count)
909       : start(0), end(count), team(thread_) {}
ThreadVectorRangeBoundariesStructKokkos::Impl::ThreadVectorRangeBoundariesStruct910   inline ThreadVectorRangeBoundariesStruct(
911       const OpenMPTargetExecTeamMember& thread_, index_type begin_,
912       index_type end_)
913       : start(begin_), end(end_), team(thread_) {}
914 };
915 
916 template <typename iType>
917 struct TeamVectorRangeBoundariesStruct<iType, OpenMPTargetExecTeamMember> {
918   using index_type = iType;
919   const index_type start;
920   const index_type end;
921   const OpenMPTargetExecTeamMember& team;
922 
TeamVectorRangeBoundariesStructKokkos::Impl::TeamVectorRangeBoundariesStruct923   inline TeamVectorRangeBoundariesStruct(
924       const OpenMPTargetExecTeamMember& thread_, index_type count)
925       : start(0), end(count), team(thread_) {}
TeamVectorRangeBoundariesStructKokkos::Impl::TeamVectorRangeBoundariesStruct926   inline TeamVectorRangeBoundariesStruct(
927       const OpenMPTargetExecTeamMember& thread_, index_type begin_,
928       index_type end_)
929       : start(begin_), end(end_), team(thread_) {}
930 };
931 
932 }  // namespace Impl
933 
934 }  // namespace Kokkos
935 //----------------------------------------------------------------------------
936 //----------------------------------------------------------------------------
937 
938 #undef KOKKOS_IMPL_LOCK_FREE_HIERARCHICAL
939 #endif /* KOKKOS_OPENMPTARGET_PARALLEL_HPP */
940