1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 //                        Kokkos v. 3.0
6 //       Copyright (2020) National Technology & Engineering
7 //               Solutions of Sandia, LLC (NTESS).
8 //
9 // Under the terms of Contract DE-NA0003525 with NTESS,
10 // the U.S. Government retains certain rights in this software.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions are
14 // met:
15 //
16 // 1. Redistributions of source code must retain the above copyright
17 // notice, this list of conditions and the following disclaimer.
18 //
19 // 2. Redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution.
22 //
23 // 3. Neither the name of the Corporation nor the names of the
24 // contributors may be used to endorse or promote products derived from
25 // this software without specific prior written permission.
26 //
27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 //
39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
40 //
41 // ************************************************************************
42 //@HEADER
43 */
44 
45 #ifndef KOKKOS_OPENMPTARGETEXEC_HPP
46 #define KOKKOS_OPENMPTARGETEXEC_HPP
47 
48 #include <impl/Kokkos_Traits.hpp>
49 #include <impl/Kokkos_Spinwait.hpp>
50 
51 #include <Kokkos_Atomic.hpp>
52 #include "Kokkos_OpenMPTarget_Abort.hpp"
53 
54 // FIXME_OPENMPTARGET - Using this macro to implement a workaround for
55 // hierarchical reducers. It avoids hitting the code path which we wanted to
56 // write but doesn't work. undef'ed at the end.
57 #define KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
58 
59 //----------------------------------------------------------------------------
60 //----------------------------------------------------------------------------
61 
62 namespace Kokkos {
63 namespace Impl {
64 
65 template <class Reducer>
66 struct OpenMPTargetReducerWrapper {
67   using value_type = typename Reducer::value_type;
68 
69 // WORKAROUND OPENMPTARGET
70 // This pragma omp declare target should not be necessary, but Intel compiler
71 // fails without it
72 #pragma omp declare target
73   KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper74   static void join(value_type&, const value_type&) {
75     printf(
76         "Using a generic unknown Reducer for the OpenMPTarget backend is not "
77         "implemented.");
78   }
79 
80   KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper81   static void join(volatile value_type&, const volatile value_type&) {
82     printf(
83         "Using a generic unknown Reducer for the OpenMPTarget backend is not "
84         "implemented.");
85   }
86 
87   KOKKOS_INLINE_FUNCTION
initKokkos::Impl::OpenMPTargetReducerWrapper88   static void init(value_type&) {
89     printf(
90         "Using a generic unknown Reducer for the OpenMPTarget backend is not "
91         "implemented.");
92   }
93 #pragma omp end declare target
94 };
95 
96 template <class Scalar, class Space>
97 struct OpenMPTargetReducerWrapper<Sum<Scalar, Space>> {
98  public:
99   // Required
100   using value_type = typename std::remove_cv<Scalar>::type;
101 
102 // WORKAROUND OPENMPTARGET
103 // This pragma omp declare target should not be necessary, but Intel compiler
104 // fails without it
105 #pragma omp declare target
106   // Required
107   KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper108   static void join(value_type& dest, const value_type& src) { dest += src; }
109 
110   KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper111   static void join(volatile value_type& dest, const volatile value_type& src) {
112     dest += src;
113   }
114 
115   KOKKOS_INLINE_FUNCTION
initKokkos::Impl::OpenMPTargetReducerWrapper116   static void init(value_type& val) {
117     val = reduction_identity<value_type>::sum();
118   }
119 #pragma omp end declare target
120 };
121 
122 template <class Scalar, class Space>
123 struct OpenMPTargetReducerWrapper<Prod<Scalar, Space>> {
124  public:
125   // Required
126   using value_type = typename std::remove_cv<Scalar>::type;
127 
128 // WORKAROUND OPENMPTARGET
129 // This pragma omp declare target should not be necessary, but Intel compiler
130 // fails without it
131 #pragma omp declare target
132   // Required
133   KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper134   static void join(value_type& dest, const value_type& src) { dest *= src; }
135 
136   KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper137   static void join(volatile value_type& dest, const volatile value_type& src) {
138     dest *= src;
139   }
140 
141   KOKKOS_INLINE_FUNCTION
initKokkos::Impl::OpenMPTargetReducerWrapper142   static void init(value_type& val) {
143     val = reduction_identity<value_type>::prod();
144   }
145 #pragma omp end declare target
146 };
147 
148 template <class Scalar, class Space>
149 struct OpenMPTargetReducerWrapper<Min<Scalar, Space>> {
150  public:
151   // Required
152   using value_type = typename std::remove_cv<Scalar>::type;
153 
154 // WORKAROUND OPENMPTARGET
155 // This pragma omp declare target should not be necessary, but Intel compiler
156 // fails without it
157 #pragma omp declare target
158   // Required
159   KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper160   static void join(value_type& dest, const value_type& src) {
161     if (src < dest) dest = src;
162   }
163 
164   KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper165   static void join(volatile value_type& dest, const volatile value_type& src) {
166     if (src < dest) dest = src;
167   }
168 
169   KOKKOS_INLINE_FUNCTION
initKokkos::Impl::OpenMPTargetReducerWrapper170   static void init(value_type& val) {
171     val = reduction_identity<value_type>::min();
172   }
173 #pragma omp end declare target
174 };
175 
176 template <class Scalar, class Space>
177 struct OpenMPTargetReducerWrapper<Max<Scalar, Space>> {
178  public:
179   // Required
180   using value_type = typename std::remove_cv<Scalar>::type;
181 
182 // WORKAROUND OPENMPTARGET
183 // This pragma omp declare target should not be necessary, but Intel compiler
184 // fails without it
185 #pragma omp declare target
186   // Required
187   KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper188   static void join(value_type& dest, const value_type& src) {
189     if (src > dest) dest = src;
190   }
191 
192   KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper193   static void join(volatile value_type& dest, const volatile value_type& src) {
194     if (src > dest) dest = src;
195   }
196 
197   // Required
198   KOKKOS_INLINE_FUNCTION
initKokkos::Impl::OpenMPTargetReducerWrapper199   static void init(value_type& val) {
200     val = reduction_identity<value_type>::max();
201   }
202 #pragma omp end declare target
203 };
204 
205 template <class Scalar, class Space>
206 struct OpenMPTargetReducerWrapper<LAnd<Scalar, Space>> {
207  public:
208   // Required
209   using value_type = typename std::remove_cv<Scalar>::type;
210 
211 // WORKAROUND OPENMPTARGET
212 // This pragma omp declare target should not be necessary, but Intel compiler
213 // fails without it
214 #pragma omp declare target
215   KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper216   static void join(value_type& dest, const value_type& src) {
217     dest = dest && src;
218   }
219 
220   KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper221   static void join(volatile value_type& dest, const volatile value_type& src) {
222     dest = dest && src;
223   }
224 
225   KOKKOS_INLINE_FUNCTION
initKokkos::Impl::OpenMPTargetReducerWrapper226   static void init(value_type& val) {
227     val = reduction_identity<value_type>::land();
228   }
229 #pragma omp end declare target
230 };
231 
232 template <class Scalar, class Space>
233 struct OpenMPTargetReducerWrapper<LOr<Scalar, Space>> {
234  public:
235   // Required
236   using value_type = typename std::remove_cv<Scalar>::type;
237 
238   using result_view_type = Kokkos::View<value_type, Space>;
239 
240 // WORKAROUND OPENMPTARGET
241 // This pragma omp declare target should not be necessary, but Intel compiler
242 // fails without it
243 #pragma omp declare target
244   // Required
245   KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper246   static void join(value_type& dest, const value_type& src) {
247     dest = dest || src;
248   }
249 
250   KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper251   static void join(volatile value_type& dest, const volatile value_type& src) {
252     dest = dest || src;
253   }
254 
255   KOKKOS_INLINE_FUNCTION
initKokkos::Impl::OpenMPTargetReducerWrapper256   static void init(value_type& val) {
257     val = reduction_identity<value_type>::lor();
258   }
259 #pragma omp end declare target
260 };
261 
262 template <class Scalar, class Space>
263 struct OpenMPTargetReducerWrapper<BAnd<Scalar, Space>> {
264  public:
265   // Required
266   using value_type = typename std::remove_cv<Scalar>::type;
267 
268 // WORKAROUND OPENMPTARGET
269 // This pragma omp declare target should not be necessary, but Intel compiler
270 // fails without it
271 #pragma omp declare target
272   // Required
273   KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper274   static void join(value_type& dest, const value_type& src) {
275     dest = dest & src;
276   }
277 
278   KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper279   static void join(volatile value_type& dest, const volatile value_type& src) {
280     dest = dest & src;
281   }
282 
283   KOKKOS_INLINE_FUNCTION
initKokkos::Impl::OpenMPTargetReducerWrapper284   static void init(value_type& val) {
285     val = reduction_identity<value_type>::band();
286   }
287 #pragma omp end declare target
288 };
289 
290 template <class Scalar, class Space>
291 struct OpenMPTargetReducerWrapper<BOr<Scalar, Space>> {
292  public:
293   // Required
294   using value_type = typename std::remove_cv<Scalar>::type;
295 
296 // WORKAROUND OPENMPTARGET
297 // This pragma omp declare target should not be necessary, but Intel compiler
298 // fails without it
299 #pragma omp declare target
300   // Required
301   KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper302   static void join(value_type& dest, const value_type& src) {
303     dest = dest | src;
304   }
305 
306   KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper307   static void join(volatile value_type& dest, const volatile value_type& src) {
308     dest = dest | src;
309   }
310 
311   KOKKOS_INLINE_FUNCTION
initKokkos::Impl::OpenMPTargetReducerWrapper312   static void init(value_type& val) {
313     val = reduction_identity<value_type>::bor();
314   }
315 #pragma omp end declare target
316 };
317 
318 template <class Scalar, class Index, class Space>
319 struct OpenMPTargetReducerWrapper<MinLoc<Scalar, Index, Space>> {
320  private:
321   using scalar_type = typename std::remove_cv<Scalar>::type;
322   using index_type  = typename std::remove_cv<Index>::type;
323 
324  public:
325   // Required
326   using value_type = ValLocScalar<scalar_type, index_type>;
327 
328 // WORKAROUND OPENMPTARGET
329 // This pragma omp declare target should not be necessary, but Intel compiler
330 // fails without it
331 #pragma omp declare target
332   // Required
333   KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper334   static void join(value_type& dest, const value_type& src) {
335     if (src.val < dest.val) dest = src;
336   }
337 
338   KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper339   static void join(volatile value_type& dest, const volatile value_type& src) {
340     if (src.val < dest.val) dest = src;
341   }
342 
343   KOKKOS_INLINE_FUNCTION
initKokkos::Impl::OpenMPTargetReducerWrapper344   static void init(value_type& val) {
345     val.val = reduction_identity<scalar_type>::min();
346     val.loc = reduction_identity<index_type>::min();
347   }
348 #pragma omp end declare target
349 };
350 
351 template <class Scalar, class Index, class Space>
352 struct OpenMPTargetReducerWrapper<MaxLoc<Scalar, Index, Space>> {
353  private:
354   using scalar_type = typename std::remove_cv<Scalar>::type;
355   using index_type  = typename std::remove_cv<Index>::type;
356 
357  public:
358   // Required
359   using value_type = ValLocScalar<scalar_type, index_type>;
360 
361 // WORKAROUND OPENMPTARGET
362 // This pragma omp declare target should not be necessary, but Intel compiler
363 // fails without it
364 #pragma omp declare target
365   KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper366   static void join(value_type& dest, const value_type& src) {
367     if (src.val > dest.val) dest = src;
368   }
369 
370   KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper371   static void join(volatile value_type& dest, const volatile value_type& src) {
372     if (src.val > dest.val) dest = src;
373   }
374 
375   KOKKOS_INLINE_FUNCTION
initKokkos::Impl::OpenMPTargetReducerWrapper376   static void init(value_type& val) {
377     val.val = reduction_identity<scalar_type>::max();
378     val.loc = reduction_identity<index_type>::min();
379   }
380 #pragma omp end declare target
381 };
382 
383 template <class Scalar, class Space>
384 struct OpenMPTargetReducerWrapper<MinMax<Scalar, Space>> {
385  private:
386   using scalar_type = typename std::remove_cv<Scalar>::type;
387 
388  public:
389   // Required
390   using value_type = MinMaxScalar<scalar_type>;
391 
392 // WORKAROUND OPENMPTARGET
393 // This pragma omp declare target should not be necessary, but Intel compiler
394 // fails without it
395 #pragma omp declare target
396   // Required
397   KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper398   static void join(value_type& dest, const value_type& src) {
399     if (src.min_val < dest.min_val) {
400       dest.min_val = src.min_val;
401     }
402     if (src.max_val > dest.max_val) {
403       dest.max_val = src.max_val;
404     }
405   }
406 
407   KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper408   static void join(volatile value_type& dest, const volatile value_type& src) {
409     if (src.min_val < dest.min_val) {
410       dest.min_val = src.min_val;
411     }
412     if (src.max_val > dest.max_val) {
413       dest.max_val = src.max_val;
414     }
415   }
416 
417   KOKKOS_INLINE_FUNCTION
initKokkos::Impl::OpenMPTargetReducerWrapper418   static void init(value_type& val) {
419     val.max_val = reduction_identity<scalar_type>::max();
420     val.min_val = reduction_identity<scalar_type>::min();
421   }
422 #pragma omp end declare target
423 };
424 
425 template <class Scalar, class Index, class Space>
426 struct OpenMPTargetReducerWrapper<MinMaxLoc<Scalar, Index, Space>> {
427  private:
428   using scalar_type = typename std::remove_cv<Scalar>::type;
429   using index_type  = typename std::remove_cv<Index>::type;
430 
431  public:
432   // Required
433   using value_type = MinMaxLocScalar<scalar_type, index_type>;
434 
435 // WORKAROUND OPENMPTARGET
436 // This pragma omp declare target should not be necessary, but Intel compiler
437 // fails without it
438 #pragma omp declare target
439   // Required
440   KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper441   static void join(value_type& dest, const value_type& src) {
442     if (src.min_val < dest.min_val) {
443       dest.min_val = src.min_val;
444       dest.min_loc = src.min_loc;
445     }
446     if (src.max_val > dest.max_val) {
447       dest.max_val = src.max_val;
448       dest.max_loc = src.max_loc;
449     }
450   }
451 
452   KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper453   static void join(volatile value_type& dest, const volatile value_type& src) {
454     if (src.min_val < dest.min_val) {
455       dest.min_val = src.min_val;
456       dest.min_loc = src.min_loc;
457     }
458     if (src.max_val > dest.max_val) {
459       dest.max_val = src.max_val;
460       dest.max_loc = src.max_loc;
461     }
462   }
463 
464   KOKKOS_INLINE_FUNCTION
initKokkos::Impl::OpenMPTargetReducerWrapper465   static void init(value_type& val) {
466     val.max_val = reduction_identity<scalar_type>::max();
467     val.min_val = reduction_identity<scalar_type>::min();
468     val.max_loc = reduction_identity<index_type>::min();
469     val.min_loc = reduction_identity<index_type>::min();
470   }
471 #pragma omp end declare target
472 };
473 /*
474 template<class ReducerType>
475 class OpenMPTargetReducerWrapper {
476   public:
477     const ReducerType& reducer;
478     using value_type = typename ReducerType::value_type;
479     value_type& value;
480 
481     KOKKOS_INLINE_FUNCTION
482     void join(const value_type& upd) {
483       reducer.join(value,upd);
484     }
485 
486     KOKKOS_INLINE_FUNCTION
487     void init(const value_type& upd) {
488       reducer.init(value,upd);
489     }
490 };*/
491 
492 }  // namespace Impl
493 }  // namespace Kokkos
494 
495 namespace Kokkos {
496 namespace Impl {
497 
498 //----------------------------------------------------------------------------
499 /** \brief  Data for OpenMPTarget thread execution */
500 
501 class OpenMPTargetExec {
502  public:
503   // FIXME_OPENMPTARGET - Currently the maximum number of
504   // teams possible is calculated based on NVIDIA's Volta GPU. In
505   // future this value should be based on the chosen architecture for the
506   // OpenMPTarget backend.
507   enum { MAX_ACTIVE_THREADS = 2080 * 80 };
508   enum { MAX_ACTIVE_TEAMS = MAX_ACTIVE_THREADS / 32 };
509 
510  private:
511   static void* scratch_ptr;
512 
513  public:
514   static void verify_is_process(const char* const);
515   static void verify_initialized(const char* const);
516 
517   static int* get_lock_array(int num_teams);
518   static void* get_scratch_ptr();
519   static void clear_scratch();
520   static void clear_lock_array();
521   static void resize_scratch(int64_t team_reduce_bytes,
522                              int64_t team_shared_bytes,
523                              int64_t thread_local_bytes);
524 
525   static void* m_scratch_ptr;
526   static int64_t m_scratch_size;
527   static int* m_lock_array;
528   static int64_t m_lock_size;
529   static uint32_t* m_uniquetoken_ptr;
530 };
531 
532 }  // namespace Impl
533 }  // namespace Kokkos
534 
535 //----------------------------------------------------------------------------
536 //----------------------------------------------------------------------------
537 
538 namespace Kokkos {
539 namespace Impl {
540 
541 class OpenMPTargetExecTeamMember {
542  public:
543   enum { TEAM_REDUCE_SIZE = 512 };
544 
545   /** \brief  Thread states for team synchronization */
546   enum { Active = 0, Rendezvous = 1 };
547 
548   using execution_space      = Kokkos::Experimental::OpenMPTarget;
549   using scratch_memory_space = execution_space::scratch_memory_space;
550 
551   scratch_memory_space m_team_shared;
552   int m_team_scratch_size[2];
553   int m_team_rank;
554   int m_team_size;
555   int m_league_rank;
556   int m_league_size;
557   int m_vector_length;
558   int m_vector_lane;
559   int m_shmem_block_index;
560   void* m_glb_scratch;
561   void* m_reduce_scratch;
562 
563   /*
564   // Fan-in team threads, root of the fan-in which does not block returns true
565   inline
566   bool team_fan_in() const
567     {
568       memory_fence();
569       for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! (
570   m_team_rank_rev & n ) ; n <<= 1 ) {
571 
572         m_exec.pool_rev( m_team_base_rev + j )->state_wait( Active );
573       }
574 
575       if ( m_team_rank_rev ) {
576         m_exec.state_set( Rendezvous );
577         memory_fence();
578         m_exec.state_wait( Rendezvous );
579       }
580 
581       return 0 == m_team_rank_rev ;
582     }
583 
584   inline
585   void team_fan_out() const
586     {
587       memory_fence();
588       for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! (
589   m_team_rank_rev & n ) ; n <<= 1 ) { m_exec.pool_rev( m_team_base_rev + j
590   )->state_set( Active ); memory_fence();
591       }
592     }
593   */
594  public:
595   KOKKOS_INLINE_FUNCTION
team_shmem() const596   const execution_space::scratch_memory_space& team_shmem() const {
597     return m_team_shared.set_team_thread_mode(0, 1, 0);
598   }
599 
600   KOKKOS_INLINE_FUNCTION
team_scratch(int level) const601   const execution_space::scratch_memory_space& team_scratch(int level) const {
602     return m_team_shared.set_team_thread_mode(level, 1,
603                                               m_team_scratch_size[level]);
604   }
605 
606   KOKKOS_INLINE_FUNCTION
thread_scratch(int level) const607   const execution_space::scratch_memory_space& thread_scratch(int level) const {
608     return m_team_shared.set_team_thread_mode(level, team_size(), team_rank());
609   }
610 
league_rank() const611   KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank; }
league_size() const612   KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size; }
team_rank() const613   KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank; }
team_size() const614   KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size; }
impl_reduce_scratch() const615   KOKKOS_INLINE_FUNCTION void* impl_reduce_scratch() const {
616     return m_reduce_scratch;
617   }
618 
team_barrier() const619   KOKKOS_INLINE_FUNCTION void team_barrier() const {
620 #pragma omp barrier
621   }
622 
623   template <class ValueType>
team_broadcast(ValueType & value,int thread_id) const624   KOKKOS_INLINE_FUNCTION void team_broadcast(ValueType& value,
625                                              int thread_id) const {
626     // Make sure there is enough scratch space:
627     using type =
628         typename std::conditional<(sizeof(ValueType) < TEAM_REDUCE_SIZE),
629                                   ValueType, void>::type;
630     type* team_scratch = reinterpret_cast<type*>(
631         ((char*)(m_glb_scratch) + TEAM_REDUCE_SIZE * omp_get_team_num()));
632 #pragma omp barrier
633     if (team_rank() == thread_id) *team_scratch = value;
634 #pragma omp barrier
635     value = *team_scratch;
636   }
637 
638   template <class Closure, class ValueType>
team_broadcast(const Closure & f,ValueType & value,const int & thread_id) const639   KOKKOS_INLINE_FUNCTION void team_broadcast(const Closure& f, ValueType& value,
640                                              const int& thread_id) const {
641     f(value);
642     team_broadcast(value, thread_id);
643   }
644 
645   template <class ValueType, class JoinOp>
team_reduce(const ValueType & value,const JoinOp & op_in) const646   KOKKOS_INLINE_FUNCTION ValueType team_reduce(const ValueType& value,
647                                                const JoinOp& op_in) const {
648 #pragma omp barrier
649 
650     using value_type = ValueType;
651     const JoinLambdaAdapter<value_type, JoinOp> op(op_in);
652 
653     // Make sure there is enough scratch space:
654     using type = std::conditional_t<(sizeof(value_type) < TEAM_REDUCE_SIZE),
655                                     value_type, void>;
656 
657     const int n_values = TEAM_REDUCE_SIZE / sizeof(value_type);
658     type* team_scratch =
659         (type*)((char*)m_glb_scratch + TEAM_REDUCE_SIZE * omp_get_team_num());
660     for (int i = m_team_rank; i < n_values; i += m_team_size) {
661       team_scratch[i] = value_type();
662     }
663 
664 #pragma omp barrier
665 
666     for (int k = 0; k < m_team_size; k += n_values) {
667       if ((k <= m_team_rank) && (k + n_values > m_team_rank))
668         team_scratch[m_team_rank % n_values] += value;
669 #pragma omp barrier
670     }
671 
672     for (int d = 1; d < n_values; d *= 2) {
673       if ((m_team_rank + d < n_values) && (m_team_rank % (2 * d) == 0)) {
674         team_scratch[m_team_rank] += team_scratch[m_team_rank + d];
675       }
676 #pragma omp barrier
677     }
678     return team_scratch[0];
679   }
680   /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
681    *          with intra-team non-deterministic ordering accumulation.
682    *
683    *  The global inter-team accumulation value will, at the end of the
684    *  league's parallel execution, be the scan's total.
685    *  Parallel execution ordering of the league's teams is non-deterministic.
686    *  As such the base value for each team's scan operation is similarly
687    *  non-deterministic.
688    */
689   template <typename ArgType>
690   KOKKOS_INLINE_FUNCTION ArgType
team_scan(const ArgType &,ArgType * const) const691   team_scan(const ArgType& /*value*/, ArgType* const /*global_accum*/) const {
692     // FIXME_OPENMPTARGET
693     /*  // Make sure there is enough scratch space:
694       using type =
695         std::conditional_t<(sizeof(ArgType) < TEAM_REDUCE_SIZE), ArgType, void>;
696 
697       volatile type * const work_value  = ((type*) m_exec.scratch_thread());
698 
699       *work_value = value ;
700 
701       memory_fence();
702 
703       if ( team_fan_in() ) {
704         // The last thread to synchronize returns true, all other threads wait
705       for team_fan_out()
706         // m_team_base[0]                 == highest ranking team member
707         // m_team_base[ m_team_size - 1 ] == lowest ranking team member
708         //
709         // 1) copy from lower to higher rank, initialize lowest rank to zero
710         // 2) prefix sum from lowest to highest rank, skipping lowest rank
711 
712         type accum = 0 ;
713 
714         if ( global_accum ) {
715           for ( int i = m_team_size ; i-- ; ) {
716             type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i
717       )->scratch_thread()); accum += val ;
718           }
719           accum = atomic_fetch_add( global_accum , accum );
720         }
721 
722         for ( int i = m_team_size ; i-- ; ) {
723           type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i
724       )->scratch_thread()); const type offset = accum ; accum += val ; val =
725       offset ;
726         }
727 
728         memory_fence();
729       }
730 
731       team_fan_out();
732 
733       return *work_value ;*/
734     return ArgType();
735   }
736 
737   /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
738    *
739    *  The highest rank thread can compute the reduction total as
740    *    reduction_total = dev.team_scan( value ) + value ;
741    */
742   template <typename Type>
team_scan(const Type & value) const743   KOKKOS_INLINE_FUNCTION Type team_scan(const Type& value) const {
744     return this->template team_scan<Type>(value, 0);
745   }
746 
747   //----------------------------------------
748   // Private for the driver
749 
750  private:
751   using space = execution_space::scratch_memory_space;
752 
753  public:
754   // FIXME_OPENMPTARGET - 512(16*32) bytes at the begining of the scratch space
755   // for each league is saved for reduction. It should actually be based on the
756   // ValueType of the reduction variable.
OpenMPTargetExecTeamMember(const int league_rank,const int league_size,const int team_size,const int vector_length,void * const glb_scratch,const int shmem_block_index,const int shmem_size_L0,const int shmem_size_L1)757   inline OpenMPTargetExecTeamMember(
758       const int league_rank, const int league_size, const int team_size,
759       const int vector_length  // const TeamPolicyInternal< OpenMPTarget,
760                                // Properties ...> & team
761       ,
762       void* const glb_scratch, const int shmem_block_index,
763       const int shmem_size_L0, const int shmem_size_L1)
764       : m_team_scratch_size{shmem_size_L0, shmem_size_L1},
765         m_team_rank(0),
766         m_team_size(team_size),
767         m_league_rank(league_rank),
768         m_league_size(league_size),
769         m_vector_length(vector_length),
770         m_shmem_block_index(shmem_block_index),
771         m_glb_scratch(glb_scratch) {
772     const int omp_tid = omp_get_thread_num();
773     m_team_shared     = scratch_memory_space(
774         ((char*)glb_scratch +
775          m_shmem_block_index *
776              (shmem_size_L0 + shmem_size_L1 +
777               ((shmem_size_L0 + shmem_size_L1) * 10 / 100) + TEAM_REDUCE_SIZE)),
778         shmem_size_L0,
779         ((char*)glb_scratch +
780          m_shmem_block_index * (shmem_size_L0 + shmem_size_L1 +
781                                 ((shmem_size_L0 + shmem_size_L1) * 10 / 100) +
782                                 TEAM_REDUCE_SIZE)) +
783             shmem_size_L0 + ((shmem_size_L0 + shmem_size_L1) * 10 / 100) +
784             TEAM_REDUCE_SIZE,
785         shmem_size_L1);
786     m_reduce_scratch =
787         (char*)glb_scratch +
788         shmem_block_index *
789             (shmem_size_L0 + shmem_size_L1 +
790              ((shmem_size_L0 + shmem_size_L1) * 10 / 100) + TEAM_REDUCE_SIZE);
791     m_league_rank = league_rank;
792     m_team_rank   = omp_tid;
793     m_vector_lane = 0;
794   }
795 
team_reduce_size()796   static inline int team_reduce_size() { return TEAM_REDUCE_SIZE; }
797 };
798 
799 template <class... Properties>
800 class TeamPolicyInternal<Kokkos::Experimental::OpenMPTarget, Properties...>
801     : public PolicyTraits<Properties...> {
802  public:
803   //! Tag this class as a kokkos execution policy
804   using execution_policy = TeamPolicyInternal;
805 
806   using traits = PolicyTraits<Properties...>;
807 
808   //----------------------------------------
809 
810   template <class FunctorType>
team_size_max(const FunctorType &,const ParallelForTag &)811   inline static int team_size_max(const FunctorType&, const ParallelForTag&) {
812     return 256;
813   }
814 
815   template <class FunctorType>
team_size_max(const FunctorType &,const ParallelReduceTag &)816   inline static int team_size_max(const FunctorType&,
817                                   const ParallelReduceTag&) {
818     return 256;
819   }
820 
821   template <class FunctorType, class ReducerType>
team_size_max(const FunctorType &,const ReducerType &,const ParallelReduceTag &)822   inline static int team_size_max(const FunctorType&, const ReducerType&,
823                                   const ParallelReduceTag&) {
824     return 256;
825   }
826 
827   template <class FunctorType>
team_size_recommended(const FunctorType &,const ParallelForTag &)828   inline static int team_size_recommended(const FunctorType&,
829                                           const ParallelForTag&) {
830     return 128;
831   }
832 
833   template <class FunctorType>
team_size_recommended(const FunctorType &,const ParallelReduceTag &)834   inline static int team_size_recommended(const FunctorType&,
835                                           const ParallelReduceTag&) {
836     return 128;
837   }
838 
839   template <class FunctorType, class ReducerType>
team_size_recommended(const FunctorType &,const ReducerType &,const ParallelReduceTag &)840   inline static int team_size_recommended(const FunctorType&,
841                                           const ReducerType&,
842                                           const ParallelReduceTag&) {
843     return 128;
844   }
845 
846   //----------------------------------------
847 
848  private:
849   int m_league_size;
850   int m_team_size;
851   int m_vector_length;
852   int m_team_alloc;
853   int m_team_iter;
854   std::array<size_t, 2> m_team_scratch_size;
855   std::array<size_t, 2> m_thread_scratch_size;
856   bool m_tune_team_size;
857   bool m_tune_vector_length;
858   constexpr const static size_t default_team_size = 256;
859   int m_chunk_size;
860 
init(const int league_size_request,const int team_size_request,const int vector_length_request)861   inline void init(const int league_size_request, const int team_size_request,
862                    const int vector_length_request) {
863     m_league_size = league_size_request;
864 
865     // Minimum team size should be 32 for OpenMPTarget backend.
866     if (team_size_request < 32) {
867       Kokkos::Impl::OpenMPTarget_abort(
868           "OpenMPTarget backend requires a minimum of 32 threads per team.\n");
869     } else
870       m_team_size = team_size_request;
871 
872     m_vector_length = vector_length_request;
873     set_auto_chunk_size();
874   }
875 
876   template <typename ExecSpace, typename... OtherProperties>
877   friend class TeamPolicyInternal;
878 
879  public:
impl_auto_team_size() const880   inline bool impl_auto_team_size() const { return m_tune_team_size; }
impl_auto_vector_length() const881   inline bool impl_auto_vector_length() const { return m_tune_vector_length; }
impl_set_team_size(const size_t size)882   inline void impl_set_team_size(const size_t size) { m_team_size = size; }
impl_set_vector_length(const size_t length)883   inline void impl_set_vector_length(const size_t length) {
884     m_tune_vector_length = length;
885   }
impl_vector_length() const886   inline int impl_vector_length() const { return m_vector_length; }
vector_length() const887   KOKKOS_DEPRECATED inline int vector_length() const {
888     return impl_vector_length();
889   }
team_size() const890   inline int team_size() const { return m_team_size; }
league_size() const891   inline int league_size() const { return m_league_size; }
scratch_size(const int & level,int team_size_=-1) const892   inline size_t scratch_size(const int& level, int team_size_ = -1) const {
893     if (team_size_ < 0) team_size_ = m_team_size;
894     return m_team_scratch_size[level] +
895            team_size_ * m_thread_scratch_size[level];
896   }
897 
space() const898   inline Kokkos::Experimental::OpenMPTarget space() const {
899     return Kokkos::Experimental::OpenMPTarget();
900   }
901 
902   template <class... OtherProperties>
TeamPolicyInternal(const TeamPolicyInternal<OtherProperties...> & p)903   TeamPolicyInternal(const TeamPolicyInternal<OtherProperties...>& p)
904       : m_league_size(p.m_league_size),
905         m_team_size(p.m_team_size),
906         m_vector_length(p.m_vector_length),
907         m_team_alloc(p.m_team_alloc),
908         m_team_iter(p.m_team_iter),
909         m_team_scratch_size(p.m_team_scratch_size),
910         m_thread_scratch_size(p.m_thread_scratch_size),
911         m_tune_team_size(p.m_tune_team_size),
912         m_tune_vector_length(p.m_tune_vector_length),
913         m_chunk_size(p.m_chunk_size) {}
914 
915   /** \brief  Specify league size, request team size */
TeamPolicyInternal(typename traits::execution_space &,int league_size_request,int team_size_request,int vector_length_request=1)916   TeamPolicyInternal(typename traits::execution_space&, int league_size_request,
917                      int team_size_request, int vector_length_request = 1)
918       : m_team_scratch_size{0, 0},
919         m_thread_scratch_size{0, 0},
920         m_tune_team_size(false),
921         m_tune_vector_length(false),
922         m_chunk_size(0) {
923     init(league_size_request, team_size_request, vector_length_request);
924   }
925 
TeamPolicyInternal(typename traits::execution_space &,int league_size_request,const Kokkos::AUTO_t &,int vector_length_request=1)926   TeamPolicyInternal(typename traits::execution_space&, int league_size_request,
927                      const Kokkos::AUTO_t& /* team_size_request */
928                      ,
929                      int vector_length_request = 1)
930       : m_team_scratch_size{0, 0},
931         m_thread_scratch_size{0, 0},
932         m_tune_team_size(true),
933         m_tune_vector_length(false),
934         m_chunk_size(0) {
935     init(league_size_request, default_team_size / vector_length_request,
936          vector_length_request);
937   }
938 
TeamPolicyInternal(typename traits::execution_space &,int league_size_request,const Kokkos::AUTO_t &,const Kokkos::AUTO_t &)939   TeamPolicyInternal(typename traits::execution_space&, int league_size_request,
940                      const Kokkos::AUTO_t& /* team_size_request */
941                      ,
942                      const Kokkos::AUTO_t& /* vector_length_request */)
943       : m_team_scratch_size{0, 0},
944         m_thread_scratch_size{0, 0},
945         m_tune_team_size(true),
946         m_tune_vector_length(true),
947         m_chunk_size(0) {
948     init(league_size_request, default_team_size, 1);
949   }
TeamPolicyInternal(typename traits::execution_space &,int league_size_request,int team_size_request,const Kokkos::AUTO_t &)950   TeamPolicyInternal(typename traits::execution_space&, int league_size_request,
951                      int team_size_request,
952                      const Kokkos::AUTO_t& /* vector_length_request */)
953       : m_team_scratch_size{0, 0},
954         m_thread_scratch_size{0, 0},
955         m_tune_team_size(false),
956         m_tune_vector_length(true),
957         m_chunk_size(0) {
958     init(league_size_request, team_size_request, 1);
959   }
960 
TeamPolicyInternal(int league_size_request,int team_size_request,int vector_length_request=1)961   TeamPolicyInternal(int league_size_request, int team_size_request,
962                      int vector_length_request = 1)
963       : m_team_scratch_size{0, 0},
964         m_thread_scratch_size{0, 0},
965         m_tune_team_size(false),
966         m_tune_vector_length(false),
967         m_chunk_size(0) {
968     init(league_size_request, team_size_request, vector_length_request);
969   }
970 
TeamPolicyInternal(int league_size_request,const Kokkos::AUTO_t &,int vector_length_request=1)971   TeamPolicyInternal(int league_size_request,
972                      const Kokkos::AUTO_t& /* team_size_request */
973                      ,
974                      int vector_length_request = 1)
975       : m_team_scratch_size{0, 0},
976         m_thread_scratch_size{0, 0},
977         m_tune_team_size(true),
978         m_tune_vector_length(false),
979         m_chunk_size(0) {
980     init(league_size_request, default_team_size / vector_length_request,
981          vector_length_request);
982   }
983 
TeamPolicyInternal(int league_size_request,const Kokkos::AUTO_t &,const Kokkos::AUTO_t &)984   TeamPolicyInternal(int league_size_request,
985                      const Kokkos::AUTO_t& /* team_size_request */
986                      ,
987                      const Kokkos::AUTO_t& /* vector_length_request */)
988       : m_team_scratch_size{0, 0},
989         m_thread_scratch_size{0, 0},
990         m_tune_team_size(true),
991         m_tune_vector_length(true),
992         m_chunk_size(0) {
993     init(league_size_request, default_team_size, 1);
994   }
TeamPolicyInternal(int league_size_request,int team_size_request,const Kokkos::AUTO_t &)995   TeamPolicyInternal(int league_size_request, int team_size_request,
996                      const Kokkos::AUTO_t& /* vector_length_request */)
997       : m_team_scratch_size{0, 0},
998         m_thread_scratch_size{0, 0},
999         m_tune_team_size(false),
1000         m_tune_vector_length(true),
1001         m_chunk_size(0) {
1002     init(league_size_request, team_size_request, 1);
1003   }
vector_length_max()1004   inline static size_t vector_length_max() {
1005     return 32; /* TODO: this is bad. Need logic that is compiler and backend
1006                   aware */
1007   }
team_alloc() const1008   inline int team_alloc() const { return m_team_alloc; }
team_iter() const1009   inline int team_iter() const { return m_team_iter; }
1010 
chunk_size() const1011   inline int chunk_size() const { return m_chunk_size; }
1012 
1013   /** \brief set chunk_size to a discrete value*/
set_chunk_size(typename traits::index_type chunk_size_)1014   inline TeamPolicyInternal& set_chunk_size(
1015       typename traits::index_type chunk_size_) {
1016     m_chunk_size = chunk_size_;
1017     return *this;
1018   }
1019 
1020   /** \brief set per team scratch size for a specific level of the scratch
1021    * hierarchy */
set_scratch_size(const int & level,const PerTeamValue & per_team)1022   inline TeamPolicyInternal& set_scratch_size(const int& level,
1023                                               const PerTeamValue& per_team) {
1024     m_team_scratch_size[level] = per_team.value;
1025     return *this;
1026   }
1027 
1028   /** \brief set per thread scratch size for a specific level of the scratch
1029    * hierarchy */
set_scratch_size(const int & level,const PerThreadValue & per_thread)1030   inline TeamPolicyInternal& set_scratch_size(
1031       const int& level, const PerThreadValue& per_thread) {
1032     m_thread_scratch_size[level] = per_thread.value;
1033     return *this;
1034   }
1035 
1036   /** \brief set per thread and per team scratch size for a specific level of
1037    * the scratch hierarchy */
set_scratch_size(const int & level,const PerTeamValue & per_team,const PerThreadValue & per_thread)1038   inline TeamPolicyInternal& set_scratch_size(
1039       const int& level, const PerTeamValue& per_team,
1040       const PerThreadValue& per_thread) {
1041     m_team_scratch_size[level]   = per_team.value;
1042     m_thread_scratch_size[level] = per_thread.value;
1043     return *this;
1044   }
1045 
1046  private:
1047   /** \brief finalize chunk_size if it was set to AUTO*/
set_auto_chunk_size()1048   inline void set_auto_chunk_size() {
1049     int concurrency = 2048 * 128;
1050 
1051     if (concurrency == 0) concurrency = 1;
1052 
1053     if (m_chunk_size > 0) {
1054       if (!Impl::is_integral_power_of_two(m_chunk_size))
1055         Kokkos::abort("TeamPolicy blocking granularity must be power of two");
1056     }
1057 
1058     int new_chunk_size = 1;
1059     while (new_chunk_size * 100 * concurrency < m_league_size)
1060       new_chunk_size *= 2;
1061     if (new_chunk_size < 128) {
1062       new_chunk_size = 1;
1063       while ((new_chunk_size * 40 * concurrency < m_league_size) &&
1064              (new_chunk_size < 128))
1065         new_chunk_size *= 2;
1066     }
1067     m_chunk_size = new_chunk_size;
1068   }
1069 
1070  public:
1071   using member_type = Impl::OpenMPTargetExecTeamMember;
1072 };
1073 }  // namespace Impl
1074 
1075 }  // namespace Kokkos
1076 
1077 namespace Kokkos {
1078 
1079 template <typename iType>
1080 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
1081     iType, Impl::OpenMPTargetExecTeamMember>
TeamThreadRange(const Impl::OpenMPTargetExecTeamMember & thread,const iType & count)1082 TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread,
1083                 const iType& count) {
1084   return Impl::TeamThreadRangeBoundariesStruct<
1085       iType, Impl::OpenMPTargetExecTeamMember>(thread, count);
1086 }
1087 
1088 template <typename iType1, typename iType2>
1089 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
1090     typename std::common_type<iType1, iType2>::type,
1091     Impl::OpenMPTargetExecTeamMember>
TeamThreadRange(const Impl::OpenMPTargetExecTeamMember & thread,const iType1 & begin,const iType2 & end)1092 TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread,
1093                 const iType1& begin, const iType2& end) {
1094   using iType = typename std::common_type<iType1, iType2>::type;
1095   return Impl::TeamThreadRangeBoundariesStruct<
1096       iType, Impl::OpenMPTargetExecTeamMember>(thread, iType(begin),
1097                                                iType(end));
1098 }
1099 
1100 template <typename iType>
1101 KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct<
1102     iType, Impl::OpenMPTargetExecTeamMember>
ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember & thread,const iType & count)1103 ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread,
1104                   const iType& count) {
1105   return Impl::ThreadVectorRangeBoundariesStruct<
1106       iType, Impl::OpenMPTargetExecTeamMember>(thread, count);
1107 }
1108 
1109 template <typename iType1, typename iType2>
1110 KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct<
1111     typename std::common_type<iType1, iType2>::type,
1112     Impl::OpenMPTargetExecTeamMember>
ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember & thread,const iType1 & arg_begin,const iType2 & arg_end)1113 ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread,
1114                   const iType1& arg_begin, const iType2& arg_end) {
1115   using iType = typename std::common_type<iType1, iType2>::type;
1116   return Impl::ThreadVectorRangeBoundariesStruct<
1117       iType, Impl::OpenMPTargetExecTeamMember>(thread, iType(arg_begin),
1118                                                iType(arg_end));
1119 }
1120 
1121 template <typename iType>
1122 KOKKOS_INLINE_FUNCTION Impl::TeamVectorRangeBoundariesStruct<
1123     iType, Impl::OpenMPTargetExecTeamMember>
TeamVectorRange(const Impl::OpenMPTargetExecTeamMember & thread,const iType & count)1124 TeamVectorRange(const Impl::OpenMPTargetExecTeamMember& thread,
1125                 const iType& count) {
1126   return Impl::TeamVectorRangeBoundariesStruct<
1127       iType, Impl::OpenMPTargetExecTeamMember>(thread, count);
1128 }
1129 
1130 template <typename iType1, typename iType2>
1131 KOKKOS_INLINE_FUNCTION Impl::TeamVectorRangeBoundariesStruct<
1132     typename std::common_type<iType1, iType2>::type,
1133     Impl::OpenMPTargetExecTeamMember>
TeamVectorRange(const Impl::OpenMPTargetExecTeamMember & thread,const iType1 & arg_begin,const iType2 & arg_end)1134 TeamVectorRange(const Impl::OpenMPTargetExecTeamMember& thread,
1135                 const iType1& arg_begin, const iType2& arg_end) {
1136   using iType = typename std::common_type<iType1, iType2>::type;
1137   return Impl::TeamVectorRangeBoundariesStruct<
1138       iType, Impl::OpenMPTargetExecTeamMember>(thread, iType(arg_begin),
1139                                                iType(arg_end));
1140 }
1141 
1142 KOKKOS_INLINE_FUNCTION
PerTeam(const Impl::OpenMPTargetExecTeamMember & thread)1143 Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember> PerTeam(
1144     const Impl::OpenMPTargetExecTeamMember& thread) {
1145   return Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember>(thread);
1146 }
1147 
1148 KOKKOS_INLINE_FUNCTION
PerThread(const Impl::OpenMPTargetExecTeamMember & thread)1149 Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember> PerThread(
1150     const Impl::OpenMPTargetExecTeamMember& thread) {
1151   return Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember>(thread);
1152 }
1153 }  // namespace Kokkos
1154 
1155 namespace Kokkos {
1156 
1157 /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each
1158  * i=0..N-1.
1159  *
1160  * The range i=0..N-1 is mapped to all threads of the the calling thread team.
1161  */
1162 template <typename iType, class Lambda>
parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember> & loop_boundaries,const Lambda & lambda)1163 KOKKOS_INLINE_FUNCTION void parallel_for(
1164     const Impl::TeamThreadRangeBoundariesStruct<
1165         iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
1166     const Lambda& lambda) {
1167 #pragma omp for nowait schedule(static, 1)
1168   for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i);
1169 }
1170 
1171 /** \brief  Inter-thread vector parallel_reduce. Executes lambda(iType i,
1172  * ValueType & val) for each i=0..N-1.
1173  *
1174  * The range i=0..N-1 is mapped to all threads of the the calling thread team
1175  * and a summation of val is performed and put into result.
1176  */
1177 
1178 template <typename iType, class Lambda, typename ValueType>
1179 KOKKOS_INLINE_FUNCTION
1180     std::enable_if_t<!Kokkos::is_reducer_type<ValueType>::value>
parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember> & loop_boundaries,const Lambda & lambda,ValueType & result)1181     parallel_reduce(
1182         const Impl::TeamThreadRangeBoundariesStruct<
1183             iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
1184         const Lambda& lambda, ValueType& result) {
1185   // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
1186   // elements in the array <= 32. For reduction we allocate, 16 bytes per
1187   // element in the scratch space, hence, 16*32 = 512.
1188   static_assert(sizeof(ValueType) <=
1189                 Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
1190 
1191   ValueType* TeamThread_scratch =
1192       static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
1193 
1194 #pragma omp barrier
1195   TeamThread_scratch[0] = ValueType();
1196 #pragma omp barrier
1197 
1198   if constexpr (std::is_arithmetic<ValueType>::value) {
1199 #pragma omp for reduction(+ : TeamThread_scratch[:1])
1200     for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
1201       ValueType tmp = ValueType();
1202       lambda(i, tmp);
1203       TeamThread_scratch[0] += tmp;
1204     }
1205   } else {
1206 #pragma omp declare reduction(custom:ValueType : omp_out += omp_in)
1207 
1208 #pragma omp for reduction(custom : TeamThread_scratch[:1])
1209     for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
1210       ValueType tmp = ValueType();
1211       lambda(i, tmp);
1212       TeamThread_scratch[0] += tmp;
1213     }
1214   }
1215 
1216   result = TeamThread_scratch[0];
1217 }
1218 
1219 #if !defined(KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND)
1220 // For some reason the actual version we wanted to write doesn't work
1221 // and crashes. We should try this with every new compiler
1222 // This is the variant we actually wanted to write
1223 template <typename iType, class Lambda, typename ReducerType>
1224 KOKKOS_INLINE_FUNCTION
1225     std::enable_if_t<Kokkos::is_reducer_type<ReducerType>::value>
parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember> & loop_boundaries,const Lambda & lambda,ReducerType result)1226     parallel_reduce(
1227         const Impl::TeamThreadRangeBoundariesStruct<
1228             iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
1229         const Lambda& lambda, ReducerType result) {
1230   using ValueType = typename ReducerType::value_type;
1231 
1232 #pragma omp declare reduction(                                               \
1233     custominner:ValueType                                                    \
1234     : Impl::OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
1235     initializer(                                                             \
1236         Impl::OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
1237 
1238   // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
1239   // elements in the array <= 32. For reduction we allocate, 16 bytes per
1240   // element in the scratch space, hence, 16*32 = 512.
1241   static_assert(sizeof(ValueType) <=
1242                 Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
1243 
1244   ValueType* TeamThread_scratch =
1245       static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
1246 
1247 #pragma omp barrier
1248   // These three lines all cause crash
1249   Impl::OpenMPTargetReducerWrapper<ReducerType>::init(TeamThread_scratch[0]);
1250 //  result.init(TeamThread_scratch[0]);
1251 //  Impl::OpenMPTargetReducerWrapper<ReducerType> red;
1252 //  red.init(TeamThread_scratch[0]);
1253 #pragma omp barrier
1254 
1255 #pragma omp for reduction(custominner : TeamThread_scratch[:1])
1256   for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
1257     ValueType tmp;
1258     result.init(tmp);
1259     lambda(i, tmp);
1260     // This line causes a crash
1261     Impl::OpenMPTargetReducerWrapper<ReducerType>::join(TeamThread_scratch[0],
1262                                                         tmp);
1263   }
1264   result.reference() = TeamThread_scratch[0];
1265 }
1266 #else
1267 template <typename iType, class Lambda, typename ReducerType>
1268 KOKKOS_INLINE_FUNCTION
1269     std::enable_if_t<Kokkos::is_reducer_type<ReducerType>::value>
parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember> & loop_boundaries,const Lambda & lambda,ReducerType result)1270     parallel_reduce(
1271         const Impl::TeamThreadRangeBoundariesStruct<
1272             iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
1273         const Lambda& lambda, ReducerType result) {
1274   using ValueType = typename ReducerType::value_type;
1275 
1276   // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
1277   // elements in the array <= 32. For reduction we allocate, 16 bytes per
1278   // element in the scratch space, hence, 16*32 = 512.
1279   static_assert(sizeof(ValueType) <=
1280                 Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
1281 
1282   ValueType* TeamThread_scratch =
1283       static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
1284 
1285 #pragma omp declare reduction(                                               \
1286     omp_red_teamthread_reducer:ValueType                                     \
1287     : Impl::OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
1288     initializer(                                                             \
1289         Impl::OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
1290 
1291 #pragma omp barrier
1292   ValueType tmp;
1293   result.init(tmp);
1294   TeamThread_scratch[0] = tmp;
1295 #pragma omp barrier
1296 
1297   iType team_size = iType(omp_get_num_threads());
1298 #pragma omp for reduction(omp_red_teamthread_reducer \
1299                           : TeamThread_scratch[:1]) schedule(static, 1)
1300   for (iType t = 0; t < team_size; t++) {
1301     ValueType tmp2;
1302     result.init(tmp2);
1303 
1304     for (iType i = loop_boundaries.start + t; i < loop_boundaries.end;
1305          i += team_size) {
1306       lambda(i, tmp2);
1307     }
1308     TeamThread_scratch[0] = tmp2;
1309   }
1310 
1311   result.reference() = TeamThread_scratch[0];
1312 }
1313 #endif  // KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
1314 
1315 /** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i,
1316  * ValueType & val) for each i=0..N-1.
1317  *
1318  * The range i=0..N-1 is mapped to all vector lanes of the the calling thread
1319  * and a reduction of val is performed using JoinType(ValueType& val, const
1320  * ValueType& update) and put into init_result. The input value of init_result
1321  * is used as initializer for temporary variables of ValueType. Therefore the
1322  * input value should be the neutral element with respect to the join operation
1323  * (e.g. '0 for +-' or '1 for *').
1324  */
1325 template <typename iType, class Lambda, typename ValueType, class JoinType>
parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember> & loop_boundaries,const Lambda & lambda,const JoinType & join,ValueType & init_result)1326 KOKKOS_INLINE_FUNCTION void parallel_reduce(
1327     const Impl::TeamThreadRangeBoundariesStruct<
1328         iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
1329     const Lambda& lambda, const JoinType& join, ValueType& init_result) {
1330   ValueType* TeamThread_scratch =
1331       static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
1332 
1333   // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
1334   // elements in the array <= 32. For reduction we allocate, 16 bytes per
1335   // element in the scratch space, hence, 16*32 = 512.
1336   static_assert(sizeof(ValueType) <=
1337                 Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
1338 
1339 #pragma omp barrier
1340   TeamThread_scratch[0] = init_result;
1341 #pragma omp barrier
1342 
1343   if constexpr (std::is_arithmetic<ValueType>::value) {
1344 #pragma omp for reduction(+ : TeamThread_scratch[:1])
1345     for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
1346       ValueType tmp = ValueType();
1347       lambda(i, tmp);
1348       TeamThread_scratch[0] += tmp;
1349     }
1350   } else {
1351 #pragma omp declare reduction(custom:ValueType : omp_out += omp_in)
1352 
1353 #pragma omp for reduction(custom : TeamThread_scratch[:1])
1354     for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
1355       ValueType tmp = ValueType();
1356       lambda(i, tmp);
1357       join(TeamThread_scratch[0], tmp);
1358     }
1359   }
1360 
1361   init_result = TeamThread_scratch[0];
1362 }
1363 
1364 // This is largely the same code as in HIP and CUDA except for the member name
1365 template <typename iType, class FunctorType>
parallel_scan(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember> & loop_bounds,const FunctorType & lambda)1366 KOKKOS_INLINE_FUNCTION void parallel_scan(
1367     const Impl::TeamThreadRangeBoundariesStruct<
1368         iType, Impl::OpenMPTargetExecTeamMember>& loop_bounds,
1369     const FunctorType& lambda) {
1370   // Extract value_type from lambda
1371   using value_type = typename Kokkos::Impl::FunctorAnalysis<
1372       Kokkos::Impl::FunctorPatternInterface::SCAN, void,
1373       FunctorType>::value_type;
1374 
1375   const auto start = loop_bounds.start;
1376   const auto end   = loop_bounds.end;
1377   // Note this thing is called .member in the CUDA specialization of
1378   // TeamThreadRangeBoundariesStruct
1379   auto& member         = loop_bounds.team;
1380   const auto team_size = member.team_size();
1381   const auto team_rank = member.team_rank();
1382   const auto nchunk    = (end - start + team_size - 1) / team_size;
1383   value_type accum     = 0;
1384   // each team has to process one or more chunks of the prefix scan
1385   for (iType i = 0; i < nchunk; ++i) {
1386     auto ii = start + i * team_size + team_rank;
1387     // local accumulation for this chunk
1388     value_type local_accum = 0;
1389     // user updates value with prefix value
1390     if (ii < loop_bounds.end) lambda(ii, local_accum, false);
1391     // perform team scan
1392     local_accum = member.team_scan(local_accum);
1393     // add this blocks accum to total accumulation
1394     auto val = accum + local_accum;
1395     // user updates their data with total accumulation
1396     if (ii < loop_bounds.end) lambda(ii, val, true);
1397     // the last value needs to be propogated to next chunk
1398     if (team_rank == team_size - 1) accum = val;
1399     // broadcast last value to rest of the team
1400     member.team_broadcast(accum, team_size - 1);
1401   }
1402 }
1403 
1404 }  // namespace Kokkos
1405 #undef KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
1406 
1407 namespace Kokkos {
1408 /** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each
1409  * i=0..N-1.
1410  *
1411  * The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
1412  */
1413 template <typename iType, class Lambda>
parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember> & loop_boundaries,const Lambda & lambda)1414 KOKKOS_INLINE_FUNCTION void parallel_for(
1415     const Impl::ThreadVectorRangeBoundariesStruct<
1416         iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
1417     const Lambda& lambda) {
1418 #pragma omp simd
1419   for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i);
1420 }
1421 
1422 /** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i,
1423  * ValueType & val) for each i=0..N-1.
1424  *
1425  * The range i=0..N-1 is mapped to all vector lanes of the the calling thread
1426  * and a summation of val is performed and put into result.
1427  */
1428 template <typename iType, class Lambda, typename ValueType>
parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember> & loop_boundaries,const Lambda & lambda,ValueType & result)1429 KOKKOS_INLINE_FUNCTION void parallel_reduce(
1430     const Impl::ThreadVectorRangeBoundariesStruct<
1431         iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
1432     const Lambda& lambda, ValueType& result) {
1433   ValueType vector_reduce = ValueType();
1434 
1435   if constexpr (std::is_arithmetic<ValueType>::value) {
1436 #pragma omp simd reduction(+ : vector_reduce)
1437     for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
1438       ValueType tmp = ValueType();
1439       lambda(i, tmp);
1440       vector_reduce += tmp;
1441     }
1442   } else {
1443 #pragma omp declare reduction(custom:ValueType : omp_out += omp_in)
1444 
1445 #pragma omp simd reduction(custom : vector_reduce)
1446     for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
1447       lambda(i, vector_reduce);
1448     }
1449   }
1450 
1451   result = vector_reduce;
1452 }
1453 
1454 template <typename iType, class Lambda, typename ReducerType>
1455 KOKKOS_INLINE_FUNCTION
1456     std::enable_if_t<Kokkos::is_reducer_type<ReducerType>::value>
parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember> & loop_boundaries,const Lambda & lambda,ReducerType const & result)1457     parallel_reduce(
1458         const Impl::ThreadVectorRangeBoundariesStruct<
1459             iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
1460         const Lambda& lambda, ReducerType const& result) {
1461   using ValueType = typename ReducerType::value_type;
1462 
1463 #pragma omp declare reduction(                                               \
1464     custom:ValueType                                                         \
1465     : Impl::OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
1466     initializer(                                                             \
1467         Impl::OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
1468 
1469   ValueType vector_reduce;
1470   Impl::OpenMPTargetReducerWrapper<ReducerType>::init(vector_reduce);
1471 
1472 #pragma omp simd reduction(custom : vector_reduce)
1473   for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
1474     lambda(i, vector_reduce);
1475   }
1476 
1477   result.reference() = vector_reduce;
1478 }
1479 
1480 /** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i,
1481  * ValueType & val) for each i=0..N-1.
1482  *
1483  * The range i=0..N-1 is mapped to all vector lanes of the the calling thread
1484  * and a reduction of val is performed using JoinType(ValueType& val, const
1485  * ValueType& update) and put into init_result. The input value of init_result
1486  * is used as initializer for temporary variables of ValueType. Therefore the
1487  * input value should be the neutral element with respect to the join operation
1488  * (e.g. '0 for +-' or '1 for *').
1489  */
1490 template <typename iType, class Lambda, typename ValueType, class JoinType>
parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember> & loop_boundaries,const Lambda & lambda,const JoinType & join,ValueType & init_result)1491 KOKKOS_INLINE_FUNCTION void parallel_reduce(
1492     const Impl::ThreadVectorRangeBoundariesStruct<
1493         iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
1494     const Lambda& lambda, const JoinType& join, ValueType& init_result) {
1495   ValueType result = init_result;
1496 
1497   // FIXME_OPENMPTARGET think about omp simd
1498   // join does not work with omp reduction clause
1499   for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
1500     ValueType tmp = ValueType();
1501     lambda(i, tmp);
1502     join(result, tmp);
1503   }
1504 
1505   init_result = result;
1506 }
1507 
1508 /** \brief  Intra-thread vector parallel exclusive prefix sum. Executes
1509  * lambda(iType i, ValueType & val, bool final) for each i=0..N-1.
1510  *
1511  * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan
1512  * operation is performed. Depending on the target execution space the operator
1513  * might be called twice: once with final=false and once with final=true. When
1514  * final==true val contains the prefix sum value. The contribution of this "i"
1515  * needs to be added to val no matter whether final==true or not. In a serial
1516  * execution (i.e. team_size==1) the operator is only called once with
1517  * final==true. Scan_val will be set to the final sum value over all vector
1518  * lanes.
1519  */
1520 template <typename iType, class FunctorType>
parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember> & loop_boundaries,const FunctorType & lambda)1521 KOKKOS_INLINE_FUNCTION void parallel_scan(
1522     const Impl::ThreadVectorRangeBoundariesStruct<
1523         iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
1524     const FunctorType& lambda) {
1525   using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, void>;
1526   using value_type  = typename ValueTraits::value_type;
1527 
1528   value_type scan_val = value_type();
1529 
1530 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
1531 #pragma ivdep
1532 #endif
1533   for (iType i = loop_boundaries.start; i < loop_boundaries.end;
1534        i += loop_boundaries.increment) {
1535     lambda(i, scan_val, true);
1536   }
1537 }
1538 
1539 }  // namespace Kokkos
1540 
1541 namespace Kokkos {
1542 /** \brief  Intra-team vector parallel_for. Executes lambda(iType i) for each
1543  * i=0..N-1.
1544  *
1545  * The range i=0..N-1 is mapped to all vector lanes of the the calling team.
1546  */
1547 template <typename iType, class Lambda>
parallel_for(const Impl::TeamVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember> & loop_boundaries,const Lambda & lambda)1548 KOKKOS_INLINE_FUNCTION void parallel_for(
1549     const Impl::TeamVectorRangeBoundariesStruct<
1550         iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
1551     const Lambda& lambda) {
1552 #pragma omp for simd nowait schedule(static, 1)
1553   for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i);
1554 }
1555 
1556 /** \brief  Intra-team vector parallel_reduce. Executes lambda(iType i,
1557  * ValueType & val) for each i=0..N-1.
1558  *
1559  * The range i=0..N-1 is mapped to all vector lanes of the the calling team
1560  * and a summation of val is performed and put into result.
1561  */
1562 template <typename iType, class Lambda, typename ValueType>
parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember> & loop_boundaries,const Lambda & lambda,ValueType & result)1563 KOKKOS_INLINE_FUNCTION void parallel_reduce(
1564     const Impl::TeamVectorRangeBoundariesStruct<
1565         iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
1566     const Lambda& lambda, ValueType& result) {
1567   // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
1568   // elements in the array <= 32. For reduction we allocate, 16 bytes per
1569   // element in the scratch space, hence, 16*32 = 512.
1570   static_assert(sizeof(ValueType) <=
1571                 Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
1572 
1573   ValueType* TeamVector_scratch =
1574       static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
1575 
1576 #pragma omp barrier
1577   TeamVector_scratch[0] = ValueType();
1578 #pragma omp barrier
1579 
1580   if constexpr (std::is_arithmetic<ValueType>::value) {
1581 #pragma omp for simd reduction(+ : TeamVector_scratch[:1])
1582     for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
1583       ValueType tmp = ValueType();
1584       lambda(i, tmp);
1585       TeamVector_scratch[0] += tmp;
1586     }
1587   } else {
1588 #pragma omp declare reduction(custom:ValueType : omp_out += omp_in)
1589 
1590 #pragma omp for simd reduction(custom : TeamVector_scratch[:1])
1591     for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
1592       ValueType tmp = ValueType();
1593       lambda(i, tmp);
1594       TeamVector_scratch[0] += tmp;
1595     }
1596   }
1597 
1598   result = TeamVector_scratch[0];
1599 }
1600 
1601 #if !defined(KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND)
1602 template <typename iType, class Lambda, typename ReducerType>
1603 KOKKOS_INLINE_FUNCTION
1604     std::enable_if_t<Kokkos::is_reducer_type<ReducerType>::value>
parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember> & loop_boundaries,const Lambda & lambda,ReducerType const & result)1605     parallel_reduce(
1606         const Impl::TeamVectorRangeBoundariesStruct<
1607             iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
1608         const Lambda& lambda, ReducerType const& result) {
1609   using ValueType = typename ReducerType::value_type;
1610 
1611   // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
1612   // elements in the array <= 32. For reduction we allocate, 16 bytes per
1613   // element in the scratch space, hence, 16*32 = 512.
1614   static_assert(sizeof(ValueType) <=
1615                 Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
1616 
1617 #pragma omp declare reduction(                                               \
1618     custom:ValueType                                                         \
1619     : Impl::OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
1620     initializer(                                                             \
1621         Impl::OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
1622 
1623   ValueType* TeamVector_scratch =
1624       static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
1625 
1626 #pragma omp barrier
1627   Impl::OpenMPTargetReducerWrapper<ReducerType>::init(TeamVector_scratch[0]);
1628 #pragma omp barrier
1629 
1630 #pragma omp for simd reduction(custom : TeamVector_scratch[:1])
1631   for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
1632     ValueType tmp = ValueType();
1633     lambda(i, tmp);
1634     TeamVector_scratch[0] += tmp;
1635   }
1636 
1637   result.reference() = TeamVector_scratch[0];
1638 }
1639 #else
1640 template <typename iType, class Lambda, typename ReducerType>
1641 KOKKOS_INLINE_FUNCTION
1642     std::enable_if_t<Kokkos::is_reducer_type<ReducerType>::value>
parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember> & loop_boundaries,const Lambda & lambda,ReducerType const & result)1643     parallel_reduce(
1644         const Impl::TeamVectorRangeBoundariesStruct<
1645             iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
1646         const Lambda& lambda, ReducerType const& result) {
1647   using ValueType = typename ReducerType::value_type;
1648 
1649   // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
1650   // elements in the array <= 32. For reduction we allocate, 16 bytes per
1651   // element in the scratch space, hence, 16*32 = 512.
1652   static_assert(sizeof(ValueType) <=
1653                 Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
1654 
1655   ValueType* TeamVector_scratch =
1656       static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
1657 
1658 #pragma omp declare reduction(                                               \
1659     omp_red_teamthread_reducer:ValueType                                     \
1660     : Impl::OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
1661     initializer(                                                             \
1662         Impl::OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
1663 
1664 #pragma omp barrier
1665   ValueType tmp;
1666   result.init(tmp);
1667   TeamVector_scratch[0] = tmp;
1668 #pragma omp barrier
1669 
1670   iType team_size = iType(omp_get_num_threads());
1671 #pragma omp for simd reduction(omp_red_teamthread_reducer \
1672                                : TeamVector_scratch[:1]) schedule(static, 1)
1673   for (iType t = 0; t < team_size; t++) {
1674     ValueType tmp2;
1675     result.init(tmp2);
1676 
1677     for (iType i = loop_boundaries.start + t; i < loop_boundaries.end;
1678          i += team_size) {
1679       lambda(i, tmp2);
1680     }
1681     TeamVector_scratch[0] = tmp2;
1682   }
1683 
1684   result.reference() = TeamVector_scratch[0];
1685 }
1686 #endif  // KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
1687 }  // namespace Kokkos
1688 
1689 #undef KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
1690 
1691 namespace Kokkos {
1692 
1693 template <class FunctorType>
single(const Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember> &,const FunctorType & lambda)1694 KOKKOS_INLINE_FUNCTION void single(
1695     const Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember>&
1696     /*single_struct*/,
1697     const FunctorType& lambda) {
1698   lambda();
1699 }
1700 
1701 template <class FunctorType>
single(const Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember> & single_struct,const FunctorType & lambda)1702 KOKKOS_INLINE_FUNCTION void single(
1703     const Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember>&
1704         single_struct,
1705     const FunctorType& lambda) {
1706   if (single_struct.team_member.team_rank() == 0) lambda();
1707 }
1708 
1709 template <class FunctorType, class ValueType>
single(const Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember> &,const FunctorType & lambda,ValueType & val)1710 KOKKOS_INLINE_FUNCTION void single(
1711     const Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember>&
1712     /*single_struct*/,
1713     const FunctorType& lambda, ValueType& val) {
1714   lambda(val);
1715 }
1716 
1717 template <class FunctorType, class ValueType>
single(const Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember> & single_struct,const FunctorType & lambda,ValueType & val)1718 KOKKOS_INLINE_FUNCTION void single(
1719     const Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember>&
1720         single_struct,
1721     const FunctorType& lambda, ValueType& val) {
1722   if (single_struct.team_member.team_rank() == 0) {
1723     lambda(val);
1724   }
1725   single_struct.team_member.team_broadcast(val, 0);
1726 }
1727 }  // namespace Kokkos
1728 
1729 #endif /* #ifndef KOKKOS_OPENMPTARGETEXEC_HPP */
1730