1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 // Kokkos v. 3.0
6 // Copyright (2020) National Technology & Engineering
7 // Solutions of Sandia, LLC (NTESS).
8 //
9 // Under the terms of Contract DE-NA0003525 with NTESS,
10 // the U.S. Government retains certain rights in this software.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions are
14 // met:
15 //
16 // 1. Redistributions of source code must retain the above copyright
17 // notice, this list of conditions and the following disclaimer.
18 //
19 // 2. Redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution.
22 //
23 // 3. Neither the name of the Corporation nor the names of the
24 // contributors may be used to endorse or promote products derived from
25 // this software without specific prior written permission.
26 //
27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 //
39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
40 //
41 // ************************************************************************
42 //@HEADER
43 */
44
45 #ifndef KOKKOS_OPENMPTARGETEXEC_HPP
46 #define KOKKOS_OPENMPTARGETEXEC_HPP
47
48 #include <impl/Kokkos_Traits.hpp>
49 #include <impl/Kokkos_Spinwait.hpp>
50
51 #include <Kokkos_Atomic.hpp>
52 #include "Kokkos_OpenMPTarget_Abort.hpp"
53
54 // FIXME_OPENMPTARGET - Using this macro to implement a workaround for
55 // hierarchical reducers. It avoids hitting the code path which we wanted to
56 // write but doesn't work. undef'ed at the end.
57 #define KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
58
59 //----------------------------------------------------------------------------
60 //----------------------------------------------------------------------------
61
62 namespace Kokkos {
63 namespace Impl {
64
65 template <class Reducer>
66 struct OpenMPTargetReducerWrapper {
67 using value_type = typename Reducer::value_type;
68
69 // WORKAROUND OPENMPTARGET
70 // This pragma omp declare target should not be necessary, but Intel compiler
71 // fails without it
72 #pragma omp declare target
73 KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper74 static void join(value_type&, const value_type&) {
75 printf(
76 "Using a generic unknown Reducer for the OpenMPTarget backend is not "
77 "implemented.");
78 }
79
80 KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper81 static void join(volatile value_type&, const volatile value_type&) {
82 printf(
83 "Using a generic unknown Reducer for the OpenMPTarget backend is not "
84 "implemented.");
85 }
86
87 KOKKOS_INLINE_FUNCTION
initKokkos::Impl::OpenMPTargetReducerWrapper88 static void init(value_type&) {
89 printf(
90 "Using a generic unknown Reducer for the OpenMPTarget backend is not "
91 "implemented.");
92 }
93 #pragma omp end declare target
94 };
95
96 template <class Scalar, class Space>
97 struct OpenMPTargetReducerWrapper<Sum<Scalar, Space>> {
98 public:
99 // Required
100 using value_type = typename std::remove_cv<Scalar>::type;
101
102 // WORKAROUND OPENMPTARGET
103 // This pragma omp declare target should not be necessary, but Intel compiler
104 // fails without it
105 #pragma omp declare target
106 // Required
107 KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper108 static void join(value_type& dest, const value_type& src) { dest += src; }
109
110 KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper111 static void join(volatile value_type& dest, const volatile value_type& src) {
112 dest += src;
113 }
114
115 KOKKOS_INLINE_FUNCTION
initKokkos::Impl::OpenMPTargetReducerWrapper116 static void init(value_type& val) {
117 val = reduction_identity<value_type>::sum();
118 }
119 #pragma omp end declare target
120 };
121
122 template <class Scalar, class Space>
123 struct OpenMPTargetReducerWrapper<Prod<Scalar, Space>> {
124 public:
125 // Required
126 using value_type = typename std::remove_cv<Scalar>::type;
127
128 // WORKAROUND OPENMPTARGET
129 // This pragma omp declare target should not be necessary, but Intel compiler
130 // fails without it
131 #pragma omp declare target
132 // Required
133 KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper134 static void join(value_type& dest, const value_type& src) { dest *= src; }
135
136 KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper137 static void join(volatile value_type& dest, const volatile value_type& src) {
138 dest *= src;
139 }
140
141 KOKKOS_INLINE_FUNCTION
initKokkos::Impl::OpenMPTargetReducerWrapper142 static void init(value_type& val) {
143 val = reduction_identity<value_type>::prod();
144 }
145 #pragma omp end declare target
146 };
147
148 template <class Scalar, class Space>
149 struct OpenMPTargetReducerWrapper<Min<Scalar, Space>> {
150 public:
151 // Required
152 using value_type = typename std::remove_cv<Scalar>::type;
153
154 // WORKAROUND OPENMPTARGET
155 // This pragma omp declare target should not be necessary, but Intel compiler
156 // fails without it
157 #pragma omp declare target
158 // Required
159 KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper160 static void join(value_type& dest, const value_type& src) {
161 if (src < dest) dest = src;
162 }
163
164 KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper165 static void join(volatile value_type& dest, const volatile value_type& src) {
166 if (src < dest) dest = src;
167 }
168
169 KOKKOS_INLINE_FUNCTION
initKokkos::Impl::OpenMPTargetReducerWrapper170 static void init(value_type& val) {
171 val = reduction_identity<value_type>::min();
172 }
173 #pragma omp end declare target
174 };
175
176 template <class Scalar, class Space>
177 struct OpenMPTargetReducerWrapper<Max<Scalar, Space>> {
178 public:
179 // Required
180 using value_type = typename std::remove_cv<Scalar>::type;
181
182 // WORKAROUND OPENMPTARGET
183 // This pragma omp declare target should not be necessary, but Intel compiler
184 // fails without it
185 #pragma omp declare target
186 // Required
187 KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper188 static void join(value_type& dest, const value_type& src) {
189 if (src > dest) dest = src;
190 }
191
192 KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper193 static void join(volatile value_type& dest, const volatile value_type& src) {
194 if (src > dest) dest = src;
195 }
196
197 // Required
198 KOKKOS_INLINE_FUNCTION
initKokkos::Impl::OpenMPTargetReducerWrapper199 static void init(value_type& val) {
200 val = reduction_identity<value_type>::max();
201 }
202 #pragma omp end declare target
203 };
204
205 template <class Scalar, class Space>
206 struct OpenMPTargetReducerWrapper<LAnd<Scalar, Space>> {
207 public:
208 // Required
209 using value_type = typename std::remove_cv<Scalar>::type;
210
211 // WORKAROUND OPENMPTARGET
212 // This pragma omp declare target should not be necessary, but Intel compiler
213 // fails without it
214 #pragma omp declare target
215 KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper216 static void join(value_type& dest, const value_type& src) {
217 dest = dest && src;
218 }
219
220 KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper221 static void join(volatile value_type& dest, const volatile value_type& src) {
222 dest = dest && src;
223 }
224
225 KOKKOS_INLINE_FUNCTION
initKokkos::Impl::OpenMPTargetReducerWrapper226 static void init(value_type& val) {
227 val = reduction_identity<value_type>::land();
228 }
229 #pragma omp end declare target
230 };
231
232 template <class Scalar, class Space>
233 struct OpenMPTargetReducerWrapper<LOr<Scalar, Space>> {
234 public:
235 // Required
236 using value_type = typename std::remove_cv<Scalar>::type;
237
238 using result_view_type = Kokkos::View<value_type, Space>;
239
240 // WORKAROUND OPENMPTARGET
241 // This pragma omp declare target should not be necessary, but Intel compiler
242 // fails without it
243 #pragma omp declare target
244 // Required
245 KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper246 static void join(value_type& dest, const value_type& src) {
247 dest = dest || src;
248 }
249
250 KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper251 static void join(volatile value_type& dest, const volatile value_type& src) {
252 dest = dest || src;
253 }
254
255 KOKKOS_INLINE_FUNCTION
initKokkos::Impl::OpenMPTargetReducerWrapper256 static void init(value_type& val) {
257 val = reduction_identity<value_type>::lor();
258 }
259 #pragma omp end declare target
260 };
261
262 template <class Scalar, class Space>
263 struct OpenMPTargetReducerWrapper<BAnd<Scalar, Space>> {
264 public:
265 // Required
266 using value_type = typename std::remove_cv<Scalar>::type;
267
268 // WORKAROUND OPENMPTARGET
269 // This pragma omp declare target should not be necessary, but Intel compiler
270 // fails without it
271 #pragma omp declare target
272 // Required
273 KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper274 static void join(value_type& dest, const value_type& src) {
275 dest = dest & src;
276 }
277
278 KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper279 static void join(volatile value_type& dest, const volatile value_type& src) {
280 dest = dest & src;
281 }
282
283 KOKKOS_INLINE_FUNCTION
initKokkos::Impl::OpenMPTargetReducerWrapper284 static void init(value_type& val) {
285 val = reduction_identity<value_type>::band();
286 }
287 #pragma omp end declare target
288 };
289
290 template <class Scalar, class Space>
291 struct OpenMPTargetReducerWrapper<BOr<Scalar, Space>> {
292 public:
293 // Required
294 using value_type = typename std::remove_cv<Scalar>::type;
295
296 // WORKAROUND OPENMPTARGET
297 // This pragma omp declare target should not be necessary, but Intel compiler
298 // fails without it
299 #pragma omp declare target
300 // Required
301 KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper302 static void join(value_type& dest, const value_type& src) {
303 dest = dest | src;
304 }
305
306 KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper307 static void join(volatile value_type& dest, const volatile value_type& src) {
308 dest = dest | src;
309 }
310
311 KOKKOS_INLINE_FUNCTION
initKokkos::Impl::OpenMPTargetReducerWrapper312 static void init(value_type& val) {
313 val = reduction_identity<value_type>::bor();
314 }
315 #pragma omp end declare target
316 };
317
318 template <class Scalar, class Index, class Space>
319 struct OpenMPTargetReducerWrapper<MinLoc<Scalar, Index, Space>> {
320 private:
321 using scalar_type = typename std::remove_cv<Scalar>::type;
322 using index_type = typename std::remove_cv<Index>::type;
323
324 public:
325 // Required
326 using value_type = ValLocScalar<scalar_type, index_type>;
327
328 // WORKAROUND OPENMPTARGET
329 // This pragma omp declare target should not be necessary, but Intel compiler
330 // fails without it
331 #pragma omp declare target
332 // Required
333 KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper334 static void join(value_type& dest, const value_type& src) {
335 if (src.val < dest.val) dest = src;
336 }
337
338 KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper339 static void join(volatile value_type& dest, const volatile value_type& src) {
340 if (src.val < dest.val) dest = src;
341 }
342
343 KOKKOS_INLINE_FUNCTION
initKokkos::Impl::OpenMPTargetReducerWrapper344 static void init(value_type& val) {
345 val.val = reduction_identity<scalar_type>::min();
346 val.loc = reduction_identity<index_type>::min();
347 }
348 #pragma omp end declare target
349 };
350
351 template <class Scalar, class Index, class Space>
352 struct OpenMPTargetReducerWrapper<MaxLoc<Scalar, Index, Space>> {
353 private:
354 using scalar_type = typename std::remove_cv<Scalar>::type;
355 using index_type = typename std::remove_cv<Index>::type;
356
357 public:
358 // Required
359 using value_type = ValLocScalar<scalar_type, index_type>;
360
361 // WORKAROUND OPENMPTARGET
362 // This pragma omp declare target should not be necessary, but Intel compiler
363 // fails without it
364 #pragma omp declare target
365 KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper366 static void join(value_type& dest, const value_type& src) {
367 if (src.val > dest.val) dest = src;
368 }
369
370 KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper371 static void join(volatile value_type& dest, const volatile value_type& src) {
372 if (src.val > dest.val) dest = src;
373 }
374
375 KOKKOS_INLINE_FUNCTION
initKokkos::Impl::OpenMPTargetReducerWrapper376 static void init(value_type& val) {
377 val.val = reduction_identity<scalar_type>::max();
378 val.loc = reduction_identity<index_type>::min();
379 }
380 #pragma omp end declare target
381 };
382
383 template <class Scalar, class Space>
384 struct OpenMPTargetReducerWrapper<MinMax<Scalar, Space>> {
385 private:
386 using scalar_type = typename std::remove_cv<Scalar>::type;
387
388 public:
389 // Required
390 using value_type = MinMaxScalar<scalar_type>;
391
392 // WORKAROUND OPENMPTARGET
393 // This pragma omp declare target should not be necessary, but Intel compiler
394 // fails without it
395 #pragma omp declare target
396 // Required
397 KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper398 static void join(value_type& dest, const value_type& src) {
399 if (src.min_val < dest.min_val) {
400 dest.min_val = src.min_val;
401 }
402 if (src.max_val > dest.max_val) {
403 dest.max_val = src.max_val;
404 }
405 }
406
407 KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper408 static void join(volatile value_type& dest, const volatile value_type& src) {
409 if (src.min_val < dest.min_val) {
410 dest.min_val = src.min_val;
411 }
412 if (src.max_val > dest.max_val) {
413 dest.max_val = src.max_val;
414 }
415 }
416
417 KOKKOS_INLINE_FUNCTION
initKokkos::Impl::OpenMPTargetReducerWrapper418 static void init(value_type& val) {
419 val.max_val = reduction_identity<scalar_type>::max();
420 val.min_val = reduction_identity<scalar_type>::min();
421 }
422 #pragma omp end declare target
423 };
424
425 template <class Scalar, class Index, class Space>
426 struct OpenMPTargetReducerWrapper<MinMaxLoc<Scalar, Index, Space>> {
427 private:
428 using scalar_type = typename std::remove_cv<Scalar>::type;
429 using index_type = typename std::remove_cv<Index>::type;
430
431 public:
432 // Required
433 using value_type = MinMaxLocScalar<scalar_type, index_type>;
434
435 // WORKAROUND OPENMPTARGET
436 // This pragma omp declare target should not be necessary, but Intel compiler
437 // fails without it
438 #pragma omp declare target
439 // Required
440 KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper441 static void join(value_type& dest, const value_type& src) {
442 if (src.min_val < dest.min_val) {
443 dest.min_val = src.min_val;
444 dest.min_loc = src.min_loc;
445 }
446 if (src.max_val > dest.max_val) {
447 dest.max_val = src.max_val;
448 dest.max_loc = src.max_loc;
449 }
450 }
451
452 KOKKOS_INLINE_FUNCTION
joinKokkos::Impl::OpenMPTargetReducerWrapper453 static void join(volatile value_type& dest, const volatile value_type& src) {
454 if (src.min_val < dest.min_val) {
455 dest.min_val = src.min_val;
456 dest.min_loc = src.min_loc;
457 }
458 if (src.max_val > dest.max_val) {
459 dest.max_val = src.max_val;
460 dest.max_loc = src.max_loc;
461 }
462 }
463
464 KOKKOS_INLINE_FUNCTION
initKokkos::Impl::OpenMPTargetReducerWrapper465 static void init(value_type& val) {
466 val.max_val = reduction_identity<scalar_type>::max();
467 val.min_val = reduction_identity<scalar_type>::min();
468 val.max_loc = reduction_identity<index_type>::min();
469 val.min_loc = reduction_identity<index_type>::min();
470 }
471 #pragma omp end declare target
472 };
473 /*
474 template<class ReducerType>
475 class OpenMPTargetReducerWrapper {
476 public:
477 const ReducerType& reducer;
478 using value_type = typename ReducerType::value_type;
479 value_type& value;
480
481 KOKKOS_INLINE_FUNCTION
482 void join(const value_type& upd) {
483 reducer.join(value,upd);
484 }
485
486 KOKKOS_INLINE_FUNCTION
487 void init(const value_type& upd) {
488 reducer.init(value,upd);
489 }
490 };*/
491
492 } // namespace Impl
493 } // namespace Kokkos
494
495 namespace Kokkos {
496 namespace Impl {
497
498 //----------------------------------------------------------------------------
499 /** \brief Data for OpenMPTarget thread execution */
500
501 class OpenMPTargetExec {
502 public:
503 // FIXME_OPENMPTARGET - Currently the maximum number of
504 // teams possible is calculated based on NVIDIA's Volta GPU. In
505 // future this value should be based on the chosen architecture for the
506 // OpenMPTarget backend.
507 enum { MAX_ACTIVE_THREADS = 2080 * 80 };
508 enum { MAX_ACTIVE_TEAMS = MAX_ACTIVE_THREADS / 32 };
509
510 private:
511 static void* scratch_ptr;
512
513 public:
514 static void verify_is_process(const char* const);
515 static void verify_initialized(const char* const);
516
517 static int* get_lock_array(int num_teams);
518 static void* get_scratch_ptr();
519 static void clear_scratch();
520 static void clear_lock_array();
521 static void resize_scratch(int64_t team_reduce_bytes,
522 int64_t team_shared_bytes,
523 int64_t thread_local_bytes);
524
525 static void* m_scratch_ptr;
526 static int64_t m_scratch_size;
527 static int* m_lock_array;
528 static int64_t m_lock_size;
529 static uint32_t* m_uniquetoken_ptr;
530 };
531
532 } // namespace Impl
533 } // namespace Kokkos
534
535 //----------------------------------------------------------------------------
536 //----------------------------------------------------------------------------
537
538 namespace Kokkos {
539 namespace Impl {
540
541 class OpenMPTargetExecTeamMember {
542 public:
543 enum { TEAM_REDUCE_SIZE = 512 };
544
545 /** \brief Thread states for team synchronization */
546 enum { Active = 0, Rendezvous = 1 };
547
548 using execution_space = Kokkos::Experimental::OpenMPTarget;
549 using scratch_memory_space = execution_space::scratch_memory_space;
550
551 scratch_memory_space m_team_shared;
552 int m_team_scratch_size[2];
553 int m_team_rank;
554 int m_team_size;
555 int m_league_rank;
556 int m_league_size;
557 int m_vector_length;
558 int m_vector_lane;
559 int m_shmem_block_index;
560 void* m_glb_scratch;
561 void* m_reduce_scratch;
562
563 /*
564 // Fan-in team threads, root of the fan-in which does not block returns true
565 inline
566 bool team_fan_in() const
567 {
568 memory_fence();
569 for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! (
570 m_team_rank_rev & n ) ; n <<= 1 ) {
571
572 m_exec.pool_rev( m_team_base_rev + j )->state_wait( Active );
573 }
574
575 if ( m_team_rank_rev ) {
576 m_exec.state_set( Rendezvous );
577 memory_fence();
578 m_exec.state_wait( Rendezvous );
579 }
580
581 return 0 == m_team_rank_rev ;
582 }
583
584 inline
585 void team_fan_out() const
586 {
587 memory_fence();
588 for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! (
589 m_team_rank_rev & n ) ; n <<= 1 ) { m_exec.pool_rev( m_team_base_rev + j
590 )->state_set( Active ); memory_fence();
591 }
592 }
593 */
594 public:
595 KOKKOS_INLINE_FUNCTION
team_shmem() const596 const execution_space::scratch_memory_space& team_shmem() const {
597 return m_team_shared.set_team_thread_mode(0, 1, 0);
598 }
599
600 KOKKOS_INLINE_FUNCTION
team_scratch(int level) const601 const execution_space::scratch_memory_space& team_scratch(int level) const {
602 return m_team_shared.set_team_thread_mode(level, 1,
603 m_team_scratch_size[level]);
604 }
605
606 KOKKOS_INLINE_FUNCTION
thread_scratch(int level) const607 const execution_space::scratch_memory_space& thread_scratch(int level) const {
608 return m_team_shared.set_team_thread_mode(level, team_size(), team_rank());
609 }
610
league_rank() const611 KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank; }
league_size() const612 KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size; }
team_rank() const613 KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank; }
team_size() const614 KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size; }
impl_reduce_scratch() const615 KOKKOS_INLINE_FUNCTION void* impl_reduce_scratch() const {
616 return m_reduce_scratch;
617 }
618
team_barrier() const619 KOKKOS_INLINE_FUNCTION void team_barrier() const {
620 #pragma omp barrier
621 }
622
623 template <class ValueType>
team_broadcast(ValueType & value,int thread_id) const624 KOKKOS_INLINE_FUNCTION void team_broadcast(ValueType& value,
625 int thread_id) const {
626 // Make sure there is enough scratch space:
627 using type =
628 typename std::conditional<(sizeof(ValueType) < TEAM_REDUCE_SIZE),
629 ValueType, void>::type;
630 type* team_scratch = reinterpret_cast<type*>(
631 ((char*)(m_glb_scratch) + TEAM_REDUCE_SIZE * omp_get_team_num()));
632 #pragma omp barrier
633 if (team_rank() == thread_id) *team_scratch = value;
634 #pragma omp barrier
635 value = *team_scratch;
636 }
637
638 template <class Closure, class ValueType>
team_broadcast(const Closure & f,ValueType & value,const int & thread_id) const639 KOKKOS_INLINE_FUNCTION void team_broadcast(const Closure& f, ValueType& value,
640 const int& thread_id) const {
641 f(value);
642 team_broadcast(value, thread_id);
643 }
644
645 template <class ValueType, class JoinOp>
team_reduce(const ValueType & value,const JoinOp & op_in) const646 KOKKOS_INLINE_FUNCTION ValueType team_reduce(const ValueType& value,
647 const JoinOp& op_in) const {
648 #pragma omp barrier
649
650 using value_type = ValueType;
651 const JoinLambdaAdapter<value_type, JoinOp> op(op_in);
652
653 // Make sure there is enough scratch space:
654 using type = std::conditional_t<(sizeof(value_type) < TEAM_REDUCE_SIZE),
655 value_type, void>;
656
657 const int n_values = TEAM_REDUCE_SIZE / sizeof(value_type);
658 type* team_scratch =
659 (type*)((char*)m_glb_scratch + TEAM_REDUCE_SIZE * omp_get_team_num());
660 for (int i = m_team_rank; i < n_values; i += m_team_size) {
661 team_scratch[i] = value_type();
662 }
663
664 #pragma omp barrier
665
666 for (int k = 0; k < m_team_size; k += n_values) {
667 if ((k <= m_team_rank) && (k + n_values > m_team_rank))
668 team_scratch[m_team_rank % n_values] += value;
669 #pragma omp barrier
670 }
671
672 for (int d = 1; d < n_values; d *= 2) {
673 if ((m_team_rank + d < n_values) && (m_team_rank % (2 * d) == 0)) {
674 team_scratch[m_team_rank] += team_scratch[m_team_rank + d];
675 }
676 #pragma omp barrier
677 }
678 return team_scratch[0];
679 }
680 /** \brief Intra-team exclusive prefix sum with team_rank() ordering
681 * with intra-team non-deterministic ordering accumulation.
682 *
683 * The global inter-team accumulation value will, at the end of the
684 * league's parallel execution, be the scan's total.
685 * Parallel execution ordering of the league's teams is non-deterministic.
686 * As such the base value for each team's scan operation is similarly
687 * non-deterministic.
688 */
689 template <typename ArgType>
690 KOKKOS_INLINE_FUNCTION ArgType
team_scan(const ArgType &,ArgType * const) const691 team_scan(const ArgType& /*value*/, ArgType* const /*global_accum*/) const {
692 // FIXME_OPENMPTARGET
693 /* // Make sure there is enough scratch space:
694 using type =
695 std::conditional_t<(sizeof(ArgType) < TEAM_REDUCE_SIZE), ArgType, void>;
696
697 volatile type * const work_value = ((type*) m_exec.scratch_thread());
698
699 *work_value = value ;
700
701 memory_fence();
702
703 if ( team_fan_in() ) {
704 // The last thread to synchronize returns true, all other threads wait
705 for team_fan_out()
706 // m_team_base[0] == highest ranking team member
707 // m_team_base[ m_team_size - 1 ] == lowest ranking team member
708 //
709 // 1) copy from lower to higher rank, initialize lowest rank to zero
710 // 2) prefix sum from lowest to highest rank, skipping lowest rank
711
712 type accum = 0 ;
713
714 if ( global_accum ) {
715 for ( int i = m_team_size ; i-- ; ) {
716 type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i
717 )->scratch_thread()); accum += val ;
718 }
719 accum = atomic_fetch_add( global_accum , accum );
720 }
721
722 for ( int i = m_team_size ; i-- ; ) {
723 type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i
724 )->scratch_thread()); const type offset = accum ; accum += val ; val =
725 offset ;
726 }
727
728 memory_fence();
729 }
730
731 team_fan_out();
732
733 return *work_value ;*/
734 return ArgType();
735 }
736
737 /** \brief Intra-team exclusive prefix sum with team_rank() ordering.
738 *
739 * The highest rank thread can compute the reduction total as
740 * reduction_total = dev.team_scan( value ) + value ;
741 */
742 template <typename Type>
team_scan(const Type & value) const743 KOKKOS_INLINE_FUNCTION Type team_scan(const Type& value) const {
744 return this->template team_scan<Type>(value, 0);
745 }
746
747 //----------------------------------------
748 // Private for the driver
749
750 private:
751 using space = execution_space::scratch_memory_space;
752
753 public:
754 // FIXME_OPENMPTARGET - 512(16*32) bytes at the begining of the scratch space
755 // for each league is saved for reduction. It should actually be based on the
756 // ValueType of the reduction variable.
OpenMPTargetExecTeamMember(const int league_rank,const int league_size,const int team_size,const int vector_length,void * const glb_scratch,const int shmem_block_index,const int shmem_size_L0,const int shmem_size_L1)757 inline OpenMPTargetExecTeamMember(
758 const int league_rank, const int league_size, const int team_size,
759 const int vector_length // const TeamPolicyInternal< OpenMPTarget,
760 // Properties ...> & team
761 ,
762 void* const glb_scratch, const int shmem_block_index,
763 const int shmem_size_L0, const int shmem_size_L1)
764 : m_team_scratch_size{shmem_size_L0, shmem_size_L1},
765 m_team_rank(0),
766 m_team_size(team_size),
767 m_league_rank(league_rank),
768 m_league_size(league_size),
769 m_vector_length(vector_length),
770 m_shmem_block_index(shmem_block_index),
771 m_glb_scratch(glb_scratch) {
772 const int omp_tid = omp_get_thread_num();
773 m_team_shared = scratch_memory_space(
774 ((char*)glb_scratch +
775 m_shmem_block_index *
776 (shmem_size_L0 + shmem_size_L1 +
777 ((shmem_size_L0 + shmem_size_L1) * 10 / 100) + TEAM_REDUCE_SIZE)),
778 shmem_size_L0,
779 ((char*)glb_scratch +
780 m_shmem_block_index * (shmem_size_L0 + shmem_size_L1 +
781 ((shmem_size_L0 + shmem_size_L1) * 10 / 100) +
782 TEAM_REDUCE_SIZE)) +
783 shmem_size_L0 + ((shmem_size_L0 + shmem_size_L1) * 10 / 100) +
784 TEAM_REDUCE_SIZE,
785 shmem_size_L1);
786 m_reduce_scratch =
787 (char*)glb_scratch +
788 shmem_block_index *
789 (shmem_size_L0 + shmem_size_L1 +
790 ((shmem_size_L0 + shmem_size_L1) * 10 / 100) + TEAM_REDUCE_SIZE);
791 m_league_rank = league_rank;
792 m_team_rank = omp_tid;
793 m_vector_lane = 0;
794 }
795
team_reduce_size()796 static inline int team_reduce_size() { return TEAM_REDUCE_SIZE; }
797 };
798
799 template <class... Properties>
800 class TeamPolicyInternal<Kokkos::Experimental::OpenMPTarget, Properties...>
801 : public PolicyTraits<Properties...> {
802 public:
803 //! Tag this class as a kokkos execution policy
804 using execution_policy = TeamPolicyInternal;
805
806 using traits = PolicyTraits<Properties...>;
807
808 //----------------------------------------
809
810 template <class FunctorType>
team_size_max(const FunctorType &,const ParallelForTag &)811 inline static int team_size_max(const FunctorType&, const ParallelForTag&) {
812 return 256;
813 }
814
815 template <class FunctorType>
team_size_max(const FunctorType &,const ParallelReduceTag &)816 inline static int team_size_max(const FunctorType&,
817 const ParallelReduceTag&) {
818 return 256;
819 }
820
821 template <class FunctorType, class ReducerType>
team_size_max(const FunctorType &,const ReducerType &,const ParallelReduceTag &)822 inline static int team_size_max(const FunctorType&, const ReducerType&,
823 const ParallelReduceTag&) {
824 return 256;
825 }
826
827 template <class FunctorType>
team_size_recommended(const FunctorType &,const ParallelForTag &)828 inline static int team_size_recommended(const FunctorType&,
829 const ParallelForTag&) {
830 return 128;
831 }
832
833 template <class FunctorType>
team_size_recommended(const FunctorType &,const ParallelReduceTag &)834 inline static int team_size_recommended(const FunctorType&,
835 const ParallelReduceTag&) {
836 return 128;
837 }
838
839 template <class FunctorType, class ReducerType>
team_size_recommended(const FunctorType &,const ReducerType &,const ParallelReduceTag &)840 inline static int team_size_recommended(const FunctorType&,
841 const ReducerType&,
842 const ParallelReduceTag&) {
843 return 128;
844 }
845
846 //----------------------------------------
847
848 private:
849 int m_league_size;
850 int m_team_size;
851 int m_vector_length;
852 int m_team_alloc;
853 int m_team_iter;
854 std::array<size_t, 2> m_team_scratch_size;
855 std::array<size_t, 2> m_thread_scratch_size;
856 bool m_tune_team_size;
857 bool m_tune_vector_length;
858 constexpr const static size_t default_team_size = 256;
859 int m_chunk_size;
860
init(const int league_size_request,const int team_size_request,const int vector_length_request)861 inline void init(const int league_size_request, const int team_size_request,
862 const int vector_length_request) {
863 m_league_size = league_size_request;
864
865 // Minimum team size should be 32 for OpenMPTarget backend.
866 if (team_size_request < 32) {
867 Kokkos::Impl::OpenMPTarget_abort(
868 "OpenMPTarget backend requires a minimum of 32 threads per team.\n");
869 } else
870 m_team_size = team_size_request;
871
872 m_vector_length = vector_length_request;
873 set_auto_chunk_size();
874 }
875
876 template <typename ExecSpace, typename... OtherProperties>
877 friend class TeamPolicyInternal;
878
879 public:
impl_auto_team_size() const880 inline bool impl_auto_team_size() const { return m_tune_team_size; }
impl_auto_vector_length() const881 inline bool impl_auto_vector_length() const { return m_tune_vector_length; }
impl_set_team_size(const size_t size)882 inline void impl_set_team_size(const size_t size) { m_team_size = size; }
impl_set_vector_length(const size_t length)883 inline void impl_set_vector_length(const size_t length) {
884 m_tune_vector_length = length;
885 }
impl_vector_length() const886 inline int impl_vector_length() const { return m_vector_length; }
vector_length() const887 KOKKOS_DEPRECATED inline int vector_length() const {
888 return impl_vector_length();
889 }
team_size() const890 inline int team_size() const { return m_team_size; }
league_size() const891 inline int league_size() const { return m_league_size; }
scratch_size(const int & level,int team_size_=-1) const892 inline size_t scratch_size(const int& level, int team_size_ = -1) const {
893 if (team_size_ < 0) team_size_ = m_team_size;
894 return m_team_scratch_size[level] +
895 team_size_ * m_thread_scratch_size[level];
896 }
897
space() const898 inline Kokkos::Experimental::OpenMPTarget space() const {
899 return Kokkos::Experimental::OpenMPTarget();
900 }
901
902 template <class... OtherProperties>
TeamPolicyInternal(const TeamPolicyInternal<OtherProperties...> & p)903 TeamPolicyInternal(const TeamPolicyInternal<OtherProperties...>& p)
904 : m_league_size(p.m_league_size),
905 m_team_size(p.m_team_size),
906 m_vector_length(p.m_vector_length),
907 m_team_alloc(p.m_team_alloc),
908 m_team_iter(p.m_team_iter),
909 m_team_scratch_size(p.m_team_scratch_size),
910 m_thread_scratch_size(p.m_thread_scratch_size),
911 m_tune_team_size(p.m_tune_team_size),
912 m_tune_vector_length(p.m_tune_vector_length),
913 m_chunk_size(p.m_chunk_size) {}
914
915 /** \brief Specify league size, request team size */
TeamPolicyInternal(typename traits::execution_space &,int league_size_request,int team_size_request,int vector_length_request=1)916 TeamPolicyInternal(typename traits::execution_space&, int league_size_request,
917 int team_size_request, int vector_length_request = 1)
918 : m_team_scratch_size{0, 0},
919 m_thread_scratch_size{0, 0},
920 m_tune_team_size(false),
921 m_tune_vector_length(false),
922 m_chunk_size(0) {
923 init(league_size_request, team_size_request, vector_length_request);
924 }
925
TeamPolicyInternal(typename traits::execution_space &,int league_size_request,const Kokkos::AUTO_t &,int vector_length_request=1)926 TeamPolicyInternal(typename traits::execution_space&, int league_size_request,
927 const Kokkos::AUTO_t& /* team_size_request */
928 ,
929 int vector_length_request = 1)
930 : m_team_scratch_size{0, 0},
931 m_thread_scratch_size{0, 0},
932 m_tune_team_size(true),
933 m_tune_vector_length(false),
934 m_chunk_size(0) {
935 init(league_size_request, default_team_size / vector_length_request,
936 vector_length_request);
937 }
938
TeamPolicyInternal(typename traits::execution_space &,int league_size_request,const Kokkos::AUTO_t &,const Kokkos::AUTO_t &)939 TeamPolicyInternal(typename traits::execution_space&, int league_size_request,
940 const Kokkos::AUTO_t& /* team_size_request */
941 ,
942 const Kokkos::AUTO_t& /* vector_length_request */)
943 : m_team_scratch_size{0, 0},
944 m_thread_scratch_size{0, 0},
945 m_tune_team_size(true),
946 m_tune_vector_length(true),
947 m_chunk_size(0) {
948 init(league_size_request, default_team_size, 1);
949 }
TeamPolicyInternal(typename traits::execution_space &,int league_size_request,int team_size_request,const Kokkos::AUTO_t &)950 TeamPolicyInternal(typename traits::execution_space&, int league_size_request,
951 int team_size_request,
952 const Kokkos::AUTO_t& /* vector_length_request */)
953 : m_team_scratch_size{0, 0},
954 m_thread_scratch_size{0, 0},
955 m_tune_team_size(false),
956 m_tune_vector_length(true),
957 m_chunk_size(0) {
958 init(league_size_request, team_size_request, 1);
959 }
960
TeamPolicyInternal(int league_size_request,int team_size_request,int vector_length_request=1)961 TeamPolicyInternal(int league_size_request, int team_size_request,
962 int vector_length_request = 1)
963 : m_team_scratch_size{0, 0},
964 m_thread_scratch_size{0, 0},
965 m_tune_team_size(false),
966 m_tune_vector_length(false),
967 m_chunk_size(0) {
968 init(league_size_request, team_size_request, vector_length_request);
969 }
970
TeamPolicyInternal(int league_size_request,const Kokkos::AUTO_t &,int vector_length_request=1)971 TeamPolicyInternal(int league_size_request,
972 const Kokkos::AUTO_t& /* team_size_request */
973 ,
974 int vector_length_request = 1)
975 : m_team_scratch_size{0, 0},
976 m_thread_scratch_size{0, 0},
977 m_tune_team_size(true),
978 m_tune_vector_length(false),
979 m_chunk_size(0) {
980 init(league_size_request, default_team_size / vector_length_request,
981 vector_length_request);
982 }
983
TeamPolicyInternal(int league_size_request,const Kokkos::AUTO_t &,const Kokkos::AUTO_t &)984 TeamPolicyInternal(int league_size_request,
985 const Kokkos::AUTO_t& /* team_size_request */
986 ,
987 const Kokkos::AUTO_t& /* vector_length_request */)
988 : m_team_scratch_size{0, 0},
989 m_thread_scratch_size{0, 0},
990 m_tune_team_size(true),
991 m_tune_vector_length(true),
992 m_chunk_size(0) {
993 init(league_size_request, default_team_size, 1);
994 }
TeamPolicyInternal(int league_size_request,int team_size_request,const Kokkos::AUTO_t &)995 TeamPolicyInternal(int league_size_request, int team_size_request,
996 const Kokkos::AUTO_t& /* vector_length_request */)
997 : m_team_scratch_size{0, 0},
998 m_thread_scratch_size{0, 0},
999 m_tune_team_size(false),
1000 m_tune_vector_length(true),
1001 m_chunk_size(0) {
1002 init(league_size_request, team_size_request, 1);
1003 }
vector_length_max()1004 inline static size_t vector_length_max() {
1005 return 32; /* TODO: this is bad. Need logic that is compiler and backend
1006 aware */
1007 }
team_alloc() const1008 inline int team_alloc() const { return m_team_alloc; }
team_iter() const1009 inline int team_iter() const { return m_team_iter; }
1010
chunk_size() const1011 inline int chunk_size() const { return m_chunk_size; }
1012
1013 /** \brief set chunk_size to a discrete value*/
set_chunk_size(typename traits::index_type chunk_size_)1014 inline TeamPolicyInternal& set_chunk_size(
1015 typename traits::index_type chunk_size_) {
1016 m_chunk_size = chunk_size_;
1017 return *this;
1018 }
1019
1020 /** \brief set per team scratch size for a specific level of the scratch
1021 * hierarchy */
set_scratch_size(const int & level,const PerTeamValue & per_team)1022 inline TeamPolicyInternal& set_scratch_size(const int& level,
1023 const PerTeamValue& per_team) {
1024 m_team_scratch_size[level] = per_team.value;
1025 return *this;
1026 }
1027
1028 /** \brief set per thread scratch size for a specific level of the scratch
1029 * hierarchy */
set_scratch_size(const int & level,const PerThreadValue & per_thread)1030 inline TeamPolicyInternal& set_scratch_size(
1031 const int& level, const PerThreadValue& per_thread) {
1032 m_thread_scratch_size[level] = per_thread.value;
1033 return *this;
1034 }
1035
1036 /** \brief set per thread and per team scratch size for a specific level of
1037 * the scratch hierarchy */
set_scratch_size(const int & level,const PerTeamValue & per_team,const PerThreadValue & per_thread)1038 inline TeamPolicyInternal& set_scratch_size(
1039 const int& level, const PerTeamValue& per_team,
1040 const PerThreadValue& per_thread) {
1041 m_team_scratch_size[level] = per_team.value;
1042 m_thread_scratch_size[level] = per_thread.value;
1043 return *this;
1044 }
1045
1046 private:
1047 /** \brief finalize chunk_size if it was set to AUTO*/
set_auto_chunk_size()1048 inline void set_auto_chunk_size() {
1049 int concurrency = 2048 * 128;
1050
1051 if (concurrency == 0) concurrency = 1;
1052
1053 if (m_chunk_size > 0) {
1054 if (!Impl::is_integral_power_of_two(m_chunk_size))
1055 Kokkos::abort("TeamPolicy blocking granularity must be power of two");
1056 }
1057
1058 int new_chunk_size = 1;
1059 while (new_chunk_size * 100 * concurrency < m_league_size)
1060 new_chunk_size *= 2;
1061 if (new_chunk_size < 128) {
1062 new_chunk_size = 1;
1063 while ((new_chunk_size * 40 * concurrency < m_league_size) &&
1064 (new_chunk_size < 128))
1065 new_chunk_size *= 2;
1066 }
1067 m_chunk_size = new_chunk_size;
1068 }
1069
1070 public:
1071 using member_type = Impl::OpenMPTargetExecTeamMember;
1072 };
1073 } // namespace Impl
1074
1075 } // namespace Kokkos
1076
1077 namespace Kokkos {
1078
1079 template <typename iType>
1080 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
1081 iType, Impl::OpenMPTargetExecTeamMember>
TeamThreadRange(const Impl::OpenMPTargetExecTeamMember & thread,const iType & count)1082 TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread,
1083 const iType& count) {
1084 return Impl::TeamThreadRangeBoundariesStruct<
1085 iType, Impl::OpenMPTargetExecTeamMember>(thread, count);
1086 }
1087
1088 template <typename iType1, typename iType2>
1089 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
1090 typename std::common_type<iType1, iType2>::type,
1091 Impl::OpenMPTargetExecTeamMember>
TeamThreadRange(const Impl::OpenMPTargetExecTeamMember & thread,const iType1 & begin,const iType2 & end)1092 TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread,
1093 const iType1& begin, const iType2& end) {
1094 using iType = typename std::common_type<iType1, iType2>::type;
1095 return Impl::TeamThreadRangeBoundariesStruct<
1096 iType, Impl::OpenMPTargetExecTeamMember>(thread, iType(begin),
1097 iType(end));
1098 }
1099
1100 template <typename iType>
1101 KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct<
1102 iType, Impl::OpenMPTargetExecTeamMember>
ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember & thread,const iType & count)1103 ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread,
1104 const iType& count) {
1105 return Impl::ThreadVectorRangeBoundariesStruct<
1106 iType, Impl::OpenMPTargetExecTeamMember>(thread, count);
1107 }
1108
1109 template <typename iType1, typename iType2>
1110 KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct<
1111 typename std::common_type<iType1, iType2>::type,
1112 Impl::OpenMPTargetExecTeamMember>
ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember & thread,const iType1 & arg_begin,const iType2 & arg_end)1113 ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread,
1114 const iType1& arg_begin, const iType2& arg_end) {
1115 using iType = typename std::common_type<iType1, iType2>::type;
1116 return Impl::ThreadVectorRangeBoundariesStruct<
1117 iType, Impl::OpenMPTargetExecTeamMember>(thread, iType(arg_begin),
1118 iType(arg_end));
1119 }
1120
1121 template <typename iType>
1122 KOKKOS_INLINE_FUNCTION Impl::TeamVectorRangeBoundariesStruct<
1123 iType, Impl::OpenMPTargetExecTeamMember>
TeamVectorRange(const Impl::OpenMPTargetExecTeamMember & thread,const iType & count)1124 TeamVectorRange(const Impl::OpenMPTargetExecTeamMember& thread,
1125 const iType& count) {
1126 return Impl::TeamVectorRangeBoundariesStruct<
1127 iType, Impl::OpenMPTargetExecTeamMember>(thread, count);
1128 }
1129
1130 template <typename iType1, typename iType2>
1131 KOKKOS_INLINE_FUNCTION Impl::TeamVectorRangeBoundariesStruct<
1132 typename std::common_type<iType1, iType2>::type,
1133 Impl::OpenMPTargetExecTeamMember>
TeamVectorRange(const Impl::OpenMPTargetExecTeamMember & thread,const iType1 & arg_begin,const iType2 & arg_end)1134 TeamVectorRange(const Impl::OpenMPTargetExecTeamMember& thread,
1135 const iType1& arg_begin, const iType2& arg_end) {
1136 using iType = typename std::common_type<iType1, iType2>::type;
1137 return Impl::TeamVectorRangeBoundariesStruct<
1138 iType, Impl::OpenMPTargetExecTeamMember>(thread, iType(arg_begin),
1139 iType(arg_end));
1140 }
1141
1142 KOKKOS_INLINE_FUNCTION
PerTeam(const Impl::OpenMPTargetExecTeamMember & thread)1143 Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember> PerTeam(
1144 const Impl::OpenMPTargetExecTeamMember& thread) {
1145 return Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember>(thread);
1146 }
1147
1148 KOKKOS_INLINE_FUNCTION
PerThread(const Impl::OpenMPTargetExecTeamMember & thread)1149 Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember> PerThread(
1150 const Impl::OpenMPTargetExecTeamMember& thread) {
1151 return Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember>(thread);
1152 }
1153 } // namespace Kokkos
1154
1155 namespace Kokkos {
1156
1157 /** \brief Inter-thread parallel_for. Executes lambda(iType i) for each
1158 * i=0..N-1.
1159 *
1160 * The range i=0..N-1 is mapped to all threads of the the calling thread team.
1161 */
1162 template <typename iType, class Lambda>
parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember> & loop_boundaries,const Lambda & lambda)1163 KOKKOS_INLINE_FUNCTION void parallel_for(
1164 const Impl::TeamThreadRangeBoundariesStruct<
1165 iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
1166 const Lambda& lambda) {
1167 #pragma omp for nowait schedule(static, 1)
1168 for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i);
1169 }
1170
1171 /** \brief Inter-thread vector parallel_reduce. Executes lambda(iType i,
1172 * ValueType & val) for each i=0..N-1.
1173 *
1174 * The range i=0..N-1 is mapped to all threads of the the calling thread team
1175 * and a summation of val is performed and put into result.
1176 */
1177
1178 template <typename iType, class Lambda, typename ValueType>
1179 KOKKOS_INLINE_FUNCTION
1180 std::enable_if_t<!Kokkos::is_reducer_type<ValueType>::value>
parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember> & loop_boundaries,const Lambda & lambda,ValueType & result)1181 parallel_reduce(
1182 const Impl::TeamThreadRangeBoundariesStruct<
1183 iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
1184 const Lambda& lambda, ValueType& result) {
1185 // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
1186 // elements in the array <= 32. For reduction we allocate, 16 bytes per
1187 // element in the scratch space, hence, 16*32 = 512.
1188 static_assert(sizeof(ValueType) <=
1189 Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
1190
1191 ValueType* TeamThread_scratch =
1192 static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
1193
1194 #pragma omp barrier
1195 TeamThread_scratch[0] = ValueType();
1196 #pragma omp barrier
1197
1198 if constexpr (std::is_arithmetic<ValueType>::value) {
1199 #pragma omp for reduction(+ : TeamThread_scratch[:1])
1200 for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
1201 ValueType tmp = ValueType();
1202 lambda(i, tmp);
1203 TeamThread_scratch[0] += tmp;
1204 }
1205 } else {
1206 #pragma omp declare reduction(custom:ValueType : omp_out += omp_in)
1207
1208 #pragma omp for reduction(custom : TeamThread_scratch[:1])
1209 for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
1210 ValueType tmp = ValueType();
1211 lambda(i, tmp);
1212 TeamThread_scratch[0] += tmp;
1213 }
1214 }
1215
1216 result = TeamThread_scratch[0];
1217 }
1218
1219 #if !defined(KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND)
1220 // For some reason the actual version we wanted to write doesn't work
1221 // and crashes. We should try this with every new compiler
1222 // This is the variant we actually wanted to write
1223 template <typename iType, class Lambda, typename ReducerType>
1224 KOKKOS_INLINE_FUNCTION
1225 std::enable_if_t<Kokkos::is_reducer_type<ReducerType>::value>
parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember> & loop_boundaries,const Lambda & lambda,ReducerType result)1226 parallel_reduce(
1227 const Impl::TeamThreadRangeBoundariesStruct<
1228 iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
1229 const Lambda& lambda, ReducerType result) {
1230 using ValueType = typename ReducerType::value_type;
1231
1232 #pragma omp declare reduction( \
1233 custominner:ValueType \
1234 : Impl::OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
1235 initializer( \
1236 Impl::OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
1237
1238 // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
1239 // elements in the array <= 32. For reduction we allocate, 16 bytes per
1240 // element in the scratch space, hence, 16*32 = 512.
1241 static_assert(sizeof(ValueType) <=
1242 Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
1243
1244 ValueType* TeamThread_scratch =
1245 static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
1246
1247 #pragma omp barrier
1248 // These three lines all cause crash
1249 Impl::OpenMPTargetReducerWrapper<ReducerType>::init(TeamThread_scratch[0]);
1250 // result.init(TeamThread_scratch[0]);
1251 // Impl::OpenMPTargetReducerWrapper<ReducerType> red;
1252 // red.init(TeamThread_scratch[0]);
1253 #pragma omp barrier
1254
1255 #pragma omp for reduction(custominner : TeamThread_scratch[:1])
1256 for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
1257 ValueType tmp;
1258 result.init(tmp);
1259 lambda(i, tmp);
1260 // This line causes a crash
1261 Impl::OpenMPTargetReducerWrapper<ReducerType>::join(TeamThread_scratch[0],
1262 tmp);
1263 }
1264 result.reference() = TeamThread_scratch[0];
1265 }
1266 #else
1267 template <typename iType, class Lambda, typename ReducerType>
1268 KOKKOS_INLINE_FUNCTION
1269 std::enable_if_t<Kokkos::is_reducer_type<ReducerType>::value>
parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember> & loop_boundaries,const Lambda & lambda,ReducerType result)1270 parallel_reduce(
1271 const Impl::TeamThreadRangeBoundariesStruct<
1272 iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
1273 const Lambda& lambda, ReducerType result) {
1274 using ValueType = typename ReducerType::value_type;
1275
1276 // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
1277 // elements in the array <= 32. For reduction we allocate, 16 bytes per
1278 // element in the scratch space, hence, 16*32 = 512.
1279 static_assert(sizeof(ValueType) <=
1280 Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
1281
1282 ValueType* TeamThread_scratch =
1283 static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
1284
1285 #pragma omp declare reduction( \
1286 omp_red_teamthread_reducer:ValueType \
1287 : Impl::OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
1288 initializer( \
1289 Impl::OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
1290
1291 #pragma omp barrier
1292 ValueType tmp;
1293 result.init(tmp);
1294 TeamThread_scratch[0] = tmp;
1295 #pragma omp barrier
1296
1297 iType team_size = iType(omp_get_num_threads());
1298 #pragma omp for reduction(omp_red_teamthread_reducer \
1299 : TeamThread_scratch[:1]) schedule(static, 1)
1300 for (iType t = 0; t < team_size; t++) {
1301 ValueType tmp2;
1302 result.init(tmp2);
1303
1304 for (iType i = loop_boundaries.start + t; i < loop_boundaries.end;
1305 i += team_size) {
1306 lambda(i, tmp2);
1307 }
1308 TeamThread_scratch[0] = tmp2;
1309 }
1310
1311 result.reference() = TeamThread_scratch[0];
1312 }
1313 #endif // KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
1314
1315 /** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i,
1316 * ValueType & val) for each i=0..N-1.
1317 *
1318 * The range i=0..N-1 is mapped to all vector lanes of the the calling thread
1319 * and a reduction of val is performed using JoinType(ValueType& val, const
1320 * ValueType& update) and put into init_result. The input value of init_result
1321 * is used as initializer for temporary variables of ValueType. Therefore the
1322 * input value should be the neutral element with respect to the join operation
1323 * (e.g. '0 for +-' or '1 for *').
1324 */
1325 template <typename iType, class Lambda, typename ValueType, class JoinType>
parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember> & loop_boundaries,const Lambda & lambda,const JoinType & join,ValueType & init_result)1326 KOKKOS_INLINE_FUNCTION void parallel_reduce(
1327 const Impl::TeamThreadRangeBoundariesStruct<
1328 iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
1329 const Lambda& lambda, const JoinType& join, ValueType& init_result) {
1330 ValueType* TeamThread_scratch =
1331 static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
1332
1333 // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
1334 // elements in the array <= 32. For reduction we allocate, 16 bytes per
1335 // element in the scratch space, hence, 16*32 = 512.
1336 static_assert(sizeof(ValueType) <=
1337 Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
1338
1339 #pragma omp barrier
1340 TeamThread_scratch[0] = init_result;
1341 #pragma omp barrier
1342
1343 if constexpr (std::is_arithmetic<ValueType>::value) {
1344 #pragma omp for reduction(+ : TeamThread_scratch[:1])
1345 for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
1346 ValueType tmp = ValueType();
1347 lambda(i, tmp);
1348 TeamThread_scratch[0] += tmp;
1349 }
1350 } else {
1351 #pragma omp declare reduction(custom:ValueType : omp_out += omp_in)
1352
1353 #pragma omp for reduction(custom : TeamThread_scratch[:1])
1354 for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
1355 ValueType tmp = ValueType();
1356 lambda(i, tmp);
1357 join(TeamThread_scratch[0], tmp);
1358 }
1359 }
1360
1361 init_result = TeamThread_scratch[0];
1362 }
1363
1364 // This is largely the same code as in HIP and CUDA except for the member name
1365 template <typename iType, class FunctorType>
parallel_scan(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember> & loop_bounds,const FunctorType & lambda)1366 KOKKOS_INLINE_FUNCTION void parallel_scan(
1367 const Impl::TeamThreadRangeBoundariesStruct<
1368 iType, Impl::OpenMPTargetExecTeamMember>& loop_bounds,
1369 const FunctorType& lambda) {
1370 // Extract value_type from lambda
1371 using value_type = typename Kokkos::Impl::FunctorAnalysis<
1372 Kokkos::Impl::FunctorPatternInterface::SCAN, void,
1373 FunctorType>::value_type;
1374
1375 const auto start = loop_bounds.start;
1376 const auto end = loop_bounds.end;
1377 // Note this thing is called .member in the CUDA specialization of
1378 // TeamThreadRangeBoundariesStruct
1379 auto& member = loop_bounds.team;
1380 const auto team_size = member.team_size();
1381 const auto team_rank = member.team_rank();
1382 const auto nchunk = (end - start + team_size - 1) / team_size;
1383 value_type accum = 0;
1384 // each team has to process one or more chunks of the prefix scan
1385 for (iType i = 0; i < nchunk; ++i) {
1386 auto ii = start + i * team_size + team_rank;
1387 // local accumulation for this chunk
1388 value_type local_accum = 0;
1389 // user updates value with prefix value
1390 if (ii < loop_bounds.end) lambda(ii, local_accum, false);
1391 // perform team scan
1392 local_accum = member.team_scan(local_accum);
1393 // add this blocks accum to total accumulation
1394 auto val = accum + local_accum;
1395 // user updates their data with total accumulation
1396 if (ii < loop_bounds.end) lambda(ii, val, true);
1397 // the last value needs to be propogated to next chunk
1398 if (team_rank == team_size - 1) accum = val;
1399 // broadcast last value to rest of the team
1400 member.team_broadcast(accum, team_size - 1);
1401 }
1402 }
1403
1404 } // namespace Kokkos
1405 #undef KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
1406
1407 namespace Kokkos {
1408 /** \brief Intra-thread vector parallel_for. Executes lambda(iType i) for each
1409 * i=0..N-1.
1410 *
1411 * The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
1412 */
1413 template <typename iType, class Lambda>
parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember> & loop_boundaries,const Lambda & lambda)1414 KOKKOS_INLINE_FUNCTION void parallel_for(
1415 const Impl::ThreadVectorRangeBoundariesStruct<
1416 iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
1417 const Lambda& lambda) {
1418 #pragma omp simd
1419 for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i);
1420 }
1421
1422 /** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i,
1423 * ValueType & val) for each i=0..N-1.
1424 *
1425 * The range i=0..N-1 is mapped to all vector lanes of the the calling thread
1426 * and a summation of val is performed and put into result.
1427 */
1428 template <typename iType, class Lambda, typename ValueType>
parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember> & loop_boundaries,const Lambda & lambda,ValueType & result)1429 KOKKOS_INLINE_FUNCTION void parallel_reduce(
1430 const Impl::ThreadVectorRangeBoundariesStruct<
1431 iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
1432 const Lambda& lambda, ValueType& result) {
1433 ValueType vector_reduce = ValueType();
1434
1435 if constexpr (std::is_arithmetic<ValueType>::value) {
1436 #pragma omp simd reduction(+ : vector_reduce)
1437 for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
1438 ValueType tmp = ValueType();
1439 lambda(i, tmp);
1440 vector_reduce += tmp;
1441 }
1442 } else {
1443 #pragma omp declare reduction(custom:ValueType : omp_out += omp_in)
1444
1445 #pragma omp simd reduction(custom : vector_reduce)
1446 for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
1447 lambda(i, vector_reduce);
1448 }
1449 }
1450
1451 result = vector_reduce;
1452 }
1453
1454 template <typename iType, class Lambda, typename ReducerType>
1455 KOKKOS_INLINE_FUNCTION
1456 std::enable_if_t<Kokkos::is_reducer_type<ReducerType>::value>
parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember> & loop_boundaries,const Lambda & lambda,ReducerType const & result)1457 parallel_reduce(
1458 const Impl::ThreadVectorRangeBoundariesStruct<
1459 iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
1460 const Lambda& lambda, ReducerType const& result) {
1461 using ValueType = typename ReducerType::value_type;
1462
1463 #pragma omp declare reduction( \
1464 custom:ValueType \
1465 : Impl::OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
1466 initializer( \
1467 Impl::OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
1468
1469 ValueType vector_reduce;
1470 Impl::OpenMPTargetReducerWrapper<ReducerType>::init(vector_reduce);
1471
1472 #pragma omp simd reduction(custom : vector_reduce)
1473 for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
1474 lambda(i, vector_reduce);
1475 }
1476
1477 result.reference() = vector_reduce;
1478 }
1479
1480 /** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i,
1481 * ValueType & val) for each i=0..N-1.
1482 *
1483 * The range i=0..N-1 is mapped to all vector lanes of the the calling thread
1484 * and a reduction of val is performed using JoinType(ValueType& val, const
1485 * ValueType& update) and put into init_result. The input value of init_result
1486 * is used as initializer for temporary variables of ValueType. Therefore the
1487 * input value should be the neutral element with respect to the join operation
1488 * (e.g. '0 for +-' or '1 for *').
1489 */
1490 template <typename iType, class Lambda, typename ValueType, class JoinType>
parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember> & loop_boundaries,const Lambda & lambda,const JoinType & join,ValueType & init_result)1491 KOKKOS_INLINE_FUNCTION void parallel_reduce(
1492 const Impl::ThreadVectorRangeBoundariesStruct<
1493 iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
1494 const Lambda& lambda, const JoinType& join, ValueType& init_result) {
1495 ValueType result = init_result;
1496
1497 // FIXME_OPENMPTARGET think about omp simd
1498 // join does not work with omp reduction clause
1499 for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
1500 ValueType tmp = ValueType();
1501 lambda(i, tmp);
1502 join(result, tmp);
1503 }
1504
1505 init_result = result;
1506 }
1507
1508 /** \brief Intra-thread vector parallel exclusive prefix sum. Executes
1509 * lambda(iType i, ValueType & val, bool final) for each i=0..N-1.
1510 *
1511 * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan
1512 * operation is performed. Depending on the target execution space the operator
1513 * might be called twice: once with final=false and once with final=true. When
1514 * final==true val contains the prefix sum value. The contribution of this "i"
1515 * needs to be added to val no matter whether final==true or not. In a serial
1516 * execution (i.e. team_size==1) the operator is only called once with
1517 * final==true. Scan_val will be set to the final sum value over all vector
1518 * lanes.
1519 */
1520 template <typename iType, class FunctorType>
parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember> & loop_boundaries,const FunctorType & lambda)1521 KOKKOS_INLINE_FUNCTION void parallel_scan(
1522 const Impl::ThreadVectorRangeBoundariesStruct<
1523 iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
1524 const FunctorType& lambda) {
1525 using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, void>;
1526 using value_type = typename ValueTraits::value_type;
1527
1528 value_type scan_val = value_type();
1529
1530 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
1531 #pragma ivdep
1532 #endif
1533 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
1534 i += loop_boundaries.increment) {
1535 lambda(i, scan_val, true);
1536 }
1537 }
1538
1539 } // namespace Kokkos
1540
1541 namespace Kokkos {
1542 /** \brief Intra-team vector parallel_for. Executes lambda(iType i) for each
1543 * i=0..N-1.
1544 *
1545 * The range i=0..N-1 is mapped to all vector lanes of the the calling team.
1546 */
1547 template <typename iType, class Lambda>
parallel_for(const Impl::TeamVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember> & loop_boundaries,const Lambda & lambda)1548 KOKKOS_INLINE_FUNCTION void parallel_for(
1549 const Impl::TeamVectorRangeBoundariesStruct<
1550 iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
1551 const Lambda& lambda) {
1552 #pragma omp for simd nowait schedule(static, 1)
1553 for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i);
1554 }
1555
1556 /** \brief Intra-team vector parallel_reduce. Executes lambda(iType i,
1557 * ValueType & val) for each i=0..N-1.
1558 *
1559 * The range i=0..N-1 is mapped to all vector lanes of the the calling team
1560 * and a summation of val is performed and put into result.
1561 */
1562 template <typename iType, class Lambda, typename ValueType>
parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember> & loop_boundaries,const Lambda & lambda,ValueType & result)1563 KOKKOS_INLINE_FUNCTION void parallel_reduce(
1564 const Impl::TeamVectorRangeBoundariesStruct<
1565 iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
1566 const Lambda& lambda, ValueType& result) {
1567 // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
1568 // elements in the array <= 32. For reduction we allocate, 16 bytes per
1569 // element in the scratch space, hence, 16*32 = 512.
1570 static_assert(sizeof(ValueType) <=
1571 Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
1572
1573 ValueType* TeamVector_scratch =
1574 static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
1575
1576 #pragma omp barrier
1577 TeamVector_scratch[0] = ValueType();
1578 #pragma omp barrier
1579
1580 if constexpr (std::is_arithmetic<ValueType>::value) {
1581 #pragma omp for simd reduction(+ : TeamVector_scratch[:1])
1582 for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
1583 ValueType tmp = ValueType();
1584 lambda(i, tmp);
1585 TeamVector_scratch[0] += tmp;
1586 }
1587 } else {
1588 #pragma omp declare reduction(custom:ValueType : omp_out += omp_in)
1589
1590 #pragma omp for simd reduction(custom : TeamVector_scratch[:1])
1591 for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
1592 ValueType tmp = ValueType();
1593 lambda(i, tmp);
1594 TeamVector_scratch[0] += tmp;
1595 }
1596 }
1597
1598 result = TeamVector_scratch[0];
1599 }
1600
1601 #if !defined(KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND)
1602 template <typename iType, class Lambda, typename ReducerType>
1603 KOKKOS_INLINE_FUNCTION
1604 std::enable_if_t<Kokkos::is_reducer_type<ReducerType>::value>
parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember> & loop_boundaries,const Lambda & lambda,ReducerType const & result)1605 parallel_reduce(
1606 const Impl::TeamVectorRangeBoundariesStruct<
1607 iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
1608 const Lambda& lambda, ReducerType const& result) {
1609 using ValueType = typename ReducerType::value_type;
1610
1611 // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
1612 // elements in the array <= 32. For reduction we allocate, 16 bytes per
1613 // element in the scratch space, hence, 16*32 = 512.
1614 static_assert(sizeof(ValueType) <=
1615 Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
1616
1617 #pragma omp declare reduction( \
1618 custom:ValueType \
1619 : Impl::OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
1620 initializer( \
1621 Impl::OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
1622
1623 ValueType* TeamVector_scratch =
1624 static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
1625
1626 #pragma omp barrier
1627 Impl::OpenMPTargetReducerWrapper<ReducerType>::init(TeamVector_scratch[0]);
1628 #pragma omp barrier
1629
1630 #pragma omp for simd reduction(custom : TeamVector_scratch[:1])
1631 for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
1632 ValueType tmp = ValueType();
1633 lambda(i, tmp);
1634 TeamVector_scratch[0] += tmp;
1635 }
1636
1637 result.reference() = TeamVector_scratch[0];
1638 }
1639 #else
1640 template <typename iType, class Lambda, typename ReducerType>
1641 KOKKOS_INLINE_FUNCTION
1642 std::enable_if_t<Kokkos::is_reducer_type<ReducerType>::value>
parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember> & loop_boundaries,const Lambda & lambda,ReducerType const & result)1643 parallel_reduce(
1644 const Impl::TeamVectorRangeBoundariesStruct<
1645 iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
1646 const Lambda& lambda, ReducerType const& result) {
1647 using ValueType = typename ReducerType::value_type;
1648
1649 // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of
1650 // elements in the array <= 32. For reduction we allocate, 16 bytes per
1651 // element in the scratch space, hence, 16*32 = 512.
1652 static_assert(sizeof(ValueType) <=
1653 Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
1654
1655 ValueType* TeamVector_scratch =
1656 static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
1657
1658 #pragma omp declare reduction( \
1659 omp_red_teamthread_reducer:ValueType \
1660 : Impl::OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
1661 initializer( \
1662 Impl::OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
1663
1664 #pragma omp barrier
1665 ValueType tmp;
1666 result.init(tmp);
1667 TeamVector_scratch[0] = tmp;
1668 #pragma omp barrier
1669
1670 iType team_size = iType(omp_get_num_threads());
1671 #pragma omp for simd reduction(omp_red_teamthread_reducer \
1672 : TeamVector_scratch[:1]) schedule(static, 1)
1673 for (iType t = 0; t < team_size; t++) {
1674 ValueType tmp2;
1675 result.init(tmp2);
1676
1677 for (iType i = loop_boundaries.start + t; i < loop_boundaries.end;
1678 i += team_size) {
1679 lambda(i, tmp2);
1680 }
1681 TeamVector_scratch[0] = tmp2;
1682 }
1683
1684 result.reference() = TeamVector_scratch[0];
1685 }
1686 #endif // KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
1687 } // namespace Kokkos
1688
1689 #undef KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
1690
1691 namespace Kokkos {
1692
1693 template <class FunctorType>
single(const Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember> &,const FunctorType & lambda)1694 KOKKOS_INLINE_FUNCTION void single(
1695 const Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember>&
1696 /*single_struct*/,
1697 const FunctorType& lambda) {
1698 lambda();
1699 }
1700
1701 template <class FunctorType>
single(const Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember> & single_struct,const FunctorType & lambda)1702 KOKKOS_INLINE_FUNCTION void single(
1703 const Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember>&
1704 single_struct,
1705 const FunctorType& lambda) {
1706 if (single_struct.team_member.team_rank() == 0) lambda();
1707 }
1708
1709 template <class FunctorType, class ValueType>
single(const Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember> &,const FunctorType & lambda,ValueType & val)1710 KOKKOS_INLINE_FUNCTION void single(
1711 const Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember>&
1712 /*single_struct*/,
1713 const FunctorType& lambda, ValueType& val) {
1714 lambda(val);
1715 }
1716
1717 template <class FunctorType, class ValueType>
single(const Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember> & single_struct,const FunctorType & lambda,ValueType & val)1718 KOKKOS_INLINE_FUNCTION void single(
1719 const Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember>&
1720 single_struct,
1721 const FunctorType& lambda, ValueType& val) {
1722 if (single_struct.team_member.team_rank() == 0) {
1723 lambda(val);
1724 }
1725 single_struct.team_member.team_broadcast(val, 0);
1726 }
1727 } // namespace Kokkos
1728
1729 #endif /* #ifndef KOKKOS_OPENMPTARGETEXEC_HPP */
1730