1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 //                        Kokkos v. 3.0
6 //       Copyright (2020) National Technology & Engineering
7 //               Solutions of Sandia, LLC (NTESS).
8 //
9 // Under the terms of Contract DE-NA0003525 with NTESS,
10 // the U.S. Government retains certain rights in this software.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions are
14 // met:
15 //
16 // 1. Redistributions of source code must retain the above copyright
17 // notice, this list of conditions and the following disclaimer.
18 //
19 // 2. Redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution.
22 //
23 // 3. Neither the name of the Corporation nor the names of the
24 // contributors may be used to endorse or promote products derived from
25 // this software without specific prior written permission.
26 //
27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 //
39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
40 //
41 // ************************************************************************
42 //@HEADER
43 */
44 
45 #include <cstdio>
46 #include <stdexcept>
47 #include <sstream>
48 #include <iostream>
49 
50 #include <Kokkos_Core.hpp>
51 
52 namespace Test {
53 
54 namespace {
55 
56 template <class ExecSpace, class ScheduleType>
57 struct TestTeamPolicy {
58   using team_member =
59       typename Kokkos::TeamPolicy<ScheduleType, ExecSpace>::member_type;
60   using view_type = Kokkos::View<int **, ExecSpace>;
61 
62   view_type m_flags;
63 
TestTeamPolicyTest::__anon4dcf17ce0111::TestTeamPolicy64   TestTeamPolicy(const size_t league_size)
65       : m_flags(
66             Kokkos::view_alloc(Kokkos::WithoutInitializing, "flags"),
67   // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
68 #ifdef KOKKOS_ENABLE_OPENMPTARGET
69             Kokkos::TeamPolicy<ScheduleType, ExecSpace>(1, 32).team_size_max(
70                 *this, Kokkos::ParallelReduceTag()),
71 #else
72             Kokkos::TeamPolicy<ScheduleType, ExecSpace>(1, 1).team_size_max(
73                 *this, Kokkos::ParallelReduceTag()),
74 #endif
75             league_size) {
76   }
77 
78   struct VerifyInitTag {};
79 
80   KOKKOS_INLINE_FUNCTION
operator ()Test::__anon4dcf17ce0111::TestTeamPolicy81   void operator()(const team_member &member) const {
82     const int tid =
83         member.team_rank() + member.team_size() * member.league_rank();
84 
85     m_flags(member.team_rank(), member.league_rank()) = tid;
86     static_assert(
87         (std::is_same<typename team_member::execution_space, ExecSpace>::value),
88         "TeamMember::execution_space is not the same as "
89         "TeamPolicy<>::execution_space");
90   }
91 
92   KOKKOS_INLINE_FUNCTION
operator ()Test::__anon4dcf17ce0111::TestTeamPolicy93   void operator()(const VerifyInitTag &, const team_member &member) const {
94     const int tid =
95         member.team_rank() + member.team_size() * member.league_rank();
96 
97     if (tid != m_flags(member.team_rank(), member.league_rank())) {
98       KOKKOS_IMPL_DO_NOT_USE_PRINTF(
99           "TestTeamPolicy member(%d,%d) error %d != %d\n", member.league_rank(),
100           member.team_rank(), tid,
101           m_flags(member.team_rank(), member.league_rank()));
102     }
103   }
104 
105   // Included for test_small_league_size.
TestTeamPolicyTest::__anon4dcf17ce0111::TestTeamPolicy106   TestTeamPolicy() : m_flags() {}
107 
108   // Included for test_small_league_size.
109   struct NoOpTag {};
110 
111   KOKKOS_INLINE_FUNCTION
operator ()Test::__anon4dcf17ce0111::TestTeamPolicy112   void operator()(const NoOpTag &, const team_member & /*member*/) const {}
113 
test_small_league_sizeTest::__anon4dcf17ce0111::TestTeamPolicy114   static void test_small_league_size() {
115     int bs = 8;   // batch size (number of elements per batch)
116     int ns = 16;  // total number of "problems" to process
117 
118     // Calculate total scratch memory space size.
119     const int level     = 0;
120     int mem_size        = 960;
121     const int num_teams = ns / bs;
122     Kokkos::TeamPolicy<ExecSpace, NoOpTag> policy(num_teams, Kokkos::AUTO());
123 
124     Kokkos::parallel_for(
125         policy.set_scratch_size(level, Kokkos::PerTeam(mem_size),
126                                 Kokkos::PerThread(0)),
127         TestTeamPolicy());
128   }
129 
test_constructorsTest::__anon4dcf17ce0111::TestTeamPolicy130   static void test_constructors() {
131     constexpr const int smallest_work = 1;
132     // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
133 #ifdef KOKKOS_ENABLE_OPENMPTARGET
134     Kokkos::TeamPolicy<ExecSpace, NoOpTag> none_auto(smallest_work, 32,
135                                                      smallest_work);
136 #else
137     Kokkos::TeamPolicy<ExecSpace, NoOpTag> none_auto(
138         smallest_work, smallest_work, smallest_work);
139 #endif
140     Kokkos::TeamPolicy<ExecSpace, NoOpTag> both_auto(
141         smallest_work, Kokkos::AUTO(), Kokkos::AUTO());
142     // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
143 #ifdef KOKKOS_ENABLE_OPENMPTARGET
144     Kokkos::TeamPolicy<ExecSpace, NoOpTag> auto_vector(smallest_work, 32,
145                                                        Kokkos::AUTO());
146 #else
147     Kokkos::TeamPolicy<ExecSpace, NoOpTag> auto_vector(
148         smallest_work, smallest_work, Kokkos::AUTO());
149 #endif
150     Kokkos::TeamPolicy<ExecSpace, NoOpTag> auto_team(
151         smallest_work, Kokkos::AUTO(), smallest_work);
152   }
153 
test_forTest::__anon4dcf17ce0111::TestTeamPolicy154   static void test_for(const size_t league_size) {
155     {
156       TestTeamPolicy functor(league_size);
157       using policy_type = Kokkos::TeamPolicy<ScheduleType, ExecSpace>;
158       using policy_type_init =
159           Kokkos::TeamPolicy<ScheduleType, ExecSpace, VerifyInitTag>;
160 
161       // FIXME_OPENMPTARGET temporary restriction for team size to be at least
162       // 32
163 #ifdef KOKKOS_ENABLE_OPENMPTARGET
164       const int team_size =
165           policy_type(league_size, 32)
166               .team_size_max(functor, Kokkos::ParallelForTag());
167       const int team_size_init =
168           policy_type_init(league_size, 32)
169               .team_size_max(functor, Kokkos::ParallelForTag());
170 #else
171       const int team_size =
172           policy_type(league_size, 1)
173               .team_size_max(functor, Kokkos::ParallelForTag());
174       const int team_size_init =
175           policy_type_init(league_size, 1)
176               .team_size_max(functor, Kokkos::ParallelForTag());
177 #endif
178 
179       Kokkos::parallel_for(policy_type(league_size, team_size), functor);
180       Kokkos::parallel_for(policy_type_init(league_size, team_size_init),
181                            functor);
182     }
183 
184     test_small_league_size();
185     test_constructors();
186   }
187 
188   struct ReduceTag {};
189 
190   using value_type = int64_t;
191 
192   KOKKOS_INLINE_FUNCTION
operator ()Test::__anon4dcf17ce0111::TestTeamPolicy193   void operator()(const team_member &member, value_type &update) const {
194     update += member.team_rank() + member.team_size() * member.league_rank();
195   }
196 
197   KOKKOS_INLINE_FUNCTION
operator ()Test::__anon4dcf17ce0111::TestTeamPolicy198   void operator()(const ReduceTag &, const team_member &member,
199                   value_type &update) const {
200     update +=
201         1 + member.team_rank() + member.team_size() * member.league_rank();
202   }
203 
test_reduceTest::__anon4dcf17ce0111::TestTeamPolicy204   static void test_reduce(const size_t league_size) {
205     TestTeamPolicy functor(league_size);
206 
207     using policy_type = Kokkos::TeamPolicy<ScheduleType, ExecSpace>;
208     using policy_type_reduce =
209         Kokkos::TeamPolicy<ScheduleType, ExecSpace, ReduceTag>;
210 
211     // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
212 #ifdef KOKKOS_ENABLE_OPENMPTARGET
213     const int team_size =
214         policy_type_reduce(league_size, 32)
215             .team_size_max(functor, Kokkos::ParallelReduceTag());
216 #else
217     const int team_size =
218         policy_type_reduce(league_size, 1)
219             .team_size_max(functor, Kokkos::ParallelReduceTag());
220 #endif
221 
222     const int64_t N = team_size * league_size;
223 
224     int64_t total = 0;
225 
226     Kokkos::parallel_reduce(policy_type(league_size, team_size), functor,
227                             total);
228     ASSERT_EQ(size_t((N - 1) * (N)) / 2, size_t(total));
229 
230     Kokkos::parallel_reduce(policy_type_reduce(league_size, team_size), functor,
231                             total);
232     ASSERT_EQ((size_t(N) * size_t(N + 1)) / 2, size_t(total));
233   }
234 };
235 
236 }  // namespace
237 
238 }  // namespace Test
239 
240 /*--------------------------------------------------------------------------*/
241 
242 namespace Test {
243 
244 template <typename ScalarType, class DeviceType, class ScheduleType>
245 class ReduceTeamFunctor {
246  public:
247   using execution_space = DeviceType;
248   using policy_type     = Kokkos::TeamPolicy<ScheduleType, execution_space>;
249   using size_type       = typename execution_space::size_type;
250 
251   struct value_type {
252     ScalarType value[3];
253   };
254 
255   const size_type nwork;
256 
257   KOKKOS_INLINE_FUNCTION
ReduceTeamFunctor(const size_type & arg_nwork)258   ReduceTeamFunctor(const size_type &arg_nwork) : nwork(arg_nwork) {}
259 
260   KOKKOS_INLINE_FUNCTION
ReduceTeamFunctor(const ReduceTeamFunctor & rhs)261   ReduceTeamFunctor(const ReduceTeamFunctor &rhs) : nwork(rhs.nwork) {}
262 
263   KOKKOS_INLINE_FUNCTION
init(value_type & dst) const264   void init(value_type &dst) const {
265     dst.value[0] = 0;
266     dst.value[1] = 0;
267     dst.value[2] = 0;
268   }
269 
270   KOKKOS_INLINE_FUNCTION
join(volatile value_type & dst,const volatile value_type & src) const271   void join(volatile value_type &dst, const volatile value_type &src) const {
272     dst.value[0] += src.value[0];
273     dst.value[1] += src.value[1];
274     dst.value[2] += src.value[2];
275   }
276 
277   KOKKOS_INLINE_FUNCTION
operator ()(const typename policy_type::member_type ind,value_type & dst) const278   void operator()(const typename policy_type::member_type ind,
279                   value_type &dst) const {
280     const int thread_rank =
281         ind.team_rank() + ind.team_size() * ind.league_rank();
282     const int thread_size = ind.team_size() * ind.league_size();
283     const int chunk       = (nwork + thread_size - 1) / thread_size;
284 
285     size_type iwork           = chunk * thread_rank;
286     const size_type iwork_end = iwork + chunk < nwork ? iwork + chunk : nwork;
287 
288     for (; iwork < iwork_end; ++iwork) {
289       dst.value[0] += 1;
290       dst.value[1] += iwork + 1;
291       dst.value[2] += nwork - iwork;
292     }
293   }
294 };
295 
296 }  // namespace Test
297 
298 namespace {
299 
300 template <typename ScalarType, class DeviceType, class ScheduleType>
301 class TestReduceTeam {
302  public:
303   using execution_space = DeviceType;
304   using policy_type     = Kokkos::TeamPolicy<ScheduleType, execution_space>;
305   using size_type       = typename execution_space::size_type;
306 
TestReduceTeam(const size_type & nwork)307   TestReduceTeam(const size_type &nwork) { run_test(nwork); }
308 
run_test(const size_type & nwork)309   void run_test(const size_type &nwork) {
310     using functor_type =
311         Test::ReduceTeamFunctor<ScalarType, execution_space, ScheduleType>;
312     using value_type = typename functor_type::value_type;
313     using result_type =
314         Kokkos::View<value_type, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
315 
316     enum { Count = 3 };
317     enum { Repeat = 100 };
318 
319     value_type result[Repeat];
320 
321     const uint64_t nw   = nwork;
322     const uint64_t nsum = nw % 2 ? nw * ((nw + 1) / 2) : (nw / 2) * (nw + 1);
323 
324     policy_type team_exec(nw, 1);
325 
326     const unsigned team_size = team_exec.team_size_recommended(
327         functor_type(nwork), Kokkos::ParallelReduceTag());
328     const unsigned league_size = (nwork + team_size - 1) / team_size;
329 
330     team_exec = policy_type(league_size, team_size);
331 
332     for (unsigned i = 0; i < Repeat; ++i) {
333       result_type tmp(&result[i]);
334       Kokkos::parallel_reduce(team_exec, functor_type(nwork), tmp);
335     }
336 
337     execution_space().fence();
338 
339     for (unsigned i = 0; i < Repeat; ++i) {
340       for (unsigned j = 0; j < Count; ++j) {
341         const uint64_t correct = 0 == j % 3 ? nw : nsum;
342         ASSERT_EQ((ScalarType)correct, result[i].value[j]);
343       }
344     }
345   }
346 };
347 
348 }  // namespace
349 
350 /*--------------------------------------------------------------------------*/
351 
352 namespace Test {
353 
354 template <class DeviceType, class ScheduleType>
355 class ScanTeamFunctor {
356  public:
357   using execution_space = DeviceType;
358   using policy_type     = Kokkos::TeamPolicy<ScheduleType, execution_space>;
359   using value_type      = int64_t;
360 
361   Kokkos::View<value_type, execution_space> accum;
362   Kokkos::View<value_type, execution_space> total;
363 
ScanTeamFunctor()364   ScanTeamFunctor() : accum("accum"), total("total") {}
365 
366   KOKKOS_INLINE_FUNCTION
init(value_type & error) const367   void init(value_type &error) const { error = 0; }
368 
369   KOKKOS_INLINE_FUNCTION
join(value_type volatile & error,value_type volatile const & input) const370   void join(value_type volatile &error,
371             value_type volatile const &input) const {
372     if (input) error = 1;
373   }
374 
375   struct JoinMax {
376     using value_type = int64_t;
377 
378     KOKKOS_INLINE_FUNCTION
joinTest::ScanTeamFunctor::JoinMax379     void join(value_type volatile &dst,
380               value_type volatile const &input) const {
381       if (dst < input) dst = input;
382     }
383   };
384 
385   KOKKOS_INLINE_FUNCTION
operator ()(const typename policy_type::member_type ind,value_type & error) const386   void operator()(const typename policy_type::member_type ind,
387                   value_type &error) const {
388     if (0 == ind.league_rank() && 0 == ind.team_rank()) {
389       const int64_t thread_count = ind.league_size() * ind.team_size();
390       total()                    = (thread_count * (thread_count + 1)) / 2;
391     }
392 
393     // Team max:
394     int64_t m = (int64_t)(ind.league_rank() + ind.team_rank());
395     ind.team_reduce(Kokkos::Max<int64_t>(m));
396 
397     if (m != ind.league_rank() + (ind.team_size() - 1)) {
398       KOKKOS_IMPL_DO_NOT_USE_PRINTF(
399           "ScanTeamFunctor[%i.%i of %i.%i] reduce_max_answer(%li) != "
400           "reduce_max(%li)\n",
401           static_cast<int>(ind.league_rank()),
402           static_cast<int>(ind.team_rank()),
403           static_cast<int>(ind.league_size()),
404           static_cast<int>(ind.team_size()),
405           static_cast<long>(ind.league_rank() + (ind.team_size() - 1)),
406           static_cast<long>(m));
407     }
408 
409     // Scan:
410     const int64_t answer = (ind.league_rank() + 1) * ind.team_rank() +
411                            (ind.team_rank() * (ind.team_rank() + 1)) / 2;
412 
413     const int64_t result =
414         ind.team_scan(ind.league_rank() + 1 + ind.team_rank() + 1);
415 
416     const int64_t result2 =
417         ind.team_scan(ind.league_rank() + 1 + ind.team_rank() + 1);
418 
419     if (answer != result || answer != result2) {
420       KOKKOS_IMPL_DO_NOT_USE_PRINTF(
421           "ScanTeamFunctor[%i.%i of %i.%i] answer(%li) != scan_first(%li) or "
422           "scan_second(%li)\n",
423           static_cast<int>(ind.league_rank()),
424           static_cast<int>(ind.team_rank()),
425           static_cast<int>(ind.league_size()),
426           static_cast<int>(ind.team_size()), static_cast<long>(answer),
427           static_cast<long>(result), static_cast<long>(result2));
428 
429       error = 1;
430     }
431 
432     const int64_t thread_rank =
433         ind.team_rank() + ind.team_size() * ind.league_rank();
434     ind.team_scan(1 + thread_rank, accum.data());
435   }
436 };
437 
438 template <class DeviceType, class ScheduleType>
439 class TestScanTeam {
440  public:
441   using execution_space = DeviceType;
442   using value_type      = int64_t;
443   using policy_type     = Kokkos::TeamPolicy<ScheduleType, execution_space>;
444   using functor_type    = Test::ScanTeamFunctor<DeviceType, ScheduleType>;
445 
TestScanTeam(const size_t nteam)446   TestScanTeam(const size_t nteam) { run_test(nteam); }
447 
run_test(const size_t nteam)448   void run_test(const size_t nteam) {
449     using result_type =
450         Kokkos::View<int64_t, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
451 
452     const unsigned REPEAT = 100000;
453     unsigned Repeat;
454 
455     if (nteam == 0) {
456       Repeat = 1;
457     } else {
458       Repeat = (REPEAT + nteam - 1) / nteam;  // Error here.
459     }
460 
461     functor_type functor;
462 
463     policy_type team_exec(nteam, 1);
464     team_exec = policy_type(
465         nteam, team_exec.team_size_max(functor, Kokkos::ParallelReduceTag()));
466 
467     for (unsigned i = 0; i < Repeat; ++i) {
468       int64_t accum = 0;
469       int64_t total = 0;
470       int64_t error = 0;
471       Kokkos::deep_copy(functor.accum, total);
472 
473       Kokkos::parallel_reduce(team_exec, functor, result_type(&error));
474       DeviceType().fence();
475 
476       Kokkos::deep_copy(accum, functor.accum);
477       Kokkos::deep_copy(total, functor.total);
478 
479       ASSERT_EQ(error, 0);
480       ASSERT_EQ(total, accum);
481     }
482 
483     execution_space().fence();
484   }
485 };
486 
487 }  // namespace Test
488 
489 /*--------------------------------------------------------------------------*/
490 
491 namespace Test {
492 
493 template <class ExecSpace, class ScheduleType>
494 struct SharedTeamFunctor {
495   using execution_space = ExecSpace;
496   using value_type      = int;
497   using policy_type     = Kokkos::TeamPolicy<ScheduleType, execution_space>;
498 
499   enum { SHARED_COUNT = 1000 };
500 
501   using shmem_space = typename ExecSpace::scratch_memory_space;
502 
503   // TBD: MemoryUnmanaged should be the default for shared memory space.
504   using shared_int_array_type =
505       Kokkos::View<int *, shmem_space, Kokkos::MemoryUnmanaged>;
506 
507   // Tell how much shared memory will be required by this functor.
team_shmem_sizeTest::SharedTeamFunctor508   inline unsigned team_shmem_size(int /*team_size*/) const {
509     return shared_int_array_type::shmem_size(SHARED_COUNT) +
510            shared_int_array_type::shmem_size(SHARED_COUNT);
511   }
512 
513   KOKKOS_INLINE_FUNCTION
operator ()Test::SharedTeamFunctor514   void operator()(const typename policy_type::member_type &ind,
515                   value_type &update) const {
516     const shared_int_array_type shared_A(ind.team_shmem(), SHARED_COUNT);
517     const shared_int_array_type shared_B(ind.team_shmem(), SHARED_COUNT);
518 
519     if ((shared_A.data() == nullptr && SHARED_COUNT > 0) ||
520         (shared_B.data() == nullptr && SHARED_COUNT > 0)) {
521       KOKKOS_IMPL_DO_NOT_USE_PRINTF(
522           "member( %i/%i , %i/%i ) Failed to allocate shared memory of size "
523           "%lu\n",
524           static_cast<int>(ind.league_rank()),
525           static_cast<int>(ind.league_size()),
526           static_cast<int>(ind.team_rank()), static_cast<int>(ind.team_size()),
527           static_cast<unsigned long>(SHARED_COUNT));
528 
529       ++update;  // Failure to allocate is an error.
530     } else {
531       for (int i = ind.team_rank(); i < SHARED_COUNT; i += ind.team_size()) {
532         shared_A[i] = i + ind.league_rank();
533         shared_B[i] = 2 * i + ind.league_rank();
534       }
535 
536       ind.team_barrier();
537 
538       if (ind.team_rank() + 1 == ind.team_size()) {
539         for (int i = 0; i < SHARED_COUNT; ++i) {
540           if (shared_A[i] != i + ind.league_rank()) {
541             ++update;
542           }
543 
544           if (shared_B[i] != 2 * i + ind.league_rank()) {
545             ++update;
546           }
547         }
548       }
549     }
550   }
551 };
552 
553 }  // namespace Test
554 
555 namespace {
556 
557 template <class ExecSpace, class ScheduleType>
558 struct TestSharedTeam {
TestSharedTeam__anon4dcf17ce0611::TestSharedTeam559   TestSharedTeam() { run(); }
560 
run__anon4dcf17ce0611::TestSharedTeam561   void run() {
562     using Functor = Test::SharedTeamFunctor<ExecSpace, ScheduleType>;
563     using result_type =
564         Kokkos::View<typename Functor::value_type, Kokkos::HostSpace,
565                      Kokkos::MemoryUnmanaged>;
566 
567 #ifdef KOKKOS_ENABLE_OPENMPTARGET
568     const size_t team_size =
569         Kokkos::TeamPolicy<ScheduleType, ExecSpace>(64, 32).team_size_max(
570             Functor(), Kokkos::ParallelReduceTag());
571 
572     Kokkos::TeamPolicy<ScheduleType, ExecSpace> team_exec(32 / team_size,
573                                                           team_size);
574 #else
575     const size_t team_size =
576         Kokkos::TeamPolicy<ScheduleType, ExecSpace>(8192, 1).team_size_max(
577             Functor(), Kokkos::ParallelReduceTag());
578 
579     Kokkos::TeamPolicy<ScheduleType, ExecSpace> team_exec(8192 / team_size,
580                                                           team_size);
581 #endif
582 
583     typename Functor::value_type error_count = 0;
584 
585     Kokkos::parallel_reduce(team_exec, Functor(), result_type(&error_count));
586     Kokkos::fence();
587 
588     ASSERT_EQ(error_count, 0);
589   }
590 };
591 
592 }  // namespace
593 
594 namespace Test {
595 
596 #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
597 template <class MemorySpace, class ExecSpace, class ScheduleType>
598 struct TestLambdaSharedTeam {
TestLambdaSharedTeamTest::TestLambdaSharedTeam599   TestLambdaSharedTeam() { run(); }
600 
runTest::TestLambdaSharedTeam601   void run() {
602     using Functor     = Test::SharedTeamFunctor<ExecSpace, ScheduleType>;
603     using result_type = Kokkos::View<typename Functor::value_type, MemorySpace,
604                                      Kokkos::MemoryUnmanaged>;
605 
606     using shmem_space = typename ExecSpace::scratch_memory_space;
607 
608     // TBD: MemoryUnmanaged should be the default for shared memory space.
609     using shared_int_array_type =
610         Kokkos::View<int *, shmem_space, Kokkos::MemoryUnmanaged>;
611 
612     const int SHARED_COUNT = 1000;
613 #ifdef KOKKOS_ENABLE_OPENMPTARGET
614     int team_size = 32;
615 #else
616     int team_size = 1;
617 #endif
618 
619 #ifdef KOKKOS_ENABLE_CUDA
620     if (std::is_same<ExecSpace, Kokkos::Cuda>::value) team_size = 128;
621 #endif
622 
623     Kokkos::TeamPolicy<ScheduleType, ExecSpace> team_exec(8192 / team_size,
624                                                           team_size);
625     team_exec = team_exec.set_scratch_size(
626         0, Kokkos::PerTeam(SHARED_COUNT * 2 * sizeof(int)));
627 
628     typename Functor::value_type error_count = 0;
629 
630     Kokkos::parallel_reduce(
631         team_exec,
632         KOKKOS_LAMBDA(
633             const typename Kokkos::TeamPolicy<ScheduleType,
634                                               ExecSpace>::member_type &ind,
635             int &update) {
636           const shared_int_array_type shared_A(ind.team_shmem(), SHARED_COUNT);
637           const shared_int_array_type shared_B(ind.team_shmem(), SHARED_COUNT);
638 
639           if ((shared_A.data() == nullptr && SHARED_COUNT > 0) ||
640               (shared_B.data() == nullptr && SHARED_COUNT > 0)) {
641             KOKKOS_IMPL_DO_NOT_USE_PRINTF(
642                 "Failed to allocate shared memory of size %lu\n",
643                 static_cast<unsigned long>(SHARED_COUNT));
644 
645             ++update;  // Failure to allocate is an error.
646           } else {
647             for (int i = ind.team_rank(); i < SHARED_COUNT;
648                  i += ind.team_size()) {
649               shared_A[i] = i + ind.league_rank();
650               shared_B[i] = 2 * i + ind.league_rank();
651             }
652 
653             ind.team_barrier();
654 
655             if (ind.team_rank() + 1 == ind.team_size()) {
656               for (int i = 0; i < SHARED_COUNT; ++i) {
657                 if (shared_A[i] != i + ind.league_rank()) {
658                   ++update;
659                 }
660 
661                 if (shared_B[i] != 2 * i + ind.league_rank()) {
662                   ++update;
663                 }
664               }
665             }
666           }
667         },
668         result_type(&error_count));
669 
670     Kokkos::fence();
671 
672     ASSERT_EQ(error_count, 0);
673   }
674 };
675 #endif
676 
677 }  // namespace Test
678 
679 namespace Test {
680 
681 template <class ExecSpace, class ScheduleType>
682 struct ScratchTeamFunctor {
683   using execution_space = ExecSpace;
684   using value_type      = int;
685   using policy_type     = Kokkos::TeamPolicy<ScheduleType, execution_space>;
686 
687   enum { SHARED_TEAM_COUNT = 100 };
688   enum { SHARED_THREAD_COUNT = 10 };
689 
690   using shmem_space = typename ExecSpace::scratch_memory_space;
691 
692   // TBD: MemoryUnmanaged should be the default for shared memory space.
693   using shared_int_array_type =
694       Kokkos::View<size_t *, shmem_space, Kokkos::MemoryUnmanaged>;
695 
696   KOKKOS_INLINE_FUNCTION
operator ()Test::ScratchTeamFunctor697   void operator()(const typename policy_type::member_type &ind,
698                   value_type &update) const {
699     const shared_int_array_type scratch_ptr(ind.team_scratch(1),
700                                             3 * ind.team_size());
701     const shared_int_array_type scratch_A(ind.team_scratch(1),
702                                           SHARED_TEAM_COUNT);
703     const shared_int_array_type scratch_B(ind.thread_scratch(1),
704                                           SHARED_THREAD_COUNT);
705 
706     if ((scratch_ptr.data() == nullptr) ||
707         (scratch_A.data() == nullptr && SHARED_TEAM_COUNT > 0) ||
708         (scratch_B.data() == nullptr && SHARED_THREAD_COUNT > 0)) {
709       KOKKOS_IMPL_DO_NOT_USE_PRINTF(
710           "Failed to allocate shared memory of size %lu\n",
711           static_cast<unsigned long>(SHARED_TEAM_COUNT));
712 
713       ++update;  // Failure to allocate is an error.
714     } else {
715       Kokkos::parallel_for(
716           Kokkos::TeamThreadRange(ind, 0, (int)SHARED_TEAM_COUNT),
717           [&](const int &i) { scratch_A[i] = i + ind.league_rank(); });
718 
719       for (int i = 0; i < SHARED_THREAD_COUNT; i++) {
720         scratch_B[i] = 10000 * ind.league_rank() + 100 * ind.team_rank() + i;
721       }
722 
723       scratch_ptr[ind.team_rank()]                   = (size_t)scratch_A.data();
724       scratch_ptr[ind.team_rank() + ind.team_size()] = (size_t)scratch_B.data();
725 
726       ind.team_barrier();
727 
728       for (int i = 0; i < SHARED_TEAM_COUNT; i++) {
729         if (scratch_A[i] != size_t(i + ind.league_rank())) ++update;
730       }
731 
732       for (int i = 0; i < ind.team_size(); i++) {
733         if (scratch_ptr[0] != scratch_ptr[i]) ++update;
734       }
735 
736       if (scratch_ptr[1 + ind.team_size()] - scratch_ptr[0 + ind.team_size()] <
737           SHARED_THREAD_COUNT * sizeof(size_t)) {
738         ++update;
739       }
740 
741       for (int i = 1; i < ind.team_size(); i++) {
742         if ((scratch_ptr[i + ind.team_size()] -
743              scratch_ptr[i - 1 + ind.team_size()]) !=
744             (scratch_ptr[1 + ind.team_size()] -
745              scratch_ptr[0 + ind.team_size()])) {
746           ++update;
747         }
748       }
749     }
750   }
751 };
752 
753 }  // namespace Test
754 
755 namespace {
756 
757 template <class ExecSpace, class ScheduleType>
758 struct TestScratchTeam {
TestScratchTeam__anon4dcf17ce0a11::TestScratchTeam759   TestScratchTeam() { run(); }
760 
run__anon4dcf17ce0a11::TestScratchTeam761   void run() {
762     using Functor = Test::ScratchTeamFunctor<ExecSpace, ScheduleType>;
763     using result_type =
764         Kokkos::View<typename Functor::value_type, Kokkos::HostSpace,
765                      Kokkos::MemoryUnmanaged>;
766     using p_type = Kokkos::TeamPolicy<ScheduleType, ExecSpace>;
767 
768     typename Functor::value_type error_count = 0;
769 
770     int thread_scratch_size = Functor::shared_int_array_type::shmem_size(
771         Functor::SHARED_THREAD_COUNT);
772 
773 #ifdef KOKKOS_ENABLE_OPENMPTARGET
774     p_type team_exec = p_type(64, 32).set_scratch_size(
775         1,
776         Kokkos::PerTeam(Functor::shared_int_array_type::shmem_size(
777             Functor::SHARED_TEAM_COUNT)),
778         Kokkos::PerThread(thread_scratch_size + 3 * sizeof(int)));
779 #else
780     p_type team_exec = p_type(8192, 1).set_scratch_size(
781         1,
782         Kokkos::PerTeam(Functor::shared_int_array_type::shmem_size(
783             Functor::SHARED_TEAM_COUNT)),
784         Kokkos::PerThread(thread_scratch_size + 3 * sizeof(int)));
785 #endif
786 
787     const size_t team_size =
788         team_exec.team_size_max(Functor(), Kokkos::ParallelReduceTag());
789 
790     int team_scratch_size =
791         Functor::shared_int_array_type::shmem_size(Functor::SHARED_TEAM_COUNT) +
792         Functor::shared_int_array_type::shmem_size(3 * team_size);
793 
794 #ifdef KOKKOS_ENABLE_OPENMPTARGET
795     team_exec = p_type(64 / team_size, team_size);
796 #else
797     team_exec          = p_type(8192 / team_size, team_size);
798 #endif
799 
800     Kokkos::parallel_reduce(
801         team_exec.set_scratch_size(1, Kokkos::PerTeam(team_scratch_size),
802                                    Kokkos::PerThread(thread_scratch_size)),
803         Functor(), result_type(&error_count));
804     Kokkos::fence();
805     ASSERT_EQ(error_count, 0);
806   }
807 };
808 
809 }  // namespace
810 
811 namespace Test {
812 
813 template <class ExecSpace>
test_team_mulit_level_scratch_loop_body(const typename Kokkos::TeamPolicy<ExecSpace>::member_type & team)814 KOKKOS_INLINE_FUNCTION int test_team_mulit_level_scratch_loop_body(
815     const typename Kokkos::TeamPolicy<ExecSpace>::member_type &team) {
816   Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
817       a_team1(team.team_scratch(0), 128);
818   Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
819       a_thread1(team.thread_scratch(0), 16);
820   Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
821       a_team2(team.team_scratch(0), 128);
822   Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
823       a_thread2(team.thread_scratch(0), 16);
824 
825   Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
826       b_team1(team.team_scratch(1), 12800);
827   Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
828       b_thread1(team.thread_scratch(1), 1600);
829   Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
830       b_team2(team.team_scratch(1), 12800);
831   Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
832       b_thread2(team.thread_scratch(1), 1600);
833 
834   Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
835       a_team3(team.team_scratch(0), 128);
836   Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
837       a_thread3(team.thread_scratch(0), 16);
838   Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
839       b_team3(team.team_scratch(1), 12800);
840   Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
841       b_thread3(team.thread_scratch(1), 1600);
842 
843   // The explicit types for 0 and 128 are here to test TeamThreadRange accepting
844   // different types for begin and end.
845   Kokkos::parallel_for(Kokkos::TeamThreadRange(team, int(0), unsigned(128)),
846                        [&](const int &i) {
847                          a_team1(i) = 1000000 + i + team.league_rank() * 100000;
848                          a_team2(i) = 2000000 + i + team.league_rank() * 100000;
849                          a_team3(i) = 3000000 + i + team.league_rank() * 100000;
850                        });
851   team.team_barrier();
852 
853   Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, int(0), unsigned(16)),
854                        [&](const int &i) {
855                          a_thread1(i) = 1000000 + 100000 * team.team_rank() +
856                                         16 - i + team.league_rank() * 100000;
857                          a_thread2(i) = 2000000 + 100000 * team.team_rank() +
858                                         16 - i + team.league_rank() * 100000;
859                          a_thread3(i) = 3000000 + 100000 * team.team_rank() +
860                                         16 - i + team.league_rank() * 100000;
861                        });
862 
863   Kokkos::parallel_for(Kokkos::TeamThreadRange(team, int(0), unsigned(12800)),
864                        [&](const int &i) {
865                          b_team1(i) = 1000000 + i + team.league_rank() * 100000;
866                          b_team2(i) = 2000000 + i + team.league_rank() * 100000;
867                          b_team3(i) = 3000000 + i + team.league_rank() * 100000;
868                        });
869   team.team_barrier();
870 
871   Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, 1600),
872                        [&](const int &i) {
873                          b_thread1(i) = 1000000 + 100000 * team.team_rank() +
874                                         16 - i + team.league_rank() * 100000;
875                          b_thread2(i) = 2000000 + 100000 * team.team_rank() +
876                                         16 - i + team.league_rank() * 100000;
877                          b_thread3(i) = 3000000 + 100000 * team.team_rank() +
878                                         16 - i + team.league_rank() * 100000;
879                        });
880 
881   team.team_barrier();
882 
883   int error = 0;
884   Kokkos::parallel_for(
885       Kokkos::TeamThreadRange(team, 0, 128), [&](const int &i) {
886         if (a_team1(i) != 1000000 + i + team.league_rank() * 100000) error++;
887         if (a_team2(i) != 2000000 + i + team.league_rank() * 100000) error++;
888         if (a_team3(i) != 3000000 + i + team.league_rank() * 100000) error++;
889       });
890   team.team_barrier();
891 
892   Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, 16), [&](const int &i) {
893     if (a_thread1(i) != 1000000 + 100000 * team.team_rank() + 16 - i +
894                             team.league_rank() * 100000)
895       error++;
896     if (a_thread2(i) != 2000000 + 100000 * team.team_rank() + 16 - i +
897                             team.league_rank() * 100000)
898       error++;
899     if (a_thread3(i) != 3000000 + 100000 * team.team_rank() + 16 - i +
900                             team.league_rank() * 100000)
901       error++;
902   });
903 
904   Kokkos::parallel_for(
905       Kokkos::TeamThreadRange(team, 0, 12800), [&](const int &i) {
906         if (b_team1(i) != 1000000 + i + team.league_rank() * 100000) error++;
907         if (b_team2(i) != 2000000 + i + team.league_rank() * 100000) error++;
908         if (b_team3(i) != 3000000 + i + team.league_rank() * 100000) error++;
909       });
910   team.team_barrier();
911 
912   Kokkos::parallel_for(
913       Kokkos::ThreadVectorRange(team, 1600), [&](const int &i) {
914         if (b_thread1(i) != 1000000 + 100000 * team.team_rank() + 16 - i +
915                                 team.league_rank() * 100000)
916           error++;
917         if (b_thread2(i) != 2000000 + 100000 * team.team_rank() + 16 - i +
918                                 team.league_rank() * 100000)
919           error++;
920         if (b_thread3(i) != 3000000 + 100000 * team.team_rank() + 16 - i +
921                                 team.league_rank() * 100000)
922           error++;
923       });
924 
925   return error;
926 }
927 
928 struct TagReduce {};
929 struct TagFor {};
930 
931 template <class ExecSpace, class ScheduleType>
932 struct ClassNoShmemSizeFunction {
933   using member_type =
934       typename Kokkos::TeamPolicy<ExecSpace, ScheduleType>::member_type;
935 
936   Kokkos::View<int, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
937 
938   KOKKOS_INLINE_FUNCTION
operator ()Test::ClassNoShmemSizeFunction939   void operator()(const TagFor &, const member_type &team) const {
940     int error = test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
941     errors() += error;
942   }
943 
944   KOKKOS_INLINE_FUNCTION
operator ()Test::ClassNoShmemSizeFunction945   void operator()(const TagReduce &, const member_type &team,
946                   int &error) const {
947     error += test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
948   }
949 
runTest::ClassNoShmemSizeFunction950   void run() {
951     Kokkos::View<int, ExecSpace> d_errors =
952         Kokkos::View<int, ExecSpace>("Errors");
953     errors = d_errors;
954 
955     const int per_team0 =
956         3 *
957         Kokkos::View<double *, ExecSpace,
958                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(128);
959     const int per_thread0 =
960         3 *
961         Kokkos::View<double *, ExecSpace,
962                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(16);
963 
964     const int per_team1 =
965         3 * Kokkos::View<
966                 double *, ExecSpace,
967                 Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(12800);
968     const int per_thread1 =
969         3 * Kokkos::View<
970                 double *, ExecSpace,
971                 Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(1600);
972 
973     int team_size = 8;
974     if (team_size > ExecSpace::concurrency())
975       team_size = ExecSpace::concurrency();
976     {
977       Kokkos::TeamPolicy<TagFor, ExecSpace, ScheduleType> policy(10, team_size,
978                                                                  16);
979 
980       Kokkos::parallel_for(
981           policy
982               .set_scratch_size(0, Kokkos::PerTeam(per_team0),
983                                 Kokkos::PerThread(per_thread0))
984               .set_scratch_size(1, Kokkos::PerTeam(per_team1),
985                                 Kokkos::PerThread(per_thread1)),
986           *this);
987       Kokkos::fence();
988 
989       typename Kokkos::View<int, ExecSpace>::HostMirror h_errors =
990           Kokkos::create_mirror_view(d_errors);
991       Kokkos::deep_copy(h_errors, d_errors);
992       ASSERT_EQ(h_errors(), 0);
993     }
994 
995     {
996       int error = 0;
997       Kokkos::TeamPolicy<TagReduce, ExecSpace, ScheduleType> policy(
998           10, team_size, 16);
999 
1000       Kokkos::parallel_reduce(
1001           policy
1002               .set_scratch_size(0, Kokkos::PerTeam(per_team0),
1003                                 Kokkos::PerThread(per_thread0))
1004               .set_scratch_size(1, Kokkos::PerTeam(per_team1),
1005                                 Kokkos::PerThread(per_thread1)),
1006           *this, error);
1007 
1008       ASSERT_EQ(error, 0);
1009     }
1010   };
1011 };
1012 
1013 template <class ExecSpace, class ScheduleType>
1014 struct ClassWithShmemSizeFunction {
1015   using member_type =
1016       typename Kokkos::TeamPolicy<ExecSpace, ScheduleType>::member_type;
1017 
1018   Kokkos::View<int, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
1019 
1020   KOKKOS_INLINE_FUNCTION
operator ()Test::ClassWithShmemSizeFunction1021   void operator()(const TagFor &, const member_type &team) const {
1022     int error = test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
1023     errors() += error;
1024   }
1025 
1026   KOKKOS_INLINE_FUNCTION
operator ()Test::ClassWithShmemSizeFunction1027   void operator()(const TagReduce &, const member_type &team,
1028                   int &error) const {
1029     error += test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
1030   }
1031 
runTest::ClassWithShmemSizeFunction1032   void run() {
1033     Kokkos::View<int, ExecSpace> d_errors =
1034         Kokkos::View<int, ExecSpace>("Errors");
1035     errors = d_errors;
1036 
1037     const int per_team1 =
1038         3 * Kokkos::View<
1039                 double *, ExecSpace,
1040                 Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(12800);
1041     const int per_thread1 =
1042         3 * Kokkos::View<
1043                 double *, ExecSpace,
1044                 Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(1600);
1045 
1046     int team_size = 8;
1047     if (team_size > ExecSpace::concurrency())
1048       team_size = ExecSpace::concurrency();
1049 
1050     {
1051       Kokkos::TeamPolicy<TagFor, ExecSpace, ScheduleType> policy(10, team_size,
1052                                                                  16);
1053 
1054       Kokkos::parallel_for(
1055           policy.set_scratch_size(1, Kokkos::PerTeam(per_team1),
1056                                   Kokkos::PerThread(per_thread1)),
1057           *this);
1058       Kokkos::fence();
1059 
1060       typename Kokkos::View<int, ExecSpace>::HostMirror h_errors =
1061           Kokkos::create_mirror_view(d_errors);
1062       Kokkos::deep_copy(h_errors, d_errors);
1063       ASSERT_EQ(h_errors(), 0);
1064     }
1065 
1066     {
1067       int error = 0;
1068       Kokkos::TeamPolicy<TagReduce, ExecSpace, ScheduleType> policy(
1069           10, team_size, 16);
1070 
1071       Kokkos::parallel_reduce(
1072           policy.set_scratch_size(1, Kokkos::PerTeam(per_team1),
1073                                   Kokkos::PerThread(per_thread1)),
1074           *this, error);
1075 
1076       ASSERT_EQ(error, 0);
1077     }
1078   };
1079 
team_shmem_sizeTest::ClassWithShmemSizeFunction1080   unsigned team_shmem_size(int team_size) const {
1081     const int per_team0 =
1082         3 *
1083         Kokkos::View<double *, ExecSpace,
1084                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(128);
1085     const int per_thread0 =
1086         3 *
1087         Kokkos::View<double *, ExecSpace,
1088                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(16);
1089     return per_team0 + team_size * per_thread0;
1090   }
1091 };
1092 
1093 template <class ExecSpace, class ScheduleType>
test_team_mulit_level_scratch_test_lambda()1094 void test_team_mulit_level_scratch_test_lambda() {
1095 #ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
1096   Kokkos::View<int, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
1097   Kokkos::View<int, ExecSpace> d_errors("Errors");
1098   errors = d_errors;
1099 
1100   const int per_team0 =
1101       3 *
1102       Kokkos::View<double *, ExecSpace,
1103                    Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(128);
1104   const int per_thread0 =
1105       3 *
1106       Kokkos::View<double *, ExecSpace,
1107                    Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(16);
1108 
1109   const int per_team1 =
1110       3 *
1111       Kokkos::View<double *, ExecSpace,
1112                    Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(12800);
1113   const int per_thread1 =
1114       3 *
1115       Kokkos::View<double *, ExecSpace,
1116                    Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(1600);
1117 
1118   int team_size = 8;
1119   if (team_size > ExecSpace::concurrency())
1120     team_size = ExecSpace::concurrency();
1121 
1122   Kokkos::TeamPolicy<ExecSpace, ScheduleType> policy(10, team_size, 16);
1123 
1124   Kokkos::parallel_for(
1125       policy
1126           .set_scratch_size(0, Kokkos::PerTeam(per_team0),
1127                             Kokkos::PerThread(per_thread0))
1128           .set_scratch_size(1, Kokkos::PerTeam(per_team1),
1129                             Kokkos::PerThread(per_thread1)),
1130       KOKKOS_LAMBDA(
1131           const typename Kokkos::TeamPolicy<ExecSpace>::member_type &team) {
1132         int error = test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
1133         errors() += error;
1134       });
1135   Kokkos::fence();
1136 
1137   typename Kokkos::View<int, ExecSpace>::HostMirror h_errors =
1138       Kokkos::create_mirror_view(errors);
1139   Kokkos::deep_copy(h_errors, d_errors);
1140   ASSERT_EQ(h_errors(), 0);
1141 
1142   int error = 0;
1143   Kokkos::parallel_reduce(
1144       policy
1145           .set_scratch_size(0, Kokkos::PerTeam(per_team0),
1146                             Kokkos::PerThread(per_thread0))
1147           .set_scratch_size(1, Kokkos::PerTeam(per_team1),
1148                             Kokkos::PerThread(per_thread1)),
1149       KOKKOS_LAMBDA(
1150           const typename Kokkos::TeamPolicy<ExecSpace>::member_type &team,
1151           int &count) {
1152         count += test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
1153       },
1154       error);
1155   ASSERT_EQ(error, 0);
1156 #endif
1157 }
1158 
1159 }  // namespace Test
1160 
1161 namespace {
1162 
1163 template <class ExecSpace, class ScheduleType>
1164 struct TestMultiLevelScratchTeam {
TestMultiLevelScratchTeam__anon4dcf17ce1311::TestMultiLevelScratchTeam1165   TestMultiLevelScratchTeam() { run(); }
1166 
run__anon4dcf17ce1311::TestMultiLevelScratchTeam1167   void run() {
1168 #ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
1169     Test::test_team_mulit_level_scratch_test_lambda<ExecSpace, ScheduleType>();
1170 #endif
1171     Test::ClassNoShmemSizeFunction<ExecSpace, ScheduleType> c1;
1172     c1.run();
1173 
1174     Test::ClassWithShmemSizeFunction<ExecSpace, ScheduleType> c2;
1175     c2.run();
1176   }
1177 };
1178 
1179 }  // namespace
1180 
1181 namespace Test {
1182 
1183 template <class ExecSpace>
1184 struct TestShmemSize {
TestShmemSizeTest::TestShmemSize1185   TestShmemSize() { run(); }
1186 
runTest::TestShmemSize1187   void run() {
1188     using view_type = Kokkos::View<int64_t ***, ExecSpace>;
1189 
1190     size_t d1 = 5;
1191     size_t d2 = 6;
1192     size_t d3 = 7;
1193 
1194     size_t size = view_type::shmem_size(d1, d2, d3);
1195 
1196     ASSERT_EQ(size, (d1 * d2 * d3 + 1) * sizeof(int64_t));
1197 
1198     test_layout_stride();
1199   }
1200 
test_layout_strideTest::TestShmemSize1201   void test_layout_stride() {
1202     int rank       = 3;
1203     int order[3]   = {2, 0, 1};
1204     int extents[3] = {100, 10, 3};
1205     auto s1 =
1206         Kokkos::View<double ***, Kokkos::LayoutStride, ExecSpace>::shmem_size(
1207             Kokkos::LayoutStride::order_dimensions(rank, order, extents));
1208     auto s2 =
1209         Kokkos::View<double ***, Kokkos::LayoutRight, ExecSpace>::shmem_size(
1210             extents[0], extents[1], extents[2]);
1211     ASSERT_EQ(s1, s2);
1212   }
1213 };
1214 
1215 }  // namespace Test
1216 
1217 /*--------------------------------------------------------------------------*/
1218 
1219 namespace Test {
1220 
1221 namespace {
1222 
1223 template <class ExecSpace, class ScheduleType, class T, class Enabled = void>
1224 struct TestTeamBroadcast;
1225 
1226 template <class ExecSpace, class ScheduleType, class T>
1227 struct TestTeamBroadcast<
1228     ExecSpace, ScheduleType, T,
1229     typename std::enable_if<(sizeof(T) == sizeof(char)), void>::type> {
1230   using team_member =
1231       typename Kokkos::TeamPolicy<ScheduleType, ExecSpace>::member_type;
1232   using memory_space = typename ExecSpace::memory_space;
1233   using value_type   = T;
1234 
1235   const value_type offset;
1236 
TestTeamBroadcastTest::__anon4dcf17ce1411::TestTeamBroadcast1237   TestTeamBroadcast(const size_t /*league_size*/, const value_type os_)
1238       : offset(os_) {}
1239 
1240   struct BroadcastTag {};
1241 
1242   KOKKOS_INLINE_FUNCTION
operator ()Test::__anon4dcf17ce1411::TestTeamBroadcast1243   void operator()(const team_member &teamMember, value_type &update) const {
1244     int lid = teamMember.league_rank();
1245     int tid = teamMember.team_rank();
1246     int ts  = teamMember.team_size();
1247 
1248     value_type parUpdate = 0;
1249     value_type value     = (value_type)(tid % 0xFF) + offset;
1250 
1251     // broadcast boolean and value to team from source thread
1252     teamMember.team_broadcast(value, lid % ts);
1253 
1254     Kokkos::parallel_reduce(
1255         Kokkos::TeamThreadRange(teamMember, ts),
1256         [&](const int /*j*/, value_type &teamUpdate) { teamUpdate |= value; },
1257         Kokkos::BOr<value_type, memory_space>(parUpdate));
1258 
1259     if (teamMember.team_rank() == 0) update |= parUpdate;
1260   }
1261 
1262   KOKKOS_INLINE_FUNCTION
operator ()Test::__anon4dcf17ce1411::TestTeamBroadcast1263   void operator()(const BroadcastTag &, const team_member &teamMember,
1264                   value_type &update) const {
1265     int lid = teamMember.league_rank();
1266     int tid = teamMember.team_rank();
1267     int ts  = teamMember.team_size();
1268 
1269     value_type parUpdate = 0;
1270     value_type value     = (value_type)(tid % 0xFF) + offset;
1271 
1272     teamMember.team_broadcast([&](value_type &var) { var -= offset; }, value,
1273                               lid % ts);
1274 
1275     Kokkos::parallel_reduce(
1276         Kokkos::TeamThreadRange(teamMember, ts),
1277         [&](const int /*j*/, value_type &teamUpdate) { teamUpdate |= value; },
1278         Kokkos::BOr<value_type, memory_space>(parUpdate));
1279 
1280     if (teamMember.team_rank() == 0) update |= parUpdate;
1281   }
1282 
test_teambroadcastTest::__anon4dcf17ce1411::TestTeamBroadcast1283   static void test_teambroadcast(const size_t league_size,
1284                                  const value_type off) {
1285     TestTeamBroadcast functor(league_size, off);
1286 
1287     using policy_type = Kokkos::TeamPolicy<ScheduleType, ExecSpace>;
1288     using policy_type_f =
1289         Kokkos::TeamPolicy<ScheduleType, ExecSpace, BroadcastTag>;
1290 
1291     // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
1292 #ifdef KOKKOS_ENABLE_OPENMPTARGET
1293     int fake_team_size =
1294         std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value ? 32
1295                                                                            : 1;
1296 #else
1297     int fake_team_size = 1;
1298 #endif
1299     const int team_size =
1300         policy_type_f(league_size, fake_team_size)
1301             .team_size_max(
1302                 functor,
1303                 Kokkos::
1304                     ParallelReduceTag());  // printf("team_size=%d\n",team_size);
1305 
1306     // team_broadcast with value
1307     value_type total = 0;
1308 
1309     Kokkos::parallel_reduce(policy_type(league_size, team_size), functor,
1310                             Kokkos::BOr<value_type, Kokkos::HostSpace>(total));
1311 
1312     value_type expected_result = 0;
1313     for (unsigned int i = 0; i < league_size; i++) {
1314       value_type val = (value_type((i % team_size % 0xFF)) + off);
1315       expected_result |= val;
1316     }
1317     ASSERT_EQ(expected_result, total);
1318     // printf("team_broadcast with value --"
1319     //"expected_result=%x,"
1320     //"total=%x\n",expected_result, total);
1321 
1322     // team_broadcast with function object
1323     total = 0;
1324 
1325     Kokkos::parallel_reduce(policy_type_f(league_size, team_size), functor,
1326                             Kokkos::BOr<value_type, Kokkos::HostSpace>(total));
1327 
1328     expected_result = 0;
1329     for (unsigned int i = 0; i < league_size; i++) {
1330       value_type val = ((value_type)((i % team_size % 0xFF)));
1331       expected_result |= val;
1332     }
1333     ASSERT_EQ(expected_result, total);
1334     // printf("team_broadcast with function object --"
1335     // "expected_result=%x,"
1336     // "total=%x\n",expected_result, total);
1337   }
1338 };
1339 
1340 template <class ExecSpace, class ScheduleType, class T>
1341 struct TestTeamBroadcast<
1342     ExecSpace, ScheduleType, T,
1343     typename std::enable_if<(sizeof(T) > sizeof(char)), void>::type> {
1344   using team_member =
1345       typename Kokkos::TeamPolicy<ScheduleType, ExecSpace>::member_type;
1346   using value_type = T;
1347 
1348   const value_type offset;
1349 
TestTeamBroadcastTest::__anon4dcf17ce1411::TestTeamBroadcast1350   TestTeamBroadcast(const size_t /*league_size*/, const value_type os_)
1351       : offset(os_) {}
1352 
1353   struct BroadcastTag {};
1354 
1355   KOKKOS_INLINE_FUNCTION
operator ()Test::__anon4dcf17ce1411::TestTeamBroadcast1356   void operator()(const team_member &teamMember, value_type &update) const {
1357     int lid = teamMember.league_rank();
1358     int tid = teamMember.team_rank();
1359     int ts  = teamMember.team_size();
1360 
1361     value_type parUpdate = 0;
1362     value_type value     = (value_type)(tid * 3) + offset;
1363 
1364     // setValue is used to determine if the update should be
1365     // performed at the bottom.  The thread id must match the
1366     // thread id used to broadcast the value.  It is the
1367     // thread id that matches the league rank mod team size
1368     // this way each league rank will use a different thread id
1369     // which is likely not 0
1370     bool setValue = ((lid % ts) == tid);
1371 
1372     // broadcast boolean and value to team from source thread
1373     teamMember.team_broadcast(value, lid % ts);
1374     teamMember.team_broadcast(setValue, lid % ts);
1375 
1376     Kokkos::parallel_reduce(
1377         Kokkos::TeamThreadRange(teamMember, ts),
1378         [&](const int /*j*/, value_type &teamUpdate) { teamUpdate += value; },
1379         parUpdate);
1380 
1381     if (teamMember.team_rank() == 0 && setValue) update += parUpdate;
1382   }
1383 
1384   KOKKOS_INLINE_FUNCTION
operator ()Test::__anon4dcf17ce1411::TestTeamBroadcast1385   void operator()(const BroadcastTag &, const team_member &teamMember,
1386                   value_type &update) const {
1387     int lid = teamMember.league_rank();
1388     int tid = teamMember.team_rank();
1389     int ts  = teamMember.team_size();
1390 
1391     value_type parUpdate = 0;
1392     value_type value     = (value_type)(tid * 3) + offset;
1393 
1394     // setValue is used to determine if the update should be
1395     // performed at the bottom.  The thread id must match the
1396     // thread id used to broadcast the value.  It is the
1397     // thread id that matches the league rank mod team size
1398     // this way each league rank will use a different thread id
1399     // which is likely not 0. Note the logic is switched from
1400     // above because the functor switches it back.
1401     bool setValue = ((lid % ts) != tid);
1402 
1403     teamMember.team_broadcast([&](value_type &var) { var *= 2; }, value,
1404                               lid % ts);
1405     teamMember.team_broadcast([&](bool &bVar) { bVar = !bVar; }, setValue,
1406                               lid % ts);
1407 
1408     Kokkos::parallel_reduce(
1409         Kokkos::TeamThreadRange(teamMember, ts),
1410         [&](const int /*j*/, value_type &teamUpdate) { teamUpdate += value; },
1411         parUpdate);
1412 
1413     if (teamMember.team_rank() == 0 && setValue) update += parUpdate;
1414   }
1415 
1416   template <class ScalarType>
1417   static inline
1418       typename std::enable_if<!std::is_integral<ScalarType>::value, void>::type
compare_testTest::__anon4dcf17ce1411::TestTeamBroadcast1419       compare_test(ScalarType A, ScalarType B, double epsilon_factor) {
1420     if (std::is_same<ScalarType, double>::value ||
1421         std::is_same<ScalarType, float>::value) {
1422       ASSERT_NEAR((double)A, (double)B,
1423                   epsilon_factor * std::abs(A) *
1424                       std::numeric_limits<ScalarType>::epsilon());
1425     } else {
1426       ASSERT_EQ(A, B);
1427     }
1428   }
1429 
1430   template <class ScalarType>
1431   static inline
1432       typename std::enable_if<std::is_integral<ScalarType>::value, void>::type
compare_testTest::__anon4dcf17ce1411::TestTeamBroadcast1433       compare_test(ScalarType A, ScalarType B, double) {
1434     ASSERT_EQ(A, B);
1435   }
1436 
test_teambroadcastTest::__anon4dcf17ce1411::TestTeamBroadcast1437   static void test_teambroadcast(const size_t league_size,
1438                                  const value_type off) {
1439     TestTeamBroadcast functor(league_size, off);
1440 
1441     using policy_type = Kokkos::TeamPolicy<ScheduleType, ExecSpace>;
1442     using policy_type_f =
1443         Kokkos::TeamPolicy<ScheduleType, ExecSpace, BroadcastTag>;
1444 
1445     // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
1446 #ifdef KOKKOS_ENABLE_OPENMPTARGET
1447     int fake_team_size =
1448         std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value ? 32
1449                                                                            : 1;
1450 #else
1451     int fake_team_size = 1;
1452 #endif
1453     const int team_size =
1454         policy_type_f(league_size, fake_team_size)
1455             .team_size_max(
1456                 functor,
1457                 Kokkos::
1458                     ParallelReduceTag());  // printf("team_size=%d\n",team_size);
1459     // team_broadcast with value
1460     value_type total = 0;
1461 
1462     Kokkos::parallel_reduce(policy_type(league_size, team_size), functor,
1463                             total);
1464 
1465     value_type expected_result = 0;
1466     for (unsigned int i = 0; i < league_size; i++) {
1467       value_type val =
1468           (value_type((i % team_size) * 3) + off) * (value_type)team_size;
1469       expected_result += val;
1470     }
1471     // For comparison purposes treat the reduction as a random walk in the
1472     // least significant digit, which gives a typical walk distance
1473     // sqrt(league_size) Add 4x for larger sigma
1474     compare_test(expected_result, total, 4.0 * std::sqrt(league_size));
1475 
1476     // team_broadcast with function object
1477     total = 0;
1478 
1479     Kokkos::parallel_reduce(policy_type_f(league_size, team_size), functor,
1480                             total);
1481 
1482     expected_result = 0;
1483     for (unsigned int i = 0; i < league_size; i++) {
1484       value_type val = ((value_type)((i % team_size) * 3) + off) *
1485                        (value_type)(2 * team_size);
1486       expected_result += val;
1487     }
1488     // For comparison purposes treat the reduction as a random walk in the
1489     // least significant digit, which gives a typical walk distance
1490     // sqrt(league_size) Add 4x for larger sigma
1491     compare_test(expected_result, total, 4.0 * std::sqrt(league_size));
1492   }
1493 };
1494 
1495 template <class ExecSpace>
1496 struct TestScratchAlignment {
1497   struct TestScalar {
1498     double x, y, z;
1499   };
TestScratchAlignmentTest::__anon4dcf17ce1411::TestScratchAlignment1500   TestScratchAlignment() {
1501     test(true);
1502     test(false);
1503   }
1504   using ScratchView =
1505       Kokkos::View<TestScalar *, typename ExecSpace::scratch_memory_space>;
1506   using ScratchViewInt =
1507       Kokkos::View<int *, typename ExecSpace::scratch_memory_space>;
testTest::__anon4dcf17ce1411::TestScratchAlignment1508   void test(bool allocate_small) {
1509     int shmem_size = ScratchView::shmem_size(11);
1510 #ifdef KOKKOS_ENABLE_OPENMPTARGET
1511     int team_size = 32;
1512 #else
1513     int team_size      = 1;
1514 #endif
1515     if (allocate_small) shmem_size += ScratchViewInt::shmem_size(1);
1516     Kokkos::parallel_for(
1517         Kokkos::TeamPolicy<ExecSpace>(1, team_size)
1518             .set_scratch_size(0, Kokkos::PerTeam(shmem_size)),
1519         KOKKOS_LAMBDA(
1520             const typename Kokkos::TeamPolicy<ExecSpace>::member_type &team) {
1521           if (allocate_small) ScratchViewInt p(team.team_scratch(0), 1);
1522           ScratchView a(team.team_scratch(0), 11);
1523           if (ptrdiff_t(a.data()) % sizeof(TestScalar) != 0)
1524             Kokkos::abort("Error: invalid scratch view alignment\n");
1525         });
1526     Kokkos::fence();
1527   }
1528 };
1529 
1530 }  // namespace
1531 
1532 namespace {
1533 
1534 template <class ExecSpace>
1535 struct TestTeamPolicyHandleByValue {
1536   using scalar     = double;
1537   using exec_space = ExecSpace;
1538   using mem_space  = typename ExecSpace::memory_space;
1539 
TestTeamPolicyHandleByValueTest::__anon4dcf17ce1c11::TestTeamPolicyHandleByValue1540   TestTeamPolicyHandleByValue() { test(); }
1541 
testTest::__anon4dcf17ce1c11::TestTeamPolicyHandleByValue1542   void test() {
1543 #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
1544     const int M = 1, N = 1;
1545     Kokkos::View<scalar **, mem_space> a("a", M, N);
1546     Kokkos::View<scalar **, mem_space> b("b", M, N);
1547     Kokkos::deep_copy(a, 0.0);
1548     Kokkos::deep_copy(b, 1.0);
1549     Kokkos::parallel_for(
1550         "test_tphandle_by_value",
1551         Kokkos::TeamPolicy<exec_space>(M, Kokkos::AUTO(), 1),
1552         KOKKOS_LAMBDA(
1553             const typename Kokkos::TeamPolicy<exec_space>::member_type team) {
1554           const int i = team.league_rank();
1555           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 0, N),
1556                                [&](const int j) { a(i, j) += b(i, j); });
1557         });
1558 #endif
1559   }
1560 };
1561 
1562 }  // namespace
1563 
1564 }  // namespace Test
1565 
1566 /*--------------------------------------------------------------------------*/
1567