1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 // Kokkos v. 3.0
6 // Copyright (2020) National Technology & Engineering
7 // Solutions of Sandia, LLC (NTESS).
8 //
9 // Under the terms of Contract DE-NA0003525 with NTESS,
10 // the U.S. Government retains certain rights in this software.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions are
14 // met:
15 //
16 // 1. Redistributions of source code must retain the above copyright
17 // notice, this list of conditions and the following disclaimer.
18 //
19 // 2. Redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution.
22 //
23 // 3. Neither the name of the Corporation nor the names of the
24 // contributors may be used to endorse or promote products derived from
25 // this software without specific prior written permission.
26 //
27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 //
39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
40 //
41 // ************************************************************************
42 //@HEADER
43 */
44
45 #include <cstdio>
46 #include <stdexcept>
47 #include <sstream>
48 #include <iostream>
49
50 #include <Kokkos_Core.hpp>
51
52 namespace Test {
53
54 namespace {
55
56 template <class ExecSpace, class ScheduleType>
57 struct TestTeamPolicy {
58 using team_member =
59 typename Kokkos::TeamPolicy<ScheduleType, ExecSpace>::member_type;
60 using view_type = Kokkos::View<int **, ExecSpace>;
61
62 view_type m_flags;
63
TestTeamPolicyTest::__anon4dcf17ce0111::TestTeamPolicy64 TestTeamPolicy(const size_t league_size)
65 : m_flags(
66 Kokkos::view_alloc(Kokkos::WithoutInitializing, "flags"),
67 // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
68 #ifdef KOKKOS_ENABLE_OPENMPTARGET
69 Kokkos::TeamPolicy<ScheduleType, ExecSpace>(1, 32).team_size_max(
70 *this, Kokkos::ParallelReduceTag()),
71 #else
72 Kokkos::TeamPolicy<ScheduleType, ExecSpace>(1, 1).team_size_max(
73 *this, Kokkos::ParallelReduceTag()),
74 #endif
75 league_size) {
76 }
77
78 struct VerifyInitTag {};
79
80 KOKKOS_INLINE_FUNCTION
operator ()Test::__anon4dcf17ce0111::TestTeamPolicy81 void operator()(const team_member &member) const {
82 const int tid =
83 member.team_rank() + member.team_size() * member.league_rank();
84
85 m_flags(member.team_rank(), member.league_rank()) = tid;
86 static_assert(
87 (std::is_same<typename team_member::execution_space, ExecSpace>::value),
88 "TeamMember::execution_space is not the same as "
89 "TeamPolicy<>::execution_space");
90 }
91
92 KOKKOS_INLINE_FUNCTION
operator ()Test::__anon4dcf17ce0111::TestTeamPolicy93 void operator()(const VerifyInitTag &, const team_member &member) const {
94 const int tid =
95 member.team_rank() + member.team_size() * member.league_rank();
96
97 if (tid != m_flags(member.team_rank(), member.league_rank())) {
98 KOKKOS_IMPL_DO_NOT_USE_PRINTF(
99 "TestTeamPolicy member(%d,%d) error %d != %d\n", member.league_rank(),
100 member.team_rank(), tid,
101 m_flags(member.team_rank(), member.league_rank()));
102 }
103 }
104
105 // Included for test_small_league_size.
TestTeamPolicyTest::__anon4dcf17ce0111::TestTeamPolicy106 TestTeamPolicy() : m_flags() {}
107
108 // Included for test_small_league_size.
109 struct NoOpTag {};
110
111 KOKKOS_INLINE_FUNCTION
operator ()Test::__anon4dcf17ce0111::TestTeamPolicy112 void operator()(const NoOpTag &, const team_member & /*member*/) const {}
113
test_small_league_sizeTest::__anon4dcf17ce0111::TestTeamPolicy114 static void test_small_league_size() {
115 int bs = 8; // batch size (number of elements per batch)
116 int ns = 16; // total number of "problems" to process
117
118 // Calculate total scratch memory space size.
119 const int level = 0;
120 int mem_size = 960;
121 const int num_teams = ns / bs;
122 Kokkos::TeamPolicy<ExecSpace, NoOpTag> policy(num_teams, Kokkos::AUTO());
123
124 Kokkos::parallel_for(
125 policy.set_scratch_size(level, Kokkos::PerTeam(mem_size),
126 Kokkos::PerThread(0)),
127 TestTeamPolicy());
128 }
129
test_constructorsTest::__anon4dcf17ce0111::TestTeamPolicy130 static void test_constructors() {
131 constexpr const int smallest_work = 1;
132 // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
133 #ifdef KOKKOS_ENABLE_OPENMPTARGET
134 Kokkos::TeamPolicy<ExecSpace, NoOpTag> none_auto(smallest_work, 32,
135 smallest_work);
136 #else
137 Kokkos::TeamPolicy<ExecSpace, NoOpTag> none_auto(
138 smallest_work, smallest_work, smallest_work);
139 #endif
140 Kokkos::TeamPolicy<ExecSpace, NoOpTag> both_auto(
141 smallest_work, Kokkos::AUTO(), Kokkos::AUTO());
142 // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
143 #ifdef KOKKOS_ENABLE_OPENMPTARGET
144 Kokkos::TeamPolicy<ExecSpace, NoOpTag> auto_vector(smallest_work, 32,
145 Kokkos::AUTO());
146 #else
147 Kokkos::TeamPolicy<ExecSpace, NoOpTag> auto_vector(
148 smallest_work, smallest_work, Kokkos::AUTO());
149 #endif
150 Kokkos::TeamPolicy<ExecSpace, NoOpTag> auto_team(
151 smallest_work, Kokkos::AUTO(), smallest_work);
152 }
153
test_forTest::__anon4dcf17ce0111::TestTeamPolicy154 static void test_for(const size_t league_size) {
155 {
156 TestTeamPolicy functor(league_size);
157 using policy_type = Kokkos::TeamPolicy<ScheduleType, ExecSpace>;
158 using policy_type_init =
159 Kokkos::TeamPolicy<ScheduleType, ExecSpace, VerifyInitTag>;
160
161 // FIXME_OPENMPTARGET temporary restriction for team size to be at least
162 // 32
163 #ifdef KOKKOS_ENABLE_OPENMPTARGET
164 const int team_size =
165 policy_type(league_size, 32)
166 .team_size_max(functor, Kokkos::ParallelForTag());
167 const int team_size_init =
168 policy_type_init(league_size, 32)
169 .team_size_max(functor, Kokkos::ParallelForTag());
170 #else
171 const int team_size =
172 policy_type(league_size, 1)
173 .team_size_max(functor, Kokkos::ParallelForTag());
174 const int team_size_init =
175 policy_type_init(league_size, 1)
176 .team_size_max(functor, Kokkos::ParallelForTag());
177 #endif
178
179 Kokkos::parallel_for(policy_type(league_size, team_size), functor);
180 Kokkos::parallel_for(policy_type_init(league_size, team_size_init),
181 functor);
182 }
183
184 test_small_league_size();
185 test_constructors();
186 }
187
188 struct ReduceTag {};
189
190 using value_type = int64_t;
191
192 KOKKOS_INLINE_FUNCTION
operator ()Test::__anon4dcf17ce0111::TestTeamPolicy193 void operator()(const team_member &member, value_type &update) const {
194 update += member.team_rank() + member.team_size() * member.league_rank();
195 }
196
197 KOKKOS_INLINE_FUNCTION
operator ()Test::__anon4dcf17ce0111::TestTeamPolicy198 void operator()(const ReduceTag &, const team_member &member,
199 value_type &update) const {
200 update +=
201 1 + member.team_rank() + member.team_size() * member.league_rank();
202 }
203
test_reduceTest::__anon4dcf17ce0111::TestTeamPolicy204 static void test_reduce(const size_t league_size) {
205 TestTeamPolicy functor(league_size);
206
207 using policy_type = Kokkos::TeamPolicy<ScheduleType, ExecSpace>;
208 using policy_type_reduce =
209 Kokkos::TeamPolicy<ScheduleType, ExecSpace, ReduceTag>;
210
211 // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
212 #ifdef KOKKOS_ENABLE_OPENMPTARGET
213 const int team_size =
214 policy_type_reduce(league_size, 32)
215 .team_size_max(functor, Kokkos::ParallelReduceTag());
216 #else
217 const int team_size =
218 policy_type_reduce(league_size, 1)
219 .team_size_max(functor, Kokkos::ParallelReduceTag());
220 #endif
221
222 const int64_t N = team_size * league_size;
223
224 int64_t total = 0;
225
226 Kokkos::parallel_reduce(policy_type(league_size, team_size), functor,
227 total);
228 ASSERT_EQ(size_t((N - 1) * (N)) / 2, size_t(total));
229
230 Kokkos::parallel_reduce(policy_type_reduce(league_size, team_size), functor,
231 total);
232 ASSERT_EQ((size_t(N) * size_t(N + 1)) / 2, size_t(total));
233 }
234 };
235
236 } // namespace
237
238 } // namespace Test
239
240 /*--------------------------------------------------------------------------*/
241
242 namespace Test {
243
244 template <typename ScalarType, class DeviceType, class ScheduleType>
245 class ReduceTeamFunctor {
246 public:
247 using execution_space = DeviceType;
248 using policy_type = Kokkos::TeamPolicy<ScheduleType, execution_space>;
249 using size_type = typename execution_space::size_type;
250
251 struct value_type {
252 ScalarType value[3];
253 };
254
255 const size_type nwork;
256
257 KOKKOS_INLINE_FUNCTION
ReduceTeamFunctor(const size_type & arg_nwork)258 ReduceTeamFunctor(const size_type &arg_nwork) : nwork(arg_nwork) {}
259
260 KOKKOS_INLINE_FUNCTION
ReduceTeamFunctor(const ReduceTeamFunctor & rhs)261 ReduceTeamFunctor(const ReduceTeamFunctor &rhs) : nwork(rhs.nwork) {}
262
263 KOKKOS_INLINE_FUNCTION
init(value_type & dst) const264 void init(value_type &dst) const {
265 dst.value[0] = 0;
266 dst.value[1] = 0;
267 dst.value[2] = 0;
268 }
269
270 KOKKOS_INLINE_FUNCTION
join(volatile value_type & dst,const volatile value_type & src) const271 void join(volatile value_type &dst, const volatile value_type &src) const {
272 dst.value[0] += src.value[0];
273 dst.value[1] += src.value[1];
274 dst.value[2] += src.value[2];
275 }
276
277 KOKKOS_INLINE_FUNCTION
operator ()(const typename policy_type::member_type ind,value_type & dst) const278 void operator()(const typename policy_type::member_type ind,
279 value_type &dst) const {
280 const int thread_rank =
281 ind.team_rank() + ind.team_size() * ind.league_rank();
282 const int thread_size = ind.team_size() * ind.league_size();
283 const int chunk = (nwork + thread_size - 1) / thread_size;
284
285 size_type iwork = chunk * thread_rank;
286 const size_type iwork_end = iwork + chunk < nwork ? iwork + chunk : nwork;
287
288 for (; iwork < iwork_end; ++iwork) {
289 dst.value[0] += 1;
290 dst.value[1] += iwork + 1;
291 dst.value[2] += nwork - iwork;
292 }
293 }
294 };
295
296 } // namespace Test
297
298 namespace {
299
300 template <typename ScalarType, class DeviceType, class ScheduleType>
301 class TestReduceTeam {
302 public:
303 using execution_space = DeviceType;
304 using policy_type = Kokkos::TeamPolicy<ScheduleType, execution_space>;
305 using size_type = typename execution_space::size_type;
306
TestReduceTeam(const size_type & nwork)307 TestReduceTeam(const size_type &nwork) { run_test(nwork); }
308
run_test(const size_type & nwork)309 void run_test(const size_type &nwork) {
310 using functor_type =
311 Test::ReduceTeamFunctor<ScalarType, execution_space, ScheduleType>;
312 using value_type = typename functor_type::value_type;
313 using result_type =
314 Kokkos::View<value_type, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
315
316 enum { Count = 3 };
317 enum { Repeat = 100 };
318
319 value_type result[Repeat];
320
321 const uint64_t nw = nwork;
322 const uint64_t nsum = nw % 2 ? nw * ((nw + 1) / 2) : (nw / 2) * (nw + 1);
323
324 policy_type team_exec(nw, 1);
325
326 const unsigned team_size = team_exec.team_size_recommended(
327 functor_type(nwork), Kokkos::ParallelReduceTag());
328 const unsigned league_size = (nwork + team_size - 1) / team_size;
329
330 team_exec = policy_type(league_size, team_size);
331
332 for (unsigned i = 0; i < Repeat; ++i) {
333 result_type tmp(&result[i]);
334 Kokkos::parallel_reduce(team_exec, functor_type(nwork), tmp);
335 }
336
337 execution_space().fence();
338
339 for (unsigned i = 0; i < Repeat; ++i) {
340 for (unsigned j = 0; j < Count; ++j) {
341 const uint64_t correct = 0 == j % 3 ? nw : nsum;
342 ASSERT_EQ((ScalarType)correct, result[i].value[j]);
343 }
344 }
345 }
346 };
347
348 } // namespace
349
350 /*--------------------------------------------------------------------------*/
351
352 namespace Test {
353
354 template <class DeviceType, class ScheduleType>
355 class ScanTeamFunctor {
356 public:
357 using execution_space = DeviceType;
358 using policy_type = Kokkos::TeamPolicy<ScheduleType, execution_space>;
359 using value_type = int64_t;
360
361 Kokkos::View<value_type, execution_space> accum;
362 Kokkos::View<value_type, execution_space> total;
363
ScanTeamFunctor()364 ScanTeamFunctor() : accum("accum"), total("total") {}
365
366 KOKKOS_INLINE_FUNCTION
init(value_type & error) const367 void init(value_type &error) const { error = 0; }
368
369 KOKKOS_INLINE_FUNCTION
join(value_type volatile & error,value_type volatile const & input) const370 void join(value_type volatile &error,
371 value_type volatile const &input) const {
372 if (input) error = 1;
373 }
374
375 struct JoinMax {
376 using value_type = int64_t;
377
378 KOKKOS_INLINE_FUNCTION
joinTest::ScanTeamFunctor::JoinMax379 void join(value_type volatile &dst,
380 value_type volatile const &input) const {
381 if (dst < input) dst = input;
382 }
383 };
384
385 KOKKOS_INLINE_FUNCTION
operator ()(const typename policy_type::member_type ind,value_type & error) const386 void operator()(const typename policy_type::member_type ind,
387 value_type &error) const {
388 if (0 == ind.league_rank() && 0 == ind.team_rank()) {
389 const int64_t thread_count = ind.league_size() * ind.team_size();
390 total() = (thread_count * (thread_count + 1)) / 2;
391 }
392
393 // Team max:
394 int64_t m = (int64_t)(ind.league_rank() + ind.team_rank());
395 ind.team_reduce(Kokkos::Max<int64_t>(m));
396
397 if (m != ind.league_rank() + (ind.team_size() - 1)) {
398 KOKKOS_IMPL_DO_NOT_USE_PRINTF(
399 "ScanTeamFunctor[%i.%i of %i.%i] reduce_max_answer(%li) != "
400 "reduce_max(%li)\n",
401 static_cast<int>(ind.league_rank()),
402 static_cast<int>(ind.team_rank()),
403 static_cast<int>(ind.league_size()),
404 static_cast<int>(ind.team_size()),
405 static_cast<long>(ind.league_rank() + (ind.team_size() - 1)),
406 static_cast<long>(m));
407 }
408
409 // Scan:
410 const int64_t answer = (ind.league_rank() + 1) * ind.team_rank() +
411 (ind.team_rank() * (ind.team_rank() + 1)) / 2;
412
413 const int64_t result =
414 ind.team_scan(ind.league_rank() + 1 + ind.team_rank() + 1);
415
416 const int64_t result2 =
417 ind.team_scan(ind.league_rank() + 1 + ind.team_rank() + 1);
418
419 if (answer != result || answer != result2) {
420 KOKKOS_IMPL_DO_NOT_USE_PRINTF(
421 "ScanTeamFunctor[%i.%i of %i.%i] answer(%li) != scan_first(%li) or "
422 "scan_second(%li)\n",
423 static_cast<int>(ind.league_rank()),
424 static_cast<int>(ind.team_rank()),
425 static_cast<int>(ind.league_size()),
426 static_cast<int>(ind.team_size()), static_cast<long>(answer),
427 static_cast<long>(result), static_cast<long>(result2));
428
429 error = 1;
430 }
431
432 const int64_t thread_rank =
433 ind.team_rank() + ind.team_size() * ind.league_rank();
434 ind.team_scan(1 + thread_rank, accum.data());
435 }
436 };
437
438 template <class DeviceType, class ScheduleType>
439 class TestScanTeam {
440 public:
441 using execution_space = DeviceType;
442 using value_type = int64_t;
443 using policy_type = Kokkos::TeamPolicy<ScheduleType, execution_space>;
444 using functor_type = Test::ScanTeamFunctor<DeviceType, ScheduleType>;
445
TestScanTeam(const size_t nteam)446 TestScanTeam(const size_t nteam) { run_test(nteam); }
447
run_test(const size_t nteam)448 void run_test(const size_t nteam) {
449 using result_type =
450 Kokkos::View<int64_t, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
451
452 const unsigned REPEAT = 100000;
453 unsigned Repeat;
454
455 if (nteam == 0) {
456 Repeat = 1;
457 } else {
458 Repeat = (REPEAT + nteam - 1) / nteam; // Error here.
459 }
460
461 functor_type functor;
462
463 policy_type team_exec(nteam, 1);
464 team_exec = policy_type(
465 nteam, team_exec.team_size_max(functor, Kokkos::ParallelReduceTag()));
466
467 for (unsigned i = 0; i < Repeat; ++i) {
468 int64_t accum = 0;
469 int64_t total = 0;
470 int64_t error = 0;
471 Kokkos::deep_copy(functor.accum, total);
472
473 Kokkos::parallel_reduce(team_exec, functor, result_type(&error));
474 DeviceType().fence();
475
476 Kokkos::deep_copy(accum, functor.accum);
477 Kokkos::deep_copy(total, functor.total);
478
479 ASSERT_EQ(error, 0);
480 ASSERT_EQ(total, accum);
481 }
482
483 execution_space().fence();
484 }
485 };
486
487 } // namespace Test
488
489 /*--------------------------------------------------------------------------*/
490
491 namespace Test {
492
493 template <class ExecSpace, class ScheduleType>
494 struct SharedTeamFunctor {
495 using execution_space = ExecSpace;
496 using value_type = int;
497 using policy_type = Kokkos::TeamPolicy<ScheduleType, execution_space>;
498
499 enum { SHARED_COUNT = 1000 };
500
501 using shmem_space = typename ExecSpace::scratch_memory_space;
502
503 // TBD: MemoryUnmanaged should be the default for shared memory space.
504 using shared_int_array_type =
505 Kokkos::View<int *, shmem_space, Kokkos::MemoryUnmanaged>;
506
507 // Tell how much shared memory will be required by this functor.
team_shmem_sizeTest::SharedTeamFunctor508 inline unsigned team_shmem_size(int /*team_size*/) const {
509 return shared_int_array_type::shmem_size(SHARED_COUNT) +
510 shared_int_array_type::shmem_size(SHARED_COUNT);
511 }
512
513 KOKKOS_INLINE_FUNCTION
operator ()Test::SharedTeamFunctor514 void operator()(const typename policy_type::member_type &ind,
515 value_type &update) const {
516 const shared_int_array_type shared_A(ind.team_shmem(), SHARED_COUNT);
517 const shared_int_array_type shared_B(ind.team_shmem(), SHARED_COUNT);
518
519 if ((shared_A.data() == nullptr && SHARED_COUNT > 0) ||
520 (shared_B.data() == nullptr && SHARED_COUNT > 0)) {
521 KOKKOS_IMPL_DO_NOT_USE_PRINTF(
522 "member( %i/%i , %i/%i ) Failed to allocate shared memory of size "
523 "%lu\n",
524 static_cast<int>(ind.league_rank()),
525 static_cast<int>(ind.league_size()),
526 static_cast<int>(ind.team_rank()), static_cast<int>(ind.team_size()),
527 static_cast<unsigned long>(SHARED_COUNT));
528
529 ++update; // Failure to allocate is an error.
530 } else {
531 for (int i = ind.team_rank(); i < SHARED_COUNT; i += ind.team_size()) {
532 shared_A[i] = i + ind.league_rank();
533 shared_B[i] = 2 * i + ind.league_rank();
534 }
535
536 ind.team_barrier();
537
538 if (ind.team_rank() + 1 == ind.team_size()) {
539 for (int i = 0; i < SHARED_COUNT; ++i) {
540 if (shared_A[i] != i + ind.league_rank()) {
541 ++update;
542 }
543
544 if (shared_B[i] != 2 * i + ind.league_rank()) {
545 ++update;
546 }
547 }
548 }
549 }
550 }
551 };
552
553 } // namespace Test
554
555 namespace {
556
557 template <class ExecSpace, class ScheduleType>
558 struct TestSharedTeam {
TestSharedTeam__anon4dcf17ce0611::TestSharedTeam559 TestSharedTeam() { run(); }
560
run__anon4dcf17ce0611::TestSharedTeam561 void run() {
562 using Functor = Test::SharedTeamFunctor<ExecSpace, ScheduleType>;
563 using result_type =
564 Kokkos::View<typename Functor::value_type, Kokkos::HostSpace,
565 Kokkos::MemoryUnmanaged>;
566
567 #ifdef KOKKOS_ENABLE_OPENMPTARGET
568 const size_t team_size =
569 Kokkos::TeamPolicy<ScheduleType, ExecSpace>(64, 32).team_size_max(
570 Functor(), Kokkos::ParallelReduceTag());
571
572 Kokkos::TeamPolicy<ScheduleType, ExecSpace> team_exec(32 / team_size,
573 team_size);
574 #else
575 const size_t team_size =
576 Kokkos::TeamPolicy<ScheduleType, ExecSpace>(8192, 1).team_size_max(
577 Functor(), Kokkos::ParallelReduceTag());
578
579 Kokkos::TeamPolicy<ScheduleType, ExecSpace> team_exec(8192 / team_size,
580 team_size);
581 #endif
582
583 typename Functor::value_type error_count = 0;
584
585 Kokkos::parallel_reduce(team_exec, Functor(), result_type(&error_count));
586 Kokkos::fence();
587
588 ASSERT_EQ(error_count, 0);
589 }
590 };
591
592 } // namespace
593
594 namespace Test {
595
596 #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
597 template <class MemorySpace, class ExecSpace, class ScheduleType>
598 struct TestLambdaSharedTeam {
TestLambdaSharedTeamTest::TestLambdaSharedTeam599 TestLambdaSharedTeam() { run(); }
600
runTest::TestLambdaSharedTeam601 void run() {
602 using Functor = Test::SharedTeamFunctor<ExecSpace, ScheduleType>;
603 using result_type = Kokkos::View<typename Functor::value_type, MemorySpace,
604 Kokkos::MemoryUnmanaged>;
605
606 using shmem_space = typename ExecSpace::scratch_memory_space;
607
608 // TBD: MemoryUnmanaged should be the default for shared memory space.
609 using shared_int_array_type =
610 Kokkos::View<int *, shmem_space, Kokkos::MemoryUnmanaged>;
611
612 const int SHARED_COUNT = 1000;
613 #ifdef KOKKOS_ENABLE_OPENMPTARGET
614 int team_size = 32;
615 #else
616 int team_size = 1;
617 #endif
618
619 #ifdef KOKKOS_ENABLE_CUDA
620 if (std::is_same<ExecSpace, Kokkos::Cuda>::value) team_size = 128;
621 #endif
622
623 Kokkos::TeamPolicy<ScheduleType, ExecSpace> team_exec(8192 / team_size,
624 team_size);
625 team_exec = team_exec.set_scratch_size(
626 0, Kokkos::PerTeam(SHARED_COUNT * 2 * sizeof(int)));
627
628 typename Functor::value_type error_count = 0;
629
630 Kokkos::parallel_reduce(
631 team_exec,
632 KOKKOS_LAMBDA(
633 const typename Kokkos::TeamPolicy<ScheduleType,
634 ExecSpace>::member_type &ind,
635 int &update) {
636 const shared_int_array_type shared_A(ind.team_shmem(), SHARED_COUNT);
637 const shared_int_array_type shared_B(ind.team_shmem(), SHARED_COUNT);
638
639 if ((shared_A.data() == nullptr && SHARED_COUNT > 0) ||
640 (shared_B.data() == nullptr && SHARED_COUNT > 0)) {
641 KOKKOS_IMPL_DO_NOT_USE_PRINTF(
642 "Failed to allocate shared memory of size %lu\n",
643 static_cast<unsigned long>(SHARED_COUNT));
644
645 ++update; // Failure to allocate is an error.
646 } else {
647 for (int i = ind.team_rank(); i < SHARED_COUNT;
648 i += ind.team_size()) {
649 shared_A[i] = i + ind.league_rank();
650 shared_B[i] = 2 * i + ind.league_rank();
651 }
652
653 ind.team_barrier();
654
655 if (ind.team_rank() + 1 == ind.team_size()) {
656 for (int i = 0; i < SHARED_COUNT; ++i) {
657 if (shared_A[i] != i + ind.league_rank()) {
658 ++update;
659 }
660
661 if (shared_B[i] != 2 * i + ind.league_rank()) {
662 ++update;
663 }
664 }
665 }
666 }
667 },
668 result_type(&error_count));
669
670 Kokkos::fence();
671
672 ASSERT_EQ(error_count, 0);
673 }
674 };
675 #endif
676
677 } // namespace Test
678
679 namespace Test {
680
681 template <class ExecSpace, class ScheduleType>
682 struct ScratchTeamFunctor {
683 using execution_space = ExecSpace;
684 using value_type = int;
685 using policy_type = Kokkos::TeamPolicy<ScheduleType, execution_space>;
686
687 enum { SHARED_TEAM_COUNT = 100 };
688 enum { SHARED_THREAD_COUNT = 10 };
689
690 using shmem_space = typename ExecSpace::scratch_memory_space;
691
692 // TBD: MemoryUnmanaged should be the default for shared memory space.
693 using shared_int_array_type =
694 Kokkos::View<size_t *, shmem_space, Kokkos::MemoryUnmanaged>;
695
696 KOKKOS_INLINE_FUNCTION
operator ()Test::ScratchTeamFunctor697 void operator()(const typename policy_type::member_type &ind,
698 value_type &update) const {
699 const shared_int_array_type scratch_ptr(ind.team_scratch(1),
700 3 * ind.team_size());
701 const shared_int_array_type scratch_A(ind.team_scratch(1),
702 SHARED_TEAM_COUNT);
703 const shared_int_array_type scratch_B(ind.thread_scratch(1),
704 SHARED_THREAD_COUNT);
705
706 if ((scratch_ptr.data() == nullptr) ||
707 (scratch_A.data() == nullptr && SHARED_TEAM_COUNT > 0) ||
708 (scratch_B.data() == nullptr && SHARED_THREAD_COUNT > 0)) {
709 KOKKOS_IMPL_DO_NOT_USE_PRINTF(
710 "Failed to allocate shared memory of size %lu\n",
711 static_cast<unsigned long>(SHARED_TEAM_COUNT));
712
713 ++update; // Failure to allocate is an error.
714 } else {
715 Kokkos::parallel_for(
716 Kokkos::TeamThreadRange(ind, 0, (int)SHARED_TEAM_COUNT),
717 [&](const int &i) { scratch_A[i] = i + ind.league_rank(); });
718
719 for (int i = 0; i < SHARED_THREAD_COUNT; i++) {
720 scratch_B[i] = 10000 * ind.league_rank() + 100 * ind.team_rank() + i;
721 }
722
723 scratch_ptr[ind.team_rank()] = (size_t)scratch_A.data();
724 scratch_ptr[ind.team_rank() + ind.team_size()] = (size_t)scratch_B.data();
725
726 ind.team_barrier();
727
728 for (int i = 0; i < SHARED_TEAM_COUNT; i++) {
729 if (scratch_A[i] != size_t(i + ind.league_rank())) ++update;
730 }
731
732 for (int i = 0; i < ind.team_size(); i++) {
733 if (scratch_ptr[0] != scratch_ptr[i]) ++update;
734 }
735
736 if (scratch_ptr[1 + ind.team_size()] - scratch_ptr[0 + ind.team_size()] <
737 SHARED_THREAD_COUNT * sizeof(size_t)) {
738 ++update;
739 }
740
741 for (int i = 1; i < ind.team_size(); i++) {
742 if ((scratch_ptr[i + ind.team_size()] -
743 scratch_ptr[i - 1 + ind.team_size()]) !=
744 (scratch_ptr[1 + ind.team_size()] -
745 scratch_ptr[0 + ind.team_size()])) {
746 ++update;
747 }
748 }
749 }
750 }
751 };
752
753 } // namespace Test
754
755 namespace {
756
757 template <class ExecSpace, class ScheduleType>
758 struct TestScratchTeam {
TestScratchTeam__anon4dcf17ce0a11::TestScratchTeam759 TestScratchTeam() { run(); }
760
run__anon4dcf17ce0a11::TestScratchTeam761 void run() {
762 using Functor = Test::ScratchTeamFunctor<ExecSpace, ScheduleType>;
763 using result_type =
764 Kokkos::View<typename Functor::value_type, Kokkos::HostSpace,
765 Kokkos::MemoryUnmanaged>;
766 using p_type = Kokkos::TeamPolicy<ScheduleType, ExecSpace>;
767
768 typename Functor::value_type error_count = 0;
769
770 int thread_scratch_size = Functor::shared_int_array_type::shmem_size(
771 Functor::SHARED_THREAD_COUNT);
772
773 #ifdef KOKKOS_ENABLE_OPENMPTARGET
774 p_type team_exec = p_type(64, 32).set_scratch_size(
775 1,
776 Kokkos::PerTeam(Functor::shared_int_array_type::shmem_size(
777 Functor::SHARED_TEAM_COUNT)),
778 Kokkos::PerThread(thread_scratch_size + 3 * sizeof(int)));
779 #else
780 p_type team_exec = p_type(8192, 1).set_scratch_size(
781 1,
782 Kokkos::PerTeam(Functor::shared_int_array_type::shmem_size(
783 Functor::SHARED_TEAM_COUNT)),
784 Kokkos::PerThread(thread_scratch_size + 3 * sizeof(int)));
785 #endif
786
787 const size_t team_size =
788 team_exec.team_size_max(Functor(), Kokkos::ParallelReduceTag());
789
790 int team_scratch_size =
791 Functor::shared_int_array_type::shmem_size(Functor::SHARED_TEAM_COUNT) +
792 Functor::shared_int_array_type::shmem_size(3 * team_size);
793
794 #ifdef KOKKOS_ENABLE_OPENMPTARGET
795 team_exec = p_type(64 / team_size, team_size);
796 #else
797 team_exec = p_type(8192 / team_size, team_size);
798 #endif
799
800 Kokkos::parallel_reduce(
801 team_exec.set_scratch_size(1, Kokkos::PerTeam(team_scratch_size),
802 Kokkos::PerThread(thread_scratch_size)),
803 Functor(), result_type(&error_count));
804 Kokkos::fence();
805 ASSERT_EQ(error_count, 0);
806 }
807 };
808
809 } // namespace
810
811 namespace Test {
812
813 template <class ExecSpace>
test_team_mulit_level_scratch_loop_body(const typename Kokkos::TeamPolicy<ExecSpace>::member_type & team)814 KOKKOS_INLINE_FUNCTION int test_team_mulit_level_scratch_loop_body(
815 const typename Kokkos::TeamPolicy<ExecSpace>::member_type &team) {
816 Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
817 a_team1(team.team_scratch(0), 128);
818 Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
819 a_thread1(team.thread_scratch(0), 16);
820 Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
821 a_team2(team.team_scratch(0), 128);
822 Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
823 a_thread2(team.thread_scratch(0), 16);
824
825 Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
826 b_team1(team.team_scratch(1), 12800);
827 Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
828 b_thread1(team.thread_scratch(1), 1600);
829 Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
830 b_team2(team.team_scratch(1), 12800);
831 Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
832 b_thread2(team.thread_scratch(1), 1600);
833
834 Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
835 a_team3(team.team_scratch(0), 128);
836 Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
837 a_thread3(team.thread_scratch(0), 16);
838 Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
839 b_team3(team.team_scratch(1), 12800);
840 Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
841 b_thread3(team.thread_scratch(1), 1600);
842
843 // The explicit types for 0 and 128 are here to test TeamThreadRange accepting
844 // different types for begin and end.
845 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, int(0), unsigned(128)),
846 [&](const int &i) {
847 a_team1(i) = 1000000 + i + team.league_rank() * 100000;
848 a_team2(i) = 2000000 + i + team.league_rank() * 100000;
849 a_team3(i) = 3000000 + i + team.league_rank() * 100000;
850 });
851 team.team_barrier();
852
853 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, int(0), unsigned(16)),
854 [&](const int &i) {
855 a_thread1(i) = 1000000 + 100000 * team.team_rank() +
856 16 - i + team.league_rank() * 100000;
857 a_thread2(i) = 2000000 + 100000 * team.team_rank() +
858 16 - i + team.league_rank() * 100000;
859 a_thread3(i) = 3000000 + 100000 * team.team_rank() +
860 16 - i + team.league_rank() * 100000;
861 });
862
863 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, int(0), unsigned(12800)),
864 [&](const int &i) {
865 b_team1(i) = 1000000 + i + team.league_rank() * 100000;
866 b_team2(i) = 2000000 + i + team.league_rank() * 100000;
867 b_team3(i) = 3000000 + i + team.league_rank() * 100000;
868 });
869 team.team_barrier();
870
871 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, 1600),
872 [&](const int &i) {
873 b_thread1(i) = 1000000 + 100000 * team.team_rank() +
874 16 - i + team.league_rank() * 100000;
875 b_thread2(i) = 2000000 + 100000 * team.team_rank() +
876 16 - i + team.league_rank() * 100000;
877 b_thread3(i) = 3000000 + 100000 * team.team_rank() +
878 16 - i + team.league_rank() * 100000;
879 });
880
881 team.team_barrier();
882
883 int error = 0;
884 Kokkos::parallel_for(
885 Kokkos::TeamThreadRange(team, 0, 128), [&](const int &i) {
886 if (a_team1(i) != 1000000 + i + team.league_rank() * 100000) error++;
887 if (a_team2(i) != 2000000 + i + team.league_rank() * 100000) error++;
888 if (a_team3(i) != 3000000 + i + team.league_rank() * 100000) error++;
889 });
890 team.team_barrier();
891
892 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, 16), [&](const int &i) {
893 if (a_thread1(i) != 1000000 + 100000 * team.team_rank() + 16 - i +
894 team.league_rank() * 100000)
895 error++;
896 if (a_thread2(i) != 2000000 + 100000 * team.team_rank() + 16 - i +
897 team.league_rank() * 100000)
898 error++;
899 if (a_thread3(i) != 3000000 + 100000 * team.team_rank() + 16 - i +
900 team.league_rank() * 100000)
901 error++;
902 });
903
904 Kokkos::parallel_for(
905 Kokkos::TeamThreadRange(team, 0, 12800), [&](const int &i) {
906 if (b_team1(i) != 1000000 + i + team.league_rank() * 100000) error++;
907 if (b_team2(i) != 2000000 + i + team.league_rank() * 100000) error++;
908 if (b_team3(i) != 3000000 + i + team.league_rank() * 100000) error++;
909 });
910 team.team_barrier();
911
912 Kokkos::parallel_for(
913 Kokkos::ThreadVectorRange(team, 1600), [&](const int &i) {
914 if (b_thread1(i) != 1000000 + 100000 * team.team_rank() + 16 - i +
915 team.league_rank() * 100000)
916 error++;
917 if (b_thread2(i) != 2000000 + 100000 * team.team_rank() + 16 - i +
918 team.league_rank() * 100000)
919 error++;
920 if (b_thread3(i) != 3000000 + 100000 * team.team_rank() + 16 - i +
921 team.league_rank() * 100000)
922 error++;
923 });
924
925 return error;
926 }
927
928 struct TagReduce {};
929 struct TagFor {};
930
931 template <class ExecSpace, class ScheduleType>
932 struct ClassNoShmemSizeFunction {
933 using member_type =
934 typename Kokkos::TeamPolicy<ExecSpace, ScheduleType>::member_type;
935
936 Kokkos::View<int, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
937
938 KOKKOS_INLINE_FUNCTION
operator ()Test::ClassNoShmemSizeFunction939 void operator()(const TagFor &, const member_type &team) const {
940 int error = test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
941 errors() += error;
942 }
943
944 KOKKOS_INLINE_FUNCTION
operator ()Test::ClassNoShmemSizeFunction945 void operator()(const TagReduce &, const member_type &team,
946 int &error) const {
947 error += test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
948 }
949
runTest::ClassNoShmemSizeFunction950 void run() {
951 Kokkos::View<int, ExecSpace> d_errors =
952 Kokkos::View<int, ExecSpace>("Errors");
953 errors = d_errors;
954
955 const int per_team0 =
956 3 *
957 Kokkos::View<double *, ExecSpace,
958 Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(128);
959 const int per_thread0 =
960 3 *
961 Kokkos::View<double *, ExecSpace,
962 Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(16);
963
964 const int per_team1 =
965 3 * Kokkos::View<
966 double *, ExecSpace,
967 Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(12800);
968 const int per_thread1 =
969 3 * Kokkos::View<
970 double *, ExecSpace,
971 Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(1600);
972
973 int team_size = 8;
974 if (team_size > ExecSpace::concurrency())
975 team_size = ExecSpace::concurrency();
976 {
977 Kokkos::TeamPolicy<TagFor, ExecSpace, ScheduleType> policy(10, team_size,
978 16);
979
980 Kokkos::parallel_for(
981 policy
982 .set_scratch_size(0, Kokkos::PerTeam(per_team0),
983 Kokkos::PerThread(per_thread0))
984 .set_scratch_size(1, Kokkos::PerTeam(per_team1),
985 Kokkos::PerThread(per_thread1)),
986 *this);
987 Kokkos::fence();
988
989 typename Kokkos::View<int, ExecSpace>::HostMirror h_errors =
990 Kokkos::create_mirror_view(d_errors);
991 Kokkos::deep_copy(h_errors, d_errors);
992 ASSERT_EQ(h_errors(), 0);
993 }
994
995 {
996 int error = 0;
997 Kokkos::TeamPolicy<TagReduce, ExecSpace, ScheduleType> policy(
998 10, team_size, 16);
999
1000 Kokkos::parallel_reduce(
1001 policy
1002 .set_scratch_size(0, Kokkos::PerTeam(per_team0),
1003 Kokkos::PerThread(per_thread0))
1004 .set_scratch_size(1, Kokkos::PerTeam(per_team1),
1005 Kokkos::PerThread(per_thread1)),
1006 *this, error);
1007
1008 ASSERT_EQ(error, 0);
1009 }
1010 };
1011 };
1012
1013 template <class ExecSpace, class ScheduleType>
1014 struct ClassWithShmemSizeFunction {
1015 using member_type =
1016 typename Kokkos::TeamPolicy<ExecSpace, ScheduleType>::member_type;
1017
1018 Kokkos::View<int, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
1019
1020 KOKKOS_INLINE_FUNCTION
operator ()Test::ClassWithShmemSizeFunction1021 void operator()(const TagFor &, const member_type &team) const {
1022 int error = test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
1023 errors() += error;
1024 }
1025
1026 KOKKOS_INLINE_FUNCTION
operator ()Test::ClassWithShmemSizeFunction1027 void operator()(const TagReduce &, const member_type &team,
1028 int &error) const {
1029 error += test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
1030 }
1031
runTest::ClassWithShmemSizeFunction1032 void run() {
1033 Kokkos::View<int, ExecSpace> d_errors =
1034 Kokkos::View<int, ExecSpace>("Errors");
1035 errors = d_errors;
1036
1037 const int per_team1 =
1038 3 * Kokkos::View<
1039 double *, ExecSpace,
1040 Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(12800);
1041 const int per_thread1 =
1042 3 * Kokkos::View<
1043 double *, ExecSpace,
1044 Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(1600);
1045
1046 int team_size = 8;
1047 if (team_size > ExecSpace::concurrency())
1048 team_size = ExecSpace::concurrency();
1049
1050 {
1051 Kokkos::TeamPolicy<TagFor, ExecSpace, ScheduleType> policy(10, team_size,
1052 16);
1053
1054 Kokkos::parallel_for(
1055 policy.set_scratch_size(1, Kokkos::PerTeam(per_team1),
1056 Kokkos::PerThread(per_thread1)),
1057 *this);
1058 Kokkos::fence();
1059
1060 typename Kokkos::View<int, ExecSpace>::HostMirror h_errors =
1061 Kokkos::create_mirror_view(d_errors);
1062 Kokkos::deep_copy(h_errors, d_errors);
1063 ASSERT_EQ(h_errors(), 0);
1064 }
1065
1066 {
1067 int error = 0;
1068 Kokkos::TeamPolicy<TagReduce, ExecSpace, ScheduleType> policy(
1069 10, team_size, 16);
1070
1071 Kokkos::parallel_reduce(
1072 policy.set_scratch_size(1, Kokkos::PerTeam(per_team1),
1073 Kokkos::PerThread(per_thread1)),
1074 *this, error);
1075
1076 ASSERT_EQ(error, 0);
1077 }
1078 };
1079
team_shmem_sizeTest::ClassWithShmemSizeFunction1080 unsigned team_shmem_size(int team_size) const {
1081 const int per_team0 =
1082 3 *
1083 Kokkos::View<double *, ExecSpace,
1084 Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(128);
1085 const int per_thread0 =
1086 3 *
1087 Kokkos::View<double *, ExecSpace,
1088 Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(16);
1089 return per_team0 + team_size * per_thread0;
1090 }
1091 };
1092
1093 template <class ExecSpace, class ScheduleType>
test_team_mulit_level_scratch_test_lambda()1094 void test_team_mulit_level_scratch_test_lambda() {
1095 #ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
1096 Kokkos::View<int, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
1097 Kokkos::View<int, ExecSpace> d_errors("Errors");
1098 errors = d_errors;
1099
1100 const int per_team0 =
1101 3 *
1102 Kokkos::View<double *, ExecSpace,
1103 Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(128);
1104 const int per_thread0 =
1105 3 *
1106 Kokkos::View<double *, ExecSpace,
1107 Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(16);
1108
1109 const int per_team1 =
1110 3 *
1111 Kokkos::View<double *, ExecSpace,
1112 Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(12800);
1113 const int per_thread1 =
1114 3 *
1115 Kokkos::View<double *, ExecSpace,
1116 Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(1600);
1117
1118 int team_size = 8;
1119 if (team_size > ExecSpace::concurrency())
1120 team_size = ExecSpace::concurrency();
1121
1122 Kokkos::TeamPolicy<ExecSpace, ScheduleType> policy(10, team_size, 16);
1123
1124 Kokkos::parallel_for(
1125 policy
1126 .set_scratch_size(0, Kokkos::PerTeam(per_team0),
1127 Kokkos::PerThread(per_thread0))
1128 .set_scratch_size(1, Kokkos::PerTeam(per_team1),
1129 Kokkos::PerThread(per_thread1)),
1130 KOKKOS_LAMBDA(
1131 const typename Kokkos::TeamPolicy<ExecSpace>::member_type &team) {
1132 int error = test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
1133 errors() += error;
1134 });
1135 Kokkos::fence();
1136
1137 typename Kokkos::View<int, ExecSpace>::HostMirror h_errors =
1138 Kokkos::create_mirror_view(errors);
1139 Kokkos::deep_copy(h_errors, d_errors);
1140 ASSERT_EQ(h_errors(), 0);
1141
1142 int error = 0;
1143 Kokkos::parallel_reduce(
1144 policy
1145 .set_scratch_size(0, Kokkos::PerTeam(per_team0),
1146 Kokkos::PerThread(per_thread0))
1147 .set_scratch_size(1, Kokkos::PerTeam(per_team1),
1148 Kokkos::PerThread(per_thread1)),
1149 KOKKOS_LAMBDA(
1150 const typename Kokkos::TeamPolicy<ExecSpace>::member_type &team,
1151 int &count) {
1152 count += test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
1153 },
1154 error);
1155 ASSERT_EQ(error, 0);
1156 #endif
1157 }
1158
1159 } // namespace Test
1160
1161 namespace {
1162
1163 template <class ExecSpace, class ScheduleType>
1164 struct TestMultiLevelScratchTeam {
TestMultiLevelScratchTeam__anon4dcf17ce1311::TestMultiLevelScratchTeam1165 TestMultiLevelScratchTeam() { run(); }
1166
run__anon4dcf17ce1311::TestMultiLevelScratchTeam1167 void run() {
1168 #ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
1169 Test::test_team_mulit_level_scratch_test_lambda<ExecSpace, ScheduleType>();
1170 #endif
1171 Test::ClassNoShmemSizeFunction<ExecSpace, ScheduleType> c1;
1172 c1.run();
1173
1174 Test::ClassWithShmemSizeFunction<ExecSpace, ScheduleType> c2;
1175 c2.run();
1176 }
1177 };
1178
1179 } // namespace
1180
1181 namespace Test {
1182
1183 template <class ExecSpace>
1184 struct TestShmemSize {
TestShmemSizeTest::TestShmemSize1185 TestShmemSize() { run(); }
1186
runTest::TestShmemSize1187 void run() {
1188 using view_type = Kokkos::View<int64_t ***, ExecSpace>;
1189
1190 size_t d1 = 5;
1191 size_t d2 = 6;
1192 size_t d3 = 7;
1193
1194 size_t size = view_type::shmem_size(d1, d2, d3);
1195
1196 ASSERT_EQ(size, (d1 * d2 * d3 + 1) * sizeof(int64_t));
1197
1198 test_layout_stride();
1199 }
1200
test_layout_strideTest::TestShmemSize1201 void test_layout_stride() {
1202 int rank = 3;
1203 int order[3] = {2, 0, 1};
1204 int extents[3] = {100, 10, 3};
1205 auto s1 =
1206 Kokkos::View<double ***, Kokkos::LayoutStride, ExecSpace>::shmem_size(
1207 Kokkos::LayoutStride::order_dimensions(rank, order, extents));
1208 auto s2 =
1209 Kokkos::View<double ***, Kokkos::LayoutRight, ExecSpace>::shmem_size(
1210 extents[0], extents[1], extents[2]);
1211 ASSERT_EQ(s1, s2);
1212 }
1213 };
1214
1215 } // namespace Test
1216
1217 /*--------------------------------------------------------------------------*/
1218
1219 namespace Test {
1220
1221 namespace {
1222
1223 template <class ExecSpace, class ScheduleType, class T, class Enabled = void>
1224 struct TestTeamBroadcast;
1225
1226 template <class ExecSpace, class ScheduleType, class T>
1227 struct TestTeamBroadcast<
1228 ExecSpace, ScheduleType, T,
1229 typename std::enable_if<(sizeof(T) == sizeof(char)), void>::type> {
1230 using team_member =
1231 typename Kokkos::TeamPolicy<ScheduleType, ExecSpace>::member_type;
1232 using memory_space = typename ExecSpace::memory_space;
1233 using value_type = T;
1234
1235 const value_type offset;
1236
TestTeamBroadcastTest::__anon4dcf17ce1411::TestTeamBroadcast1237 TestTeamBroadcast(const size_t /*league_size*/, const value_type os_)
1238 : offset(os_) {}
1239
1240 struct BroadcastTag {};
1241
1242 KOKKOS_INLINE_FUNCTION
operator ()Test::__anon4dcf17ce1411::TestTeamBroadcast1243 void operator()(const team_member &teamMember, value_type &update) const {
1244 int lid = teamMember.league_rank();
1245 int tid = teamMember.team_rank();
1246 int ts = teamMember.team_size();
1247
1248 value_type parUpdate = 0;
1249 value_type value = (value_type)(tid % 0xFF) + offset;
1250
1251 // broadcast boolean and value to team from source thread
1252 teamMember.team_broadcast(value, lid % ts);
1253
1254 Kokkos::parallel_reduce(
1255 Kokkos::TeamThreadRange(teamMember, ts),
1256 [&](const int /*j*/, value_type &teamUpdate) { teamUpdate |= value; },
1257 Kokkos::BOr<value_type, memory_space>(parUpdate));
1258
1259 if (teamMember.team_rank() == 0) update |= parUpdate;
1260 }
1261
1262 KOKKOS_INLINE_FUNCTION
operator ()Test::__anon4dcf17ce1411::TestTeamBroadcast1263 void operator()(const BroadcastTag &, const team_member &teamMember,
1264 value_type &update) const {
1265 int lid = teamMember.league_rank();
1266 int tid = teamMember.team_rank();
1267 int ts = teamMember.team_size();
1268
1269 value_type parUpdate = 0;
1270 value_type value = (value_type)(tid % 0xFF) + offset;
1271
1272 teamMember.team_broadcast([&](value_type &var) { var -= offset; }, value,
1273 lid % ts);
1274
1275 Kokkos::parallel_reduce(
1276 Kokkos::TeamThreadRange(teamMember, ts),
1277 [&](const int /*j*/, value_type &teamUpdate) { teamUpdate |= value; },
1278 Kokkos::BOr<value_type, memory_space>(parUpdate));
1279
1280 if (teamMember.team_rank() == 0) update |= parUpdate;
1281 }
1282
test_teambroadcastTest::__anon4dcf17ce1411::TestTeamBroadcast1283 static void test_teambroadcast(const size_t league_size,
1284 const value_type off) {
1285 TestTeamBroadcast functor(league_size, off);
1286
1287 using policy_type = Kokkos::TeamPolicy<ScheduleType, ExecSpace>;
1288 using policy_type_f =
1289 Kokkos::TeamPolicy<ScheduleType, ExecSpace, BroadcastTag>;
1290
1291 // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
1292 #ifdef KOKKOS_ENABLE_OPENMPTARGET
1293 int fake_team_size =
1294 std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value ? 32
1295 : 1;
1296 #else
1297 int fake_team_size = 1;
1298 #endif
1299 const int team_size =
1300 policy_type_f(league_size, fake_team_size)
1301 .team_size_max(
1302 functor,
1303 Kokkos::
1304 ParallelReduceTag()); // printf("team_size=%d\n",team_size);
1305
1306 // team_broadcast with value
1307 value_type total = 0;
1308
1309 Kokkos::parallel_reduce(policy_type(league_size, team_size), functor,
1310 Kokkos::BOr<value_type, Kokkos::HostSpace>(total));
1311
1312 value_type expected_result = 0;
1313 for (unsigned int i = 0; i < league_size; i++) {
1314 value_type val = (value_type((i % team_size % 0xFF)) + off);
1315 expected_result |= val;
1316 }
1317 ASSERT_EQ(expected_result, total);
1318 // printf("team_broadcast with value --"
1319 //"expected_result=%x,"
1320 //"total=%x\n",expected_result, total);
1321
1322 // team_broadcast with function object
1323 total = 0;
1324
1325 Kokkos::parallel_reduce(policy_type_f(league_size, team_size), functor,
1326 Kokkos::BOr<value_type, Kokkos::HostSpace>(total));
1327
1328 expected_result = 0;
1329 for (unsigned int i = 0; i < league_size; i++) {
1330 value_type val = ((value_type)((i % team_size % 0xFF)));
1331 expected_result |= val;
1332 }
1333 ASSERT_EQ(expected_result, total);
1334 // printf("team_broadcast with function object --"
1335 // "expected_result=%x,"
1336 // "total=%x\n",expected_result, total);
1337 }
1338 };
1339
1340 template <class ExecSpace, class ScheduleType, class T>
1341 struct TestTeamBroadcast<
1342 ExecSpace, ScheduleType, T,
1343 typename std::enable_if<(sizeof(T) > sizeof(char)), void>::type> {
1344 using team_member =
1345 typename Kokkos::TeamPolicy<ScheduleType, ExecSpace>::member_type;
1346 using value_type = T;
1347
1348 const value_type offset;
1349
TestTeamBroadcastTest::__anon4dcf17ce1411::TestTeamBroadcast1350 TestTeamBroadcast(const size_t /*league_size*/, const value_type os_)
1351 : offset(os_) {}
1352
1353 struct BroadcastTag {};
1354
1355 KOKKOS_INLINE_FUNCTION
operator ()Test::__anon4dcf17ce1411::TestTeamBroadcast1356 void operator()(const team_member &teamMember, value_type &update) const {
1357 int lid = teamMember.league_rank();
1358 int tid = teamMember.team_rank();
1359 int ts = teamMember.team_size();
1360
1361 value_type parUpdate = 0;
1362 value_type value = (value_type)(tid * 3) + offset;
1363
1364 // setValue is used to determine if the update should be
1365 // performed at the bottom. The thread id must match the
1366 // thread id used to broadcast the value. It is the
1367 // thread id that matches the league rank mod team size
1368 // this way each league rank will use a different thread id
1369 // which is likely not 0
1370 bool setValue = ((lid % ts) == tid);
1371
1372 // broadcast boolean and value to team from source thread
1373 teamMember.team_broadcast(value, lid % ts);
1374 teamMember.team_broadcast(setValue, lid % ts);
1375
1376 Kokkos::parallel_reduce(
1377 Kokkos::TeamThreadRange(teamMember, ts),
1378 [&](const int /*j*/, value_type &teamUpdate) { teamUpdate += value; },
1379 parUpdate);
1380
1381 if (teamMember.team_rank() == 0 && setValue) update += parUpdate;
1382 }
1383
1384 KOKKOS_INLINE_FUNCTION
operator ()Test::__anon4dcf17ce1411::TestTeamBroadcast1385 void operator()(const BroadcastTag &, const team_member &teamMember,
1386 value_type &update) const {
1387 int lid = teamMember.league_rank();
1388 int tid = teamMember.team_rank();
1389 int ts = teamMember.team_size();
1390
1391 value_type parUpdate = 0;
1392 value_type value = (value_type)(tid * 3) + offset;
1393
1394 // setValue is used to determine if the update should be
1395 // performed at the bottom. The thread id must match the
1396 // thread id used to broadcast the value. It is the
1397 // thread id that matches the league rank mod team size
1398 // this way each league rank will use a different thread id
1399 // which is likely not 0. Note the logic is switched from
1400 // above because the functor switches it back.
1401 bool setValue = ((lid % ts) != tid);
1402
1403 teamMember.team_broadcast([&](value_type &var) { var *= 2; }, value,
1404 lid % ts);
1405 teamMember.team_broadcast([&](bool &bVar) { bVar = !bVar; }, setValue,
1406 lid % ts);
1407
1408 Kokkos::parallel_reduce(
1409 Kokkos::TeamThreadRange(teamMember, ts),
1410 [&](const int /*j*/, value_type &teamUpdate) { teamUpdate += value; },
1411 parUpdate);
1412
1413 if (teamMember.team_rank() == 0 && setValue) update += parUpdate;
1414 }
1415
1416 template <class ScalarType>
1417 static inline
1418 typename std::enable_if<!std::is_integral<ScalarType>::value, void>::type
compare_testTest::__anon4dcf17ce1411::TestTeamBroadcast1419 compare_test(ScalarType A, ScalarType B, double epsilon_factor) {
1420 if (std::is_same<ScalarType, double>::value ||
1421 std::is_same<ScalarType, float>::value) {
1422 ASSERT_NEAR((double)A, (double)B,
1423 epsilon_factor * std::abs(A) *
1424 std::numeric_limits<ScalarType>::epsilon());
1425 } else {
1426 ASSERT_EQ(A, B);
1427 }
1428 }
1429
1430 template <class ScalarType>
1431 static inline
1432 typename std::enable_if<std::is_integral<ScalarType>::value, void>::type
compare_testTest::__anon4dcf17ce1411::TestTeamBroadcast1433 compare_test(ScalarType A, ScalarType B, double) {
1434 ASSERT_EQ(A, B);
1435 }
1436
test_teambroadcastTest::__anon4dcf17ce1411::TestTeamBroadcast1437 static void test_teambroadcast(const size_t league_size,
1438 const value_type off) {
1439 TestTeamBroadcast functor(league_size, off);
1440
1441 using policy_type = Kokkos::TeamPolicy<ScheduleType, ExecSpace>;
1442 using policy_type_f =
1443 Kokkos::TeamPolicy<ScheduleType, ExecSpace, BroadcastTag>;
1444
1445 // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
1446 #ifdef KOKKOS_ENABLE_OPENMPTARGET
1447 int fake_team_size =
1448 std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value ? 32
1449 : 1;
1450 #else
1451 int fake_team_size = 1;
1452 #endif
1453 const int team_size =
1454 policy_type_f(league_size, fake_team_size)
1455 .team_size_max(
1456 functor,
1457 Kokkos::
1458 ParallelReduceTag()); // printf("team_size=%d\n",team_size);
1459 // team_broadcast with value
1460 value_type total = 0;
1461
1462 Kokkos::parallel_reduce(policy_type(league_size, team_size), functor,
1463 total);
1464
1465 value_type expected_result = 0;
1466 for (unsigned int i = 0; i < league_size; i++) {
1467 value_type val =
1468 (value_type((i % team_size) * 3) + off) * (value_type)team_size;
1469 expected_result += val;
1470 }
1471 // For comparison purposes treat the reduction as a random walk in the
1472 // least significant digit, which gives a typical walk distance
1473 // sqrt(league_size) Add 4x for larger sigma
1474 compare_test(expected_result, total, 4.0 * std::sqrt(league_size));
1475
1476 // team_broadcast with function object
1477 total = 0;
1478
1479 Kokkos::parallel_reduce(policy_type_f(league_size, team_size), functor,
1480 total);
1481
1482 expected_result = 0;
1483 for (unsigned int i = 0; i < league_size; i++) {
1484 value_type val = ((value_type)((i % team_size) * 3) + off) *
1485 (value_type)(2 * team_size);
1486 expected_result += val;
1487 }
1488 // For comparison purposes treat the reduction as a random walk in the
1489 // least significant digit, which gives a typical walk distance
1490 // sqrt(league_size) Add 4x for larger sigma
1491 compare_test(expected_result, total, 4.0 * std::sqrt(league_size));
1492 }
1493 };
1494
1495 template <class ExecSpace>
1496 struct TestScratchAlignment {
1497 struct TestScalar {
1498 double x, y, z;
1499 };
TestScratchAlignmentTest::__anon4dcf17ce1411::TestScratchAlignment1500 TestScratchAlignment() {
1501 test(true);
1502 test(false);
1503 }
1504 using ScratchView =
1505 Kokkos::View<TestScalar *, typename ExecSpace::scratch_memory_space>;
1506 using ScratchViewInt =
1507 Kokkos::View<int *, typename ExecSpace::scratch_memory_space>;
testTest::__anon4dcf17ce1411::TestScratchAlignment1508 void test(bool allocate_small) {
1509 int shmem_size = ScratchView::shmem_size(11);
1510 #ifdef KOKKOS_ENABLE_OPENMPTARGET
1511 int team_size = 32;
1512 #else
1513 int team_size = 1;
1514 #endif
1515 if (allocate_small) shmem_size += ScratchViewInt::shmem_size(1);
1516 Kokkos::parallel_for(
1517 Kokkos::TeamPolicy<ExecSpace>(1, team_size)
1518 .set_scratch_size(0, Kokkos::PerTeam(shmem_size)),
1519 KOKKOS_LAMBDA(
1520 const typename Kokkos::TeamPolicy<ExecSpace>::member_type &team) {
1521 if (allocate_small) ScratchViewInt p(team.team_scratch(0), 1);
1522 ScratchView a(team.team_scratch(0), 11);
1523 if (ptrdiff_t(a.data()) % sizeof(TestScalar) != 0)
1524 Kokkos::abort("Error: invalid scratch view alignment\n");
1525 });
1526 Kokkos::fence();
1527 }
1528 };
1529
1530 } // namespace
1531
1532 namespace {
1533
1534 template <class ExecSpace>
1535 struct TestTeamPolicyHandleByValue {
1536 using scalar = double;
1537 using exec_space = ExecSpace;
1538 using mem_space = typename ExecSpace::memory_space;
1539
TestTeamPolicyHandleByValueTest::__anon4dcf17ce1c11::TestTeamPolicyHandleByValue1540 TestTeamPolicyHandleByValue() { test(); }
1541
testTest::__anon4dcf17ce1c11::TestTeamPolicyHandleByValue1542 void test() {
1543 #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
1544 const int M = 1, N = 1;
1545 Kokkos::View<scalar **, mem_space> a("a", M, N);
1546 Kokkos::View<scalar **, mem_space> b("b", M, N);
1547 Kokkos::deep_copy(a, 0.0);
1548 Kokkos::deep_copy(b, 1.0);
1549 Kokkos::parallel_for(
1550 "test_tphandle_by_value",
1551 Kokkos::TeamPolicy<exec_space>(M, Kokkos::AUTO(), 1),
1552 KOKKOS_LAMBDA(
1553 const typename Kokkos::TeamPolicy<exec_space>::member_type team) {
1554 const int i = team.league_rank();
1555 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 0, N),
1556 [&](const int j) { a(i, j) += b(i, j); });
1557 });
1558 #endif
1559 }
1560 };
1561
1562 } // namespace
1563
1564 } // namespace Test
1565
1566 /*--------------------------------------------------------------------------*/
1567