1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 // Kokkos v. 3.0
6 // Copyright (2020) National Technology & Engineering
7 // Solutions of Sandia, LLC (NTESS).
8 //
9 // Under the terms of Contract DE-NA0003525 with NTESS,
10 // the U.S. Government retains certain rights in this software.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions are
14 // met:
15 //
16 // 1. Redistributions of source code must retain the above copyright
17 // notice, this list of conditions and the following disclaimer.
18 //
19 // 2. Redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution.
22 //
23 // 3. Neither the name of the Corporation nor the names of the
24 // contributors may be used to endorse or promote products derived from
25 // this software without specific prior written permission.
26 //
27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 //
39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
40 //
41 // ************************************************************************
42 //@HEADER
43 */
44
45 #include <Kokkos_Core.hpp>
46
47 #include <impl/Kokkos_Timer.hpp>
48 #include <iostream>
49 #include <cstdlib>
50 #include <cstdint>
51 #include <cinttypes>
52 #include <TestNonTrivialScalarTypes.hpp>
53
54 namespace TestTeamVector {
55
56 template <typename Scalar, class ExecutionSpace>
57 struct functor_team_for {
58 using policy_type = Kokkos::TeamPolicy<ExecutionSpace>;
59 using execution_space = ExecutionSpace;
60
61 Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag;
62
functor_team_forTestTeamVector::functor_team_for63 functor_team_for(Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_)
64 : flag(flag_) {}
65
66 using shmem_space = typename ExecutionSpace::scratch_memory_space;
67 using shared_int =
68 Kokkos::View<Scalar *, shmem_space, Kokkos::MemoryUnmanaged>;
team_shmem_sizeTestTeamVector::functor_team_for69 unsigned team_shmem_size(int team_size) const {
70 return shared_int::shmem_size(team_size * 13);
71 }
72
73 KOKKOS_INLINE_FUNCTION
operator ()TestTeamVector::functor_team_for74 void operator()(typename policy_type::member_type team) const {
75 using size_type = typename shmem_space::size_type;
76 const size_type shmemSize = team.team_size() * 13;
77 shared_int values = shared_int(team.team_shmem(), shmemSize);
78
79 if (values.data() == nullptr ||
80 static_cast<size_type>(values.extent(0)) < shmemSize) {
81 KOKKOS_IMPL_DO_NOT_USE_PRINTF(
82 "FAILED to allocate shared memory of size %u\n",
83 static_cast<unsigned int>(shmemSize));
84 } else {
85 // Initialize shared memory.
86 values(team.team_rank()) = 0;
87
88 // Accumulate value into per thread shared memory.
89 // This is non blocking.
90 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 131), [&](int i) {
91 values(team.team_rank()) +=
92 i - team.league_rank() + team.league_size() + team.team_size();
93 });
94
95 // Wait for all memory to be written.
96 team.team_barrier();
97
98 // One thread per team executes the comparison.
99 Kokkos::single(Kokkos::PerTeam(team), [&]() {
100 Scalar test = 0;
101 Scalar value = 0;
102
103 for (int i = 0; i < 131; ++i) {
104 test +=
105 i - team.league_rank() + team.league_size() + team.team_size();
106 }
107
108 for (int i = 0; i < team.team_size(); ++i) {
109 value += values(i);
110 }
111
112 if (test != value) {
113 KOKKOS_IMPL_DO_NOT_USE_PRINTF(
114 "FAILED team_parallel_for %i %i %f %f\n", team.league_rank(),
115 team.team_rank(), static_cast<double>(test),
116 static_cast<double>(value));
117 flag() = 1;
118 }
119 });
120 }
121 }
122 };
123
124 template <typename Scalar, class ExecutionSpace>
125 struct functor_team_reduce {
126 using policy_type = Kokkos::TeamPolicy<ExecutionSpace>;
127 using execution_space = ExecutionSpace;
128
129 Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag;
130
functor_team_reduceTestTeamVector::functor_team_reduce131 functor_team_reduce(
132 Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_)
133 : flag(flag_) {}
134
135 using shmem_space = typename ExecutionSpace::scratch_memory_space;
136 using shared_scalar_t =
137 Kokkos::View<Scalar *, shmem_space, Kokkos::MemoryUnmanaged>;
team_shmem_sizeTestTeamVector::functor_team_reduce138 unsigned team_shmem_size(int team_size) const {
139 return shared_scalar_t::shmem_size(team_size * 13);
140 }
141
142 KOKKOS_INLINE_FUNCTION
operator ()TestTeamVector::functor_team_reduce143 void operator()(typename policy_type::member_type team) const {
144 Scalar value = Scalar();
145 shared_scalar_t shared_value(team.team_scratch(0), 1);
146
147 Kokkos::parallel_reduce(
148 Kokkos::TeamThreadRange(team, 131),
149 [&](int i, Scalar &val) {
150 val += i - team.league_rank() + team.league_size() + team.team_size();
151 },
152 value);
153
154 Kokkos::parallel_reduce(
155 Kokkos::TeamThreadRange(team, 131),
156 [&](int i, Scalar &val) {
157 val += i - team.league_rank() + team.league_size() + team.team_size();
158 },
159 shared_value(0));
160
161 team.team_barrier();
162
163 Kokkos::single(Kokkos::PerTeam(team), [&]() {
164 Scalar test = 0;
165
166 for (int i = 0; i < 131; ++i) {
167 test += i - team.league_rank() + team.league_size() + team.team_size();
168 }
169
170 if (test != value) {
171 if (team.league_rank() == 0) {
172 KOKKOS_IMPL_DO_NOT_USE_PRINTF(
173 "FAILED team_parallel_reduce %i %i %lf %lf %lu\n",
174 team.league_rank(), team.team_rank(), static_cast<double>(test),
175 static_cast<double>(value),
176 static_cast<unsigned long>(sizeof(Scalar)));
177 }
178
179 flag() = 1;
180 }
181 if (test != shared_value(0)) {
182 if (team.league_rank() == 0) {
183 KOKKOS_IMPL_DO_NOT_USE_PRINTF(
184 "FAILED team_parallel_reduce with shared result %i %i %lf %lf "
185 "%lu\n",
186 team.league_rank(), team.team_rank(), static_cast<double>(test),
187 static_cast<double>(shared_value(0)),
188 static_cast<unsigned long>(sizeof(Scalar)));
189 }
190
191 flag() = 1;
192 }
193 });
194 }
195 };
196
197 template <typename Scalar, class ExecutionSpace>
198 struct functor_team_reduce_reducer {
199 using policy_type = Kokkos::TeamPolicy<ExecutionSpace>;
200 using execution_space = ExecutionSpace;
201
202 Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag;
203
functor_team_reduce_reducerTestTeamVector::functor_team_reduce_reducer204 functor_team_reduce_reducer(
205 Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_)
206 : flag(flag_) {}
207
208 using shmem_space = typename ExecutionSpace::scratch_memory_space;
209 using shared_scalar_t =
210 Kokkos::View<Scalar *, shmem_space, Kokkos::MemoryUnmanaged>;
team_shmem_sizeTestTeamVector::functor_team_reduce_reducer211 unsigned team_shmem_size(int team_size) const {
212 return shared_scalar_t::shmem_size(team_size * 13);
213 }
214
215 KOKKOS_INLINE_FUNCTION
operator ()TestTeamVector::functor_team_reduce_reducer216 void operator()(typename policy_type::member_type team) const {
217 Scalar value = 0;
218 shared_scalar_t shared_value(team.team_scratch(0), 1);
219
220 Kokkos::parallel_reduce(
221 Kokkos::TeamThreadRange(team, 131),
222 [&](int i, Scalar &val) {
223 val += i - team.league_rank() + team.league_size() + team.team_size();
224 },
225 Kokkos::Sum<Scalar>(value));
226
227 Kokkos::parallel_reduce(
228 Kokkos::TeamThreadRange(team, 131),
229 [&](int i, Scalar &val) {
230 val += i - team.league_rank() + team.league_size() + team.team_size();
231 },
232 Kokkos::Sum<Scalar>(shared_value(0)));
233
234 team.team_barrier();
235
236 Kokkos::single(Kokkos::PerTeam(team), [&]() {
237 Scalar test = 0;
238
239 for (int i = 0; i < 131; ++i) {
240 test += i - team.league_rank() + team.league_size() + team.team_size();
241 }
242
243 if (test != value) {
244 KOKKOS_IMPL_DO_NOT_USE_PRINTF(
245 "FAILED team_vector_parallel_reduce_reducer %i %i %lf %lf\n",
246 team.league_rank(), team.team_rank(), static_cast<double>(test),
247 static_cast<double>(value));
248
249 flag() = 1;
250 }
251 if (test != shared_value(0)) {
252 KOKKOS_IMPL_DO_NOT_USE_PRINTF(
253 "FAILED team_vector_parallel_reduce_reducer shared value %i %i %lf "
254 "%lf\n",
255 team.league_rank(), team.team_rank(), static_cast<double>(test),
256 static_cast<double>(shared_value(0)));
257
258 flag() = 1;
259 }
260 });
261 }
262 };
263
264 template <typename Scalar, class ExecutionSpace>
265 struct functor_team_vector_for {
266 using policy_type = Kokkos::TeamPolicy<ExecutionSpace>;
267 using execution_space = ExecutionSpace;
268
269 Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag;
270
functor_team_vector_forTestTeamVector::functor_team_vector_for271 functor_team_vector_for(
272 Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_)
273 : flag(flag_) {}
274
275 using shmem_space = typename ExecutionSpace::scratch_memory_space;
276 using shared_int =
277 Kokkos::View<Scalar *, shmem_space, Kokkos::MemoryUnmanaged>;
team_shmem_sizeTestTeamVector::functor_team_vector_for278 unsigned team_shmem_size(int team_size) const {
279 return shared_int::shmem_size(team_size * 13);
280 }
281
282 KOKKOS_INLINE_FUNCTION
operator ()TestTeamVector::functor_team_vector_for283 void operator()(typename policy_type::member_type team) const {
284 using size_type = typename shared_int::size_type;
285
286 const size_type shmemSize = team.team_size() * 13;
287 shared_int values = shared_int(team.team_shmem(), shmemSize);
288
289 if (values.data() == nullptr ||
290 static_cast<size_type>(values.extent(0)) < shmemSize) {
291 KOKKOS_IMPL_DO_NOT_USE_PRINTF(
292 "FAILED to allocate shared memory of size %u\n",
293 static_cast<unsigned int>(shmemSize));
294 } else {
295 team.team_barrier();
296
297 Kokkos::single(Kokkos::PerThread(team),
298 [&]() { values(team.team_rank()) = 0; });
299
300 Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 131), [&](int i) {
301 Kokkos::single(Kokkos::PerThread(team), [&]() {
302 values(team.team_rank()) +=
303 i - team.league_rank() + team.league_size() + team.team_size();
304 });
305 });
306
307 team.team_barrier();
308
309 Kokkos::single(Kokkos::PerTeam(team), [&]() {
310 Scalar test = 0;
311 Scalar value = 0;
312
313 for (int i = 0; i < 131; ++i) {
314 test +=
315 i - team.league_rank() + team.league_size() + team.team_size();
316 }
317
318 for (int i = 0; i < team.team_size(); ++i) {
319 value += values(i);
320 }
321
322 if (test != value) {
323 KOKKOS_IMPL_DO_NOT_USE_PRINTF(
324 "FAILED team_vector_parallel_for %i %i %f %f\n",
325 team.league_rank(), team.team_rank(), static_cast<double>(test),
326 static_cast<double>(value));
327
328 flag() = 1;
329 }
330 });
331 }
332 }
333 };
334
335 template <typename Scalar, class ExecutionSpace>
336 struct functor_team_vector_reduce {
337 using policy_type = Kokkos::TeamPolicy<ExecutionSpace>;
338 using execution_space = ExecutionSpace;
339
340 Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag;
functor_team_vector_reduceTestTeamVector::functor_team_vector_reduce341 functor_team_vector_reduce(
342 Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_)
343 : flag(flag_) {}
344
345 using shmem_space = typename ExecutionSpace::scratch_memory_space;
346 using shared_int =
347 Kokkos::View<Scalar *, shmem_space, Kokkos::MemoryUnmanaged>;
team_shmem_sizeTestTeamVector::functor_team_vector_reduce348 unsigned team_shmem_size(int team_size) const {
349 return shared_int::shmem_size(team_size * 13);
350 }
351
352 KOKKOS_INLINE_FUNCTION
operator ()TestTeamVector::functor_team_vector_reduce353 void operator()(typename policy_type::member_type team) const {
354 Scalar value = Scalar();
355
356 Kokkos::parallel_reduce(
357 Kokkos::TeamThreadRange(team, 131),
358 [&](int i, Scalar &val) {
359 val += i - team.league_rank() + team.league_size() + team.team_size();
360 },
361 value);
362
363 team.team_barrier();
364
365 Kokkos::single(Kokkos::PerTeam(team), [&]() {
366 Scalar test = 0;
367
368 for (int i = 0; i < 131; ++i) {
369 test += i - team.league_rank() + team.league_size() + team.team_size();
370 }
371
372 if (test != value) {
373 if (team.league_rank() == 0) {
374 KOKKOS_IMPL_DO_NOT_USE_PRINTF(
375 "FAILED team_vector_parallel_reduce %i %i %f %f %lu\n",
376 team.league_rank(), team.team_rank(), static_cast<double>(test),
377 static_cast<double>(value),
378 static_cast<unsigned long>(sizeof(Scalar)));
379 }
380
381 flag() = 1;
382 }
383 });
384 }
385 };
386
387 template <typename Scalar, class ExecutionSpace>
388 struct functor_team_vector_reduce_reducer {
389 using policy_type = Kokkos::TeamPolicy<ExecutionSpace>;
390 using execution_space = ExecutionSpace;
391
392 Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag;
393
functor_team_vector_reduce_reducerTestTeamVector::functor_team_vector_reduce_reducer394 functor_team_vector_reduce_reducer(
395 Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_)
396 : flag(flag_) {}
397
398 using shmem_space = typename ExecutionSpace::scratch_memory_space;
399 using shared_int =
400 Kokkos::View<Scalar *, shmem_space, Kokkos::MemoryUnmanaged>;
team_shmem_sizeTestTeamVector::functor_team_vector_reduce_reducer401 unsigned team_shmem_size(int team_size) const {
402 return shared_int::shmem_size(team_size * 13);
403 }
404
405 KOKKOS_INLINE_FUNCTION
operator ()TestTeamVector::functor_team_vector_reduce_reducer406 void operator()(typename policy_type::member_type team) const {
407 Scalar value = 0;
408
409 Kokkos::parallel_reduce(
410 Kokkos::TeamThreadRange(team, 131),
411 [&](int i, Scalar &val) {
412 val += i - team.league_rank() + team.league_size() + team.team_size();
413 },
414 Kokkos::Sum<Scalar>(value));
415
416 team.team_barrier();
417
418 Kokkos::single(Kokkos::PerTeam(team), [&]() {
419 Scalar test = 0;
420
421 for (int i = 0; i < 131; ++i) {
422 test += i - team.league_rank() + team.league_size() + team.team_size();
423 }
424
425 if (test != value) {
426 KOKKOS_IMPL_DO_NOT_USE_PRINTF(
427 "FAILED team_vector_parallel_reduce_reducer %i %i %f %f\n",
428 team.league_rank(), team.team_rank(), static_cast<double>(test),
429 static_cast<double>(value));
430
431 flag() = 1;
432 }
433 });
434 }
435 };
436
437 template <typename Scalar, class ExecutionSpace>
438 struct functor_vec_single {
439 using policy_type = Kokkos::TeamPolicy<ExecutionSpace>;
440 using execution_space = ExecutionSpace;
441
442 Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag;
443 int nStart;
444 int nEnd;
445
functor_vec_singleTestTeamVector::functor_vec_single446 functor_vec_single(
447 Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_,
448 const int start_, const int end_)
449 : flag(flag_), nStart(start_), nEnd(end_) {}
450
451 KOKKOS_INLINE_FUNCTION
operator ()TestTeamVector::functor_vec_single452 void operator()(typename policy_type::member_type team) const {
453 // Warning: this test case intentionally violates permissible semantics.
454 // It is not valid to get references to members of the enclosing region
455 // inside a parallel_for and write to it.
456 Scalar value = 0;
457
458 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, nStart, nEnd),
459 [&](int i) {
460 value = i; // This write is violating Kokkos
461 // semantics for nested parallelism.
462 });
463
464 Kokkos::single(
465 Kokkos::PerThread(team), [&](Scalar &val) { val = 1; }, value);
466
467 Scalar value2 = 0;
468 Kokkos::parallel_reduce(
469 Kokkos::ThreadVectorRange(team, nStart, nEnd),
470 [&](int /*i*/, Scalar &val) { val += value; }, value2);
471
472 if (value2 != (value * Scalar(nEnd - nStart))) {
473 KOKKOS_IMPL_DO_NOT_USE_PRINTF(
474 "FAILED vector_single broadcast %i %i %f %f\n", team.league_rank(),
475 team.team_rank(), (double)value2, (double)value);
476
477 flag() = 1;
478 }
479 }
480 };
481
482 template <typename Scalar, class ExecutionSpace>
483 struct functor_vec_for {
484 using policy_type = Kokkos::TeamPolicy<ExecutionSpace>;
485 using execution_space = ExecutionSpace;
486
487 Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag;
488
functor_vec_forTestTeamVector::functor_vec_for489 functor_vec_for(Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_)
490 : flag(flag_) {}
491
492 using shmem_space = typename ExecutionSpace::scratch_memory_space;
493 using shared_int =
494 Kokkos::View<Scalar *, shmem_space, Kokkos::MemoryUnmanaged>;
team_shmem_sizeTestTeamVector::functor_vec_for495 unsigned team_shmem_size(int team_size) const {
496 return shared_int::shmem_size(team_size * 13);
497 }
498
499 KOKKOS_INLINE_FUNCTION
operator ()TestTeamVector::functor_vec_for500 void operator()(typename policy_type::member_type team) const {
501 shared_int values = shared_int(team.team_shmem(), team.team_size() * 13);
502
503 if (values.data() == nullptr ||
504 values.extent(0) < (unsigned)team.team_size() * 13) {
505 KOKKOS_IMPL_DO_NOT_USE_PRINTF("FAILED to allocate memory of size %i\n",
506 static_cast<int>(team.team_size() * 13));
507 flag() = 1;
508 } else {
509 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, 13), [&](int i) {
510 values(13 * team.team_rank() + i) =
511 i - team.team_rank() - team.league_rank() + team.league_size() +
512 team.team_size();
513 });
514
515 Kokkos::single(Kokkos::PerThread(team), [&]() {
516 Scalar test = 0;
517 Scalar value = 0;
518
519 for (int i = 0; i < 13; ++i) {
520 test += i - team.team_rank() - team.league_rank() +
521 team.league_size() + team.team_size();
522 value += values(13 * team.team_rank() + i);
523 }
524
525 if (test != value) {
526 KOKKOS_IMPL_DO_NOT_USE_PRINTF("FAILED vector_par_for %i %i %f %f\n",
527 team.league_rank(), team.team_rank(),
528 static_cast<double>(test),
529 static_cast<double>(value));
530
531 flag() = 1;
532 }
533 });
534 }
535 }
536 };
537
538 template <typename Scalar, class ExecutionSpace>
539 struct functor_vec_red {
540 using policy_type = Kokkos::TeamPolicy<ExecutionSpace>;
541 using execution_space = ExecutionSpace;
542
543 Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag;
544
functor_vec_redTestTeamVector::functor_vec_red545 functor_vec_red(Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_)
546 : flag(flag_) {}
547
548 KOKKOS_INLINE_FUNCTION
operator ()TestTeamVector::functor_vec_red549 void operator()(typename policy_type::member_type team) const {
550 Scalar value = 0;
551
552 // When no reducer is given the default is summation.
553 Kokkos::parallel_reduce(
554 Kokkos::ThreadVectorRange(team, 13),
555 [&](int i, Scalar &val) { val += i; }, value);
556
557 Kokkos::single(Kokkos::PerThread(team), [&]() {
558 Scalar test = 0;
559
560 for (int i = 0; i < 13; i++) test += i;
561
562 if (test != value) {
563 KOKKOS_IMPL_DO_NOT_USE_PRINTF("FAILED vector_par_reduce %i %i %f %f\n",
564 team.league_rank(), team.team_rank(),
565 (double)test, (double)value);
566
567 flag() = 1;
568 }
569 });
570 }
571 };
572
573 template <typename Scalar, class ExecutionSpace>
574 struct functor_vec_red_reducer {
575 using policy_type = Kokkos::TeamPolicy<ExecutionSpace>;
576 using execution_space = ExecutionSpace;
577
578 Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag;
579
functor_vec_red_reducerTestTeamVector::functor_vec_red_reducer580 functor_vec_red_reducer(
581 Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_)
582 : flag(flag_) {}
583
584 KOKKOS_INLINE_FUNCTION
operator ()TestTeamVector::functor_vec_red_reducer585 void operator()(typename policy_type::member_type team) const {
586 // Must initialize to the identity value for the reduce operation
587 // for this test:
588 // ( identity, operation ) = ( 1 , *= )
589 Scalar value = 1;
590
591 Kokkos::parallel_reduce(
592 Kokkos::ThreadVectorRange(team, 13),
593 [&](int i, Scalar &val) { val *= (i % 5 + 1); },
594 Kokkos::Prod<Scalar>(value));
595
596 Kokkos::single(Kokkos::PerThread(team), [&]() {
597 Scalar test = 1;
598
599 for (int i = 0; i < 13; i++) test *= (i % 5 + 1);
600
601 if (test != value) {
602 KOKKOS_IMPL_DO_NOT_USE_PRINTF(
603 "FAILED vector_par_reduce_reducer %i %i %f %f\n",
604 team.league_rank(), team.team_rank(), (double)test, (double)value);
605
606 flag() = 1;
607 }
608 });
609 }
610 };
611
612 template <typename Scalar, class ExecutionSpace>
613 struct functor_vec_scan {
614 using policy_type = Kokkos::TeamPolicy<ExecutionSpace>;
615 using execution_space = ExecutionSpace;
616
617 Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag;
functor_vec_scanTestTeamVector::functor_vec_scan618 functor_vec_scan(Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_)
619 : flag(flag_) {}
620
621 KOKKOS_INLINE_FUNCTION
operator ()TestTeamVector::functor_vec_scan622 void operator()(typename policy_type::member_type team) const {
623 Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team, 13),
624 [&](int i, Scalar &val, bool final) {
625 val += i;
626
627 if (final) {
628 Scalar test = 0;
629 for (int k = 0; k <= i; k++) test += k;
630
631 if (test != val) {
632 KOKKOS_IMPL_DO_NOT_USE_PRINTF(
633 "FAILED vector_par_scan %i %i %f %f\n",
634 team.league_rank(), team.team_rank(),
635 (double)test, (double)val);
636
637 flag() = 1;
638 }
639 }
640 });
641 }
642 };
643
644 template <typename Scalar, class ExecutionSpace>
645 struct functor_reduce {
646 using value_type = double;
647 using policy_type = Kokkos::TeamPolicy<ExecutionSpace>;
648 using execution_space = ExecutionSpace;
649
650 Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag;
functor_reduceTestTeamVector::functor_reduce651 functor_reduce(Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_)
652 : flag(flag_) {}
653
654 KOKKOS_INLINE_FUNCTION
operator ()TestTeamVector::functor_reduce655 void operator()(typename policy_type::member_type team, double &sum) const {
656 sum += team.league_rank() * 100 + team.thread_rank();
657 }
658 };
659
660 template <typename Scalar, class ExecutionSpace>
test_scalar(int nteams,int team_size,int test)661 bool test_scalar(int nteams, int team_size, int test) {
662 Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> d_flag("flag");
663 typename Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace>::HostMirror
664 h_flag("h_flag");
665 h_flag() = 0;
666 Kokkos::deep_copy(d_flag, h_flag);
667
668 if (test == 0) {
669 Kokkos::parallel_for(
670 std::string("A"),
671 Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size, 8),
672 functor_vec_red<Scalar, ExecutionSpace>(d_flag));
673 } else if (test == 1) {
674 Kokkos::parallel_for(
675 Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size, 8),
676 functor_vec_red_reducer<Scalar, ExecutionSpace>(d_flag));
677 } else if (test == 2) {
678 Kokkos::parallel_for(
679 Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size, 8),
680 functor_vec_scan<Scalar, ExecutionSpace>(d_flag));
681 } else if (test == 3) {
682 Kokkos::parallel_for(
683 Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size, 8),
684 functor_vec_for<Scalar, ExecutionSpace>(d_flag));
685 } else if (test == 4) {
686 Kokkos::parallel_for(
687 "B", Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size, 8),
688 functor_vec_single<Scalar, ExecutionSpace>(d_flag, 0, 13));
689 } else if (test == 5) {
690 Kokkos::parallel_for(Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size),
691 functor_team_for<Scalar, ExecutionSpace>(d_flag));
692 } else if (test == 6) {
693 Kokkos::parallel_for(Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size),
694 functor_team_reduce<Scalar, ExecutionSpace>(d_flag));
695 } else if (test == 7) {
696 Kokkos::parallel_for(
697 Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size),
698 functor_team_reduce_reducer<Scalar, ExecutionSpace>(d_flag));
699 } else if (test == 8) {
700 Kokkos::parallel_for(
701 Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size, 8),
702 functor_team_vector_for<Scalar, ExecutionSpace>(d_flag));
703 } else if (test == 9) {
704 Kokkos::parallel_for(
705 Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size, 8),
706 functor_team_vector_reduce<Scalar, ExecutionSpace>(d_flag));
707 } else if (test == 10) {
708 Kokkos::parallel_for(
709 Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size, 8),
710 functor_team_vector_reduce_reducer<Scalar, ExecutionSpace>(d_flag));
711 } else if (test == 11) {
712 Kokkos::parallel_for(
713 "B", Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size, 8),
714 functor_vec_single<Scalar, ExecutionSpace>(d_flag, 4, 13));
715 }
716
717 Kokkos::deep_copy(h_flag, d_flag);
718
719 return (h_flag() == 0);
720 }
721
722 template <class ExecutionSpace>
Test(int test)723 bool Test(int test) {
724 bool passed = true;
725
726 int team_size = 33;
727 if (team_size > int(ExecutionSpace::concurrency()))
728 team_size = int(ExecutionSpace::concurrency());
729 passed = passed && test_scalar<int, ExecutionSpace>(317, team_size, test);
730 passed = passed &&
731 test_scalar<long long int, ExecutionSpace>(317, team_size, test);
732 passed = passed && test_scalar<float, ExecutionSpace>(317, team_size, test);
733 passed = passed && test_scalar<double, ExecutionSpace>(317, team_size, test);
734 passed = passed &&
735 test_scalar<Test::my_complex, ExecutionSpace>(317, team_size, test);
736 passed = passed && test_scalar<Test::array_reduce<double, 1>, ExecutionSpace>(
737 317, team_size, test);
738 passed = passed && test_scalar<Test::array_reduce<float, 1>, ExecutionSpace>(
739 317, team_size, test);
740 passed = passed && test_scalar<Test::array_reduce<double, 3>, ExecutionSpace>(
741 317, team_size, test);
742
743 return passed;
744 }
745
746 } // namespace TestTeamVector
747
748 namespace Test {
749
750 // Computes y^T*A*x
751 // ( modified from kokkos-tutorials/GTC2016/Exercises/ThreeLevelPar )
752
753 #if (!defined(KOKKOS_ENABLE_CUDA)) || defined(KOKKOS_ENABLE_CUDA_LAMBDA)
754 template <typename ScalarType, class DeviceType>
755 class TestTripleNestedReduce {
756 public:
757 using execution_space = DeviceType;
758 using size_type = typename execution_space::size_type;
759
TestTripleNestedReduce(const size_type & nrows,const size_type & ncols,const size_type & team_size,const size_type & vector_length)760 TestTripleNestedReduce(const size_type &nrows, const size_type &ncols,
761 const size_type &team_size,
762 const size_type &vector_length) {
763 run_test(nrows, ncols, team_size, vector_length);
764 }
765
run_test(const size_type & nrows,const size_type & ncols,size_type team_size,const size_type & vector_length)766 void run_test(const size_type &nrows, const size_type &ncols,
767 size_type team_size, const size_type &vector_length) {
768 if (team_size > size_type(DeviceType::execution_space::concurrency()))
769 team_size = size_type(DeviceType::execution_space::concurrency());
770
771 #ifdef KOKKOS_ENABLE_HPX
772 team_size = 1;
773 if (!std::is_same<execution_space, Kokkos::Experimental::HPX>::value) {
774 team_size = 1;
775 }
776 #endif
777
778 // using Layout = Kokkos::LayoutLeft;
779 using Layout = Kokkos::LayoutRight;
780
781 using ViewVector = Kokkos::View<ScalarType *, DeviceType>;
782 using ViewMatrix = Kokkos::View<ScalarType **, Layout, DeviceType>;
783
784 ViewVector y("y", nrows);
785 ViewVector x("x", ncols);
786 ViewMatrix A("A", nrows, ncols);
787
788 using range_policy = Kokkos::RangePolicy<DeviceType>;
789
790 // Initialize y vector.
791 Kokkos::parallel_for(
792 range_policy(0, nrows), KOKKOS_LAMBDA(const int i) { y(i) = 1; });
793
794 // Initialize x vector.
795 Kokkos::parallel_for(
796 range_policy(0, ncols), KOKKOS_LAMBDA(const int i) { x(i) = 1; });
797 Kokkos::fence();
798
799 using team_policy = Kokkos::TeamPolicy<DeviceType>;
800 using member_type = typename Kokkos::TeamPolicy<DeviceType>::member_type;
801
802 // Initialize A matrix, note 2D indexing computation.
803 Kokkos::parallel_for(
804 team_policy(nrows, Kokkos::AUTO),
805 KOKKOS_LAMBDA(const member_type &teamMember) {
806 const int j = teamMember.league_rank();
807 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, ncols),
808 [&](const int i) { A(j, i) = 1; });
809 });
810 Kokkos::fence();
811
812 // Three level parallelism kernel to force caching of vector x.
813 ScalarType result = 0.0;
814 int chunk_size = 128;
815 Kokkos::parallel_reduce(
816 team_policy(nrows / chunk_size, team_size, vector_length),
817 KOKKOS_LAMBDA(const member_type &teamMember, double &update) {
818 const int row_start = teamMember.league_rank() * chunk_size;
819 const int row_end = row_start + chunk_size;
820 Kokkos::parallel_for(
821 Kokkos::TeamThreadRange(teamMember, row_start, row_end),
822 [&](const int i) {
823 ScalarType sum_i = 0.0;
824 Kokkos::parallel_reduce(
825 Kokkos::ThreadVectorRange(teamMember, ncols),
826 [&](const int j, ScalarType &innerUpdate) {
827 innerUpdate += A(i, j) * x(j);
828 },
829 sum_i);
830 Kokkos::single(Kokkos::PerThread(teamMember),
831 [&]() { update += y(i) * sum_i; });
832 });
833 },
834 result);
835 Kokkos::fence();
836
837 const ScalarType solution = (ScalarType)nrows * (ScalarType)ncols;
838 if (int64_t(solution) != int64_t(result)) {
839 printf(" TestTripleNestedReduce failed solution(%" PRId64
840 ") != result(%" PRId64
841 "),"
842 " nrows(%" PRId32 ") ncols(%" PRId32 ") league_size(%" PRId32
843 ") team_size(%" PRId32 ")\n",
844 int64_t(solution), int64_t(result), int32_t(nrows), int32_t(ncols),
845 int32_t(nrows / chunk_size), int32_t(team_size));
846 }
847
848 ASSERT_EQ(solution, result);
849 }
850 };
851
852 #else // #if ( ! defined( KOKKOS_ENABLE_CUDA ) ) || defined(
853 // KOKKOS_ENABLE_CUDA_LAMBDA )
854
855 template <typename ScalarType, class DeviceType>
856 class TestTripleNestedReduce {
857 public:
858 using execution_space = DeviceType;
859 using size_type = typename execution_space::size_type;
860
861 TestTripleNestedReduce(const size_type &, const size_type, const size_type &,
862 const size_type) {}
863 };
864
865 #endif
866
867 namespace VectorScanReducer {
868 enum class ScanType : bool { Inclusive, Exclusive };
869
870 template <typename ExecutionSpace, ScanType scan_type, int n,
871 int n_vector_range, class Reducer>
872 struct checkScan {
873 const int n_team_thread_range = 1000;
874 const int n_per_team = n_team_thread_range * n_vector_range;
875
876 using size_type = typename ExecutionSpace::size_type;
877 using value_type = typename Reducer::value_type;
878 using view_type = Kokkos::View<value_type[n], ExecutionSpace>;
879
880 view_type inputs = view_type{"inputs"};
881 view_type outputs = view_type{"outputs"};
882
883 value_type result;
884 Reducer reducer = {result};
885
886 struct ThreadVectorFunctor {
operator ()Test::VectorScanReducer::checkScan::ThreadVectorFunctor887 KOKKOS_FUNCTION void operator()(const size_type j, value_type &update,
888 const bool final) const {
889 const size_type element = j + m_team_offset + m_thread_offset;
890 const auto tmp = m_inputs(element);
891 if (scan_type == ScanType::Inclusive) {
892 m_reducer.join(update, tmp);
893 if (final) {
894 m_outputs(element) = update;
895 }
896 } else {
897 if (final) {
898 m_outputs(element) = update;
899 }
900 m_reducer.join(update, tmp);
901 }
902 }
903
904 const Reducer &m_reducer;
905 const size_type &m_team_offset;
906 const size_type &m_thread_offset;
907 const view_type &m_outputs;
908 const view_type &m_inputs;
909 };
910
911 struct TeamThreadRangeFunctor {
operator ()Test::VectorScanReducer::checkScan::TeamThreadRangeFunctor912 KOKKOS_FUNCTION void operator()(const size_type i) const {
913 const size_type thread_offset = i * n_vector_range;
914 Kokkos::parallel_scan(
915 Kokkos::ThreadVectorRange(m_team, n_vector_range),
916 ThreadVectorFunctor{m_reducer, m_team_offset, thread_offset,
917 m_outputs, m_inputs},
918 m_reducer);
919 }
920
921 const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &m_team;
922 const Reducer &m_reducer;
923 const size_type &m_team_offset;
924 const view_type &m_outputs;
925 const view_type &m_inputs;
926 };
927
operator ()Test::VectorScanReducer::checkScan928 KOKKOS_FUNCTION void operator()(
929 const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team)
930 const {
931 const size_type iTeam = team.league_rank();
932 const size_type iTeamOffset = iTeam * n_per_team;
933 Kokkos::parallel_for(
934 Kokkos::TeamThreadRange(team, n_team_thread_range),
935 TeamThreadRangeFunctor{team, reducer, iTeamOffset, outputs, inputs});
936 }
937
operator ()Test::VectorScanReducer::checkScan938 KOKKOS_FUNCTION void operator()(size_type i) const { inputs(i) = i * 1. / n; }
939
runTest::VectorScanReducer::checkScan940 void run() {
941 const int n_teams = n / n_per_team;
942
943 Kokkos::parallel_for(Kokkos::RangePolicy<ExecutionSpace>(0, n), *this);
944
945 // run ThreadVectorRange parallel_scan
946 Kokkos::TeamPolicy<ExecutionSpace> policy(n_teams, Kokkos::AUTO,
947 Kokkos::AUTO);
948 const std::string label =
949 (scan_type == ScanType::Inclusive ? std::string("inclusive")
950 : std::string("exclusive")) +
951 "Scan" + typeid(Reducer).name();
952 Kokkos::parallel_for(label, policy, *this);
953 Kokkos::fence();
954
955 auto host_outputs =
956 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, outputs);
957 auto host_inputs =
958 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, inputs);
959
960 Kokkos::View<value_type[n], Kokkos::HostSpace> expected("expected");
961 {
962 value_type identity;
963 reducer.init(identity);
964 for (int i = 0; i < expected.extent_int(0); ++i) {
965 const int vector = i % n_vector_range;
966 const value_type accum = vector == 0 ? identity : expected(i - 1);
967 const value_type val =
968 scan_type == ScanType::Inclusive
969 ? host_inputs(i)
970 : (vector == 0 ? identity : host_inputs(i - 1));
971 expected(i) = accum;
972 reducer.join(expected(i), val);
973 }
974 }
975 for (int i = 0; i < host_outputs.extent_int(0); ++i)
976 ASSERT_EQ(host_outputs(i), expected(i));
977 }
978 };
979 } // namespace VectorScanReducer
980
981 #if !(defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) || defined(KOKKOS_ENABLE_HIP))
TEST(TEST_CATEGORY,team_vector)982 TEST(TEST_CATEGORY, team_vector) {
983 ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(0)));
984 ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(1)));
985 ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(2)));
986 ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(3)));
987 ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(4)));
988 ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(5)));
989 ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(6)));
990 ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(7)));
991 ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(8)));
992 ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(9)));
993 ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(10)));
994 ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(11)));
995 }
996 #endif
997
998 #if !defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
TEST(TEST_CATEGORY,triple_nested_parallelism)999 TEST(TEST_CATEGORY, triple_nested_parallelism) {
1000 // With KOKKOS_ENABLE_DEBUG enabled, the functor uses too many registers to run
1001 // with a team size of 32 on GPUs, 16 is the max possible (at least on a K80
1002 // GPU) See https://github.com/kokkos/kokkos/issues/1513
1003 #if defined(KOKKOS_ENABLE_DEBUG) && defined(KOKKOS_ENABLE_CUDA)
1004 if (!std::is_same<TEST_EXECSPACE, Kokkos::Cuda>::value) {
1005 #endif
1006 TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 32, 32);
1007 TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 32, 16);
1008 #if defined(KOKKOS_ENABLE_DEBUG) && defined(KOKKOS_ENABLE_CUDA)
1009 }
1010 #endif
1011 TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 16, 16);
1012 TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 16, 33);
1013 TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 16, 19);
1014 TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 7, 16);
1015 }
1016 #endif
1017
TEST(TEST_CATEGORY,parallel_scan_with_reducers)1018 TEST(TEST_CATEGORY, parallel_scan_with_reducers) {
1019 using T = double;
1020 using namespace VectorScanReducer;
1021
1022 static constexpr int n = 1000000;
1023 static constexpr int n_vector_range = 100;
1024
1025 checkScan<TEST_EXECSPACE, ScanType::Exclusive, n, n_vector_range,
1026 Kokkos::Prod<T, TEST_EXECSPACE>>()
1027 .run();
1028 checkScan<TEST_EXECSPACE, ScanType::Inclusive, n, n_vector_range,
1029 Kokkos::Prod<T, TEST_EXECSPACE>>()
1030 .run();
1031
1032 checkScan<TEST_EXECSPACE, ScanType::Exclusive, n, n_vector_range,
1033 Kokkos::Max<T, TEST_EXECSPACE>>()
1034 .run();
1035 checkScan<TEST_EXECSPACE, ScanType::Inclusive, n, n_vector_range,
1036 Kokkos::Max<T, TEST_EXECSPACE>>()
1037 .run();
1038
1039 checkScan<TEST_EXECSPACE, ScanType::Exclusive, n, n_vector_range,
1040 Kokkos::Min<T, TEST_EXECSPACE>>()
1041 .run();
1042 checkScan<TEST_EXECSPACE, ScanType::Inclusive, n, n_vector_range,
1043 Kokkos::Min<T, TEST_EXECSPACE>>()
1044 .run();
1045 }
1046
1047 } // namespace Test
1048