1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 //                        Kokkos v. 3.0
6 //       Copyright (2020) National Technology & Engineering
7 //               Solutions of Sandia, LLC (NTESS).
8 //
9 // Under the terms of Contract DE-NA0003525 with NTESS,
10 // the U.S. Government retains certain rights in this software.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions are
14 // met:
15 //
16 // 1. Redistributions of source code must retain the above copyright
17 // notice, this list of conditions and the following disclaimer.
18 //
19 // 2. Redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution.
22 //
23 // 3. Neither the name of the Corporation nor the names of the
24 // contributors may be used to endorse or promote products derived from
25 // this software without specific prior written permission.
26 //
27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 //
39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
40 //
41 // ************************************************************************
42 //@HEADER
43 */
44 
45 #include <Kokkos_Core.hpp>
46 
47 #include <impl/Kokkos_Timer.hpp>
48 #include <iostream>
49 #include <cstdlib>
50 #include <cstdint>
51 #include <cinttypes>
52 #include <TestNonTrivialScalarTypes.hpp>
53 
54 namespace TestTeamVector {
55 
56 template <typename Scalar, class ExecutionSpace>
57 struct functor_team_for {
58   using policy_type     = Kokkos::TeamPolicy<ExecutionSpace>;
59   using execution_space = ExecutionSpace;
60 
61   Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag;
62 
functor_team_forTestTeamVector::functor_team_for63   functor_team_for(Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_)
64       : flag(flag_) {}
65 
66   using shmem_space = typename ExecutionSpace::scratch_memory_space;
67   using shared_int =
68       Kokkos::View<Scalar *, shmem_space, Kokkos::MemoryUnmanaged>;
team_shmem_sizeTestTeamVector::functor_team_for69   unsigned team_shmem_size(int team_size) const {
70     return shared_int::shmem_size(team_size * 13);
71   }
72 
73   KOKKOS_INLINE_FUNCTION
operator ()TestTeamVector::functor_team_for74   void operator()(typename policy_type::member_type team) const {
75     using size_type           = typename shmem_space::size_type;
76     const size_type shmemSize = team.team_size() * 13;
77     shared_int values         = shared_int(team.team_shmem(), shmemSize);
78 
79     if (values.data() == nullptr ||
80         static_cast<size_type>(values.extent(0)) < shmemSize) {
81       KOKKOS_IMPL_DO_NOT_USE_PRINTF(
82           "FAILED to allocate shared memory of size %u\n",
83           static_cast<unsigned int>(shmemSize));
84     } else {
85       // Initialize shared memory.
86       values(team.team_rank()) = 0;
87 
88       // Accumulate value into per thread shared memory.
89       // This is non blocking.
90       Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 131), [&](int i) {
91         values(team.team_rank()) +=
92             i - team.league_rank() + team.league_size() + team.team_size();
93       });
94 
95       // Wait for all memory to be written.
96       team.team_barrier();
97 
98       // One thread per team executes the comparison.
99       Kokkos::single(Kokkos::PerTeam(team), [&]() {
100         Scalar test  = 0;
101         Scalar value = 0;
102 
103         for (int i = 0; i < 131; ++i) {
104           test +=
105               i - team.league_rank() + team.league_size() + team.team_size();
106         }
107 
108         for (int i = 0; i < team.team_size(); ++i) {
109           value += values(i);
110         }
111 
112         if (test != value) {
113           KOKKOS_IMPL_DO_NOT_USE_PRINTF(
114               "FAILED team_parallel_for %i %i %f %f\n", team.league_rank(),
115               team.team_rank(), static_cast<double>(test),
116               static_cast<double>(value));
117           flag() = 1;
118         }
119       });
120     }
121   }
122 };
123 
124 template <typename Scalar, class ExecutionSpace>
125 struct functor_team_reduce {
126   using policy_type     = Kokkos::TeamPolicy<ExecutionSpace>;
127   using execution_space = ExecutionSpace;
128 
129   Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag;
130 
functor_team_reduceTestTeamVector::functor_team_reduce131   functor_team_reduce(
132       Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_)
133       : flag(flag_) {}
134 
135   using shmem_space = typename ExecutionSpace::scratch_memory_space;
136   using shared_scalar_t =
137       Kokkos::View<Scalar *, shmem_space, Kokkos::MemoryUnmanaged>;
team_shmem_sizeTestTeamVector::functor_team_reduce138   unsigned team_shmem_size(int team_size) const {
139     return shared_scalar_t::shmem_size(team_size * 13);
140   }
141 
142   KOKKOS_INLINE_FUNCTION
operator ()TestTeamVector::functor_team_reduce143   void operator()(typename policy_type::member_type team) const {
144     Scalar value = Scalar();
145     shared_scalar_t shared_value(team.team_scratch(0), 1);
146 
147     Kokkos::parallel_reduce(
148         Kokkos::TeamThreadRange(team, 131),
149         [&](int i, Scalar &val) {
150           val += i - team.league_rank() + team.league_size() + team.team_size();
151         },
152         value);
153 
154     Kokkos::parallel_reduce(
155         Kokkos::TeamThreadRange(team, 131),
156         [&](int i, Scalar &val) {
157           val += i - team.league_rank() + team.league_size() + team.team_size();
158         },
159         shared_value(0));
160 
161     team.team_barrier();
162 
163     Kokkos::single(Kokkos::PerTeam(team), [&]() {
164       Scalar test = 0;
165 
166       for (int i = 0; i < 131; ++i) {
167         test += i - team.league_rank() + team.league_size() + team.team_size();
168       }
169 
170       if (test != value) {
171         if (team.league_rank() == 0) {
172           KOKKOS_IMPL_DO_NOT_USE_PRINTF(
173               "FAILED team_parallel_reduce %i %i %lf %lf %lu\n",
174               team.league_rank(), team.team_rank(), static_cast<double>(test),
175               static_cast<double>(value),
176               static_cast<unsigned long>(sizeof(Scalar)));
177         }
178 
179         flag() = 1;
180       }
181       if (test != shared_value(0)) {
182         if (team.league_rank() == 0) {
183           KOKKOS_IMPL_DO_NOT_USE_PRINTF(
184               "FAILED team_parallel_reduce with shared result %i %i %lf %lf "
185               "%lu\n",
186               team.league_rank(), team.team_rank(), static_cast<double>(test),
187               static_cast<double>(shared_value(0)),
188               static_cast<unsigned long>(sizeof(Scalar)));
189         }
190 
191         flag() = 1;
192       }
193     });
194   }
195 };
196 
197 template <typename Scalar, class ExecutionSpace>
198 struct functor_team_reduce_reducer {
199   using policy_type     = Kokkos::TeamPolicy<ExecutionSpace>;
200   using execution_space = ExecutionSpace;
201 
202   Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag;
203 
functor_team_reduce_reducerTestTeamVector::functor_team_reduce_reducer204   functor_team_reduce_reducer(
205       Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_)
206       : flag(flag_) {}
207 
208   using shmem_space = typename ExecutionSpace::scratch_memory_space;
209   using shared_scalar_t =
210       Kokkos::View<Scalar *, shmem_space, Kokkos::MemoryUnmanaged>;
team_shmem_sizeTestTeamVector::functor_team_reduce_reducer211   unsigned team_shmem_size(int team_size) const {
212     return shared_scalar_t::shmem_size(team_size * 13);
213   }
214 
215   KOKKOS_INLINE_FUNCTION
operator ()TestTeamVector::functor_team_reduce_reducer216   void operator()(typename policy_type::member_type team) const {
217     Scalar value = 0;
218     shared_scalar_t shared_value(team.team_scratch(0), 1);
219 
220     Kokkos::parallel_reduce(
221         Kokkos::TeamThreadRange(team, 131),
222         [&](int i, Scalar &val) {
223           val += i - team.league_rank() + team.league_size() + team.team_size();
224         },
225         Kokkos::Sum<Scalar>(value));
226 
227     Kokkos::parallel_reduce(
228         Kokkos::TeamThreadRange(team, 131),
229         [&](int i, Scalar &val) {
230           val += i - team.league_rank() + team.league_size() + team.team_size();
231         },
232         Kokkos::Sum<Scalar>(shared_value(0)));
233 
234     team.team_barrier();
235 
236     Kokkos::single(Kokkos::PerTeam(team), [&]() {
237       Scalar test = 0;
238 
239       for (int i = 0; i < 131; ++i) {
240         test += i - team.league_rank() + team.league_size() + team.team_size();
241       }
242 
243       if (test != value) {
244         KOKKOS_IMPL_DO_NOT_USE_PRINTF(
245             "FAILED team_vector_parallel_reduce_reducer %i %i %lf %lf\n",
246             team.league_rank(), team.team_rank(), static_cast<double>(test),
247             static_cast<double>(value));
248 
249         flag() = 1;
250       }
251       if (test != shared_value(0)) {
252         KOKKOS_IMPL_DO_NOT_USE_PRINTF(
253             "FAILED team_vector_parallel_reduce_reducer shared value %i %i %lf "
254             "%lf\n",
255             team.league_rank(), team.team_rank(), static_cast<double>(test),
256             static_cast<double>(shared_value(0)));
257 
258         flag() = 1;
259       }
260     });
261   }
262 };
263 
264 template <typename Scalar, class ExecutionSpace>
265 struct functor_team_vector_for {
266   using policy_type     = Kokkos::TeamPolicy<ExecutionSpace>;
267   using execution_space = ExecutionSpace;
268 
269   Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag;
270 
functor_team_vector_forTestTeamVector::functor_team_vector_for271   functor_team_vector_for(
272       Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_)
273       : flag(flag_) {}
274 
275   using shmem_space = typename ExecutionSpace::scratch_memory_space;
276   using shared_int =
277       Kokkos::View<Scalar *, shmem_space, Kokkos::MemoryUnmanaged>;
team_shmem_sizeTestTeamVector::functor_team_vector_for278   unsigned team_shmem_size(int team_size) const {
279     return shared_int::shmem_size(team_size * 13);
280   }
281 
282   KOKKOS_INLINE_FUNCTION
operator ()TestTeamVector::functor_team_vector_for283   void operator()(typename policy_type::member_type team) const {
284     using size_type = typename shared_int::size_type;
285 
286     const size_type shmemSize = team.team_size() * 13;
287     shared_int values         = shared_int(team.team_shmem(), shmemSize);
288 
289     if (values.data() == nullptr ||
290         static_cast<size_type>(values.extent(0)) < shmemSize) {
291       KOKKOS_IMPL_DO_NOT_USE_PRINTF(
292           "FAILED to allocate shared memory of size %u\n",
293           static_cast<unsigned int>(shmemSize));
294     } else {
295       team.team_barrier();
296 
297       Kokkos::single(Kokkos::PerThread(team),
298                      [&]() { values(team.team_rank()) = 0; });
299 
300       Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 131), [&](int i) {
301         Kokkos::single(Kokkos::PerThread(team), [&]() {
302           values(team.team_rank()) +=
303               i - team.league_rank() + team.league_size() + team.team_size();
304         });
305       });
306 
307       team.team_barrier();
308 
309       Kokkos::single(Kokkos::PerTeam(team), [&]() {
310         Scalar test  = 0;
311         Scalar value = 0;
312 
313         for (int i = 0; i < 131; ++i) {
314           test +=
315               i - team.league_rank() + team.league_size() + team.team_size();
316         }
317 
318         for (int i = 0; i < team.team_size(); ++i) {
319           value += values(i);
320         }
321 
322         if (test != value) {
323           KOKKOS_IMPL_DO_NOT_USE_PRINTF(
324               "FAILED team_vector_parallel_for %i %i %f %f\n",
325               team.league_rank(), team.team_rank(), static_cast<double>(test),
326               static_cast<double>(value));
327 
328           flag() = 1;
329         }
330       });
331     }
332   }
333 };
334 
335 template <typename Scalar, class ExecutionSpace>
336 struct functor_team_vector_reduce {
337   using policy_type     = Kokkos::TeamPolicy<ExecutionSpace>;
338   using execution_space = ExecutionSpace;
339 
340   Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag;
functor_team_vector_reduceTestTeamVector::functor_team_vector_reduce341   functor_team_vector_reduce(
342       Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_)
343       : flag(flag_) {}
344 
345   using shmem_space = typename ExecutionSpace::scratch_memory_space;
346   using shared_int =
347       Kokkos::View<Scalar *, shmem_space, Kokkos::MemoryUnmanaged>;
team_shmem_sizeTestTeamVector::functor_team_vector_reduce348   unsigned team_shmem_size(int team_size) const {
349     return shared_int::shmem_size(team_size * 13);
350   }
351 
352   KOKKOS_INLINE_FUNCTION
operator ()TestTeamVector::functor_team_vector_reduce353   void operator()(typename policy_type::member_type team) const {
354     Scalar value = Scalar();
355 
356     Kokkos::parallel_reduce(
357         Kokkos::TeamThreadRange(team, 131),
358         [&](int i, Scalar &val) {
359           val += i - team.league_rank() + team.league_size() + team.team_size();
360         },
361         value);
362 
363     team.team_barrier();
364 
365     Kokkos::single(Kokkos::PerTeam(team), [&]() {
366       Scalar test = 0;
367 
368       for (int i = 0; i < 131; ++i) {
369         test += i - team.league_rank() + team.league_size() + team.team_size();
370       }
371 
372       if (test != value) {
373         if (team.league_rank() == 0) {
374           KOKKOS_IMPL_DO_NOT_USE_PRINTF(
375               "FAILED team_vector_parallel_reduce %i %i %f %f %lu\n",
376               team.league_rank(), team.team_rank(), static_cast<double>(test),
377               static_cast<double>(value),
378               static_cast<unsigned long>(sizeof(Scalar)));
379         }
380 
381         flag() = 1;
382       }
383     });
384   }
385 };
386 
387 template <typename Scalar, class ExecutionSpace>
388 struct functor_team_vector_reduce_reducer {
389   using policy_type     = Kokkos::TeamPolicy<ExecutionSpace>;
390   using execution_space = ExecutionSpace;
391 
392   Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag;
393 
functor_team_vector_reduce_reducerTestTeamVector::functor_team_vector_reduce_reducer394   functor_team_vector_reduce_reducer(
395       Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_)
396       : flag(flag_) {}
397 
398   using shmem_space = typename ExecutionSpace::scratch_memory_space;
399   using shared_int =
400       Kokkos::View<Scalar *, shmem_space, Kokkos::MemoryUnmanaged>;
team_shmem_sizeTestTeamVector::functor_team_vector_reduce_reducer401   unsigned team_shmem_size(int team_size) const {
402     return shared_int::shmem_size(team_size * 13);
403   }
404 
405   KOKKOS_INLINE_FUNCTION
operator ()TestTeamVector::functor_team_vector_reduce_reducer406   void operator()(typename policy_type::member_type team) const {
407     Scalar value = 0;
408 
409     Kokkos::parallel_reduce(
410         Kokkos::TeamThreadRange(team, 131),
411         [&](int i, Scalar &val) {
412           val += i - team.league_rank() + team.league_size() + team.team_size();
413         },
414         Kokkos::Sum<Scalar>(value));
415 
416     team.team_barrier();
417 
418     Kokkos::single(Kokkos::PerTeam(team), [&]() {
419       Scalar test = 0;
420 
421       for (int i = 0; i < 131; ++i) {
422         test += i - team.league_rank() + team.league_size() + team.team_size();
423       }
424 
425       if (test != value) {
426         KOKKOS_IMPL_DO_NOT_USE_PRINTF(
427             "FAILED team_vector_parallel_reduce_reducer %i %i %f %f\n",
428             team.league_rank(), team.team_rank(), static_cast<double>(test),
429             static_cast<double>(value));
430 
431         flag() = 1;
432       }
433     });
434   }
435 };
436 
437 template <typename Scalar, class ExecutionSpace>
438 struct functor_vec_single {
439   using policy_type     = Kokkos::TeamPolicy<ExecutionSpace>;
440   using execution_space = ExecutionSpace;
441 
442   Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag;
443   int nStart;
444   int nEnd;
445 
functor_vec_singleTestTeamVector::functor_vec_single446   functor_vec_single(
447       Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_,
448       const int start_, const int end_)
449       : flag(flag_), nStart(start_), nEnd(end_) {}
450 
451   KOKKOS_INLINE_FUNCTION
operator ()TestTeamVector::functor_vec_single452   void operator()(typename policy_type::member_type team) const {
453     // Warning: this test case intentionally violates permissible semantics.
454     // It is not valid to get references to members of the enclosing region
455     // inside a parallel_for and write to it.
456     Scalar value = 0;
457 
458     Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, nStart, nEnd),
459                          [&](int i) {
460                            value = i;  // This write is violating Kokkos
461                                        // semantics for nested parallelism.
462                          });
463 
464     Kokkos::single(
465         Kokkos::PerThread(team), [&](Scalar &val) { val = 1; }, value);
466 
467     Scalar value2 = 0;
468     Kokkos::parallel_reduce(
469         Kokkos::ThreadVectorRange(team, nStart, nEnd),
470         [&](int /*i*/, Scalar &val) { val += value; }, value2);
471 
472     if (value2 != (value * Scalar(nEnd - nStart))) {
473       KOKKOS_IMPL_DO_NOT_USE_PRINTF(
474           "FAILED vector_single broadcast %i %i %f %f\n", team.league_rank(),
475           team.team_rank(), (double)value2, (double)value);
476 
477       flag() = 1;
478     }
479   }
480 };
481 
482 template <typename Scalar, class ExecutionSpace>
483 struct functor_vec_for {
484   using policy_type     = Kokkos::TeamPolicy<ExecutionSpace>;
485   using execution_space = ExecutionSpace;
486 
487   Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag;
488 
functor_vec_forTestTeamVector::functor_vec_for489   functor_vec_for(Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_)
490       : flag(flag_) {}
491 
492   using shmem_space = typename ExecutionSpace::scratch_memory_space;
493   using shared_int =
494       Kokkos::View<Scalar *, shmem_space, Kokkos::MemoryUnmanaged>;
team_shmem_sizeTestTeamVector::functor_vec_for495   unsigned team_shmem_size(int team_size) const {
496     return shared_int::shmem_size(team_size * 13);
497   }
498 
499   KOKKOS_INLINE_FUNCTION
operator ()TestTeamVector::functor_vec_for500   void operator()(typename policy_type::member_type team) const {
501     shared_int values = shared_int(team.team_shmem(), team.team_size() * 13);
502 
503     if (values.data() == nullptr ||
504         values.extent(0) < (unsigned)team.team_size() * 13) {
505       KOKKOS_IMPL_DO_NOT_USE_PRINTF("FAILED to allocate memory of size %i\n",
506                                     static_cast<int>(team.team_size() * 13));
507       flag() = 1;
508     } else {
509       Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, 13), [&](int i) {
510         values(13 * team.team_rank() + i) =
511             i - team.team_rank() - team.league_rank() + team.league_size() +
512             team.team_size();
513       });
514 
515       Kokkos::single(Kokkos::PerThread(team), [&]() {
516         Scalar test  = 0;
517         Scalar value = 0;
518 
519         for (int i = 0; i < 13; ++i) {
520           test += i - team.team_rank() - team.league_rank() +
521                   team.league_size() + team.team_size();
522           value += values(13 * team.team_rank() + i);
523         }
524 
525         if (test != value) {
526           KOKKOS_IMPL_DO_NOT_USE_PRINTF("FAILED vector_par_for %i %i %f %f\n",
527                                         team.league_rank(), team.team_rank(),
528                                         static_cast<double>(test),
529                                         static_cast<double>(value));
530 
531           flag() = 1;
532         }
533       });
534     }
535   }
536 };
537 
538 template <typename Scalar, class ExecutionSpace>
539 struct functor_vec_red {
540   using policy_type     = Kokkos::TeamPolicy<ExecutionSpace>;
541   using execution_space = ExecutionSpace;
542 
543   Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag;
544 
functor_vec_redTestTeamVector::functor_vec_red545   functor_vec_red(Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_)
546       : flag(flag_) {}
547 
548   KOKKOS_INLINE_FUNCTION
operator ()TestTeamVector::functor_vec_red549   void operator()(typename policy_type::member_type team) const {
550     Scalar value = 0;
551 
552     // When no reducer is given the default is summation.
553     Kokkos::parallel_reduce(
554         Kokkos::ThreadVectorRange(team, 13),
555         [&](int i, Scalar &val) { val += i; }, value);
556 
557     Kokkos::single(Kokkos::PerThread(team), [&]() {
558       Scalar test = 0;
559 
560       for (int i = 0; i < 13; i++) test += i;
561 
562       if (test != value) {
563         KOKKOS_IMPL_DO_NOT_USE_PRINTF("FAILED vector_par_reduce %i %i %f %f\n",
564                                       team.league_rank(), team.team_rank(),
565                                       (double)test, (double)value);
566 
567         flag() = 1;
568       }
569     });
570   }
571 };
572 
573 template <typename Scalar, class ExecutionSpace>
574 struct functor_vec_red_reducer {
575   using policy_type     = Kokkos::TeamPolicy<ExecutionSpace>;
576   using execution_space = ExecutionSpace;
577 
578   Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag;
579 
functor_vec_red_reducerTestTeamVector::functor_vec_red_reducer580   functor_vec_red_reducer(
581       Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_)
582       : flag(flag_) {}
583 
584   KOKKOS_INLINE_FUNCTION
operator ()TestTeamVector::functor_vec_red_reducer585   void operator()(typename policy_type::member_type team) const {
586     // Must initialize to the identity value for the reduce operation
587     // for this test:
588     //   ( identity, operation ) = ( 1 , *= )
589     Scalar value = 1;
590 
591     Kokkos::parallel_reduce(
592         Kokkos::ThreadVectorRange(team, 13),
593         [&](int i, Scalar &val) { val *= (i % 5 + 1); },
594         Kokkos::Prod<Scalar>(value));
595 
596     Kokkos::single(Kokkos::PerThread(team), [&]() {
597       Scalar test = 1;
598 
599       for (int i = 0; i < 13; i++) test *= (i % 5 + 1);
600 
601       if (test != value) {
602         KOKKOS_IMPL_DO_NOT_USE_PRINTF(
603             "FAILED vector_par_reduce_reducer %i %i %f %f\n",
604             team.league_rank(), team.team_rank(), (double)test, (double)value);
605 
606         flag() = 1;
607       }
608     });
609   }
610 };
611 
612 template <typename Scalar, class ExecutionSpace>
613 struct functor_vec_scan {
614   using policy_type     = Kokkos::TeamPolicy<ExecutionSpace>;
615   using execution_space = ExecutionSpace;
616 
617   Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag;
functor_vec_scanTestTeamVector::functor_vec_scan618   functor_vec_scan(Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_)
619       : flag(flag_) {}
620 
621   KOKKOS_INLINE_FUNCTION
operator ()TestTeamVector::functor_vec_scan622   void operator()(typename policy_type::member_type team) const {
623     Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team, 13),
624                           [&](int i, Scalar &val, bool final) {
625                             val += i;
626 
627                             if (final) {
628                               Scalar test = 0;
629                               for (int k = 0; k <= i; k++) test += k;
630 
631                               if (test != val) {
632                                 KOKKOS_IMPL_DO_NOT_USE_PRINTF(
633                                     "FAILED vector_par_scan %i %i %f %f\n",
634                                     team.league_rank(), team.team_rank(),
635                                     (double)test, (double)val);
636 
637                                 flag() = 1;
638                               }
639                             }
640                           });
641   }
642 };
643 
644 template <typename Scalar, class ExecutionSpace>
645 struct functor_reduce {
646   using value_type      = double;
647   using policy_type     = Kokkos::TeamPolicy<ExecutionSpace>;
648   using execution_space = ExecutionSpace;
649 
650   Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag;
functor_reduceTestTeamVector::functor_reduce651   functor_reduce(Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_)
652       : flag(flag_) {}
653 
654   KOKKOS_INLINE_FUNCTION
operator ()TestTeamVector::functor_reduce655   void operator()(typename policy_type::member_type team, double &sum) const {
656     sum += team.league_rank() * 100 + team.thread_rank();
657   }
658 };
659 
660 template <typename Scalar, class ExecutionSpace>
test_scalar(int nteams,int team_size,int test)661 bool test_scalar(int nteams, int team_size, int test) {
662   Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> d_flag("flag");
663   typename Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace>::HostMirror
664       h_flag("h_flag");
665   h_flag() = 0;
666   Kokkos::deep_copy(d_flag, h_flag);
667 
668   if (test == 0) {
669     Kokkos::parallel_for(
670         std::string("A"),
671         Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size, 8),
672         functor_vec_red<Scalar, ExecutionSpace>(d_flag));
673   } else if (test == 1) {
674     Kokkos::parallel_for(
675         Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size, 8),
676         functor_vec_red_reducer<Scalar, ExecutionSpace>(d_flag));
677   } else if (test == 2) {
678     Kokkos::parallel_for(
679         Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size, 8),
680         functor_vec_scan<Scalar, ExecutionSpace>(d_flag));
681   } else if (test == 3) {
682     Kokkos::parallel_for(
683         Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size, 8),
684         functor_vec_for<Scalar, ExecutionSpace>(d_flag));
685   } else if (test == 4) {
686     Kokkos::parallel_for(
687         "B", Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size, 8),
688         functor_vec_single<Scalar, ExecutionSpace>(d_flag, 0, 13));
689   } else if (test == 5) {
690     Kokkos::parallel_for(Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size),
691                          functor_team_for<Scalar, ExecutionSpace>(d_flag));
692   } else if (test == 6) {
693     Kokkos::parallel_for(Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size),
694                          functor_team_reduce<Scalar, ExecutionSpace>(d_flag));
695   } else if (test == 7) {
696     Kokkos::parallel_for(
697         Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size),
698         functor_team_reduce_reducer<Scalar, ExecutionSpace>(d_flag));
699   } else if (test == 8) {
700     Kokkos::parallel_for(
701         Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size, 8),
702         functor_team_vector_for<Scalar, ExecutionSpace>(d_flag));
703   } else if (test == 9) {
704     Kokkos::parallel_for(
705         Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size, 8),
706         functor_team_vector_reduce<Scalar, ExecutionSpace>(d_flag));
707   } else if (test == 10) {
708     Kokkos::parallel_for(
709         Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size, 8),
710         functor_team_vector_reduce_reducer<Scalar, ExecutionSpace>(d_flag));
711   } else if (test == 11) {
712     Kokkos::parallel_for(
713         "B", Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size, 8),
714         functor_vec_single<Scalar, ExecutionSpace>(d_flag, 4, 13));
715   }
716 
717   Kokkos::deep_copy(h_flag, d_flag);
718 
719   return (h_flag() == 0);
720 }
721 
722 template <class ExecutionSpace>
Test(int test)723 bool Test(int test) {
724   bool passed = true;
725 
726   int team_size = 33;
727   if (team_size > int(ExecutionSpace::concurrency()))
728     team_size = int(ExecutionSpace::concurrency());
729   passed = passed && test_scalar<int, ExecutionSpace>(317, team_size, test);
730   passed = passed &&
731            test_scalar<long long int, ExecutionSpace>(317, team_size, test);
732   passed = passed && test_scalar<float, ExecutionSpace>(317, team_size, test);
733   passed = passed && test_scalar<double, ExecutionSpace>(317, team_size, test);
734   passed = passed &&
735            test_scalar<Test::my_complex, ExecutionSpace>(317, team_size, test);
736   passed = passed && test_scalar<Test::array_reduce<double, 1>, ExecutionSpace>(
737                          317, team_size, test);
738   passed = passed && test_scalar<Test::array_reduce<float, 1>, ExecutionSpace>(
739                          317, team_size, test);
740   passed = passed && test_scalar<Test::array_reduce<double, 3>, ExecutionSpace>(
741                          317, team_size, test);
742 
743   return passed;
744 }
745 
746 }  // namespace TestTeamVector
747 
748 namespace Test {
749 
750 // Computes y^T*A*x
751 // ( modified from kokkos-tutorials/GTC2016/Exercises/ThreeLevelPar )
752 
753 #if (!defined(KOKKOS_ENABLE_CUDA)) || defined(KOKKOS_ENABLE_CUDA_LAMBDA)
754 template <typename ScalarType, class DeviceType>
755 class TestTripleNestedReduce {
756  public:
757   using execution_space = DeviceType;
758   using size_type       = typename execution_space::size_type;
759 
TestTripleNestedReduce(const size_type & nrows,const size_type & ncols,const size_type & team_size,const size_type & vector_length)760   TestTripleNestedReduce(const size_type &nrows, const size_type &ncols,
761                          const size_type &team_size,
762                          const size_type &vector_length) {
763     run_test(nrows, ncols, team_size, vector_length);
764   }
765 
run_test(const size_type & nrows,const size_type & ncols,size_type team_size,const size_type & vector_length)766   void run_test(const size_type &nrows, const size_type &ncols,
767                 size_type team_size, const size_type &vector_length) {
768     if (team_size > size_type(DeviceType::execution_space::concurrency()))
769       team_size = size_type(DeviceType::execution_space::concurrency());
770 
771 #ifdef KOKKOS_ENABLE_HPX
772     team_size = 1;
773     if (!std::is_same<execution_space, Kokkos::Experimental::HPX>::value) {
774       team_size = 1;
775     }
776 #endif
777 
778     // using Layout = Kokkos::LayoutLeft;
779     using Layout = Kokkos::LayoutRight;
780 
781     using ViewVector = Kokkos::View<ScalarType *, DeviceType>;
782     using ViewMatrix = Kokkos::View<ScalarType **, Layout, DeviceType>;
783 
784     ViewVector y("y", nrows);
785     ViewVector x("x", ncols);
786     ViewMatrix A("A", nrows, ncols);
787 
788     using range_policy = Kokkos::RangePolicy<DeviceType>;
789 
790     // Initialize y vector.
791     Kokkos::parallel_for(
792         range_policy(0, nrows), KOKKOS_LAMBDA(const int i) { y(i) = 1; });
793 
794     // Initialize x vector.
795     Kokkos::parallel_for(
796         range_policy(0, ncols), KOKKOS_LAMBDA(const int i) { x(i) = 1; });
797     Kokkos::fence();
798 
799     using team_policy = Kokkos::TeamPolicy<DeviceType>;
800     using member_type = typename Kokkos::TeamPolicy<DeviceType>::member_type;
801 
802     // Initialize A matrix, note 2D indexing computation.
803     Kokkos::parallel_for(
804         team_policy(nrows, Kokkos::AUTO),
805         KOKKOS_LAMBDA(const member_type &teamMember) {
806           const int j = teamMember.league_rank();
807           Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, ncols),
808                                [&](const int i) { A(j, i) = 1; });
809         });
810     Kokkos::fence();
811 
812     // Three level parallelism kernel to force caching of vector x.
813     ScalarType result = 0.0;
814     int chunk_size    = 128;
815     Kokkos::parallel_reduce(
816         team_policy(nrows / chunk_size, team_size, vector_length),
817         KOKKOS_LAMBDA(const member_type &teamMember, double &update) {
818           const int row_start = teamMember.league_rank() * chunk_size;
819           const int row_end   = row_start + chunk_size;
820           Kokkos::parallel_for(
821               Kokkos::TeamThreadRange(teamMember, row_start, row_end),
822               [&](const int i) {
823                 ScalarType sum_i = 0.0;
824                 Kokkos::parallel_reduce(
825                     Kokkos::ThreadVectorRange(teamMember, ncols),
826                     [&](const int j, ScalarType &innerUpdate) {
827                       innerUpdate += A(i, j) * x(j);
828                     },
829                     sum_i);
830                 Kokkos::single(Kokkos::PerThread(teamMember),
831                                [&]() { update += y(i) * sum_i; });
832               });
833         },
834         result);
835     Kokkos::fence();
836 
837     const ScalarType solution = (ScalarType)nrows * (ScalarType)ncols;
838     if (int64_t(solution) != int64_t(result)) {
839       printf("  TestTripleNestedReduce failed solution(%" PRId64
840              ") != result(%" PRId64
841              "),"
842              " nrows(%" PRId32 ") ncols(%" PRId32 ") league_size(%" PRId32
843              ") team_size(%" PRId32 ")\n",
844              int64_t(solution), int64_t(result), int32_t(nrows), int32_t(ncols),
845              int32_t(nrows / chunk_size), int32_t(team_size));
846     }
847 
848     ASSERT_EQ(solution, result);
849   }
850 };
851 
852 #else  // #if ( ! defined( KOKKOS_ENABLE_CUDA ) ) || defined(
853        // KOKKOS_ENABLE_CUDA_LAMBDA )
854 
855 template <typename ScalarType, class DeviceType>
856 class TestTripleNestedReduce {
857  public:
858   using execution_space = DeviceType;
859   using size_type       = typename execution_space::size_type;
860 
861   TestTripleNestedReduce(const size_type &, const size_type, const size_type &,
862                          const size_type) {}
863 };
864 
865 #endif
866 
867 namespace VectorScanReducer {
868 enum class ScanType : bool { Inclusive, Exclusive };
869 
870 template <typename ExecutionSpace, ScanType scan_type, int n,
871           int n_vector_range, class Reducer>
872 struct checkScan {
873   const int n_team_thread_range = 1000;
874   const int n_per_team          = n_team_thread_range * n_vector_range;
875 
876   using size_type  = typename ExecutionSpace::size_type;
877   using value_type = typename Reducer::value_type;
878   using view_type  = Kokkos::View<value_type[n], ExecutionSpace>;
879 
880   view_type inputs  = view_type{"inputs"};
881   view_type outputs = view_type{"outputs"};
882 
883   value_type result;
884   Reducer reducer = {result};
885 
886   struct ThreadVectorFunctor {
operator ()Test::VectorScanReducer::checkScan::ThreadVectorFunctor887     KOKKOS_FUNCTION void operator()(const size_type j, value_type &update,
888                                     const bool final) const {
889       const size_type element = j + m_team_offset + m_thread_offset;
890       const auto tmp          = m_inputs(element);
891       if (scan_type == ScanType::Inclusive) {
892         m_reducer.join(update, tmp);
893         if (final) {
894           m_outputs(element) = update;
895         }
896       } else {
897         if (final) {
898           m_outputs(element) = update;
899         }
900         m_reducer.join(update, tmp);
901       }
902     }
903 
904     const Reducer &m_reducer;
905     const size_type &m_team_offset;
906     const size_type &m_thread_offset;
907     const view_type &m_outputs;
908     const view_type &m_inputs;
909   };
910 
911   struct TeamThreadRangeFunctor {
operator ()Test::VectorScanReducer::checkScan::TeamThreadRangeFunctor912     KOKKOS_FUNCTION void operator()(const size_type i) const {
913       const size_type thread_offset = i * n_vector_range;
914       Kokkos::parallel_scan(
915           Kokkos::ThreadVectorRange(m_team, n_vector_range),
916           ThreadVectorFunctor{m_reducer, m_team_offset, thread_offset,
917                               m_outputs, m_inputs},
918           m_reducer);
919     }
920 
921     const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &m_team;
922     const Reducer &m_reducer;
923     const size_type &m_team_offset;
924     const view_type &m_outputs;
925     const view_type &m_inputs;
926   };
927 
operator ()Test::VectorScanReducer::checkScan928   KOKKOS_FUNCTION void operator()(
929       const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team)
930       const {
931     const size_type iTeam       = team.league_rank();
932     const size_type iTeamOffset = iTeam * n_per_team;
933     Kokkos::parallel_for(
934         Kokkos::TeamThreadRange(team, n_team_thread_range),
935         TeamThreadRangeFunctor{team, reducer, iTeamOffset, outputs, inputs});
936   }
937 
operator ()Test::VectorScanReducer::checkScan938   KOKKOS_FUNCTION void operator()(size_type i) const { inputs(i) = i * 1. / n; }
939 
runTest::VectorScanReducer::checkScan940   void run() {
941     const int n_teams = n / n_per_team;
942 
943     Kokkos::parallel_for(Kokkos::RangePolicy<ExecutionSpace>(0, n), *this);
944 
945     // run ThreadVectorRange parallel_scan
946     Kokkos::TeamPolicy<ExecutionSpace> policy(n_teams, Kokkos::AUTO,
947                                               Kokkos::AUTO);
948     const std::string label =
949         (scan_type == ScanType::Inclusive ? std::string("inclusive")
950                                           : std::string("exclusive")) +
951         "Scan" + typeid(Reducer).name();
952     Kokkos::parallel_for(label, policy, *this);
953     Kokkos::fence();
954 
955     auto host_outputs =
956         Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, outputs);
957     auto host_inputs =
958         Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, inputs);
959 
960     Kokkos::View<value_type[n], Kokkos::HostSpace> expected("expected");
961     {
962       value_type identity;
963       reducer.init(identity);
964       for (int i = 0; i < expected.extent_int(0); ++i) {
965         const int vector       = i % n_vector_range;
966         const value_type accum = vector == 0 ? identity : expected(i - 1);
967         const value_type val =
968             scan_type == ScanType::Inclusive
969                 ? host_inputs(i)
970                 : (vector == 0 ? identity : host_inputs(i - 1));
971         expected(i) = accum;
972         reducer.join(expected(i), val);
973       }
974     }
975     for (int i = 0; i < host_outputs.extent_int(0); ++i)
976       ASSERT_EQ(host_outputs(i), expected(i));
977   }
978 };
979 }  // namespace VectorScanReducer
980 
981 #if !(defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) || defined(KOKKOS_ENABLE_HIP))
TEST(TEST_CATEGORY,team_vector)982 TEST(TEST_CATEGORY, team_vector) {
983   ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(0)));
984   ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(1)));
985   ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(2)));
986   ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(3)));
987   ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(4)));
988   ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(5)));
989   ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(6)));
990   ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(7)));
991   ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(8)));
992   ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(9)));
993   ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(10)));
994   ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(11)));
995 }
996 #endif
997 
998 #if !defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
TEST(TEST_CATEGORY,triple_nested_parallelism)999 TEST(TEST_CATEGORY, triple_nested_parallelism) {
1000 // With KOKKOS_ENABLE_DEBUG enabled, the functor uses too many registers to run
1001 // with a team size of 32 on GPUs, 16 is the max possible (at least on a K80
1002 // GPU) See https://github.com/kokkos/kokkos/issues/1513
1003 #if defined(KOKKOS_ENABLE_DEBUG) && defined(KOKKOS_ENABLE_CUDA)
1004   if (!std::is_same<TEST_EXECSPACE, Kokkos::Cuda>::value) {
1005 #endif
1006     TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 32, 32);
1007     TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 32, 16);
1008 #if defined(KOKKOS_ENABLE_DEBUG) && defined(KOKKOS_ENABLE_CUDA)
1009   }
1010 #endif
1011   TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 16, 16);
1012   TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 16, 33);
1013   TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 16, 19);
1014   TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 7, 16);
1015 }
1016 #endif
1017 
TEST(TEST_CATEGORY,parallel_scan_with_reducers)1018 TEST(TEST_CATEGORY, parallel_scan_with_reducers) {
1019   using T = double;
1020   using namespace VectorScanReducer;
1021 
1022   static constexpr int n              = 1000000;
1023   static constexpr int n_vector_range = 100;
1024 
1025   checkScan<TEST_EXECSPACE, ScanType::Exclusive, n, n_vector_range,
1026             Kokkos::Prod<T, TEST_EXECSPACE>>()
1027       .run();
1028   checkScan<TEST_EXECSPACE, ScanType::Inclusive, n, n_vector_range,
1029             Kokkos::Prod<T, TEST_EXECSPACE>>()
1030       .run();
1031 
1032   checkScan<TEST_EXECSPACE, ScanType::Exclusive, n, n_vector_range,
1033             Kokkos::Max<T, TEST_EXECSPACE>>()
1034       .run();
1035   checkScan<TEST_EXECSPACE, ScanType::Inclusive, n, n_vector_range,
1036             Kokkos::Max<T, TEST_EXECSPACE>>()
1037       .run();
1038 
1039   checkScan<TEST_EXECSPACE, ScanType::Exclusive, n, n_vector_range,
1040             Kokkos::Min<T, TEST_EXECSPACE>>()
1041       .run();
1042   checkScan<TEST_EXECSPACE, ScanType::Inclusive, n, n_vector_range,
1043             Kokkos::Min<T, TEST_EXECSPACE>>()
1044       .run();
1045 }
1046 
1047 }  // namespace Test
1048