1 #include <Kokkos_Core.hpp>
2 #include <gtest/gtest.h>
3 #include <PerfTest_Category.hpp>
4 
5 namespace Test {
6 
7 namespace {
8 template <class ExecSpace>
9 struct SpaceInstance {
createTest::__anon062167350111::SpaceInstance10   static ExecSpace create() { return ExecSpace(); }
destroyTest::__anon062167350111::SpaceInstance11   static void destroy(ExecSpace&) {}
overlapTest::__anon062167350111::SpaceInstance12   static bool overlap() { return false; }
13 };
14 
15 #ifndef KOKKOS_ENABLE_DEBUG
16 #ifdef KOKKOS_ENABLE_CUDA
17 template <>
18 struct SpaceInstance<Kokkos::Cuda> {
createTest::__anon062167350111::SpaceInstance19   static Kokkos::Cuda create() {
20     cudaStream_t stream;
21     cudaStreamCreate(&stream);
22     return Kokkos::Cuda(stream);
23   }
destroyTest::__anon062167350111::SpaceInstance24   static void destroy(Kokkos::Cuda& space) {
25     cudaStream_t stream = space.cuda_stream();
26     cudaStreamDestroy(stream);
27   }
overlapTest::__anon062167350111::SpaceInstance28   static bool overlap() {
29     bool value          = true;
30     auto local_rank_str = std::getenv("CUDA_LAUNCH_BLOCKING");
31     if (local_rank_str) {
32       value = (std::stoi(local_rank_str) == 0);
33     }
34     return value;
35   }
36 };
37 #endif
38 #endif
39 }  // namespace
40 
41 struct FunctorRange {
42   int M, R;
43   Kokkos::View<double**, TEST_EXECSPACE> a;
FunctorRangeTest::FunctorRange44   FunctorRange(int M_, int R_, Kokkos::View<double**, TEST_EXECSPACE> a_)
45       : M(M_), R(R_), a(a_) {}
46   KOKKOS_INLINE_FUNCTION
operator ()Test::FunctorRange47   void operator()(const int i) const {
48     for (int r = 0; r < R; r++)
49       for (int j = 0; j < M; j++) {
50         a(i, j) += 1.0;
51       }
52   }
53 };
54 
55 struct FunctorMDRange {
56   int M, R;
57   Kokkos::View<double**, TEST_EXECSPACE> a;
FunctorMDRangeTest::FunctorMDRange58   FunctorMDRange(int M_, int R_, Kokkos::View<double**, TEST_EXECSPACE> a_)
59       : M(M_), R(R_), a(a_) {}
60   KOKKOS_INLINE_FUNCTION
operator ()Test::FunctorMDRange61   void operator()(const int i, const int) const {
62     for (int j = 0; j < M; j++) a(i, j) += 1.0;
63   }
64 };
65 
66 struct FunctorTeam {
67   int M, R;
68   Kokkos::View<double**, Kokkos::LayoutRight, TEST_EXECSPACE> a;
FunctorTeamTest::FunctorTeam69   FunctorTeam(int M_, int R_,
70               Kokkos::View<double**, Kokkos::LayoutRight, TEST_EXECSPACE> a_)
71       : M(M_), R(R_), a(a_) {}
72   KOKKOS_INLINE_FUNCTION
operator ()Test::FunctorTeam73   void operator()(
74       const Kokkos::TeamPolicy<TEST_EXECSPACE>::member_type& team) const {
75     int i = team.league_rank();
76     for (int r = 0; r < R; r++) {
77       Kokkos::parallel_for(Kokkos::TeamThreadRange(team, M),
78                            [&](const int j) { a(i, j) += 1.0; });
79     }
80   }
81 };
82 
83 struct FunctorRangeReduce {
84   int M, R;
85   Kokkos::View<double**, TEST_EXECSPACE> a;
FunctorRangeReduceTest::FunctorRangeReduce86   FunctorRangeReduce(int M_, int R_, Kokkos::View<double**, TEST_EXECSPACE> a_)
87       : M(M_), R(R_), a(a_) {}
88   KOKKOS_INLINE_FUNCTION
operator ()Test::FunctorRangeReduce89   void operator()(const int i, double& tmp) const {
90     for (int r = 0; r < R; r++)
91       for (int j = 0; j < M; j++) {
92         tmp += a(i, j);
93       }
94   }
95 };
96 
97 struct FunctorMDRangeReduce {
98   int M, R;
99   Kokkos::View<double**, TEST_EXECSPACE> a;
FunctorMDRangeReduceTest::FunctorMDRangeReduce100   FunctorMDRangeReduce(int M_, int R_,
101                        Kokkos::View<double**, TEST_EXECSPACE> a_)
102       : M(M_), R(R_), a(a_) {}
103   KOKKOS_INLINE_FUNCTION
operator ()Test::FunctorMDRangeReduce104   void operator()(const int i, const int, double& tmp) const {
105     for (int j = 0; j < M; j++) tmp += a(i, j);
106   }
107 };
108 
109 struct FunctorTeamReduce {
110   int M, R;
111   Kokkos::View<double**, Kokkos::LayoutRight, TEST_EXECSPACE> a;
FunctorTeamReduceTest::FunctorTeamReduce112   FunctorTeamReduce(
113       int M_, int R_,
114       Kokkos::View<double**, Kokkos::LayoutRight, TEST_EXECSPACE> a_)
115       : M(M_), R(R_), a(a_) {}
116   KOKKOS_INLINE_FUNCTION
operator ()Test::FunctorTeamReduce117   void operator()(const Kokkos::TeamPolicy<TEST_EXECSPACE>::member_type& team,
118                   double& tmp) const {
119     int i = team.league_rank();
120     for (int r = 0; r < R; r++) {
121       double val;
122       Kokkos::parallel_reduce(
123           Kokkos::TeamThreadRange(team, M),
124           [&](const int j, double& tmp2) { tmp2 += a(i, j); }, val);
125       tmp += val;
126     }
127   }
128 };
129 
TEST(default_exec,overlap_range_policy)130 TEST(default_exec, overlap_range_policy) {
131   int N = 2000;
132   int M = 10000;
133   int R = 10;
134 
135   TEST_EXECSPACE space;
136   TEST_EXECSPACE space1 = SpaceInstance<TEST_EXECSPACE>::create();
137   TEST_EXECSPACE space2 = SpaceInstance<TEST_EXECSPACE>::create();
138 
139   Kokkos::View<double**, TEST_EXECSPACE> a("A", N, M);
140   FunctorRange f(M, R, a);
141   FunctorRangeReduce fr(M, R, a);
142   Kokkos::parallel_for("default_exec::overlap_range_policy::kernel0",
143                        Kokkos::RangePolicy<TEST_EXECSPACE>(0, N),
144                        FunctorRange(M, R, a));
145 
146   Kokkos::parallel_for(
147       "default_exec::overlap_range_policy::kernel1",
148       Kokkos::Experimental::require(
149           Kokkos::RangePolicy<TEST_EXECSPACE>(space1, 0, N),
150           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
151       f);
152   Kokkos::parallel_for(
153       "default_exec::overlap_range_policy::kernel2",
154       Kokkos::Experimental::require(
155           Kokkos::RangePolicy<TEST_EXECSPACE>(space2, 0, N),
156           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
157       f);
158   Kokkos::fence();
159 
160   Kokkos::Timer timer;
161   Kokkos::parallel_for(
162       "default_exec::overlap_range_policy::kernel3",
163       Kokkos::Experimental::require(
164           Kokkos::RangePolicy<TEST_EXECSPACE>(space, 0, N),
165           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
166       f);
167   Kokkos::parallel_for(
168       "default_exec::overlap_range_policy::kernel4",
169       Kokkos::Experimental::require(
170           Kokkos::RangePolicy<TEST_EXECSPACE>(space, 0, N),
171           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
172       f);
173   Kokkos::fence();
174 
175   timer.reset();
176   Kokkos::parallel_for(
177       "default_exec::overlap_range_policy::kernel5",
178       Kokkos::Experimental::require(
179           Kokkos::RangePolicy<TEST_EXECSPACE>(space1, 0, N),
180           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
181       FunctorRange(M, R, a));
182   Kokkos::parallel_for(
183       "default_exec::overlap_range_policy::kernel6",
184       Kokkos::Experimental::require(
185           Kokkos::RangePolicy<TEST_EXECSPACE>(space2, 0, N),
186           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
187       FunctorRange(M, R, a));
188   Kokkos::fence();
189   double time_overlap = timer.seconds();
190 
191   timer.reset();
192   Kokkos::parallel_for(
193       "default_exec::overlap_range_policy::kernel7",
194       Kokkos::Experimental::require(
195           Kokkos::RangePolicy<TEST_EXECSPACE>(space, 0, N),
196           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
197       f);
198   Kokkos::parallel_for(
199       "default_exec::overlap_range_policy::kernel8",
200       Kokkos::Experimental::require(
201           Kokkos::RangePolicy<TEST_EXECSPACE>(space, 0, N),
202           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
203       f);
204   Kokkos::fence();
205   double time_end = timer.seconds();
206 
207   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
208     ASSERT_TRUE((time_end > 1.5 * time_overlap));
209   }
210   printf("Time RangePolicy: NonOverlap: %lf Time Overlap: %lf\n", time_end,
211          time_overlap);
212 
213   Kokkos::View<double, TEST_EXECSPACE> result("result");
214   Kokkos::View<double, TEST_EXECSPACE> result1("result1");
215   Kokkos::View<double, TEST_EXECSPACE> result2("result2");
216   Kokkos::View<double, Kokkos::HostSpace> h_result("h_result");
217   Kokkos::View<double, Kokkos::HostSpace> h_result1("h_result1");
218   Kokkos::View<double, Kokkos::HostSpace> h_result2("h_result2");
219 
220   timer.reset();
221   Kokkos::parallel_reduce(
222       "default_exec::overlap_range_policy::kernel_reduce",
223       Kokkos::Experimental::require(
224           Kokkos::RangePolicy<TEST_EXECSPACE>(space, 0, N),
225           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
226       fr, result);
227   Kokkos::fence();
228   double time_fenced = timer.seconds();
229   Kokkos::deep_copy(h_result, result);
230 
231   timer.reset();
232   Kokkos::parallel_reduce(
233       "default_exec::overlap_range_policy::kernel_reduce",
234       Kokkos::Experimental::require(
235           Kokkos::RangePolicy<TEST_EXECSPACE>(space, 0, N),
236           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
237       fr, result);
238   double time_not_fenced = timer.seconds();
239   Kokkos::fence();
240   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
241     ASSERT_TRUE(time_fenced > 2.0 * time_not_fenced);
242   }
243 
244   timer.reset();
245   Kokkos::parallel_reduce(
246       "default_exec::overlap_range_policy::kernel_reduce",
247       Kokkos::Experimental::require(
248           Kokkos::RangePolicy<TEST_EXECSPACE>(space, 0, N),
249           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
250       fr, result);
251   Kokkos::parallel_reduce(
252       "default_exec::overlap_range_policy::kernel_reduce",
253       Kokkos::Experimental::require(
254           Kokkos::RangePolicy<TEST_EXECSPACE>(space, 0, N),
255           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
256       fr, result);
257   Kokkos::fence();
258   double time_no_overlapped_reduce = timer.seconds();
259 
260   timer.reset();
261   Kokkos::parallel_reduce(
262       "default_exec::overlap_range_policy::kernel_reduce",
263       Kokkos::Experimental::require(
264           Kokkos::RangePolicy<TEST_EXECSPACE>(space1, 0, N),
265           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
266       fr, result1);
267   Kokkos::parallel_reduce(
268       "default_exec::overlap_range_policy::kernel_reduce",
269       Kokkos::Experimental::require(
270           Kokkos::RangePolicy<TEST_EXECSPACE>(space2, 0, N),
271           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
272       fr, result2);
273   Kokkos::fence();
274   double time_overlapped_reduce = timer.seconds();
275 
276   Kokkos::deep_copy(h_result2, result2);
277   Kokkos::deep_copy(h_result1, result1);
278 
279   ASSERT_EQ(h_result1(), h_result());
280   ASSERT_EQ(h_result2(), h_result());
281 
282   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
283     ASSERT_TRUE(time_overlapped_reduce < 1.5 * time_no_overlapped_reduce);
284   }
285   printf("Time RangePolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n",
286          time_no_overlapped_reduce, time_overlapped_reduce);
287   SpaceInstance<TEST_EXECSPACE>::destroy(space1);
288   SpaceInstance<TEST_EXECSPACE>::destroy(space2);
289 }
290 
TEST(default_exec,overlap_mdrange_policy)291 TEST(default_exec, overlap_mdrange_policy) {
292   int N = 200;
293   int M = 10000;
294   int R = 10;
295 
296   TEST_EXECSPACE space;
297   TEST_EXECSPACE space1 = SpaceInstance<TEST_EXECSPACE>::create();
298   TEST_EXECSPACE space2 = SpaceInstance<TEST_EXECSPACE>::create();
299 
300   Kokkos::View<double**, TEST_EXECSPACE> a("A", N, M);
301   FunctorMDRange f(M, R, a);
302   FunctorMDRangeReduce fr(M, R, a);
303   Kokkos::parallel_for(
304       "default_exec::overlap_range_policy::kernel0",
305       Kokkos::Experimental::require(
306           Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>({0, 0},
307                                                                  {N, R}),
308           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
309       FunctorMDRange(M, R, a));
310 
311   Kokkos::parallel_for(
312       "default_exec::overlap_range_policy::kernel1",
313       Kokkos::Experimental::require(
314           Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space1, {0, 0},
315                                                                  {N, R}),
316           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
317       f);
318   Kokkos::parallel_for(
319       "default_exec::overlap_range_policy::kernel2",
320       Kokkos::Experimental::require(
321           Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space2, {0, 0},
322                                                                  {N, R}),
323           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
324       f);
325   Kokkos::fence();
326 
327   Kokkos::Timer timer;
328   Kokkos::parallel_for(
329       "default_exec::overlap_range_policy::kernel3",
330       Kokkos::Experimental::require(
331           Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space, {0, 0},
332                                                                  {N, R}),
333           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
334       f);
335   Kokkos::parallel_for(
336       "default_exec::overlap_range_policy::kernel4",
337       Kokkos::Experimental::require(
338           Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space, {0, 0},
339                                                                  {N, R}),
340           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
341       f);
342   Kokkos::fence();
343 
344   timer.reset();
345   Kokkos::parallel_for(
346       "default_exec::overlap_range_policy::kernel5",
347       Kokkos::Experimental::require(
348           Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space1, {0, 0},
349                                                                  {N, R}),
350           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
351       FunctorMDRange(M, R, a));
352   Kokkos::parallel_for(
353       "default_exec::overlap_range_policy::kernel6",
354       Kokkos::Experimental::require(
355           Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space2, {0, 0},
356                                                                  {N, R}),
357           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
358       FunctorMDRange(M, R, a));
359   Kokkos::fence();
360   double time_overlap = timer.seconds();
361 
362   timer.reset();
363   Kokkos::parallel_for(
364       "default_exec::overlap_range_policy::kernel7",
365       Kokkos::Experimental::require(
366           Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space, {0, 0},
367                                                                  {N, R}),
368           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
369       f);
370   Kokkos::parallel_for(
371       "default_exec::overlap_range_policy::kernel8",
372       Kokkos::Experimental::require(
373           Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space, {0, 0},
374                                                                  {N, R}),
375           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
376       f);
377   Kokkos::fence();
378   double time_end = timer.seconds();
379 
380   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
381     ASSERT_TRUE((time_end > 1.5 * time_overlap));
382   }
383   printf("Time MDRangePolicy: NonOverlap: %lf Time Overlap: %lf\n", time_end,
384          time_overlap);
385 
386   Kokkos::View<double, TEST_EXECSPACE> result("result");
387   Kokkos::View<double, TEST_EXECSPACE> result1("result1");
388   Kokkos::View<double, TEST_EXECSPACE> result2("result2");
389   Kokkos::View<double, Kokkos::HostSpace> h_result("h_result");
390   Kokkos::View<double, Kokkos::HostSpace> h_result1("h_result1");
391   Kokkos::View<double, Kokkos::HostSpace> h_result2("h_result2");
392 
393   timer.reset();
394   Kokkos::parallel_reduce(
395       "default_exec::overlap_mdrange_policy::kernel_reduce",
396       Kokkos::Experimental::require(
397           Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space, {0, 0},
398                                                                  {N, R}),
399           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
400       fr, result);
401   Kokkos::fence();
402   double time_fenced = timer.seconds();
403   Kokkos::deep_copy(h_result, result);
404 
405   timer.reset();
406   Kokkos::parallel_reduce(
407       "default_exec::overlap_mdrange_policy::kernel_reduce",
408       Kokkos::Experimental::require(
409           Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space, {0, 0},
410                                                                  {N, R}),
411           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
412       fr, result);
413   double time_not_fenced = timer.seconds();
414   Kokkos::fence();
415   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
416     ASSERT_TRUE(time_fenced > 2.0 * time_not_fenced);
417   }
418 
419   timer.reset();
420   Kokkos::parallel_reduce(
421       "default_exec::overlap_mdrange_policy::kernel_reduce",
422       Kokkos::Experimental::require(
423           Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space, {0, 0},
424                                                                  {N, R}),
425           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
426       fr, result);
427   Kokkos::parallel_reduce(
428       "default_exec::overlap_mdrange_policy::kernel_reduce",
429       Kokkos::Experimental::require(
430           Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space, {0, 0},
431                                                                  {N, R}),
432           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
433       fr, result);
434   Kokkos::fence();
435   double time_no_overlapped_reduce = timer.seconds();
436 
437   timer.reset();
438   Kokkos::parallel_reduce(
439       "default_exec::overlap_mdrange_policy::kernel_reduce",
440       Kokkos::Experimental::require(
441           Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space1, {0, 0},
442                                                                  {N, R}),
443           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
444       fr, result1);
445   Kokkos::parallel_reduce(
446       "default_exec::overlap_mdrange_policy::kernel_reduce",
447       Kokkos::Experimental::require(
448           Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space2, {0, 0},
449                                                                  {N, R}),
450           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
451       fr, result2);
452   Kokkos::fence();
453   double time_overlapped_reduce = timer.seconds();
454 
455   Kokkos::deep_copy(h_result2, result2);
456   Kokkos::deep_copy(h_result1, result1);
457 
458   ASSERT_EQ(h_result1(), h_result());
459   ASSERT_EQ(h_result2(), h_result());
460 
461   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
462     ASSERT_TRUE(time_overlapped_reduce < 1.5 * time_no_overlapped_reduce);
463   }
464   printf("Time MDRangePolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n",
465          time_no_overlapped_reduce, time_overlapped_reduce);
466   SpaceInstance<TEST_EXECSPACE>::destroy(space2);
467   SpaceInstance<TEST_EXECSPACE>::destroy(space1);
468 }
469 
TEST(default_exec,overlap_team_policy)470 TEST(default_exec, overlap_team_policy) {
471   int N = 20;
472   int M = 1000000;
473   int R = 10;
474 
475   TEST_EXECSPACE space;
476   TEST_EXECSPACE space1 = SpaceInstance<TEST_EXECSPACE>::create();
477   TEST_EXECSPACE space2 = SpaceInstance<TEST_EXECSPACE>::create();
478 
479   Kokkos::View<double**, Kokkos::LayoutRight, TEST_EXECSPACE> a("A", N, M);
480   FunctorTeam f(M, R, a);
481   FunctorTeamReduce fr(M, R, a);
482   Kokkos::parallel_for(
483       "default_exec::overlap_range_policy::kernel0",
484       Kokkos::Experimental::require(
485           Kokkos::TeamPolicy<TEST_EXECSPACE>(N, Kokkos::AUTO),
486           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
487       FunctorTeam(M, R, a));
488 
489   Kokkos::parallel_for(
490       "default_exec::overlap_range_policy::kernel1",
491       Kokkos::Experimental::require(
492           Kokkos::TeamPolicy<TEST_EXECSPACE>(space1, N, Kokkos::AUTO),
493           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
494       f);
495   Kokkos::parallel_for(
496       "default_exec::overlap_range_policy::kernel2",
497       Kokkos::Experimental::require(
498           Kokkos::TeamPolicy<TEST_EXECSPACE>(space2, N, Kokkos::AUTO),
499           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
500       f);
501   Kokkos::fence();
502 
503   Kokkos::Timer timer;
504   Kokkos::parallel_for(
505       "default_exec::overlap_range_policy::kernel3",
506       Kokkos::Experimental::require(
507           Kokkos::TeamPolicy<TEST_EXECSPACE>(space, N, Kokkos::AUTO),
508           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
509       f);
510   Kokkos::parallel_for(
511       "default_exec::overlap_range_policy::kernel4",
512       Kokkos::Experimental::require(
513           Kokkos::TeamPolicy<TEST_EXECSPACE>(space, N, Kokkos::AUTO),
514           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
515       f);
516   Kokkos::fence();
517 
518   timer.reset();
519   Kokkos::parallel_for(
520       "default_exec::overlap_range_policy::kernel5",
521       Kokkos::Experimental::require(
522           Kokkos::TeamPolicy<TEST_EXECSPACE>(space1, N, Kokkos::AUTO),
523           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
524       FunctorTeam(M, R, a));
525   Kokkos::parallel_for(
526       "default_exec::overlap_range_policy::kernel6",
527       Kokkos::Experimental::require(
528           Kokkos::TeamPolicy<TEST_EXECSPACE>(space2, N, Kokkos::AUTO),
529           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
530       FunctorTeam(M, R, a));
531   Kokkos::fence();
532   double time_overlap = timer.seconds();
533 
534   timer.reset();
535   Kokkos::parallel_for(
536       "default_exec::overlap_range_policy::kernel7",
537       Kokkos::Experimental::require(
538           Kokkos::TeamPolicy<TEST_EXECSPACE>(space, N, Kokkos::AUTO),
539           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
540       f);
541   Kokkos::parallel_for(
542       "default_exec::overlap_range_policy::kernel8",
543       Kokkos::Experimental::require(
544           Kokkos::TeamPolicy<TEST_EXECSPACE>(space, N, Kokkos::AUTO),
545           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
546       f);
547   Kokkos::fence();
548   double time_end = timer.seconds();
549 
550   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
551     ASSERT_TRUE((time_end > 1.5 * time_overlap));
552   }
553   printf("Time TeamPolicy: NonOverlap: %lf Time Overlap: %lf\n", time_end,
554          time_overlap);
555 
556   Kokkos::View<double, TEST_EXECSPACE> result("result");
557   Kokkos::View<double, TEST_EXECSPACE> result1("result1");
558   Kokkos::View<double, TEST_EXECSPACE> result2("result2");
559   Kokkos::View<double, Kokkos::HostSpace> h_result("h_result");
560   Kokkos::View<double, Kokkos::HostSpace> h_result1("h_result1");
561   Kokkos::View<double, Kokkos::HostSpace> h_result2("h_result2");
562 
563   timer.reset();
564   Kokkos::parallel_reduce(
565       "default_exec::overlap_team_policy::kernel_reduce",
566       Kokkos::Experimental::require(
567           Kokkos::TeamPolicy<TEST_EXECSPACE>(space, N, Kokkos::AUTO),
568           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
569       fr, result);
570   Kokkos::fence();
571   double time_fenced = timer.seconds();
572   Kokkos::deep_copy(h_result, result);
573 
574   timer.reset();
575   Kokkos::parallel_reduce(
576       "default_exec::overlap_team_policy::kernel_reduce",
577       Kokkos::Experimental::require(
578           Kokkos::TeamPolicy<TEST_EXECSPACE>(space, N, Kokkos::AUTO),
579           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
580       fr, result);
581   double time_not_fenced = timer.seconds();
582   Kokkos::fence();
583   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
584     ASSERT_TRUE(time_fenced > 2.0 * time_not_fenced);
585   }
586   timer.reset();
587   Kokkos::parallel_reduce(
588       "default_exec::overlap_team_policy::kernel_reduce",
589       Kokkos::Experimental::require(
590           Kokkos::TeamPolicy<TEST_EXECSPACE>(space, N, Kokkos::AUTO),
591           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
592       fr, result);
593   Kokkos::parallel_reduce(
594       "default_exec::overlap_team_policy::kernel_reduce",
595       Kokkos::Experimental::require(
596           Kokkos::TeamPolicy<TEST_EXECSPACE>(space, N, Kokkos::AUTO),
597           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
598       fr, result);
599   Kokkos::fence();
600   double time_no_overlapped_reduce = timer.seconds();
601 
602   timer.reset();
603   Kokkos::parallel_reduce(
604       "default_exec::overlap_team_policy::kernel_reduce",
605       Kokkos::Experimental::require(
606           Kokkos::TeamPolicy<TEST_EXECSPACE>(space1, N, Kokkos::AUTO),
607           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
608       fr, result1);
609   Kokkos::parallel_reduce(
610       "default_exec::overlap_team_policy::kernel_reduce",
611       Kokkos::Experimental::require(
612           Kokkos::TeamPolicy<TEST_EXECSPACE>(space2, N, Kokkos::AUTO),
613           Kokkos::Experimental::WorkItemProperty::HintLightWeight),
614       fr, result2);
615   Kokkos::fence();
616   double time_overlapped_reduce = timer.seconds();
617 
618   Kokkos::deep_copy(h_result2, result2);
619   Kokkos::deep_copy(h_result1, result1);
620 
621   ASSERT_EQ(h_result1(), h_result());
622   ASSERT_EQ(h_result2(), h_result());
623 
624   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
625     ASSERT_TRUE(time_overlapped_reduce < 1.5 * time_no_overlapped_reduce);
626   }
627   printf("Time TeamPolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n",
628          time_no_overlapped_reduce, time_overlapped_reduce);
629   SpaceInstance<TEST_EXECSPACE>::destroy(space1);
630   SpaceInstance<TEST_EXECSPACE>::destroy(space2);
631 }
632 }  // namespace Test
633