1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 //                        Kokkos v. 3.0
6 //       Copyright (2020) National Technology & Engineering
7 //               Solutions of Sandia, LLC (NTESS).
8 //
9 // Under the terms of Contract DE-NA0003525 with NTESS,
10 // the U.S. Government retains certain rights in this software.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions are
14 // met:
15 //
16 // 1. Redistributions of source code must retain the above copyright
17 // notice, this list of conditions and the following disclaimer.
18 //
19 // 2. Redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution.
22 //
23 // 3. Neither the name of the Corporation nor the names of the
24 // contributors may be used to endorse or promote products derived from
25 // this software without specific prior written permission.
26 //
27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 //
39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
40 //
41 // ************************************************************************
42 //@HEADER
43 */
44 
45 #ifndef KOKKOS_IMPL_CUDA_TASK_HPP
46 #define KOKKOS_IMPL_CUDA_TASK_HPP
47 
48 #include <Kokkos_Macros.hpp>
49 #if defined(KOKKOS_ENABLE_TASKDAG)
50 
51 //----------------------------------------------------------------------------
52 //----------------------------------------------------------------------------
53 
54 #include <Kokkos_Core_fwd.hpp>
55 
56 #include <impl/Kokkos_TaskBase.hpp>
57 #include <Cuda/Kokkos_Cuda_Error.hpp>  // CUDA_SAFE_CALL
58 #include <impl/Kokkos_TaskTeamMember.hpp>
59 
60 //----------------------------------------------------------------------------
61 
62 namespace Kokkos {
63 namespace Impl {
64 namespace {
65 
66 template <typename TaskType>
set_cuda_task_base_apply_function_pointer(typename TaskType::function_type * ptr,typename TaskType::destroy_type * dtor)67 __global__ void set_cuda_task_base_apply_function_pointer(
68     typename TaskType::function_type* ptr,
69     typename TaskType::destroy_type* dtor) {
70   *ptr  = TaskType::apply;
71   *dtor = TaskType::destroy;
72 }
73 
74 template <typename Scheduler>
cuda_task_queue_execute(Scheduler scheduler,int32_t shmem_size)75 __global__ void cuda_task_queue_execute(Scheduler scheduler,
76                                         int32_t shmem_size) {
77   TaskQueueSpecialization<Scheduler>::driver(std::move(scheduler), shmem_size);
78 }
79 
80 }  // namespace
81 
82 template <class, class>
83 class TaskExec;
84 
85 template <class QueueType>
86 class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
87  public:
88   using scheduler_type  = SimpleTaskScheduler<Kokkos::Cuda, QueueType>;
89   using execution_space = Kokkos::Cuda;
90   using memory_space    = Kokkos::CudaUVMSpace;
91   using member_type     = TaskExec<Kokkos::Cuda, scheduler_type>;
92 
93   enum : long { max_league_size = 16 };
94   enum : int { warps_per_block = 4 };
95 
96   KOKKOS_INLINE_FUNCTION
iff_single_thread_recursive_execute(scheduler_type const &)97   static void iff_single_thread_recursive_execute(scheduler_type const&) {}
98 
get_max_team_count(execution_space const &)99   static int get_max_team_count(execution_space const&) {
100     return Kokkos::Impl::cuda_internal_multiprocessor_count() * warps_per_block;
101   }
102 
driver(scheduler_type scheduler,int32_t shmem_per_warp)103   __device__ static void driver(scheduler_type scheduler,
104                                 int32_t shmem_per_warp) {
105     using queue_type     = typename scheduler_type::task_queue_type;
106     using task_base_type = typename scheduler_type::task_base_type;
107     using runnable_task_base_type =
108         typename scheduler_type::runnable_task_base_type;
109     using scheduling_info_storage_type = SchedulingInfoStorage<
110         runnable_task_base_type,
111         typename scheduler_type::task_scheduling_info_type>;
112 
113     extern __shared__ int32_t shmem_all[];
114 
115     int32_t* const warp_shmem =
116         shmem_all + (threadIdx.z * shmem_per_warp) / sizeof(int32_t);
117 
118     task_base_type* const shared_memory_task_copy = (task_base_type*)warp_shmem;
119 
120     const int warp_lane = threadIdx.x + threadIdx.y * blockDim.x;
121 
122     member_type single_exec(scheduler, warp_shmem, 1);
123     member_type team_exec(scheduler, warp_shmem, blockDim.y);
124 
125     auto& queue          = scheduler.queue();
126     auto& team_scheduler = team_exec.scheduler();
127 
128     auto current_task = OptionalRef<task_base_type>();
129 
130     // Loop until all queues are empty and no tasks in flight
131     while (!queue.is_done()) {
132       if (warp_lane == 0) {  // should be (?) same as team_exec.team_rank() == 0
133         // pop off a task
134         current_task =
135             queue.pop_ready_task(team_scheduler.team_scheduler_info());
136       }
137 
138       // Broadcast task pointer:
139 
140       // Sync before the broadcast
141       KOKKOS_IMPL_CUDA_SYNCWARP;
142 
143       // pretend it's an int* for shuffle purposes
144       ((int*)&current_task)[0] =
145           KOKKOS_IMPL_CUDA_SHFL(((int*)&current_task)[0], 0, 32);
146       ((int*)&current_task)[1] =
147           KOKKOS_IMPL_CUDA_SHFL(((int*)&current_task)[1], 0, 32);
148 
149       if (current_task) {
150         KOKKOS_ASSERT(!current_task->as_runnable_task().get_respawn_flag());
151 
152         int32_t b = sizeof(scheduling_info_storage_type) / sizeof(int32_t);
153         static_assert(
154             sizeof(scheduling_info_storage_type) % sizeof(int32_t) == 0,
155             "bad task size");
156         int32_t const e = current_task->get_allocation_size() / sizeof(int32_t);
157         KOKKOS_ASSERT(current_task->get_allocation_size() % sizeof(int32_t) ==
158                       0);
159 
160         int32_t volatile* const task_mem =
161             (int32_t volatile*)current_task.get();
162 
163         // do a coordinated copy of the task closure from global to shared
164         // memory:
165         for (int32_t i = warp_lane; i < e; i += CudaTraits::WarpSize) {
166           warp_shmem[i] = task_mem[i];
167         }
168 
169         // Synchronize threads of the warp and insure memory
170         // writes are visible to all threads in the warp.
171         KOKKOS_IMPL_CUDA_SYNCWARP;
172 
173         if (shared_memory_task_copy->is_team_runnable()) {
174           // Thread Team Task
175           shared_memory_task_copy->as_runnable_task().run(team_exec);
176         } else if (threadIdx.y == 0) {
177           // TODO @tasking @optimization DSH Change this to warp_lane == 0 when
178           // we allow blockDim.x to be more than 1 Single Thread Task
179           shared_memory_task_copy->as_runnable_task().run(single_exec);
180         }
181 
182         // Synchronize threads of the warp and insure memory
183         // writes are visible to all threads in the warp.
184 
185         KOKKOS_IMPL_CUDA_SYNCWARP;
186 
187         // if(warp_lane < b % CudaTraits::WarpSize) b += CudaTraits::WarpSize;
188         // b -= b % CudaTraits::WarpSize;
189 
190         // copy task closure from shared to global memory:
191         for (int32_t i = b + warp_lane; i < e; i += CudaTraits::WarpSize) {
192           task_mem[i] = warp_shmem[i];
193         }
194 
195         // Synchronize threads of the warp and insure memory
196         // writes are visible to root thread of the warp for
197         // respawn or completion.
198 
199         KOKKOS_IMPL_CUDA_SYNCWARP;
200 
201         if (warp_lane == 0) {
202           // If respawn requested copy respawn data back to main memory
203           if (shared_memory_task_copy->as_runnable_task().get_respawn_flag()) {
204             if (shared_memory_task_copy->as_runnable_task().has_predecessor()) {
205               // It's not necessary to make this a volatile write because
206               // the next read of the predecessor is on this thread in complete,
207               // and the predecessor is cleared there (using a volatile write)
208               current_task->as_runnable_task().acquire_predecessor_from(
209                   shared_memory_task_copy->as_runnable_task());
210             }
211 
212             // It may not necessary to make this a volatile write, since the
213             // next read will be done by this thread in complete where the
214             // rescheduling occurs, but since the task could be stolen later
215             // before this is written again, we should do the volatile write
216             // here.  (It might not be necessary though because I don't know
217             // where else the priority would be read after it is scheduled
218             // by this thread; for now, we leave it volatile, but we should
219             // benchmark the cost of this.)
220             current_task.as_volatile()->set_priority(
221                 shared_memory_task_copy->get_priority());
222 
223             // It's not necessary to make this a volatile write, since the
224             // next read of it (if true) will be by this thread in `complete()`,
225             // which will unset the flag (using volatile) once it has handled
226             // the respawn
227             current_task->as_runnable_task().set_respawn_flag();
228           }
229 
230           queue.complete((*std::move(current_task)).as_runnable_task(),
231                          team_scheduler.team_scheduler_info());
232         }
233       }
234     }
235   }
236 
execute(scheduler_type const & scheduler)237   static void execute(scheduler_type const& scheduler) {
238     const int shared_per_warp = 2048;
239     const dim3 grid(Kokkos::Impl::cuda_internal_multiprocessor_count(), 1, 1);
240     const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block);
241     const int shared_total    = shared_per_warp * warps_per_block;
242     const cudaStream_t stream = nullptr;
243 
244     KOKKOS_ASSERT(
245         static_cast<long>(grid.x * grid.y * grid.z * block.x * block.y *
246                           block.z) ==
247         static_cast<long>(get_max_team_count(scheduler.get_execution_space()) *
248                           Kokkos::Impl::CudaTraits::WarpSize));
249 
250     auto& queue = scheduler.queue();
251 
252     CUDA_SAFE_CALL(cudaDeviceSynchronize());
253 
254     // Query the stack size, in bytes:
255 
256     size_t previous_stack_size = 0;
257     CUDA_SAFE_CALL(
258         cudaDeviceGetLimit(&previous_stack_size, cudaLimitStackSize));
259 
260     // If not large enough then set the stack size, in bytes:
261 
262     const size_t larger_stack_size = 1 << 11;
263 
264     if (previous_stack_size < larger_stack_size) {
265       CUDA_SAFE_CALL(cudaDeviceSetLimit(cudaLimitStackSize, larger_stack_size));
266     }
267 
268     cuda_task_queue_execute<<<grid, block, shared_total, stream>>>(
269         scheduler, shared_per_warp);
270 
271     CUDA_SAFE_CALL(cudaGetLastError());
272 
273     CUDA_SAFE_CALL(cudaDeviceSynchronize());
274 
275     if (previous_stack_size < larger_stack_size) {
276       CUDA_SAFE_CALL(
277           cudaDeviceSetLimit(cudaLimitStackSize, previous_stack_size));
278     }
279   }
280 
281   template <typename TaskType>
282   static
283       // TODO @tasking @optimiazation DSH specialize this for trivially
284       // destructible types
285       void
get_function_pointer(typename TaskType::function_type & ptr,typename TaskType::destroy_type & dtor)286       get_function_pointer(typename TaskType::function_type& ptr,
287                            typename TaskType::destroy_type& dtor) {
288     using function_type = typename TaskType::function_type;
289     using destroy_type  = typename TaskType::destroy_type;
290 
291     // TODO @tasking @minor DSH make sure there aren't any alignment concerns?
292     void* storage = cuda_internal_scratch_unified(
293         Kokkos::Cuda(), sizeof(function_type) + sizeof(destroy_type));
294     function_type* ptr_ptr = (function_type*)storage;
295     destroy_type* dtor_ptr =
296         (destroy_type*)((char*)storage + sizeof(function_type));
297 
298     CUDA_SAFE_CALL(cudaDeviceSynchronize());
299 
300     set_cuda_task_base_apply_function_pointer<TaskType>
301         <<<1, 1>>>(ptr_ptr, dtor_ptr);
302 
303     CUDA_SAFE_CALL(cudaGetLastError());
304     CUDA_SAFE_CALL(cudaDeviceSynchronize());
305 
306     ptr  = *ptr_ptr;
307     dtor = *dtor_ptr;
308   }
309 };
310 
311 //----------------------------------------------------------------------------
312 //----------------------------------------------------------------------------
313 
314 template <class Scheduler>
315 class TaskQueueSpecializationConstrained<
316     Scheduler,
317     typename std::enable_if<std::is_same<typename Scheduler::execution_space,
318                                          Kokkos::Cuda>::value>::type> {
319  public:
320   using scheduler_type  = Scheduler;
321   using execution_space = Kokkos::Cuda;
322   using memory_space    = Kokkos::CudaUVMSpace;
323   using member_type     = TaskExec<Kokkos::Cuda, Scheduler>;
324 
325   enum : long { max_league_size = 16 };
326 
327   KOKKOS_INLINE_FUNCTION
iff_single_thread_recursive_execute(scheduler_type const &)328   static void iff_single_thread_recursive_execute(scheduler_type const&) {}
329 
driver(scheduler_type scheduler,int32_t shmem_per_warp)330   __device__ static void driver(scheduler_type scheduler,
331                                 int32_t shmem_per_warp) {
332     using queue_type     = typename scheduler_type::queue_type;
333     using task_root_type = TaskBase;
334 
335     extern __shared__ int32_t shmem_all[];
336 
337     task_root_type* const end = (task_root_type*)task_root_type::EndTag;
338     task_root_type* const no_more_tasks_sentinel = nullptr;
339 
340     int32_t* const warp_shmem =
341         shmem_all + (threadIdx.z * shmem_per_warp) / sizeof(int32_t);
342 
343     task_root_type* const task_shmem = (task_root_type*)warp_shmem;
344 
345     const int warp_lane = threadIdx.x + threadIdx.y * blockDim.x;
346 
347     member_type single_exec(scheduler, warp_shmem, 1);
348     member_type team_exec(scheduler, warp_shmem, blockDim.y);
349 
350     auto& team_queue = team_exec.scheduler().queue();
351 
352     task_root_type* task_ptr = no_more_tasks_sentinel;
353 
354     // Loop until all queues are empty and no tasks in flight
355 
356     do {
357       // Each team lead attempts to acquire either a thread team task
358       // or collection of single thread tasks for the team.
359 
360       if (0 == warp_lane) {
361         if (*((volatile int*)&team_queue.m_ready_count) > 0) {
362           task_ptr = end;
363           // Attempt to acquire a task
364           // Loop by priority and then type
365           for (int i = 0; i < queue_type::NumQueue && end == task_ptr; ++i) {
366             for (int j = 0; j < 2 && end == task_ptr; ++j) {
367               task_ptr = queue_type::pop_ready_task(&team_queue.m_ready[i][j]);
368             }
369           }
370         } else {
371           // returns nullptr if and only if all other queues have a ready
372           // count of 0 also. Otherwise, returns a task from another queue
373           // or `end` if one couldn't be popped
374           task_ptr = team_queue.attempt_to_steal_task();
375 #if 0
376           if(task != no_more_tasks_sentinel && task != end) {
377             std::printf("task stolen on rank %d\n", team_exec.league_rank());
378           }
379 #endif
380         }
381       }
382 
383       // Synchronize warp with memory fence before broadcasting task pointer:
384 
385       // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "A" );
386       KOKKOS_IMPL_CUDA_SYNCWARP;
387 
388       // Broadcast task pointer:
389 
390       ((int*)&task_ptr)[0] = KOKKOS_IMPL_CUDA_SHFL(((int*)&task_ptr)[0], 0, 32);
391       ((int*)&task_ptr)[1] = KOKKOS_IMPL_CUDA_SHFL(((int*)&task_ptr)[1], 0, 32);
392 
393 #if defined(KOKKOS_ENABLE_DEBUG)
394       KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN("TaskQueue CUDA task_ptr");
395 #endif
396 
397       if (0 == task_ptr) break;  // 0 == queue->m_ready_count
398 
399       if (end != task_ptr) {
400         // Whole warp copy task's closure to/from shared memory.
401         // Use all threads of warp for coalesced read/write.
402 
403         int32_t const b = sizeof(task_root_type) / sizeof(int32_t);
404         int32_t const e =
405             *((int32_t volatile*)(&task_ptr->m_alloc_size)) / sizeof(int32_t);
406 
407         int32_t volatile* const task_mem = (int32_t volatile*)task_ptr;
408 
409         KOKKOS_ASSERT(e * sizeof(int32_t) < shmem_per_warp);
410 
411         // copy task closure from global to shared memory:
412 
413         for (int32_t i = warp_lane; i < e; i += CudaTraits::WarpSize) {
414           warp_shmem[i] = task_mem[i];
415         }
416 
417         // Synchronize threads of the warp and insure memory
418         // writes are visible to all threads in the warp.
419 
420         // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "B" );
421         KOKKOS_IMPL_CUDA_SYNCWARP;
422 
423         if (task_root_type::TaskTeam == task_shmem->m_task_type) {
424           // Thread Team Task
425           (*task_shmem->m_apply)(task_shmem, &team_exec);
426         } else if (0 == threadIdx.y) {
427           // Single Thread Task
428           (*task_shmem->m_apply)(task_shmem, &single_exec);
429         }
430 
431         // Synchronize threads of the warp and insure memory
432         // writes are visible to all threads in the warp.
433 
434         // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "C" );
435         KOKKOS_IMPL_CUDA_SYNCWARP;
436 
437         // copy task closure from shared to global memory:
438 
439         for (int32_t i = b + warp_lane; i < e; i += CudaTraits::WarpSize) {
440           task_mem[i] = warp_shmem[i];
441         }
442 
443         // Synchronize threads of the warp and insure memory
444         // writes are visible to root thread of the warp for
445         // respawn or completion.
446 
447         // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "D" );
448         KOKKOS_IMPL_CUDA_SYNCWARP;
449 
450         // If respawn requested copy respawn data back to main memory
451 
452         if (0 == warp_lane) {
453           if (((task_root_type*)task_root_type::LockTag) !=
454               task_shmem->m_next) {
455             ((volatile task_root_type*)task_ptr)->m_next = task_shmem->m_next;
456             ((volatile task_root_type*)task_ptr)->m_priority =
457                 task_shmem->m_priority;
458           }
459 
460           team_queue.complete(task_ptr);
461         }
462       }
463     } while (1);
464   }
465 
execute(scheduler_type const & scheduler)466   static void execute(scheduler_type const& scheduler) {
467     const int shared_per_warp = 2048;
468     const int warps_per_block = 4;
469     const dim3 grid(Kokkos::Impl::cuda_internal_multiprocessor_count(), 1, 1);
470     // const dim3 grid( 1 , 1 , 1 );
471     const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block);
472     const int shared_total    = shared_per_warp * warps_per_block;
473     const cudaStream_t stream = 0;
474 
475     auto& queue = scheduler.queue();
476     queue.initialize_team_queues(warps_per_block * grid.x);
477 
478     CUDA_SAFE_CALL(cudaDeviceSynchronize());
479 
480     // Query the stack size, in bytes:
481 
482     size_t previous_stack_size = 0;
483     CUDA_SAFE_CALL(
484         cudaDeviceGetLimit(&previous_stack_size, cudaLimitStackSize));
485 
486     // If not large enough then set the stack size, in bytes:
487 
488     const size_t larger_stack_size = 2048;
489 
490     if (previous_stack_size < larger_stack_size) {
491       CUDA_SAFE_CALL(cudaDeviceSetLimit(cudaLimitStackSize, larger_stack_size));
492     }
493 
494     cuda_task_queue_execute<<<grid, block, shared_total, stream>>>(
495         scheduler, shared_per_warp);
496 
497     CUDA_SAFE_CALL(cudaGetLastError());
498 
499     CUDA_SAFE_CALL(cudaDeviceSynchronize());
500 
501     if (previous_stack_size < larger_stack_size) {
502       CUDA_SAFE_CALL(
503           cudaDeviceSetLimit(cudaLimitStackSize, previous_stack_size));
504     }
505   }
506 
507   template <typename TaskType>
get_function_pointer(typename TaskType::function_type & ptr,typename TaskType::destroy_type & dtor)508   static void get_function_pointer(typename TaskType::function_type& ptr,
509                                    typename TaskType::destroy_type& dtor) {
510     using function_type = typename TaskType::function_type;
511     using destroy_type  = typename TaskType::destroy_type;
512 
513     void* storage = cuda_internal_scratch_unified(
514         Kokkos::Cuda(), sizeof(function_type) + sizeof(destroy_type));
515     function_type* ptr_ptr = (function_type*)storage;
516     destroy_type* dtor_ptr =
517         (destroy_type*)((char*)storage + sizeof(function_type));
518 
519     CUDA_SAFE_CALL(cudaDeviceSynchronize());
520 
521     set_cuda_task_base_apply_function_pointer<TaskType>
522         <<<1, 1>>>(ptr_ptr, dtor_ptr);
523 
524     CUDA_SAFE_CALL(cudaGetLastError());
525     CUDA_SAFE_CALL(cudaDeviceSynchronize());
526 
527     ptr  = *ptr_ptr;
528     dtor = *dtor_ptr;
529   }
530 };
531 
532 extern template class TaskQueue<
533     Kokkos::Cuda,
534     default_tasking_memory_space_for_execution_space_t<Kokkos::Cuda>>;
535 
536 }  // namespace Impl
537 }  // namespace Kokkos
538 
539 //----------------------------------------------------------------------------
540 //----------------------------------------------------------------------------
541 
542 namespace Kokkos {
543 namespace Impl {
544 
545 /**\brief  Impl::TaskExec<Cuda> is the TaskScheduler<Cuda>::member_type
546  *         passed to tasks running in a Cuda space.
547  *
548  *  Cuda thread blocks for tasking are dimensioned:
549  *    blockDim.x == vector length
550  *    blockDim.y == team size
551  *    blockDim.z == number of teams
552  *  where
553  *    blockDim.x * blockDim.y == WarpSize
554  *
555  *  Current implementation requires blockDim.x == 1.
556  *  Vector level parallelism with blockDim.y > 1 on Volta will
557  *  require a vector-level synchronization mask for vector-level
558  *  collective operaitons.
559  *
560  *  Both single thread and thread team tasks are run by a full Cuda warp.
561  *  A single thread task is called by warp lane #0 and the remaining
562  *  lanes of the warp are idle.
563  *
564  *  When executing a single thread task the syncwarp or other
565  *  warp synchronizing functions must not be called.
566  */
567 template <class Scheduler>
568 class TaskExec<Kokkos::Cuda, Scheduler> {
569  private:
570   enum : int { WarpSize = Kokkos::Impl::CudaTraits::WarpSize };
571 
572   TaskExec(TaskExec&&)      = delete;
573   TaskExec(TaskExec const&) = delete;
574   TaskExec& operator=(TaskExec&&) = delete;
575   TaskExec& operator=(TaskExec const&) = delete;
576 
577   friend class Kokkos::Impl::TaskQueue<
578       Kokkos::Cuda,
579       default_tasking_memory_space_for_execution_space_t<Kokkos::Cuda>>;
580   template <class, class>
581   friend class Kokkos::Impl::TaskQueueSpecializationConstrained;
582   template <class>
583   friend class Kokkos::Impl::TaskQueueSpecialization;
584 
585   int32_t* m_team_shmem;
586   const int m_team_size;
587   Scheduler m_scheduler;
588 
589   // If constructed with arg_team_size == 1 the object
590   // can only be used by 0 == threadIdx.y.
591   KOKKOS_INLINE_FUNCTION
TaskExec(Scheduler const & parent_scheduler,int32_t * arg_team_shmem,int arg_team_size=blockDim.y)592   TaskExec(Scheduler const& parent_scheduler, int32_t* arg_team_shmem,
593            int arg_team_size = blockDim.y)
594       : m_team_shmem(arg_team_shmem),
595         m_team_size(arg_team_size),
596         m_scheduler(parent_scheduler.get_team_scheduler(league_rank())) {}
597 
598  public:
599   using thread_team_member = TaskExec;
600 
601 #if defined(__CUDA_ARCH__)
team_rank() const602   __device__ int team_rank() const { return threadIdx.y; }
team_size() const603   __device__ int team_size() const { return m_team_size; }
604   //__device__ int league_rank() const { return threadIdx.z; }
league_rank() const605   __device__ int league_rank() const {
606     return blockIdx.x * blockDim.z + threadIdx.z;
607   }
league_size() const608   __device__ int league_size() const { return blockDim.z * gridDim.x; }
609 
team_barrier() const610   __device__ void team_barrier() const {
611     if (1 < m_team_size) {
612       KOKKOS_IMPL_CUDA_SYNCWARP;
613     }
614   }
615 
616   template <class ValueType>
team_broadcast(ValueType & val,const int thread_id) const617   __device__ void team_broadcast(ValueType& val, const int thread_id) const {
618     if (1 < m_team_size) {
619       // WarpSize = blockDim.X * blockDim.y
620       // thread_id < blockDim.y
621       ValueType tmp(val);  // input might not be register variable
622       Impl::in_place_shfl(val, tmp, blockDim.x * thread_id, WarpSize);
623     }
624   }
625 
626 #else
team_rank() const627   __host__ int team_rank() const { return 0; }
team_size() const628   __host__ int team_size() const { return 0; }
league_rank() const629   __host__ int league_rank() const { return 0; }
league_size() const630   __host__ int league_size() const { return 0; }
team_barrier() const631   __host__ void team_barrier() const {}
632   template <class ValueType>
team_broadcast(ValueType &,const int) const633   __host__ void team_broadcast(ValueType&, const int) const {}
634 #endif
635 
scheduler() const636   KOKKOS_INLINE_FUNCTION Scheduler const& scheduler() const noexcept {
637     return m_scheduler;
638   }
scheduler()639   KOKKOS_INLINE_FUNCTION Scheduler& scheduler() noexcept { return m_scheduler; }
640 };
641 
642 }  // namespace Impl
643 }  // namespace Kokkos
644 
645 //----------------------------------------------------------------------------
646 //----------------------------------------------------------------------------
647 
648 namespace Kokkos {
649 namespace Impl {
650 
651 template <typename iType, typename Scheduler>
652 struct TeamThreadRangeBoundariesStruct<iType,
653                                        TaskExec<Kokkos::Cuda, Scheduler>> {
654   using index_type  = iType;
655   using member_type = TaskExec<Kokkos::Cuda, Scheduler>;
656 
657   const iType start;
658   const iType end;
659   const iType increment;
660   member_type const& thread;
661 
662 #if defined(__CUDA_ARCH__)
663 
TeamThreadRangeBoundariesStructKokkos::Impl::TeamThreadRangeBoundariesStruct664   __device__ inline TeamThreadRangeBoundariesStruct(
665       member_type const& arg_thread, const iType& arg_count)
666       : start(threadIdx.y),
667         end(arg_count),
668         increment(blockDim.y),
669         thread(arg_thread) {}
670 
TeamThreadRangeBoundariesStructKokkos::Impl::TeamThreadRangeBoundariesStruct671   __device__ inline TeamThreadRangeBoundariesStruct(
672       member_type const& arg_thread, const iType& arg_start,
673       const iType& arg_end)
674       : start(arg_start + threadIdx.y),
675         end(arg_end),
676         increment(blockDim.y),
677         thread(arg_thread) {}
678 
679 #else
680 
681   TeamThreadRangeBoundariesStruct(member_type const& arg_thread,
682                                   const iType& arg_count);
683 
684   TeamThreadRangeBoundariesStruct(member_type const& arg_thread,
685                                   const iType& arg_start, const iType& arg_end);
686 
687 #endif
688 };
689 
690 //----------------------------------------------------------------------------
691 
692 template <typename iType, typename Scheduler>
693 struct ThreadVectorRangeBoundariesStruct<iType,
694                                          TaskExec<Kokkos::Cuda, Scheduler>> {
695   using index_type  = iType;
696   using member_type = TaskExec<Kokkos::Cuda, Scheduler>;
697 
698   const index_type start;
699   const index_type end;
700   const index_type increment;
701   const member_type& thread;
702 
703 #if defined(__CUDA_ARCH__)
704 
ThreadVectorRangeBoundariesStructKokkos::Impl::ThreadVectorRangeBoundariesStruct705   __device__ inline ThreadVectorRangeBoundariesStruct(
706       member_type const& arg_thread, const index_type& arg_count)
707       : start(threadIdx.x),
708         end(arg_count),
709         increment(blockDim.x),
710         thread(arg_thread) {}
711 
ThreadVectorRangeBoundariesStructKokkos::Impl::ThreadVectorRangeBoundariesStruct712   __device__ inline ThreadVectorRangeBoundariesStruct(
713       member_type const& arg_thread, const index_type& arg_begin,
714       const index_type& arg_end)
715       : start(arg_begin + threadIdx.x),
716         end(arg_end),
717         increment(blockDim.x),
718         thread(arg_thread) {}
719 
720 #else
721 
722   ThreadVectorRangeBoundariesStruct(member_type const& arg_thread,
723                                     const index_type& arg_count);
724 
725   ThreadVectorRangeBoundariesStruct(member_type const& arg_thread,
726                                     const index_type& arg_begin,
727                                     const index_type& arg_end);
728 
729 #endif
730 };
731 
732 }  // namespace Impl
733 }  // namespace Kokkos
734 
735 //----------------------------------------------------------------------------
736 
737 namespace Kokkos {
738 
739 // template<typename iType>
740 // KOKKOS_INLINE_FUNCTION
741 // Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Cuda >
742 // > TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread, const iType
743 // & count )
744 //{
745 //  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec<
746 //  Kokkos::Cuda > >( thread, count );
747 //}
748 //
749 // template<typename iType1, typename iType2>
750 // KOKKOS_INLINE_FUNCTION
751 // Impl::TeamThreadRangeBoundariesStruct
752 //  < typename std::common_type<iType1,iType2>::type
753 //  , Impl::TaskExec< Kokkos::Cuda > >
754 // TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread
755 //               , const iType1 & begin, const iType2 & end )
756 //{
757 //  using iType = typename std::common_type< iType1, iType2 >::type;
758 //  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec<
759 //  Kokkos::Cuda > >(
760 //           thread, iType(begin), iType(end) );
761 //}
762 //
763 // template<typename iType>
764 // KOKKOS_INLINE_FUNCTION
765 // Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda >
766 // > ThreadVectorRange( const Impl::TaskExec< Kokkos::Cuda > & thread
767 //                 , const iType & count )
768 //{
769 //  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec<
770 //  Kokkos::Cuda > >(thread,count);
771 //}
772 //
773 // template<typename iType>
774 // KOKKOS_INLINE_FUNCTION
775 // Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda >
776 // > ThreadVectorRange( const Impl::TaskExec< Kokkos::Cuda > & thread
777 //                 , const iType & arg_begin
778 //                 , const iType & arg_end )
779 //{
780 //  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec<
781 //  Kokkos::Cuda > >(thread,arg_begin,arg_end);
782 //}
783 
784 // KOKKOS_INLINE_FUNCTION
785 // Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda > >
786 // PerTeam(const Impl::TaskExec< Kokkos::Cuda >& thread)
787 // {
788 //   return Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda > >(thread);
789 // }
790 
791 // KOKKOS_INLINE_FUNCTION
792 // Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda > >
793 // PerThread(const Impl::TaskExec< Kokkos::Cuda >& thread)
794 // {
795 //   return Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda > >(thread);
796 // }
797 
798 /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each
799  * i=0..N-1.
800  *
801  * The range i=0..N-1 is mapped to all threads of the the calling thread team.
802  */
803 template <typename iType, class Lambda, class Scheduler>
parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec<Kokkos::Cuda,Scheduler>> & loop_boundaries,const Lambda & lambda)804 KOKKOS_INLINE_FUNCTION void parallel_for(
805     const Impl::TeamThreadRangeBoundariesStruct<
806         iType, Impl::TaskExec<Kokkos::Cuda, Scheduler>>& loop_boundaries,
807     const Lambda& lambda) {
808   for (iType i = loop_boundaries.start; i < loop_boundaries.end;
809        i += loop_boundaries.increment) {
810     lambda(i);
811   }
812 }
813 
814 template <typename iType, class Lambda, class Scheduler>
parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec<Kokkos::Cuda,Scheduler>> & loop_boundaries,const Lambda & lambda)815 KOKKOS_INLINE_FUNCTION void parallel_for(
816     const Impl::ThreadVectorRangeBoundariesStruct<
817         iType, Impl::TaskExec<Kokkos::Cuda, Scheduler>>& loop_boundaries,
818     const Lambda& lambda) {
819   for (iType i = loop_boundaries.start; i < loop_boundaries.end;
820        i += loop_boundaries.increment) {
821     lambda(i);
822   }
823 }
824 
825 // reduce across corresponding lanes between team members within warp
826 // assume stride*team_size == warp_size
827 template <typename ValueType, class JoinType>
strided_shfl_warp_reduction(const JoinType & join,ValueType & val,int team_size,int stride)828 KOKKOS_INLINE_FUNCTION void strided_shfl_warp_reduction(const JoinType& join,
829                                                         ValueType& val,
830                                                         int team_size,
831                                                         int stride) {
832   for (int lane_delta = (team_size * stride) >> 1; lane_delta >= stride;
833        lane_delta >>= 1) {
834     join(val, Kokkos::shfl_down(val, lane_delta, team_size * stride));
835   }
836 }
837 
838 // multiple within-warp non-strided reductions
839 template <typename ValueType, class JoinType>
multi_shfl_warp_reduction(const JoinType & join,ValueType & val,int vec_length)840 KOKKOS_INLINE_FUNCTION void multi_shfl_warp_reduction(const JoinType& join,
841                                                       ValueType& val,
842                                                       int vec_length) {
843   for (int lane_delta = vec_length >> 1; lane_delta; lane_delta >>= 1) {
844     join(val, Kokkos::shfl_down(val, lane_delta, vec_length));
845   }
846 }
847 
848 // broadcast within warp
849 template <class ValueType>
shfl_warp_broadcast(ValueType & val,int src_lane,int width)850 KOKKOS_INLINE_FUNCTION ValueType shfl_warp_broadcast(ValueType& val,
851                                                      int src_lane, int width) {
852   if (1 < width) {
853     return Kokkos::shfl(val, src_lane, width);
854   } else {
855     return val;
856   }
857 }
858 
859 /*// all-reduce across corresponding vector lanes between team members within
860 warp
861 // assume vec_length*team_size == warp_size
862 // blockDim.x == vec_length == stride
863 // blockDim.y == team_size
864 // threadIdx.x == position in vec
865 // threadIdx.y == member number
866 template< typename iType, class Lambda, typename ValueType, class JoinType >
867 KOKKOS_INLINE_FUNCTION
868 void parallel_reduce
869   (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec<
870 Kokkos::Cuda > >& loop_boundaries, const Lambda & lambda, const JoinType& join,
871    ValueType& initialized_result) {
872 
873   ValueType result = initialized_result;
874   for( iType i = loop_boundaries.start; i < loop_boundaries.end;
875 i+=loop_boundaries.increment) { lambda(i,result);
876   }
877   initialized_result = result;
878 
879   strided_shfl_warp_reduction<ValueType, JoinType>(
880                           join,
881                           initialized_result,
882                           loop_boundaries.thread.team_size(),
883                           blockDim.x);
884   initialized_result = shfl_warp_broadcast<ValueType>( initialized_result,
885 threadIdx.x, Impl::CudaTraits::WarpSize );
886 }*/
887 
888 // all-reduce across corresponding vector lanes between team members within warp
889 // if no join() provided, use sum
890 // assume vec_length*team_size == warp_size
891 // blockDim.x == vec_length == stride
892 // blockDim.y == team_size
893 // threadIdx.x == position in vec
894 // threadIdx.y == member number
895 template <typename iType, class Lambda, typename ValueType, class Scheduler>
parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec<Kokkos::Cuda,Scheduler>> & loop_boundaries,const Lambda & lambda,ValueType & initialized_result)896 KOKKOS_INLINE_FUNCTION void parallel_reduce(
897     const Impl::TeamThreadRangeBoundariesStruct<
898         iType, Impl::TaskExec<Kokkos::Cuda, Scheduler>>& loop_boundaries,
899     const Lambda& lambda, ValueType& initialized_result) {
900   // TODO @internal_documentation what is the point of creating this temporary?
901   ValueType result = initialized_result;
902   for (iType i = loop_boundaries.start; i < loop_boundaries.end;
903        i += loop_boundaries.increment) {
904     lambda(i, result);
905   }
906   initialized_result = result;
907 
908   if (1 < loop_boundaries.thread.team_size()) {
909     strided_shfl_warp_reduction(
910         [&](ValueType& val1, const ValueType& val2) { val1 += val2; },
911         initialized_result, loop_boundaries.thread.team_size(), blockDim.x);
912 
913     initialized_result = shfl_warp_broadcast<ValueType>(
914         initialized_result, threadIdx.x, Impl::CudaTraits::WarpSize);
915   }
916 }
917 
918 template <typename iType, class Lambda, typename ReducerType, class Scheduler>
parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec<Kokkos::Cuda,Scheduler>> & loop_boundaries,const Lambda & lambda,const ReducerType & reducer)919 KOKKOS_INLINE_FUNCTION void parallel_reduce(
920     const Impl::TeamThreadRangeBoundariesStruct<
921         iType, Impl::TaskExec<Kokkos::Cuda, Scheduler>>& loop_boundaries,
922     const Lambda& lambda, const ReducerType& reducer) {
923   using ValueType = typename ReducerType::value_type;
924   // TODO @internal_documentation what is the point of creating this temporary?
925   ValueType result = ValueType();
926   reducer.init(result);
927 
928   for (iType i = loop_boundaries.start; i < loop_boundaries.end;
929        i += loop_boundaries.increment) {
930     lambda(i, result);
931   }
932 
933   if (1 < loop_boundaries.thread.team_size()) {
934     strided_shfl_warp_reduction(
935         [&](ValueType& val1, const ValueType& val2) {
936           reducer.join(val1, val2);
937         },
938         result, loop_boundaries.thread.team_size(), blockDim.x);
939 
940     reducer.reference() = shfl_warp_broadcast<ValueType>(
941         result, threadIdx.x, Impl::CudaTraits::WarpSize);
942   } else {
943     reducer.reference() = result;
944   }
945 }
946 // all-reduce within team members within warp
947 // assume vec_length*team_size == warp_size
948 // blockDim.x == vec_length == stride
949 // blockDim.y == team_size
950 // threadIdx.x == position in vec
951 // threadIdx.y == member number
952 /*template< typename iType, class Lambda, typename ValueType, class JoinType >
953 KOKKOS_INLINE_FUNCTION
954 void parallel_reduce
955   (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec<
956 Kokkos::Cuda > >& loop_boundaries, const Lambda & lambda, const JoinType& join,
957    ValueType& initialized_result) {
958 
959   ValueType result = initialized_result;
960   for( iType i = loop_boundaries.start; i < loop_boundaries.end;
961 i+=loop_boundaries.increment) { lambda(i,result);
962   }
963   initialized_result = result;
964 
965   multi_shfl_warp_reduction<ValueType, JoinType>(join, initialized_result,
966 blockDim.x); initialized_result = shfl_warp_broadcast<ValueType>(
967 initialized_result, 0, blockDim.x );
968 }*/
969 
970 // all-reduce within team members within warp
971 // if no join() provided, use sum
972 // assume vec_length*team_size == warp_size
973 // blockDim.x == vec_length == stride
974 // blockDim.y == team_size
975 // threadIdx.x == position in vec
976 // threadIdx.y == member number
977 template <typename iType, class Lambda, typename ValueType, class Scheduler>
parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec<Kokkos::Cuda,Scheduler>> & loop_boundaries,const Lambda & lambda,ValueType & initialized_result)978 KOKKOS_INLINE_FUNCTION void parallel_reduce(
979     const Impl::ThreadVectorRangeBoundariesStruct<
980         iType, Impl::TaskExec<Kokkos::Cuda, Scheduler>>& loop_boundaries,
981     const Lambda& lambda, ValueType& initialized_result) {
982   ValueType result = initialized_result;
983 
984   for (iType i = loop_boundaries.start; i < loop_boundaries.end;
985        i += loop_boundaries.increment) {
986     lambda(i, result);
987   }
988 
989   initialized_result = result;
990 
991   if (1 < loop_boundaries.thread.team_size()) {
992     // initialized_result = multi_shfl_warp_reduction(
993     multi_shfl_warp_reduction(
994         [&](ValueType& val1, const ValueType& val2) { val1 += val2; },
995         initialized_result, blockDim.x);
996 
997     initialized_result =
998         shfl_warp_broadcast<ValueType>(initialized_result, 0, blockDim.x);
999   }
1000 }
1001 
1002 template <typename iType, class Lambda, typename ReducerType, class Scheduler>
parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec<Kokkos::Cuda,Scheduler>> & loop_boundaries,const Lambda & lambda,const ReducerType & reducer)1003 KOKKOS_INLINE_FUNCTION void parallel_reduce(
1004     const Impl::ThreadVectorRangeBoundariesStruct<
1005         iType, Impl::TaskExec<Kokkos::Cuda, Scheduler>>& loop_boundaries,
1006     const Lambda& lambda, const ReducerType& reducer) {
1007   using ValueType = typename ReducerType::value_type;
1008 
1009   ValueType result = ValueType();
1010   reducer.init(result);
1011 
1012   for (iType i = loop_boundaries.start; i < loop_boundaries.end;
1013        i += loop_boundaries.increment) {
1014     lambda(i, result);
1015   }
1016 
1017   if (1 < loop_boundaries.thread.team_size()) {
1018     multi_shfl_warp_reduction(
1019         [&](ValueType& val1, const ValueType& val2) {
1020           reducer.join(val1, val2);
1021         },
1022         result, blockDim.x);
1023 
1024     reducer.reference() = shfl_warp_broadcast<ValueType>(result, 0, blockDim.x);
1025   } else {
1026     reducer.reference() = result;
1027   }
1028 }
1029 // scan across corresponding vector lanes between team members within warp
1030 // assume vec_length*team_size == warp_size
1031 // blockDim.x == vec_length == stride
1032 // blockDim.y == team_size
1033 // threadIdx.x == position in vec
1034 // threadIdx.y == member number
1035 template <typename iType, class Closure, class Scheduler>
parallel_scan(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec<Kokkos::Cuda,Scheduler>> & loop_boundaries,const Closure & closure)1036 KOKKOS_INLINE_FUNCTION void parallel_scan(
1037     const Impl::TeamThreadRangeBoundariesStruct<
1038         iType, Impl::TaskExec<Kokkos::Cuda, Scheduler>>& loop_boundaries,
1039     const Closure& closure) {
1040   // Extract value_type from closure
1041 
1042   using value_type = typename Kokkos::Impl::FunctorAnalysis<
1043       Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type;
1044 
1045   if (1 < loop_boundaries.thread.team_size()) {
1046     // make sure all threads perform all loop iterations
1047     const iType bound = loop_boundaries.end + loop_boundaries.start;
1048     const int lane    = threadIdx.y * blockDim.x;
1049 
1050     value_type accum = 0;
1051     value_type val, y, local_total;
1052 
1053     for (iType i = loop_boundaries.start; i < bound;
1054          i += loop_boundaries.increment) {
1055       val = 0;
1056       if (i < loop_boundaries.end) closure(i, val, false);
1057 
1058       // intra-blockDim.y exclusive scan on 'val'
1059       // accum = accumulated, sum in total for this iteration
1060 
1061       // INCLUSIVE scan
1062       for (int offset = blockDim.x; offset < Impl::CudaTraits::WarpSize;
1063            offset <<= 1) {
1064         y = Kokkos::shfl_up(val, offset, Impl::CudaTraits::WarpSize);
1065         if (lane >= offset) {
1066           val += y;
1067         }
1068       }
1069 
1070       // pass accum to all threads
1071       local_total = shfl_warp_broadcast<value_type>(
1072           val, threadIdx.x + Impl::CudaTraits::WarpSize - blockDim.x,
1073           Impl::CudaTraits::WarpSize);
1074 
1075       // make EXCLUSIVE scan by shifting values over one
1076       val = Kokkos::shfl_up(val, blockDim.x, Impl::CudaTraits::WarpSize);
1077       if (threadIdx.y == 0) {
1078         val = 0;
1079       }
1080 
1081       val += accum;
1082       if (i < loop_boundaries.end) closure(i, val, true);
1083       accum += local_total;
1084     }
1085   } else {
1086     value_type accum = 0;
1087     for (iType i = loop_boundaries.start; i < loop_boundaries.end;
1088          i += loop_boundaries.increment) {
1089       closure(i, accum, true);
1090     }
1091   }
1092 }
1093 
1094 // scan within team member (vector) within warp
1095 // assume vec_length*team_size == warp_size
1096 // blockDim.x == vec_length == stride
1097 // blockDim.y == team_size
1098 // threadIdx.x == position in vec
1099 // threadIdx.y == member number
1100 template <typename iType, class Closure, class Scheduler>
parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec<Kokkos::Cuda,Scheduler>> & loop_boundaries,const Closure & closure)1101 KOKKOS_INLINE_FUNCTION void parallel_scan(
1102     const Impl::ThreadVectorRangeBoundariesStruct<
1103         iType, Impl::TaskExec<Kokkos::Cuda, Scheduler>>& loop_boundaries,
1104     const Closure& closure) {
1105   // Extract value_type from closure
1106 
1107   using value_type = typename Kokkos::Impl::FunctorAnalysis<
1108       Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type;
1109 
1110   if (1 < loop_boundaries.thread.team_size()) {
1111     // make sure all threads perform all loop iterations
1112     const iType bound = loop_boundaries.end + loop_boundaries.start;
1113 
1114     value_type accum = 0;
1115     value_type val, y, local_total;
1116 
1117     for (iType i = loop_boundaries.start; i < bound;
1118          i += loop_boundaries.increment) {
1119       val = 0;
1120       if (i < loop_boundaries.end) closure(i, val, false);
1121 
1122       // intra-blockDim.x exclusive scan on 'val'
1123       // accum = accumulated, sum in total for this iteration
1124 
1125       // INCLUSIVE scan
1126       for (int offset = 1; offset < blockDim.x; offset <<= 1) {
1127         y = Kokkos::shfl_up(val, offset, blockDim.x);
1128         if (threadIdx.x >= offset) {
1129           val += y;
1130         }
1131       }
1132 
1133       // pass accum to all threads
1134       local_total =
1135           shfl_warp_broadcast<value_type>(val, blockDim.x - 1, blockDim.x);
1136 
1137       // make EXCLUSIVE scan by shifting values over one
1138       val = Kokkos::shfl_up(val, 1, blockDim.x);
1139       if (threadIdx.x == 0) {
1140         val = 0;
1141       }
1142 
1143       val += accum;
1144       if (i < loop_boundaries.end) closure(i, val, true);
1145       accum += local_total;
1146     }
1147   } else {
1148     value_type accum = 0;
1149     for (iType i = loop_boundaries.start; i < loop_boundaries.end;
1150          i += loop_boundaries.increment) {
1151       closure(i, accum, true);
1152     }
1153   }
1154 }
1155 
1156 } /* namespace Kokkos */
1157 
1158 namespace Kokkos {
1159 
1160 template <class FunctorType, class Scheduler>
single(const Impl::VectorSingleStruct<Impl::TaskExec<Kokkos::Cuda,Scheduler>> &,const FunctorType & lambda)1161 KOKKOS_INLINE_FUNCTION void single(
1162     const Impl::VectorSingleStruct<Impl::TaskExec<Kokkos::Cuda, Scheduler>>&,
1163     const FunctorType& lambda) {
1164 #ifdef __CUDA_ARCH__
1165   if (threadIdx.x == 0) lambda();
1166 #endif
1167 }
1168 
1169 template <class FunctorType, class Scheduler>
single(const Impl::ThreadSingleStruct<Impl::TaskExec<Kokkos::Cuda,Scheduler>> &,const FunctorType & lambda)1170 KOKKOS_INLINE_FUNCTION void single(
1171     const Impl::ThreadSingleStruct<Impl::TaskExec<Kokkos::Cuda, Scheduler>>&,
1172     const FunctorType& lambda) {
1173 #ifdef __CUDA_ARCH__
1174   if (threadIdx.x == 0 && threadIdx.y == 0) lambda();
1175 #endif
1176 }
1177 
1178 template <class FunctorType, class ValueType, class Scheduler>
single(const Impl::VectorSingleStruct<Impl::TaskExec<Kokkos::Cuda,Scheduler>> & s,const FunctorType & lambda,ValueType & val)1179 KOKKOS_INLINE_FUNCTION void single(
1180     const Impl::VectorSingleStruct<Impl::TaskExec<Kokkos::Cuda, Scheduler>>& s,
1181     const FunctorType& lambda, ValueType& val) {
1182 #ifdef __CUDA_ARCH__
1183   if (threadIdx.x == 0) lambda(val);
1184   if (1 < s.team_member.team_size()) {
1185     val = shfl(val, 0, blockDim.x);
1186   }
1187 #endif
1188 }
1189 
1190 template <class FunctorType, class ValueType, class Scheduler>
single(const Impl::ThreadSingleStruct<Impl::TaskExec<Kokkos::Cuda,Scheduler>> & single_struct,const FunctorType & lambda,ValueType & val)1191 KOKKOS_INLINE_FUNCTION void single(
1192     const Impl::ThreadSingleStruct<Impl::TaskExec<Kokkos::Cuda, Scheduler>>&
1193         single_struct,
1194     const FunctorType& lambda, ValueType& val) {
1195 #ifdef __CUDA_ARCH__
1196   if (threadIdx.x == 0 && threadIdx.y == 0) {
1197     lambda(val);
1198   }
1199   single_struct.team_member.team_broadcast(val, 0);
1200 #endif
1201 }
1202 
1203 }  // namespace Kokkos
1204 
1205 //----------------------------------------------------------------------------
1206 //----------------------------------------------------------------------------
1207 
1208 #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
1209 #endif /* #ifndef KOKKOS_IMPL_CUDA_TASK_HPP */
1210