1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 // Kokkos v. 3.0
6 // Copyright (2020) National Technology & Engineering
7 // Solutions of Sandia, LLC (NTESS).
8 //
9 // Under the terms of Contract DE-NA0003525 with NTESS,
10 // the U.S. Government retains certain rights in this software.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions are
14 // met:
15 //
16 // 1. Redistributions of source code must retain the above copyright
17 // notice, this list of conditions and the following disclaimer.
18 //
19 // 2. Redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution.
22 //
23 // 3. Neither the name of the Corporation nor the names of the
24 // contributors may be used to endorse or promote products derived from
25 // this software without specific prior written permission.
26 //
27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 //
39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
40 //
41 // ************************************************************************
42 //@HEADER
43 */
44
45 #ifndef KOKKOS_IMPL_CUDA_TASK_HPP
46 #define KOKKOS_IMPL_CUDA_TASK_HPP
47
48 #include <Kokkos_Macros.hpp>
49 #if defined(KOKKOS_ENABLE_TASKDAG)
50
51 //----------------------------------------------------------------------------
52 //----------------------------------------------------------------------------
53
54 #include <Kokkos_Core_fwd.hpp>
55
56 #include <impl/Kokkos_TaskBase.hpp>
57 #include <Cuda/Kokkos_Cuda_Error.hpp> // CUDA_SAFE_CALL
58 #include <impl/Kokkos_TaskTeamMember.hpp>
59
60 //----------------------------------------------------------------------------
61
62 namespace Kokkos {
63 namespace Impl {
64 namespace {
65
66 template <typename TaskType>
set_cuda_task_base_apply_function_pointer(typename TaskType::function_type * ptr,typename TaskType::destroy_type * dtor)67 __global__ void set_cuda_task_base_apply_function_pointer(
68 typename TaskType::function_type* ptr,
69 typename TaskType::destroy_type* dtor) {
70 *ptr = TaskType::apply;
71 *dtor = TaskType::destroy;
72 }
73
74 template <typename Scheduler>
cuda_task_queue_execute(Scheduler scheduler,int32_t shmem_size)75 __global__ void cuda_task_queue_execute(Scheduler scheduler,
76 int32_t shmem_size) {
77 TaskQueueSpecialization<Scheduler>::driver(std::move(scheduler), shmem_size);
78 }
79
80 } // namespace
81
82 template <class, class>
83 class TaskExec;
84
85 template <class QueueType>
86 class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
87 public:
88 using scheduler_type = SimpleTaskScheduler<Kokkos::Cuda, QueueType>;
89 using execution_space = Kokkos::Cuda;
90 using memory_space = Kokkos::CudaUVMSpace;
91 using member_type = TaskExec<Kokkos::Cuda, scheduler_type>;
92
93 enum : long { max_league_size = 16 };
94 enum : int { warps_per_block = 4 };
95
96 KOKKOS_INLINE_FUNCTION
iff_single_thread_recursive_execute(scheduler_type const &)97 static void iff_single_thread_recursive_execute(scheduler_type const&) {}
98
get_max_team_count(execution_space const &)99 static int get_max_team_count(execution_space const&) {
100 return Kokkos::Impl::cuda_internal_multiprocessor_count() * warps_per_block;
101 }
102
driver(scheduler_type scheduler,int32_t shmem_per_warp)103 __device__ static void driver(scheduler_type scheduler,
104 int32_t shmem_per_warp) {
105 using queue_type = typename scheduler_type::task_queue_type;
106 using task_base_type = typename scheduler_type::task_base_type;
107 using runnable_task_base_type =
108 typename scheduler_type::runnable_task_base_type;
109 using scheduling_info_storage_type = SchedulingInfoStorage<
110 runnable_task_base_type,
111 typename scheduler_type::task_scheduling_info_type>;
112
113 extern __shared__ int32_t shmem_all[];
114
115 int32_t* const warp_shmem =
116 shmem_all + (threadIdx.z * shmem_per_warp) / sizeof(int32_t);
117
118 task_base_type* const shared_memory_task_copy = (task_base_type*)warp_shmem;
119
120 const int warp_lane = threadIdx.x + threadIdx.y * blockDim.x;
121
122 member_type single_exec(scheduler, warp_shmem, 1);
123 member_type team_exec(scheduler, warp_shmem, blockDim.y);
124
125 auto& queue = scheduler.queue();
126 auto& team_scheduler = team_exec.scheduler();
127
128 auto current_task = OptionalRef<task_base_type>();
129
130 // Loop until all queues are empty and no tasks in flight
131 while (!queue.is_done()) {
132 if (warp_lane == 0) { // should be (?) same as team_exec.team_rank() == 0
133 // pop off a task
134 current_task =
135 queue.pop_ready_task(team_scheduler.team_scheduler_info());
136 }
137
138 // Broadcast task pointer:
139
140 // Sync before the broadcast
141 KOKKOS_IMPL_CUDA_SYNCWARP;
142
143 // pretend it's an int* for shuffle purposes
144 ((int*)¤t_task)[0] =
145 KOKKOS_IMPL_CUDA_SHFL(((int*)¤t_task)[0], 0, 32);
146 ((int*)¤t_task)[1] =
147 KOKKOS_IMPL_CUDA_SHFL(((int*)¤t_task)[1], 0, 32);
148
149 if (current_task) {
150 KOKKOS_ASSERT(!current_task->as_runnable_task().get_respawn_flag());
151
152 int32_t b = sizeof(scheduling_info_storage_type) / sizeof(int32_t);
153 static_assert(
154 sizeof(scheduling_info_storage_type) % sizeof(int32_t) == 0,
155 "bad task size");
156 int32_t const e = current_task->get_allocation_size() / sizeof(int32_t);
157 KOKKOS_ASSERT(current_task->get_allocation_size() % sizeof(int32_t) ==
158 0);
159
160 int32_t volatile* const task_mem =
161 (int32_t volatile*)current_task.get();
162
163 // do a coordinated copy of the task closure from global to shared
164 // memory:
165 for (int32_t i = warp_lane; i < e; i += CudaTraits::WarpSize) {
166 warp_shmem[i] = task_mem[i];
167 }
168
169 // Synchronize threads of the warp and insure memory
170 // writes are visible to all threads in the warp.
171 KOKKOS_IMPL_CUDA_SYNCWARP;
172
173 if (shared_memory_task_copy->is_team_runnable()) {
174 // Thread Team Task
175 shared_memory_task_copy->as_runnable_task().run(team_exec);
176 } else if (threadIdx.y == 0) {
177 // TODO @tasking @optimization DSH Change this to warp_lane == 0 when
178 // we allow blockDim.x to be more than 1 Single Thread Task
179 shared_memory_task_copy->as_runnable_task().run(single_exec);
180 }
181
182 // Synchronize threads of the warp and insure memory
183 // writes are visible to all threads in the warp.
184
185 KOKKOS_IMPL_CUDA_SYNCWARP;
186
187 // if(warp_lane < b % CudaTraits::WarpSize) b += CudaTraits::WarpSize;
188 // b -= b % CudaTraits::WarpSize;
189
190 // copy task closure from shared to global memory:
191 for (int32_t i = b + warp_lane; i < e; i += CudaTraits::WarpSize) {
192 task_mem[i] = warp_shmem[i];
193 }
194
195 // Synchronize threads of the warp and insure memory
196 // writes are visible to root thread of the warp for
197 // respawn or completion.
198
199 KOKKOS_IMPL_CUDA_SYNCWARP;
200
201 if (warp_lane == 0) {
202 // If respawn requested copy respawn data back to main memory
203 if (shared_memory_task_copy->as_runnable_task().get_respawn_flag()) {
204 if (shared_memory_task_copy->as_runnable_task().has_predecessor()) {
205 // It's not necessary to make this a volatile write because
206 // the next read of the predecessor is on this thread in complete,
207 // and the predecessor is cleared there (using a volatile write)
208 current_task->as_runnable_task().acquire_predecessor_from(
209 shared_memory_task_copy->as_runnable_task());
210 }
211
212 // It may not necessary to make this a volatile write, since the
213 // next read will be done by this thread in complete where the
214 // rescheduling occurs, but since the task could be stolen later
215 // before this is written again, we should do the volatile write
216 // here. (It might not be necessary though because I don't know
217 // where else the priority would be read after it is scheduled
218 // by this thread; for now, we leave it volatile, but we should
219 // benchmark the cost of this.)
220 current_task.as_volatile()->set_priority(
221 shared_memory_task_copy->get_priority());
222
223 // It's not necessary to make this a volatile write, since the
224 // next read of it (if true) will be by this thread in `complete()`,
225 // which will unset the flag (using volatile) once it has handled
226 // the respawn
227 current_task->as_runnable_task().set_respawn_flag();
228 }
229
230 queue.complete((*std::move(current_task)).as_runnable_task(),
231 team_scheduler.team_scheduler_info());
232 }
233 }
234 }
235 }
236
execute(scheduler_type const & scheduler)237 static void execute(scheduler_type const& scheduler) {
238 const int shared_per_warp = 2048;
239 const dim3 grid(Kokkos::Impl::cuda_internal_multiprocessor_count(), 1, 1);
240 const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block);
241 const int shared_total = shared_per_warp * warps_per_block;
242 const cudaStream_t stream = nullptr;
243
244 KOKKOS_ASSERT(
245 static_cast<long>(grid.x * grid.y * grid.z * block.x * block.y *
246 block.z) ==
247 static_cast<long>(get_max_team_count(scheduler.get_execution_space()) *
248 Kokkos::Impl::CudaTraits::WarpSize));
249
250 auto& queue = scheduler.queue();
251
252 CUDA_SAFE_CALL(cudaDeviceSynchronize());
253
254 // Query the stack size, in bytes:
255
256 size_t previous_stack_size = 0;
257 CUDA_SAFE_CALL(
258 cudaDeviceGetLimit(&previous_stack_size, cudaLimitStackSize));
259
260 // If not large enough then set the stack size, in bytes:
261
262 const size_t larger_stack_size = 1 << 11;
263
264 if (previous_stack_size < larger_stack_size) {
265 CUDA_SAFE_CALL(cudaDeviceSetLimit(cudaLimitStackSize, larger_stack_size));
266 }
267
268 cuda_task_queue_execute<<<grid, block, shared_total, stream>>>(
269 scheduler, shared_per_warp);
270
271 CUDA_SAFE_CALL(cudaGetLastError());
272
273 CUDA_SAFE_CALL(cudaDeviceSynchronize());
274
275 if (previous_stack_size < larger_stack_size) {
276 CUDA_SAFE_CALL(
277 cudaDeviceSetLimit(cudaLimitStackSize, previous_stack_size));
278 }
279 }
280
281 template <typename TaskType>
282 static
283 // TODO @tasking @optimiazation DSH specialize this for trivially
284 // destructible types
285 void
get_function_pointer(typename TaskType::function_type & ptr,typename TaskType::destroy_type & dtor)286 get_function_pointer(typename TaskType::function_type& ptr,
287 typename TaskType::destroy_type& dtor) {
288 using function_type = typename TaskType::function_type;
289 using destroy_type = typename TaskType::destroy_type;
290
291 // TODO @tasking @minor DSH make sure there aren't any alignment concerns?
292 void* storage = cuda_internal_scratch_unified(
293 Kokkos::Cuda(), sizeof(function_type) + sizeof(destroy_type));
294 function_type* ptr_ptr = (function_type*)storage;
295 destroy_type* dtor_ptr =
296 (destroy_type*)((char*)storage + sizeof(function_type));
297
298 CUDA_SAFE_CALL(cudaDeviceSynchronize());
299
300 set_cuda_task_base_apply_function_pointer<TaskType>
301 <<<1, 1>>>(ptr_ptr, dtor_ptr);
302
303 CUDA_SAFE_CALL(cudaGetLastError());
304 CUDA_SAFE_CALL(cudaDeviceSynchronize());
305
306 ptr = *ptr_ptr;
307 dtor = *dtor_ptr;
308 }
309 };
310
311 //----------------------------------------------------------------------------
312 //----------------------------------------------------------------------------
313
314 template <class Scheduler>
315 class TaskQueueSpecializationConstrained<
316 Scheduler,
317 typename std::enable_if<std::is_same<typename Scheduler::execution_space,
318 Kokkos::Cuda>::value>::type> {
319 public:
320 using scheduler_type = Scheduler;
321 using execution_space = Kokkos::Cuda;
322 using memory_space = Kokkos::CudaUVMSpace;
323 using member_type = TaskExec<Kokkos::Cuda, Scheduler>;
324
325 enum : long { max_league_size = 16 };
326
327 KOKKOS_INLINE_FUNCTION
iff_single_thread_recursive_execute(scheduler_type const &)328 static void iff_single_thread_recursive_execute(scheduler_type const&) {}
329
driver(scheduler_type scheduler,int32_t shmem_per_warp)330 __device__ static void driver(scheduler_type scheduler,
331 int32_t shmem_per_warp) {
332 using queue_type = typename scheduler_type::queue_type;
333 using task_root_type = TaskBase;
334
335 extern __shared__ int32_t shmem_all[];
336
337 task_root_type* const end = (task_root_type*)task_root_type::EndTag;
338 task_root_type* const no_more_tasks_sentinel = nullptr;
339
340 int32_t* const warp_shmem =
341 shmem_all + (threadIdx.z * shmem_per_warp) / sizeof(int32_t);
342
343 task_root_type* const task_shmem = (task_root_type*)warp_shmem;
344
345 const int warp_lane = threadIdx.x + threadIdx.y * blockDim.x;
346
347 member_type single_exec(scheduler, warp_shmem, 1);
348 member_type team_exec(scheduler, warp_shmem, blockDim.y);
349
350 auto& team_queue = team_exec.scheduler().queue();
351
352 task_root_type* task_ptr = no_more_tasks_sentinel;
353
354 // Loop until all queues are empty and no tasks in flight
355
356 do {
357 // Each team lead attempts to acquire either a thread team task
358 // or collection of single thread tasks for the team.
359
360 if (0 == warp_lane) {
361 if (*((volatile int*)&team_queue.m_ready_count) > 0) {
362 task_ptr = end;
363 // Attempt to acquire a task
364 // Loop by priority and then type
365 for (int i = 0; i < queue_type::NumQueue && end == task_ptr; ++i) {
366 for (int j = 0; j < 2 && end == task_ptr; ++j) {
367 task_ptr = queue_type::pop_ready_task(&team_queue.m_ready[i][j]);
368 }
369 }
370 } else {
371 // returns nullptr if and only if all other queues have a ready
372 // count of 0 also. Otherwise, returns a task from another queue
373 // or `end` if one couldn't be popped
374 task_ptr = team_queue.attempt_to_steal_task();
375 #if 0
376 if(task != no_more_tasks_sentinel && task != end) {
377 std::printf("task stolen on rank %d\n", team_exec.league_rank());
378 }
379 #endif
380 }
381 }
382
383 // Synchronize warp with memory fence before broadcasting task pointer:
384
385 // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "A" );
386 KOKKOS_IMPL_CUDA_SYNCWARP;
387
388 // Broadcast task pointer:
389
390 ((int*)&task_ptr)[0] = KOKKOS_IMPL_CUDA_SHFL(((int*)&task_ptr)[0], 0, 32);
391 ((int*)&task_ptr)[1] = KOKKOS_IMPL_CUDA_SHFL(((int*)&task_ptr)[1], 0, 32);
392
393 #if defined(KOKKOS_ENABLE_DEBUG)
394 KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN("TaskQueue CUDA task_ptr");
395 #endif
396
397 if (0 == task_ptr) break; // 0 == queue->m_ready_count
398
399 if (end != task_ptr) {
400 // Whole warp copy task's closure to/from shared memory.
401 // Use all threads of warp for coalesced read/write.
402
403 int32_t const b = sizeof(task_root_type) / sizeof(int32_t);
404 int32_t const e =
405 *((int32_t volatile*)(&task_ptr->m_alloc_size)) / sizeof(int32_t);
406
407 int32_t volatile* const task_mem = (int32_t volatile*)task_ptr;
408
409 KOKKOS_ASSERT(e * sizeof(int32_t) < shmem_per_warp);
410
411 // copy task closure from global to shared memory:
412
413 for (int32_t i = warp_lane; i < e; i += CudaTraits::WarpSize) {
414 warp_shmem[i] = task_mem[i];
415 }
416
417 // Synchronize threads of the warp and insure memory
418 // writes are visible to all threads in the warp.
419
420 // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "B" );
421 KOKKOS_IMPL_CUDA_SYNCWARP;
422
423 if (task_root_type::TaskTeam == task_shmem->m_task_type) {
424 // Thread Team Task
425 (*task_shmem->m_apply)(task_shmem, &team_exec);
426 } else if (0 == threadIdx.y) {
427 // Single Thread Task
428 (*task_shmem->m_apply)(task_shmem, &single_exec);
429 }
430
431 // Synchronize threads of the warp and insure memory
432 // writes are visible to all threads in the warp.
433
434 // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "C" );
435 KOKKOS_IMPL_CUDA_SYNCWARP;
436
437 // copy task closure from shared to global memory:
438
439 for (int32_t i = b + warp_lane; i < e; i += CudaTraits::WarpSize) {
440 task_mem[i] = warp_shmem[i];
441 }
442
443 // Synchronize threads of the warp and insure memory
444 // writes are visible to root thread of the warp for
445 // respawn or completion.
446
447 // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "D" );
448 KOKKOS_IMPL_CUDA_SYNCWARP;
449
450 // If respawn requested copy respawn data back to main memory
451
452 if (0 == warp_lane) {
453 if (((task_root_type*)task_root_type::LockTag) !=
454 task_shmem->m_next) {
455 ((volatile task_root_type*)task_ptr)->m_next = task_shmem->m_next;
456 ((volatile task_root_type*)task_ptr)->m_priority =
457 task_shmem->m_priority;
458 }
459
460 team_queue.complete(task_ptr);
461 }
462 }
463 } while (1);
464 }
465
execute(scheduler_type const & scheduler)466 static void execute(scheduler_type const& scheduler) {
467 const int shared_per_warp = 2048;
468 const int warps_per_block = 4;
469 const dim3 grid(Kokkos::Impl::cuda_internal_multiprocessor_count(), 1, 1);
470 // const dim3 grid( 1 , 1 , 1 );
471 const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block);
472 const int shared_total = shared_per_warp * warps_per_block;
473 const cudaStream_t stream = 0;
474
475 auto& queue = scheduler.queue();
476 queue.initialize_team_queues(warps_per_block * grid.x);
477
478 CUDA_SAFE_CALL(cudaDeviceSynchronize());
479
480 // Query the stack size, in bytes:
481
482 size_t previous_stack_size = 0;
483 CUDA_SAFE_CALL(
484 cudaDeviceGetLimit(&previous_stack_size, cudaLimitStackSize));
485
486 // If not large enough then set the stack size, in bytes:
487
488 const size_t larger_stack_size = 2048;
489
490 if (previous_stack_size < larger_stack_size) {
491 CUDA_SAFE_CALL(cudaDeviceSetLimit(cudaLimitStackSize, larger_stack_size));
492 }
493
494 cuda_task_queue_execute<<<grid, block, shared_total, stream>>>(
495 scheduler, shared_per_warp);
496
497 CUDA_SAFE_CALL(cudaGetLastError());
498
499 CUDA_SAFE_CALL(cudaDeviceSynchronize());
500
501 if (previous_stack_size < larger_stack_size) {
502 CUDA_SAFE_CALL(
503 cudaDeviceSetLimit(cudaLimitStackSize, previous_stack_size));
504 }
505 }
506
507 template <typename TaskType>
get_function_pointer(typename TaskType::function_type & ptr,typename TaskType::destroy_type & dtor)508 static void get_function_pointer(typename TaskType::function_type& ptr,
509 typename TaskType::destroy_type& dtor) {
510 using function_type = typename TaskType::function_type;
511 using destroy_type = typename TaskType::destroy_type;
512
513 void* storage = cuda_internal_scratch_unified(
514 Kokkos::Cuda(), sizeof(function_type) + sizeof(destroy_type));
515 function_type* ptr_ptr = (function_type*)storage;
516 destroy_type* dtor_ptr =
517 (destroy_type*)((char*)storage + sizeof(function_type));
518
519 CUDA_SAFE_CALL(cudaDeviceSynchronize());
520
521 set_cuda_task_base_apply_function_pointer<TaskType>
522 <<<1, 1>>>(ptr_ptr, dtor_ptr);
523
524 CUDA_SAFE_CALL(cudaGetLastError());
525 CUDA_SAFE_CALL(cudaDeviceSynchronize());
526
527 ptr = *ptr_ptr;
528 dtor = *dtor_ptr;
529 }
530 };
531
532 extern template class TaskQueue<
533 Kokkos::Cuda,
534 default_tasking_memory_space_for_execution_space_t<Kokkos::Cuda>>;
535
536 } // namespace Impl
537 } // namespace Kokkos
538
539 //----------------------------------------------------------------------------
540 //----------------------------------------------------------------------------
541
542 namespace Kokkos {
543 namespace Impl {
544
545 /**\brief Impl::TaskExec<Cuda> is the TaskScheduler<Cuda>::member_type
546 * passed to tasks running in a Cuda space.
547 *
548 * Cuda thread blocks for tasking are dimensioned:
549 * blockDim.x == vector length
550 * blockDim.y == team size
551 * blockDim.z == number of teams
552 * where
553 * blockDim.x * blockDim.y == WarpSize
554 *
555 * Current implementation requires blockDim.x == 1.
556 * Vector level parallelism with blockDim.y > 1 on Volta will
557 * require a vector-level synchronization mask for vector-level
558 * collective operaitons.
559 *
560 * Both single thread and thread team tasks are run by a full Cuda warp.
561 * A single thread task is called by warp lane #0 and the remaining
562 * lanes of the warp are idle.
563 *
564 * When executing a single thread task the syncwarp or other
565 * warp synchronizing functions must not be called.
566 */
567 template <class Scheduler>
568 class TaskExec<Kokkos::Cuda, Scheduler> {
569 private:
570 enum : int { WarpSize = Kokkos::Impl::CudaTraits::WarpSize };
571
572 TaskExec(TaskExec&&) = delete;
573 TaskExec(TaskExec const&) = delete;
574 TaskExec& operator=(TaskExec&&) = delete;
575 TaskExec& operator=(TaskExec const&) = delete;
576
577 friend class Kokkos::Impl::TaskQueue<
578 Kokkos::Cuda,
579 default_tasking_memory_space_for_execution_space_t<Kokkos::Cuda>>;
580 template <class, class>
581 friend class Kokkos::Impl::TaskQueueSpecializationConstrained;
582 template <class>
583 friend class Kokkos::Impl::TaskQueueSpecialization;
584
585 int32_t* m_team_shmem;
586 const int m_team_size;
587 Scheduler m_scheduler;
588
589 // If constructed with arg_team_size == 1 the object
590 // can only be used by 0 == threadIdx.y.
591 KOKKOS_INLINE_FUNCTION
TaskExec(Scheduler const & parent_scheduler,int32_t * arg_team_shmem,int arg_team_size=blockDim.y)592 TaskExec(Scheduler const& parent_scheduler, int32_t* arg_team_shmem,
593 int arg_team_size = blockDim.y)
594 : m_team_shmem(arg_team_shmem),
595 m_team_size(arg_team_size),
596 m_scheduler(parent_scheduler.get_team_scheduler(league_rank())) {}
597
598 public:
599 using thread_team_member = TaskExec;
600
601 #if defined(__CUDA_ARCH__)
team_rank() const602 __device__ int team_rank() const { return threadIdx.y; }
team_size() const603 __device__ int team_size() const { return m_team_size; }
604 //__device__ int league_rank() const { return threadIdx.z; }
league_rank() const605 __device__ int league_rank() const {
606 return blockIdx.x * blockDim.z + threadIdx.z;
607 }
league_size() const608 __device__ int league_size() const { return blockDim.z * gridDim.x; }
609
team_barrier() const610 __device__ void team_barrier() const {
611 if (1 < m_team_size) {
612 KOKKOS_IMPL_CUDA_SYNCWARP;
613 }
614 }
615
616 template <class ValueType>
team_broadcast(ValueType & val,const int thread_id) const617 __device__ void team_broadcast(ValueType& val, const int thread_id) const {
618 if (1 < m_team_size) {
619 // WarpSize = blockDim.X * blockDim.y
620 // thread_id < blockDim.y
621 ValueType tmp(val); // input might not be register variable
622 Impl::in_place_shfl(val, tmp, blockDim.x * thread_id, WarpSize);
623 }
624 }
625
626 #else
team_rank() const627 __host__ int team_rank() const { return 0; }
team_size() const628 __host__ int team_size() const { return 0; }
league_rank() const629 __host__ int league_rank() const { return 0; }
league_size() const630 __host__ int league_size() const { return 0; }
team_barrier() const631 __host__ void team_barrier() const {}
632 template <class ValueType>
team_broadcast(ValueType &,const int) const633 __host__ void team_broadcast(ValueType&, const int) const {}
634 #endif
635
scheduler() const636 KOKKOS_INLINE_FUNCTION Scheduler const& scheduler() const noexcept {
637 return m_scheduler;
638 }
scheduler()639 KOKKOS_INLINE_FUNCTION Scheduler& scheduler() noexcept { return m_scheduler; }
640 };
641
642 } // namespace Impl
643 } // namespace Kokkos
644
645 //----------------------------------------------------------------------------
646 //----------------------------------------------------------------------------
647
648 namespace Kokkos {
649 namespace Impl {
650
651 template <typename iType, typename Scheduler>
652 struct TeamThreadRangeBoundariesStruct<iType,
653 TaskExec<Kokkos::Cuda, Scheduler>> {
654 using index_type = iType;
655 using member_type = TaskExec<Kokkos::Cuda, Scheduler>;
656
657 const iType start;
658 const iType end;
659 const iType increment;
660 member_type const& thread;
661
662 #if defined(__CUDA_ARCH__)
663
TeamThreadRangeBoundariesStructKokkos::Impl::TeamThreadRangeBoundariesStruct664 __device__ inline TeamThreadRangeBoundariesStruct(
665 member_type const& arg_thread, const iType& arg_count)
666 : start(threadIdx.y),
667 end(arg_count),
668 increment(blockDim.y),
669 thread(arg_thread) {}
670
TeamThreadRangeBoundariesStructKokkos::Impl::TeamThreadRangeBoundariesStruct671 __device__ inline TeamThreadRangeBoundariesStruct(
672 member_type const& arg_thread, const iType& arg_start,
673 const iType& arg_end)
674 : start(arg_start + threadIdx.y),
675 end(arg_end),
676 increment(blockDim.y),
677 thread(arg_thread) {}
678
679 #else
680
681 TeamThreadRangeBoundariesStruct(member_type const& arg_thread,
682 const iType& arg_count);
683
684 TeamThreadRangeBoundariesStruct(member_type const& arg_thread,
685 const iType& arg_start, const iType& arg_end);
686
687 #endif
688 };
689
690 //----------------------------------------------------------------------------
691
692 template <typename iType, typename Scheduler>
693 struct ThreadVectorRangeBoundariesStruct<iType,
694 TaskExec<Kokkos::Cuda, Scheduler>> {
695 using index_type = iType;
696 using member_type = TaskExec<Kokkos::Cuda, Scheduler>;
697
698 const index_type start;
699 const index_type end;
700 const index_type increment;
701 const member_type& thread;
702
703 #if defined(__CUDA_ARCH__)
704
ThreadVectorRangeBoundariesStructKokkos::Impl::ThreadVectorRangeBoundariesStruct705 __device__ inline ThreadVectorRangeBoundariesStruct(
706 member_type const& arg_thread, const index_type& arg_count)
707 : start(threadIdx.x),
708 end(arg_count),
709 increment(blockDim.x),
710 thread(arg_thread) {}
711
ThreadVectorRangeBoundariesStructKokkos::Impl::ThreadVectorRangeBoundariesStruct712 __device__ inline ThreadVectorRangeBoundariesStruct(
713 member_type const& arg_thread, const index_type& arg_begin,
714 const index_type& arg_end)
715 : start(arg_begin + threadIdx.x),
716 end(arg_end),
717 increment(blockDim.x),
718 thread(arg_thread) {}
719
720 #else
721
722 ThreadVectorRangeBoundariesStruct(member_type const& arg_thread,
723 const index_type& arg_count);
724
725 ThreadVectorRangeBoundariesStruct(member_type const& arg_thread,
726 const index_type& arg_begin,
727 const index_type& arg_end);
728
729 #endif
730 };
731
732 } // namespace Impl
733 } // namespace Kokkos
734
735 //----------------------------------------------------------------------------
736
737 namespace Kokkos {
738
739 // template<typename iType>
740 // KOKKOS_INLINE_FUNCTION
741 // Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Cuda >
742 // > TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread, const iType
743 // & count )
744 //{
745 // return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec<
746 // Kokkos::Cuda > >( thread, count );
747 //}
748 //
749 // template<typename iType1, typename iType2>
750 // KOKKOS_INLINE_FUNCTION
751 // Impl::TeamThreadRangeBoundariesStruct
752 // < typename std::common_type<iType1,iType2>::type
753 // , Impl::TaskExec< Kokkos::Cuda > >
754 // TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread
755 // , const iType1 & begin, const iType2 & end )
756 //{
757 // using iType = typename std::common_type< iType1, iType2 >::type;
758 // return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec<
759 // Kokkos::Cuda > >(
760 // thread, iType(begin), iType(end) );
761 //}
762 //
763 // template<typename iType>
764 // KOKKOS_INLINE_FUNCTION
765 // Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda >
766 // > ThreadVectorRange( const Impl::TaskExec< Kokkos::Cuda > & thread
767 // , const iType & count )
768 //{
769 // return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec<
770 // Kokkos::Cuda > >(thread,count);
771 //}
772 //
773 // template<typename iType>
774 // KOKKOS_INLINE_FUNCTION
775 // Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda >
776 // > ThreadVectorRange( const Impl::TaskExec< Kokkos::Cuda > & thread
777 // , const iType & arg_begin
778 // , const iType & arg_end )
779 //{
780 // return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec<
781 // Kokkos::Cuda > >(thread,arg_begin,arg_end);
782 //}
783
784 // KOKKOS_INLINE_FUNCTION
785 // Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda > >
786 // PerTeam(const Impl::TaskExec< Kokkos::Cuda >& thread)
787 // {
788 // return Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda > >(thread);
789 // }
790
791 // KOKKOS_INLINE_FUNCTION
792 // Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda > >
793 // PerThread(const Impl::TaskExec< Kokkos::Cuda >& thread)
794 // {
795 // return Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda > >(thread);
796 // }
797
798 /** \brief Inter-thread parallel_for. Executes lambda(iType i) for each
799 * i=0..N-1.
800 *
801 * The range i=0..N-1 is mapped to all threads of the the calling thread team.
802 */
803 template <typename iType, class Lambda, class Scheduler>
parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec<Kokkos::Cuda,Scheduler>> & loop_boundaries,const Lambda & lambda)804 KOKKOS_INLINE_FUNCTION void parallel_for(
805 const Impl::TeamThreadRangeBoundariesStruct<
806 iType, Impl::TaskExec<Kokkos::Cuda, Scheduler>>& loop_boundaries,
807 const Lambda& lambda) {
808 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
809 i += loop_boundaries.increment) {
810 lambda(i);
811 }
812 }
813
814 template <typename iType, class Lambda, class Scheduler>
parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec<Kokkos::Cuda,Scheduler>> & loop_boundaries,const Lambda & lambda)815 KOKKOS_INLINE_FUNCTION void parallel_for(
816 const Impl::ThreadVectorRangeBoundariesStruct<
817 iType, Impl::TaskExec<Kokkos::Cuda, Scheduler>>& loop_boundaries,
818 const Lambda& lambda) {
819 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
820 i += loop_boundaries.increment) {
821 lambda(i);
822 }
823 }
824
825 // reduce across corresponding lanes between team members within warp
826 // assume stride*team_size == warp_size
827 template <typename ValueType, class JoinType>
strided_shfl_warp_reduction(const JoinType & join,ValueType & val,int team_size,int stride)828 KOKKOS_INLINE_FUNCTION void strided_shfl_warp_reduction(const JoinType& join,
829 ValueType& val,
830 int team_size,
831 int stride) {
832 for (int lane_delta = (team_size * stride) >> 1; lane_delta >= stride;
833 lane_delta >>= 1) {
834 join(val, Kokkos::shfl_down(val, lane_delta, team_size * stride));
835 }
836 }
837
838 // multiple within-warp non-strided reductions
839 template <typename ValueType, class JoinType>
multi_shfl_warp_reduction(const JoinType & join,ValueType & val,int vec_length)840 KOKKOS_INLINE_FUNCTION void multi_shfl_warp_reduction(const JoinType& join,
841 ValueType& val,
842 int vec_length) {
843 for (int lane_delta = vec_length >> 1; lane_delta; lane_delta >>= 1) {
844 join(val, Kokkos::shfl_down(val, lane_delta, vec_length));
845 }
846 }
847
848 // broadcast within warp
849 template <class ValueType>
shfl_warp_broadcast(ValueType & val,int src_lane,int width)850 KOKKOS_INLINE_FUNCTION ValueType shfl_warp_broadcast(ValueType& val,
851 int src_lane, int width) {
852 if (1 < width) {
853 return Kokkos::shfl(val, src_lane, width);
854 } else {
855 return val;
856 }
857 }
858
859 /*// all-reduce across corresponding vector lanes between team members within
860 warp
861 // assume vec_length*team_size == warp_size
862 // blockDim.x == vec_length == stride
863 // blockDim.y == team_size
864 // threadIdx.x == position in vec
865 // threadIdx.y == member number
866 template< typename iType, class Lambda, typename ValueType, class JoinType >
867 KOKKOS_INLINE_FUNCTION
868 void parallel_reduce
869 (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec<
870 Kokkos::Cuda > >& loop_boundaries, const Lambda & lambda, const JoinType& join,
871 ValueType& initialized_result) {
872
873 ValueType result = initialized_result;
874 for( iType i = loop_boundaries.start; i < loop_boundaries.end;
875 i+=loop_boundaries.increment) { lambda(i,result);
876 }
877 initialized_result = result;
878
879 strided_shfl_warp_reduction<ValueType, JoinType>(
880 join,
881 initialized_result,
882 loop_boundaries.thread.team_size(),
883 blockDim.x);
884 initialized_result = shfl_warp_broadcast<ValueType>( initialized_result,
885 threadIdx.x, Impl::CudaTraits::WarpSize );
886 }*/
887
888 // all-reduce across corresponding vector lanes between team members within warp
889 // if no join() provided, use sum
890 // assume vec_length*team_size == warp_size
891 // blockDim.x == vec_length == stride
892 // blockDim.y == team_size
893 // threadIdx.x == position in vec
894 // threadIdx.y == member number
895 template <typename iType, class Lambda, typename ValueType, class Scheduler>
parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec<Kokkos::Cuda,Scheduler>> & loop_boundaries,const Lambda & lambda,ValueType & initialized_result)896 KOKKOS_INLINE_FUNCTION void parallel_reduce(
897 const Impl::TeamThreadRangeBoundariesStruct<
898 iType, Impl::TaskExec<Kokkos::Cuda, Scheduler>>& loop_boundaries,
899 const Lambda& lambda, ValueType& initialized_result) {
900 // TODO @internal_documentation what is the point of creating this temporary?
901 ValueType result = initialized_result;
902 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
903 i += loop_boundaries.increment) {
904 lambda(i, result);
905 }
906 initialized_result = result;
907
908 if (1 < loop_boundaries.thread.team_size()) {
909 strided_shfl_warp_reduction(
910 [&](ValueType& val1, const ValueType& val2) { val1 += val2; },
911 initialized_result, loop_boundaries.thread.team_size(), blockDim.x);
912
913 initialized_result = shfl_warp_broadcast<ValueType>(
914 initialized_result, threadIdx.x, Impl::CudaTraits::WarpSize);
915 }
916 }
917
918 template <typename iType, class Lambda, typename ReducerType, class Scheduler>
parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec<Kokkos::Cuda,Scheduler>> & loop_boundaries,const Lambda & lambda,const ReducerType & reducer)919 KOKKOS_INLINE_FUNCTION void parallel_reduce(
920 const Impl::TeamThreadRangeBoundariesStruct<
921 iType, Impl::TaskExec<Kokkos::Cuda, Scheduler>>& loop_boundaries,
922 const Lambda& lambda, const ReducerType& reducer) {
923 using ValueType = typename ReducerType::value_type;
924 // TODO @internal_documentation what is the point of creating this temporary?
925 ValueType result = ValueType();
926 reducer.init(result);
927
928 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
929 i += loop_boundaries.increment) {
930 lambda(i, result);
931 }
932
933 if (1 < loop_boundaries.thread.team_size()) {
934 strided_shfl_warp_reduction(
935 [&](ValueType& val1, const ValueType& val2) {
936 reducer.join(val1, val2);
937 },
938 result, loop_boundaries.thread.team_size(), blockDim.x);
939
940 reducer.reference() = shfl_warp_broadcast<ValueType>(
941 result, threadIdx.x, Impl::CudaTraits::WarpSize);
942 } else {
943 reducer.reference() = result;
944 }
945 }
946 // all-reduce within team members within warp
947 // assume vec_length*team_size == warp_size
948 // blockDim.x == vec_length == stride
949 // blockDim.y == team_size
950 // threadIdx.x == position in vec
951 // threadIdx.y == member number
952 /*template< typename iType, class Lambda, typename ValueType, class JoinType >
953 KOKKOS_INLINE_FUNCTION
954 void parallel_reduce
955 (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec<
956 Kokkos::Cuda > >& loop_boundaries, const Lambda & lambda, const JoinType& join,
957 ValueType& initialized_result) {
958
959 ValueType result = initialized_result;
960 for( iType i = loop_boundaries.start; i < loop_boundaries.end;
961 i+=loop_boundaries.increment) { lambda(i,result);
962 }
963 initialized_result = result;
964
965 multi_shfl_warp_reduction<ValueType, JoinType>(join, initialized_result,
966 blockDim.x); initialized_result = shfl_warp_broadcast<ValueType>(
967 initialized_result, 0, blockDim.x );
968 }*/
969
970 // all-reduce within team members within warp
971 // if no join() provided, use sum
972 // assume vec_length*team_size == warp_size
973 // blockDim.x == vec_length == stride
974 // blockDim.y == team_size
975 // threadIdx.x == position in vec
976 // threadIdx.y == member number
977 template <typename iType, class Lambda, typename ValueType, class Scheduler>
parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec<Kokkos::Cuda,Scheduler>> & loop_boundaries,const Lambda & lambda,ValueType & initialized_result)978 KOKKOS_INLINE_FUNCTION void parallel_reduce(
979 const Impl::ThreadVectorRangeBoundariesStruct<
980 iType, Impl::TaskExec<Kokkos::Cuda, Scheduler>>& loop_boundaries,
981 const Lambda& lambda, ValueType& initialized_result) {
982 ValueType result = initialized_result;
983
984 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
985 i += loop_boundaries.increment) {
986 lambda(i, result);
987 }
988
989 initialized_result = result;
990
991 if (1 < loop_boundaries.thread.team_size()) {
992 // initialized_result = multi_shfl_warp_reduction(
993 multi_shfl_warp_reduction(
994 [&](ValueType& val1, const ValueType& val2) { val1 += val2; },
995 initialized_result, blockDim.x);
996
997 initialized_result =
998 shfl_warp_broadcast<ValueType>(initialized_result, 0, blockDim.x);
999 }
1000 }
1001
1002 template <typename iType, class Lambda, typename ReducerType, class Scheduler>
parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec<Kokkos::Cuda,Scheduler>> & loop_boundaries,const Lambda & lambda,const ReducerType & reducer)1003 KOKKOS_INLINE_FUNCTION void parallel_reduce(
1004 const Impl::ThreadVectorRangeBoundariesStruct<
1005 iType, Impl::TaskExec<Kokkos::Cuda, Scheduler>>& loop_boundaries,
1006 const Lambda& lambda, const ReducerType& reducer) {
1007 using ValueType = typename ReducerType::value_type;
1008
1009 ValueType result = ValueType();
1010 reducer.init(result);
1011
1012 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
1013 i += loop_boundaries.increment) {
1014 lambda(i, result);
1015 }
1016
1017 if (1 < loop_boundaries.thread.team_size()) {
1018 multi_shfl_warp_reduction(
1019 [&](ValueType& val1, const ValueType& val2) {
1020 reducer.join(val1, val2);
1021 },
1022 result, blockDim.x);
1023
1024 reducer.reference() = shfl_warp_broadcast<ValueType>(result, 0, blockDim.x);
1025 } else {
1026 reducer.reference() = result;
1027 }
1028 }
1029 // scan across corresponding vector lanes between team members within warp
1030 // assume vec_length*team_size == warp_size
1031 // blockDim.x == vec_length == stride
1032 // blockDim.y == team_size
1033 // threadIdx.x == position in vec
1034 // threadIdx.y == member number
1035 template <typename iType, class Closure, class Scheduler>
parallel_scan(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec<Kokkos::Cuda,Scheduler>> & loop_boundaries,const Closure & closure)1036 KOKKOS_INLINE_FUNCTION void parallel_scan(
1037 const Impl::TeamThreadRangeBoundariesStruct<
1038 iType, Impl::TaskExec<Kokkos::Cuda, Scheduler>>& loop_boundaries,
1039 const Closure& closure) {
1040 // Extract value_type from closure
1041
1042 using value_type = typename Kokkos::Impl::FunctorAnalysis<
1043 Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type;
1044
1045 if (1 < loop_boundaries.thread.team_size()) {
1046 // make sure all threads perform all loop iterations
1047 const iType bound = loop_boundaries.end + loop_boundaries.start;
1048 const int lane = threadIdx.y * blockDim.x;
1049
1050 value_type accum = 0;
1051 value_type val, y, local_total;
1052
1053 for (iType i = loop_boundaries.start; i < bound;
1054 i += loop_boundaries.increment) {
1055 val = 0;
1056 if (i < loop_boundaries.end) closure(i, val, false);
1057
1058 // intra-blockDim.y exclusive scan on 'val'
1059 // accum = accumulated, sum in total for this iteration
1060
1061 // INCLUSIVE scan
1062 for (int offset = blockDim.x; offset < Impl::CudaTraits::WarpSize;
1063 offset <<= 1) {
1064 y = Kokkos::shfl_up(val, offset, Impl::CudaTraits::WarpSize);
1065 if (lane >= offset) {
1066 val += y;
1067 }
1068 }
1069
1070 // pass accum to all threads
1071 local_total = shfl_warp_broadcast<value_type>(
1072 val, threadIdx.x + Impl::CudaTraits::WarpSize - blockDim.x,
1073 Impl::CudaTraits::WarpSize);
1074
1075 // make EXCLUSIVE scan by shifting values over one
1076 val = Kokkos::shfl_up(val, blockDim.x, Impl::CudaTraits::WarpSize);
1077 if (threadIdx.y == 0) {
1078 val = 0;
1079 }
1080
1081 val += accum;
1082 if (i < loop_boundaries.end) closure(i, val, true);
1083 accum += local_total;
1084 }
1085 } else {
1086 value_type accum = 0;
1087 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
1088 i += loop_boundaries.increment) {
1089 closure(i, accum, true);
1090 }
1091 }
1092 }
1093
1094 // scan within team member (vector) within warp
1095 // assume vec_length*team_size == warp_size
1096 // blockDim.x == vec_length == stride
1097 // blockDim.y == team_size
1098 // threadIdx.x == position in vec
1099 // threadIdx.y == member number
1100 template <typename iType, class Closure, class Scheduler>
parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec<Kokkos::Cuda,Scheduler>> & loop_boundaries,const Closure & closure)1101 KOKKOS_INLINE_FUNCTION void parallel_scan(
1102 const Impl::ThreadVectorRangeBoundariesStruct<
1103 iType, Impl::TaskExec<Kokkos::Cuda, Scheduler>>& loop_boundaries,
1104 const Closure& closure) {
1105 // Extract value_type from closure
1106
1107 using value_type = typename Kokkos::Impl::FunctorAnalysis<
1108 Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type;
1109
1110 if (1 < loop_boundaries.thread.team_size()) {
1111 // make sure all threads perform all loop iterations
1112 const iType bound = loop_boundaries.end + loop_boundaries.start;
1113
1114 value_type accum = 0;
1115 value_type val, y, local_total;
1116
1117 for (iType i = loop_boundaries.start; i < bound;
1118 i += loop_boundaries.increment) {
1119 val = 0;
1120 if (i < loop_boundaries.end) closure(i, val, false);
1121
1122 // intra-blockDim.x exclusive scan on 'val'
1123 // accum = accumulated, sum in total for this iteration
1124
1125 // INCLUSIVE scan
1126 for (int offset = 1; offset < blockDim.x; offset <<= 1) {
1127 y = Kokkos::shfl_up(val, offset, blockDim.x);
1128 if (threadIdx.x >= offset) {
1129 val += y;
1130 }
1131 }
1132
1133 // pass accum to all threads
1134 local_total =
1135 shfl_warp_broadcast<value_type>(val, blockDim.x - 1, blockDim.x);
1136
1137 // make EXCLUSIVE scan by shifting values over one
1138 val = Kokkos::shfl_up(val, 1, blockDim.x);
1139 if (threadIdx.x == 0) {
1140 val = 0;
1141 }
1142
1143 val += accum;
1144 if (i < loop_boundaries.end) closure(i, val, true);
1145 accum += local_total;
1146 }
1147 } else {
1148 value_type accum = 0;
1149 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
1150 i += loop_boundaries.increment) {
1151 closure(i, accum, true);
1152 }
1153 }
1154 }
1155
1156 } /* namespace Kokkos */
1157
1158 namespace Kokkos {
1159
1160 template <class FunctorType, class Scheduler>
single(const Impl::VectorSingleStruct<Impl::TaskExec<Kokkos::Cuda,Scheduler>> &,const FunctorType & lambda)1161 KOKKOS_INLINE_FUNCTION void single(
1162 const Impl::VectorSingleStruct<Impl::TaskExec<Kokkos::Cuda, Scheduler>>&,
1163 const FunctorType& lambda) {
1164 #ifdef __CUDA_ARCH__
1165 if (threadIdx.x == 0) lambda();
1166 #endif
1167 }
1168
1169 template <class FunctorType, class Scheduler>
single(const Impl::ThreadSingleStruct<Impl::TaskExec<Kokkos::Cuda,Scheduler>> &,const FunctorType & lambda)1170 KOKKOS_INLINE_FUNCTION void single(
1171 const Impl::ThreadSingleStruct<Impl::TaskExec<Kokkos::Cuda, Scheduler>>&,
1172 const FunctorType& lambda) {
1173 #ifdef __CUDA_ARCH__
1174 if (threadIdx.x == 0 && threadIdx.y == 0) lambda();
1175 #endif
1176 }
1177
1178 template <class FunctorType, class ValueType, class Scheduler>
single(const Impl::VectorSingleStruct<Impl::TaskExec<Kokkos::Cuda,Scheduler>> & s,const FunctorType & lambda,ValueType & val)1179 KOKKOS_INLINE_FUNCTION void single(
1180 const Impl::VectorSingleStruct<Impl::TaskExec<Kokkos::Cuda, Scheduler>>& s,
1181 const FunctorType& lambda, ValueType& val) {
1182 #ifdef __CUDA_ARCH__
1183 if (threadIdx.x == 0) lambda(val);
1184 if (1 < s.team_member.team_size()) {
1185 val = shfl(val, 0, blockDim.x);
1186 }
1187 #endif
1188 }
1189
1190 template <class FunctorType, class ValueType, class Scheduler>
single(const Impl::ThreadSingleStruct<Impl::TaskExec<Kokkos::Cuda,Scheduler>> & single_struct,const FunctorType & lambda,ValueType & val)1191 KOKKOS_INLINE_FUNCTION void single(
1192 const Impl::ThreadSingleStruct<Impl::TaskExec<Kokkos::Cuda, Scheduler>>&
1193 single_struct,
1194 const FunctorType& lambda, ValueType& val) {
1195 #ifdef __CUDA_ARCH__
1196 if (threadIdx.x == 0 && threadIdx.y == 0) {
1197 lambda(val);
1198 }
1199 single_struct.team_member.team_broadcast(val, 0);
1200 #endif
1201 }
1202
1203 } // namespace Kokkos
1204
1205 //----------------------------------------------------------------------------
1206 //----------------------------------------------------------------------------
1207
1208 #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
1209 #endif /* #ifndef KOKKOS_IMPL_CUDA_TASK_HPP */
1210