1 //===---- reduction.cu - GPU OpenMP reduction implementation ----- CUDA -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the implementation of reduction with KMPC interface.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "common/omptarget.h"
14 #include "common/target_atomic.h"
15 #include "target_impl.h"
16 
17 EXTERN
__kmpc_nvptx_end_reduce(int32_t global_tid)18 void __kmpc_nvptx_end_reduce(int32_t global_tid) {}
19 
20 EXTERN
__kmpc_nvptx_end_reduce_nowait(int32_t global_tid)21 void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid) {}
22 
__kmpc_shuffle_int32(int32_t val,int16_t delta,int16_t size)23 EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) {
24   return __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, val, delta, size);
25 }
26 
__kmpc_shuffle_int64(int64_t val,int16_t delta,int16_t size)27 EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) {
28    uint32_t lo, hi;
29    __kmpc_impl_unpack(val, lo, hi);
30    hi = __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, hi, delta, size);
31    lo = __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, lo, delta, size);
32    return __kmpc_impl_pack(lo, hi);
33 }
34 
gpu_regular_warp_reduce(void * reduce_data,kmp_ShuffleReductFctPtr shflFct)35 INLINE static void gpu_regular_warp_reduce(void *reduce_data,
36                                            kmp_ShuffleReductFctPtr shflFct) {
37   for (uint32_t mask = WARPSIZE / 2; mask > 0; mask /= 2) {
38     shflFct(reduce_data, /*LaneId - not used= */ 0,
39             /*Offset = */ mask, /*AlgoVersion=*/0);
40   }
41 }
42 
gpu_irregular_warp_reduce(void * reduce_data,kmp_ShuffleReductFctPtr shflFct,uint32_t size,uint32_t tid)43 INLINE static void gpu_irregular_warp_reduce(void *reduce_data,
44                                              kmp_ShuffleReductFctPtr shflFct,
45                                              uint32_t size, uint32_t tid) {
46   uint32_t curr_size;
47   uint32_t mask;
48   curr_size = size;
49   mask = curr_size / 2;
50   while (mask > 0) {
51     shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1);
52     curr_size = (curr_size + 1) / 2;
53     mask = curr_size / 2;
54   }
55 }
56 
57 INLINE static uint32_t
gpu_irregular_simd_reduce(void * reduce_data,kmp_ShuffleReductFctPtr shflFct)58 gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
59   uint32_t size, remote_id, physical_lane_id;
60   physical_lane_id = GetThreadIdInBlock() % WARPSIZE;
61   __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
62   __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
63   uint32_t logical_lane_id = __kmpc_impl_popc(Liveness & lanemask_lt) * 2;
64   __kmpc_impl_lanemask_t lanemask_gt = __kmpc_impl_lanemask_gt();
65   do {
66     Liveness = __kmpc_impl_activemask();
67     remote_id = __kmpc_impl_ffs(Liveness & lanemask_gt);
68     size = __kmpc_impl_popc(Liveness);
69     logical_lane_id /= 2;
70     shflFct(reduce_data, /*LaneId =*/logical_lane_id,
71             /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
72   } while (logical_lane_id % 2 == 0 && size > 1);
73   return (logical_lane_id == 0);
74 }
75 
76 EXTERN
__kmpc_nvptx_simd_reduce_nowait(int32_t global_tid,int32_t num_vars,size_t reduce_size,void * reduce_data,kmp_ShuffleReductFctPtr shflFct,kmp_InterWarpCopyFctPtr cpyFct)77 int32_t __kmpc_nvptx_simd_reduce_nowait(int32_t global_tid, int32_t num_vars,
78                                         size_t reduce_size, void *reduce_data,
79                                         kmp_ShuffleReductFctPtr shflFct,
80                                         kmp_InterWarpCopyFctPtr cpyFct) {
81   __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
82   if (Liveness == __kmpc_impl_all_lanes) {
83     gpu_regular_warp_reduce(reduce_data, shflFct);
84     return GetThreadIdInBlock() % WARPSIZE ==
85            0; // Result on lane 0 of the simd warp.
86   } else {
87     return gpu_irregular_simd_reduce(
88         reduce_data, shflFct); // Result on the first active lane.
89   }
90 }
91 
92 INLINE
nvptx_parallel_reduce_nowait(int32_t global_tid,int32_t num_vars,size_t reduce_size,void * reduce_data,kmp_ShuffleReductFctPtr shflFct,kmp_InterWarpCopyFctPtr cpyFct,bool isSPMDExecutionMode,bool isRuntimeUninitialized)93 static int32_t nvptx_parallel_reduce_nowait(
94     int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
95     kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
96     bool isSPMDExecutionMode, bool isRuntimeUninitialized) {
97   uint32_t BlockThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
98   uint32_t NumThreads = GetNumberOfOmpThreads(isSPMDExecutionMode);
99   if (NumThreads == 1)
100     return 1;
101   /*
102    * This reduce function handles reduction within a team. It handles
103    * parallel regions in both L1 and L2 parallelism levels. It also
104    * supports Generic, SPMD, and NoOMP modes.
105    *
106    * 1. Reduce within a warp.
107    * 2. Warp master copies value to warp 0 via shared memory.
108    * 3. Warp 0 reduces to a single value.
109    * 4. The reduced value is available in the thread that returns 1.
110    */
111 
112 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
113   uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
114   uint32_t WarpId = BlockThreadId / WARPSIZE;
115 
116   // Volta execution model:
117   // For the Generic execution mode a parallel region either has 1 thread and
118   // beyond that, always a multiple of 32. For the SPMD execution mode we may
119   // have any number of threads.
120   if ((NumThreads % WARPSIZE == 0) || (WarpId < WarpsNeeded - 1))
121     gpu_regular_warp_reduce(reduce_data, shflFct);
122   else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
123     gpu_irregular_warp_reduce(reduce_data, shflFct,
124                               /*LaneCount=*/NumThreads % WARPSIZE,
125                               /*LaneId=*/GetThreadIdInBlock() % WARPSIZE);
126 
127   // When we have more than [warpsize] number of threads
128   // a block reduction is performed here.
129   //
130   // Only L1 parallel region can enter this if condition.
131   if (NumThreads > WARPSIZE) {
132     // Gather all the reduced values from each warp
133     // to the first warp.
134     cpyFct(reduce_data, WarpsNeeded);
135 
136     if (WarpId == 0)
137       gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
138                                 BlockThreadId);
139   }
140   return BlockThreadId == 0;
141 #else
142   __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
143   if (Liveness == __kmpc_impl_all_lanes) // Full warp
144     gpu_regular_warp_reduce(reduce_data, shflFct);
145   else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
146     gpu_irregular_warp_reduce(reduce_data, shflFct,
147                               /*LaneCount=*/__kmpc_impl_popc(Liveness),
148                               /*LaneId=*/GetThreadIdInBlock() % WARPSIZE);
149   else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2
150                                     // parallel region may enter here; return
151                                     // early.
152     return gpu_irregular_simd_reduce(reduce_data, shflFct);
153 
154   // When we have more than [warpsize] number of threads
155   // a block reduction is performed here.
156   //
157   // Only L1 parallel region can enter this if condition.
158   if (NumThreads > WARPSIZE) {
159     uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
160     // Gather all the reduced values from each warp
161     // to the first warp.
162     cpyFct(reduce_data, WarpsNeeded);
163 
164     uint32_t WarpId = BlockThreadId / WARPSIZE;
165     if (WarpId == 0)
166       gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
167                                 BlockThreadId);
168 
169     return BlockThreadId == 0;
170   } else if (isRuntimeUninitialized /* Never an L2 parallel region without the OMP runtime */) {
171     return BlockThreadId == 0;
172   }
173 
174   // Get the OMP thread Id. This is different from BlockThreadId in the case of
175   // an L2 parallel region.
176   return global_tid == 0;
177 #endif // __CUDA_ARCH__ >= 700
178 }
179 
__kmpc_nvptx_parallel_reduce_nowait(int32_t global_tid,int32_t num_vars,size_t reduce_size,void * reduce_data,kmp_ShuffleReductFctPtr shflFct,kmp_InterWarpCopyFctPtr cpyFct)180 EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait(
181     int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
182     kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
183   return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size,
184                                       reduce_data, shflFct, cpyFct,
185                                       isSPMDMode(), isRuntimeUninitialized());
186 }
187 
188 EXTERN
__kmpc_nvptx_parallel_reduce_nowait_v2(kmp_Ident * loc,int32_t global_tid,int32_t num_vars,size_t reduce_size,void * reduce_data,kmp_ShuffleReductFctPtr shflFct,kmp_InterWarpCopyFctPtr cpyFct)189 int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
190     kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size,
191     void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
192     kmp_InterWarpCopyFctPtr cpyFct) {
193   return nvptx_parallel_reduce_nowait(
194       global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
195       checkSPMDMode(loc), checkRuntimeUninitialized(loc));
196 }
197 
198 EXTERN
__kmpc_nvptx_parallel_reduce_nowait_simple_spmd(int32_t global_tid,int32_t num_vars,size_t reduce_size,void * reduce_data,kmp_ShuffleReductFctPtr shflFct,kmp_InterWarpCopyFctPtr cpyFct)199 int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd(
200     int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
201     kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
202   return nvptx_parallel_reduce_nowait(
203       global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
204       /*isSPMDExecutionMode=*/true, /*isRuntimeUninitialized=*/true);
205 }
206 
207 EXTERN
__kmpc_nvptx_parallel_reduce_nowait_simple_generic(int32_t global_tid,int32_t num_vars,size_t reduce_size,void * reduce_data,kmp_ShuffleReductFctPtr shflFct,kmp_InterWarpCopyFctPtr cpyFct)208 int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic(
209     int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
210     kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
211   return nvptx_parallel_reduce_nowait(
212       global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
213       /*isSPMDExecutionMode=*/false, /*isRuntimeUninitialized=*/true);
214 }
215 
216 INLINE
nvptx_teams_reduce_nowait(int32_t global_tid,int32_t num_vars,size_t reduce_size,void * reduce_data,kmp_ShuffleReductFctPtr shflFct,kmp_InterWarpCopyFctPtr cpyFct,kmp_CopyToScratchpadFctPtr scratchFct,kmp_LoadReduceFctPtr ldFct,bool isSPMDExecutionMode)217 static int32_t nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
218                                          size_t reduce_size, void *reduce_data,
219                                          kmp_ShuffleReductFctPtr shflFct,
220                                          kmp_InterWarpCopyFctPtr cpyFct,
221                                          kmp_CopyToScratchpadFctPtr scratchFct,
222                                          kmp_LoadReduceFctPtr ldFct,
223                                          bool isSPMDExecutionMode) {
224   uint32_t ThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
225   // In non-generic mode all workers participate in the teams reduction.
226   // In generic mode only the team master participates in the teams
227   // reduction because the workers are waiting for parallel work.
228   uint32_t NumThreads =
229       isSPMDExecutionMode ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true)
230                           : /*Master thread only*/ 1;
231   uint32_t TeamId = GetBlockIdInKernel();
232   uint32_t NumTeams = GetNumberOfBlocksInKernel();
233   SHARED volatile bool IsLastTeam;
234 
235   // Team masters of all teams write to the scratchpad.
236   if (ThreadId == 0) {
237     unsigned int *timestamp = GetTeamsReductionTimestamp();
238     char *scratchpad = GetTeamsReductionScratchpad();
239 
240     scratchFct(reduce_data, scratchpad, TeamId, NumTeams);
241     __kmpc_impl_threadfence();
242 
243     // atomicInc increments 'timestamp' and has a range [0, NumTeams-1].
244     // It resets 'timestamp' back to 0 once the last team increments
245     // this counter.
246     unsigned val = __kmpc_atomic_inc(timestamp, NumTeams - 1);
247     IsLastTeam = val == NumTeams - 1;
248   }
249 
250   // We have to wait on L1 barrier because in GENERIC mode the workers
251   // are waiting on barrier 0 for work.
252   //
253   // If we guard this barrier as follows it leads to deadlock, probably
254   // because of a compiler bug: if (!IsGenericMode()) __syncthreads();
255   uint16_t SyncWarps = (NumThreads + WARPSIZE - 1) / WARPSIZE;
256   __kmpc_impl_named_sync(L1_BARRIER, SyncWarps * WARPSIZE);
257 
258   // If this team is not the last, quit.
259   if (/* Volatile read by all threads */ !IsLastTeam)
260     return 0;
261 
262     //
263     // Last team processing.
264     //
265 
266     // Threads in excess of #teams do not participate in reduction of the
267     // scratchpad values.
268 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
269   uint32_t ActiveThreads = NumThreads;
270   if (NumTeams < NumThreads) {
271     ActiveThreads =
272         (NumTeams < WARPSIZE) ? 1 : NumTeams & ~((uint16_t)WARPSIZE - 1);
273   }
274   if (ThreadId >= ActiveThreads)
275     return 0;
276 
277   // Load from scratchpad and reduce.
278   char *scratchpad = GetTeamsReductionScratchpad();
279   ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0);
280   for (uint32_t i = ActiveThreads + ThreadId; i < NumTeams; i += ActiveThreads)
281     ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1);
282 
283   uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
284   uint32_t WarpId = ThreadId / WARPSIZE;
285 
286   // Reduce across warps to the warp master.
287   if ((ActiveThreads % WARPSIZE == 0) ||
288       (WarpId < WarpsNeeded - 1)) // Full warp
289     gpu_regular_warp_reduce(reduce_data, shflFct);
290   else if (ActiveThreads > 1) // Partial warp but contiguous lanes
291     // Only SPMD execution mode comes thru this case.
292     gpu_irregular_warp_reduce(reduce_data, shflFct,
293                               /*LaneCount=*/ActiveThreads % WARPSIZE,
294                               /*LaneId=*/ThreadId % WARPSIZE);
295 
296   // When we have more than [warpsize] number of threads
297   // a block reduction is performed here.
298   if (ActiveThreads > WARPSIZE) {
299     // Gather all the reduced values from each warp
300     // to the first warp.
301     cpyFct(reduce_data, WarpsNeeded);
302 
303     if (WarpId == 0)
304       gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
305   }
306 #else
307   if (ThreadId >= NumTeams)
308     return 0;
309 
310   // Load from scratchpad and reduce.
311   char *scratchpad = GetTeamsReductionScratchpad();
312   ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0);
313   for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads)
314     ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1);
315 
316   // Reduce across warps to the warp master.
317   __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
318   if (Liveness == __kmpc_impl_all_lanes) // Full warp
319     gpu_regular_warp_reduce(reduce_data, shflFct);
320   else // Partial warp but contiguous lanes
321     gpu_irregular_warp_reduce(reduce_data, shflFct,
322                               /*LaneCount=*/__kmpc_impl_popc(Liveness),
323                               /*LaneId=*/ThreadId % WARPSIZE);
324 
325   // When we have more than [warpsize] number of threads
326   // a block reduction is performed here.
327   uint32_t ActiveThreads = NumTeams < NumThreads ? NumTeams : NumThreads;
328   if (ActiveThreads > WARPSIZE) {
329     uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
330     // Gather all the reduced values from each warp
331     // to the first warp.
332     cpyFct(reduce_data, WarpsNeeded);
333 
334     uint32_t WarpId = ThreadId / WARPSIZE;
335     if (WarpId == 0)
336       gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
337   }
338 #endif // __CUDA_ARCH__ >= 700
339 
340   return ThreadId == 0;
341 }
342 
343 EXTERN
__kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,int32_t num_vars,size_t reduce_size,void * reduce_data,kmp_ShuffleReductFctPtr shflFct,kmp_InterWarpCopyFctPtr cpyFct,kmp_CopyToScratchpadFctPtr scratchFct,kmp_LoadReduceFctPtr ldFct)344 int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
345                                          size_t reduce_size, void *reduce_data,
346                                          kmp_ShuffleReductFctPtr shflFct,
347                                          kmp_InterWarpCopyFctPtr cpyFct,
348                                          kmp_CopyToScratchpadFctPtr scratchFct,
349                                          kmp_LoadReduceFctPtr ldFct) {
350   return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
351                                    reduce_data, shflFct, cpyFct, scratchFct,
352                                    ldFct, isSPMDMode());
353 }
354 
355 EXTERN
__kmpc_nvptx_teams_reduce_nowait_simple_spmd(int32_t global_tid,int32_t num_vars,size_t reduce_size,void * reduce_data,kmp_ShuffleReductFctPtr shflFct,kmp_InterWarpCopyFctPtr cpyFct,kmp_CopyToScratchpadFctPtr scratchFct,kmp_LoadReduceFctPtr ldFct)356 int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd(
357     int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
358     kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
359     kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) {
360   return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
361                                    reduce_data, shflFct, cpyFct, scratchFct,
362                                    ldFct, /*isSPMDExecutionMode=*/true);
363 }
364 
365 EXTERN
__kmpc_nvptx_teams_reduce_nowait_simple_generic(int32_t global_tid,int32_t num_vars,size_t reduce_size,void * reduce_data,kmp_ShuffleReductFctPtr shflFct,kmp_InterWarpCopyFctPtr cpyFct,kmp_CopyToScratchpadFctPtr scratchFct,kmp_LoadReduceFctPtr ldFct)366 int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic(
367     int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
368     kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
369     kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) {
370   return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
371                                    reduce_data, shflFct, cpyFct, scratchFct,
372                                    ldFct, /*isSPMDExecutionMode=*/false);
373 }
374 
__kmpc_nvptx_teams_reduce_nowait_simple(kmp_Ident * loc,int32_t global_tid,kmp_CriticalName * crit)375 EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple(kmp_Ident *loc,
376                                                        int32_t global_tid,
377                                                        kmp_CriticalName *crit) {
378   if (checkSPMDMode(loc) && GetThreadIdInBlock() != 0)
379     return 0;
380   // The master thread of the team actually does the reduction.
381   while (__kmpc_atomic_cas((uint32_t *)crit, 0u, 1u))
382     ;
383   return 1;
384 }
385 
386 EXTERN void
__kmpc_nvptx_teams_end_reduce_nowait_simple(kmp_Ident * loc,int32_t global_tid,kmp_CriticalName * crit)387 __kmpc_nvptx_teams_end_reduce_nowait_simple(kmp_Ident *loc, int32_t global_tid,
388                                             kmp_CriticalName *crit) {
389   __kmpc_impl_threadfence_system();
390   (void)__kmpc_atomic_exchange((uint32_t *)crit, 0u);
391 }
392 
isMaster(kmp_Ident * loc,uint32_t ThreadId)393 INLINE static bool isMaster(kmp_Ident *loc, uint32_t ThreadId) {
394   return checkGenericMode(loc) || IsTeamMaster(ThreadId);
395 }
396 
roundToWarpsize(uint32_t s)397 INLINE static uint32_t roundToWarpsize(uint32_t s) {
398   if (s < WARPSIZE)
399     return 1;
400   return (s & ~(unsigned)(WARPSIZE - 1));
401 }
402 
403 DEVICE static volatile uint32_t IterCnt = 0;
404 DEVICE static volatile uint32_t Cnt = 0;
__kmpc_nvptx_teams_reduce_nowait_v2(kmp_Ident * loc,int32_t global_tid,void * global_buffer,int32_t num_of_records,void * reduce_data,kmp_ShuffleReductFctPtr shflFct,kmp_InterWarpCopyFctPtr cpyFct,kmp_ListGlobalFctPtr lgcpyFct,kmp_ListGlobalFctPtr lgredFct,kmp_ListGlobalFctPtr glcpyFct,kmp_ListGlobalFctPtr glredFct)405 EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
406     kmp_Ident *loc, int32_t global_tid, void *global_buffer,
407     int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
408     kmp_InterWarpCopyFctPtr cpyFct, kmp_ListGlobalFctPtr lgcpyFct,
409     kmp_ListGlobalFctPtr lgredFct, kmp_ListGlobalFctPtr glcpyFct,
410     kmp_ListGlobalFctPtr glredFct) {
411 
412   // Terminate all threads in non-SPMD mode except for the master thread.
413   if (checkGenericMode(loc) && GetThreadIdInBlock() != GetMasterThreadID())
414     return 0;
415 
416   uint32_t ThreadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
417 
418   // In non-generic mode all workers participate in the teams reduction.
419   // In generic mode only the team master participates in the teams
420   // reduction because the workers are waiting for parallel work.
421   uint32_t NumThreads =
422       checkSPMDMode(loc) ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true)
423                          : /*Master thread only*/ 1;
424   uint32_t TeamId = GetBlockIdInKernel();
425   uint32_t NumTeams = GetNumberOfBlocksInKernel();
426   SHARED unsigned Bound;
427   SHARED unsigned ChunkTeamCount;
428 
429   // Block progress for teams greater than the current upper
430   // limit. We always only allow a number of teams less or equal
431   // to the number of slots in the buffer.
432   bool IsMaster = isMaster(loc, ThreadId);
433   while (IsMaster) {
434     // Atomic read
435     Bound = __kmpc_atomic_add((uint32_t *)&IterCnt, 0u);
436     if (TeamId < Bound + num_of_records)
437       break;
438   }
439 
440   if (IsMaster) {
441     int ModBockId = TeamId % num_of_records;
442     if (TeamId < num_of_records)
443       lgcpyFct(global_buffer, ModBockId, reduce_data);
444     else
445       lgredFct(global_buffer, ModBockId, reduce_data);
446     __kmpc_impl_threadfence_system();
447 
448     // Increment team counter.
449     // This counter is incremented by all teams in the current
450     // BUFFER_SIZE chunk.
451     ChunkTeamCount = __kmpc_atomic_inc((uint32_t *)&Cnt, num_of_records - 1u);
452   }
453   // Synchronize
454   if (checkSPMDMode(loc))
455     __kmpc_barrier(loc, global_tid);
456 
457   // reduce_data is global or shared so before being reduced within the
458   // warp we need to bring it in local memory:
459   // local_reduce_data = reduce_data[i]
460   //
461   // Example for 3 reduction variables a, b, c (of potentially different
462   // types):
463   //
464   // buffer layout (struct of arrays):
465   // a, a, ..., a, b, b, ... b, c, c, ... c
466   // |__________|
467   //     num_of_records
468   //
469   // local_data_reduce layout (struct):
470   // a, b, c
471   //
472   // Each thread will have a local struct containing the values to be
473   // reduced:
474   //      1. do reduction within each warp.
475   //      2. do reduction across warps.
476   //      3. write the final result to the main reduction variable
477   //         by returning 1 in the thread holding the reduction result.
478 
479   // Check if this is the very last team.
480   unsigned NumRecs = __kmpc_impl_min(NumTeams, uint32_t(num_of_records));
481   if (ChunkTeamCount == NumTeams - Bound - 1) {
482     //
483     // Last team processing.
484     //
485     if (ThreadId >= NumRecs)
486       return 0;
487     NumThreads = roundToWarpsize(__kmpc_impl_min(NumThreads, NumRecs));
488     if (ThreadId >= NumThreads)
489       return 0;
490 
491     // Load from buffer and reduce.
492     glcpyFct(global_buffer, ThreadId, reduce_data);
493     for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads)
494       glredFct(global_buffer, i, reduce_data);
495 
496     // Reduce across warps to the warp master.
497     if (NumThreads > 1) {
498       gpu_regular_warp_reduce(reduce_data, shflFct);
499 
500       // When we have more than [warpsize] number of threads
501       // a block reduction is performed here.
502       uint32_t ActiveThreads = __kmpc_impl_min(NumRecs, NumThreads);
503       if (ActiveThreads > WARPSIZE) {
504         uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
505         // Gather all the reduced values from each warp
506         // to the first warp.
507         cpyFct(reduce_data, WarpsNeeded);
508 
509         uint32_t WarpId = ThreadId / WARPSIZE;
510         if (WarpId == 0)
511           gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
512                                     ThreadId);
513       }
514     }
515 
516     if (IsMaster) {
517       Cnt = 0;
518       IterCnt = 0;
519       return 1;
520     }
521     return 0;
522   }
523   if (IsMaster && ChunkTeamCount == num_of_records - 1) {
524     // Allow SIZE number of teams to proceed writing their
525     // intermediate results to the global buffer.
526     __kmpc_atomic_add((uint32_t *)&IterCnt, uint32_t(num_of_records));
527   }
528 
529   return 0;
530 }
531 
532