1 //===---- reduction.cu - GPU OpenMP reduction implementation ----- CUDA -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the implementation of reduction with KMPC interface.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "common/omptarget.h"
14 #include "common/target_atomic.h"
15 #include "target_impl.h"
16 
17 EXTERN
__kmpc_nvptx_end_reduce(int32_t global_tid)18 void __kmpc_nvptx_end_reduce(int32_t global_tid) {}
19 
20 EXTERN
__kmpc_nvptx_end_reduce_nowait(int32_t global_tid)21 void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid) {}
22 
__kmpc_shuffle_int32(int32_t val,int16_t delta,int16_t size)23 EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) {
24   return __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, val, delta, size);
25 }
26 
__kmpc_shuffle_int64(int64_t val,int16_t delta,int16_t size)27 EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) {
28    uint32_t lo, hi;
29    __kmpc_impl_unpack(val, lo, hi);
30    hi = __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, hi, delta, size);
31    lo = __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, lo, delta, size);
32    return __kmpc_impl_pack(lo, hi);
33 }
34 
gpu_regular_warp_reduce(void * reduce_data,kmp_ShuffleReductFctPtr shflFct)35 INLINE static void gpu_regular_warp_reduce(void *reduce_data,
36                                            kmp_ShuffleReductFctPtr shflFct) {
37   for (uint32_t mask = WARPSIZE / 2; mask > 0; mask /= 2) {
38     shflFct(reduce_data, /*LaneId - not used= */ 0,
39             /*Offset = */ mask, /*AlgoVersion=*/0);
40   }
41 }
42 
gpu_irregular_warp_reduce(void * reduce_data,kmp_ShuffleReductFctPtr shflFct,uint32_t size,uint32_t tid)43 INLINE static void gpu_irregular_warp_reduce(void *reduce_data,
44                                              kmp_ShuffleReductFctPtr shflFct,
45                                              uint32_t size, uint32_t tid) {
46   uint32_t curr_size;
47   uint32_t mask;
48   curr_size = size;
49   mask = curr_size / 2;
50   while (mask > 0) {
51     shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1);
52     curr_size = (curr_size + 1) / 2;
53     mask = curr_size / 2;
54   }
55 }
56 
57 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
58 INLINE static uint32_t
gpu_irregular_simd_reduce(void * reduce_data,kmp_ShuffleReductFctPtr shflFct)59 gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
60   uint32_t size, remote_id, physical_lane_id;
61   physical_lane_id = GetThreadIdInBlock() % WARPSIZE;
62   __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
63   __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
64   uint32_t logical_lane_id = __kmpc_impl_popc(Liveness & lanemask_lt) * 2;
65   __kmpc_impl_lanemask_t lanemask_gt = __kmpc_impl_lanemask_gt();
66   do {
67     Liveness = __kmpc_impl_activemask();
68     remote_id = __kmpc_impl_ffs(Liveness & lanemask_gt);
69     size = __kmpc_impl_popc(Liveness);
70     logical_lane_id /= 2;
71     shflFct(reduce_data, /*LaneId =*/logical_lane_id,
72             /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
73   } while (logical_lane_id % 2 == 0 && size > 1);
74   return (logical_lane_id == 0);
75 }
76 #endif
77 
78 INLINE
nvptx_parallel_reduce_nowait(int32_t global_tid,int32_t num_vars,size_t reduce_size,void * reduce_data,kmp_ShuffleReductFctPtr shflFct,kmp_InterWarpCopyFctPtr cpyFct,bool isSPMDExecutionMode,bool isRuntimeUninitialized)79 static int32_t nvptx_parallel_reduce_nowait(
80     int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
81     kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
82     bool isSPMDExecutionMode, bool isRuntimeUninitialized) {
83   uint32_t BlockThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
84   uint32_t NumThreads = GetNumberOfOmpThreads(isSPMDExecutionMode);
85   if (NumThreads == 1)
86     return 1;
87   /*
88    * This reduce function handles reduction within a team. It handles
89    * parallel regions in both L1 and L2 parallelism levels. It also
90    * supports Generic, SPMD, and NoOMP modes.
91    *
92    * 1. Reduce within a warp.
93    * 2. Warp master copies value to warp 0 via shared memory.
94    * 3. Warp 0 reduces to a single value.
95    * 4. The reduced value is available in the thread that returns 1.
96    */
97 
98 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
99   uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
100   uint32_t WarpId = BlockThreadId / WARPSIZE;
101 
102   // Volta execution model:
103   // For the Generic execution mode a parallel region either has 1 thread and
104   // beyond that, always a multiple of 32. For the SPMD execution mode we may
105   // have any number of threads.
106   if ((NumThreads % WARPSIZE == 0) || (WarpId < WarpsNeeded - 1))
107     gpu_regular_warp_reduce(reduce_data, shflFct);
108   else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
109     gpu_irregular_warp_reduce(reduce_data, shflFct,
110                               /*LaneCount=*/NumThreads % WARPSIZE,
111                               /*LaneId=*/GetThreadIdInBlock() % WARPSIZE);
112 
113   // When we have more than [warpsize] number of threads
114   // a block reduction is performed here.
115   //
116   // Only L1 parallel region can enter this if condition.
117   if (NumThreads > WARPSIZE) {
118     // Gather all the reduced values from each warp
119     // to the first warp.
120     cpyFct(reduce_data, WarpsNeeded);
121 
122     if (WarpId == 0)
123       gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
124                                 BlockThreadId);
125   }
126   return BlockThreadId == 0;
127 #else
128   __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
129   if (Liveness == __kmpc_impl_all_lanes) // Full warp
130     gpu_regular_warp_reduce(reduce_data, shflFct);
131   else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
132     gpu_irregular_warp_reduce(reduce_data, shflFct,
133                               /*LaneCount=*/__kmpc_impl_popc(Liveness),
134                               /*LaneId=*/GetThreadIdInBlock() % WARPSIZE);
135   else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2
136                                     // parallel region may enter here; return
137                                     // early.
138     return gpu_irregular_simd_reduce(reduce_data, shflFct);
139 
140   // When we have more than [warpsize] number of threads
141   // a block reduction is performed here.
142   //
143   // Only L1 parallel region can enter this if condition.
144   if (NumThreads > WARPSIZE) {
145     uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
146     // Gather all the reduced values from each warp
147     // to the first warp.
148     cpyFct(reduce_data, WarpsNeeded);
149 
150     uint32_t WarpId = BlockThreadId / WARPSIZE;
151     if (WarpId == 0)
152       gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
153                                 BlockThreadId);
154 
155     return BlockThreadId == 0;
156   } else if (isRuntimeUninitialized /* Never an L2 parallel region without the OMP runtime */) {
157     return BlockThreadId == 0;
158   }
159 
160   // Get the OMP thread Id. This is different from BlockThreadId in the case of
161   // an L2 parallel region.
162   return global_tid == 0;
163 #endif // __CUDA_ARCH__ >= 700
164 }
165 
166 EXTERN
__kmpc_nvptx_parallel_reduce_nowait_v2(kmp_Ident * loc,int32_t global_tid,int32_t num_vars,size_t reduce_size,void * reduce_data,kmp_ShuffleReductFctPtr shflFct,kmp_InterWarpCopyFctPtr cpyFct)167 int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
168     kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size,
169     void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
170     kmp_InterWarpCopyFctPtr cpyFct) {
171   return nvptx_parallel_reduce_nowait(
172       global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
173       checkSPMDMode(loc), checkRuntimeUninitialized(loc));
174 }
175 
isMaster(kmp_Ident * loc,uint32_t ThreadId)176 INLINE static bool isMaster(kmp_Ident *loc, uint32_t ThreadId) {
177   return checkGenericMode(loc) || IsTeamMaster(ThreadId);
178 }
179 
roundToWarpsize(uint32_t s)180 INLINE static uint32_t roundToWarpsize(uint32_t s) {
181   if (s < WARPSIZE)
182     return 1;
183   return (s & ~(unsigned)(WARPSIZE - 1));
184 }
185 
186 DEVICE static volatile uint32_t IterCnt = 0;
187 DEVICE static volatile uint32_t Cnt = 0;
__kmpc_nvptx_teams_reduce_nowait_v2(kmp_Ident * loc,int32_t global_tid,void * global_buffer,int32_t num_of_records,void * reduce_data,kmp_ShuffleReductFctPtr shflFct,kmp_InterWarpCopyFctPtr cpyFct,kmp_ListGlobalFctPtr lgcpyFct,kmp_ListGlobalFctPtr lgredFct,kmp_ListGlobalFctPtr glcpyFct,kmp_ListGlobalFctPtr glredFct)188 EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
189     kmp_Ident *loc, int32_t global_tid, void *global_buffer,
190     int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
191     kmp_InterWarpCopyFctPtr cpyFct, kmp_ListGlobalFctPtr lgcpyFct,
192     kmp_ListGlobalFctPtr lgredFct, kmp_ListGlobalFctPtr glcpyFct,
193     kmp_ListGlobalFctPtr glredFct) {
194 
195   // Terminate all threads in non-SPMD mode except for the master thread.
196   if (checkGenericMode(loc) && GetThreadIdInBlock() != GetMasterThreadID())
197     return 0;
198 
199   uint32_t ThreadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
200 
201   // In non-generic mode all workers participate in the teams reduction.
202   // In generic mode only the team master participates in the teams
203   // reduction because the workers are waiting for parallel work.
204   uint32_t NumThreads =
205       checkSPMDMode(loc) ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true)
206                          : /*Master thread only*/ 1;
207   uint32_t TeamId = GetBlockIdInKernel();
208   uint32_t NumTeams = GetNumberOfBlocksInKernel();
209   static SHARED unsigned Bound;
210   static SHARED unsigned ChunkTeamCount;
211 
212   // Block progress for teams greater than the current upper
213   // limit. We always only allow a number of teams less or equal
214   // to the number of slots in the buffer.
215   bool IsMaster = isMaster(loc, ThreadId);
216   while (IsMaster) {
217     // Atomic read
218     Bound = __kmpc_atomic_add((uint32_t *)&IterCnt, 0u);
219     if (TeamId < Bound + num_of_records)
220       break;
221   }
222 
223   if (IsMaster) {
224     int ModBockId = TeamId % num_of_records;
225     if (TeamId < num_of_records)
226       lgcpyFct(global_buffer, ModBockId, reduce_data);
227     else
228       lgredFct(global_buffer, ModBockId, reduce_data);
229     __kmpc_impl_threadfence_system();
230 
231     // Increment team counter.
232     // This counter is incremented by all teams in the current
233     // BUFFER_SIZE chunk.
234     ChunkTeamCount = __kmpc_atomic_inc((uint32_t *)&Cnt, num_of_records - 1u);
235   }
236   // Synchronize
237   if (checkSPMDMode(loc))
238     __kmpc_barrier(loc, global_tid);
239 
240   // reduce_data is global or shared so before being reduced within the
241   // warp we need to bring it in local memory:
242   // local_reduce_data = reduce_data[i]
243   //
244   // Example for 3 reduction variables a, b, c (of potentially different
245   // types):
246   //
247   // buffer layout (struct of arrays):
248   // a, a, ..., a, b, b, ... b, c, c, ... c
249   // |__________|
250   //     num_of_records
251   //
252   // local_data_reduce layout (struct):
253   // a, b, c
254   //
255   // Each thread will have a local struct containing the values to be
256   // reduced:
257   //      1. do reduction within each warp.
258   //      2. do reduction across warps.
259   //      3. write the final result to the main reduction variable
260   //         by returning 1 in the thread holding the reduction result.
261 
262   // Check if this is the very last team.
263   unsigned NumRecs = __kmpc_impl_min(NumTeams, uint32_t(num_of_records));
264   if (ChunkTeamCount == NumTeams - Bound - 1) {
265     //
266     // Last team processing.
267     //
268     if (ThreadId >= NumRecs)
269       return 0;
270     NumThreads = roundToWarpsize(__kmpc_impl_min(NumThreads, NumRecs));
271     if (ThreadId >= NumThreads)
272       return 0;
273 
274     // Load from buffer and reduce.
275     glcpyFct(global_buffer, ThreadId, reduce_data);
276     for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads)
277       glredFct(global_buffer, i, reduce_data);
278 
279     // Reduce across warps to the warp master.
280     if (NumThreads > 1) {
281       gpu_regular_warp_reduce(reduce_data, shflFct);
282 
283       // When we have more than [warpsize] number of threads
284       // a block reduction is performed here.
285       uint32_t ActiveThreads = __kmpc_impl_min(NumRecs, NumThreads);
286       if (ActiveThreads > WARPSIZE) {
287         uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
288         // Gather all the reduced values from each warp
289         // to the first warp.
290         cpyFct(reduce_data, WarpsNeeded);
291 
292         uint32_t WarpId = ThreadId / WARPSIZE;
293         if (WarpId == 0)
294           gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
295                                     ThreadId);
296       }
297     }
298 
299     if (IsMaster) {
300       Cnt = 0;
301       IterCnt = 0;
302       return 1;
303     }
304     return 0;
305   }
306   if (IsMaster && ChunkTeamCount == num_of_records - 1) {
307     // Allow SIZE number of teams to proceed writing their
308     // intermediate results to the global buffer.
309     __kmpc_atomic_add((uint32_t *)&IterCnt, uint32_t(num_of_records));
310   }
311 
312   return 0;
313 }
314 
315