1 //===---- reduction.cu - GPU OpenMP reduction implementation ----- CUDA -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the implementation of reduction with KMPC interface.
10 //
11 //===----------------------------------------------------------------------===//
12 #pragma omp declare target
13 
14 #include "common/omptarget.h"
15 #include "target_impl.h"
16 
17 EXTERN
__kmpc_nvptx_end_reduce(int32_t global_tid)18 void __kmpc_nvptx_end_reduce(int32_t global_tid) {}
19 
20 EXTERN
__kmpc_nvptx_end_reduce_nowait(int32_t global_tid)21 void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid) {}
22 
__kmpc_shuffle_int32(int32_t val,int16_t delta,int16_t size)23 EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) {
24   return __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, val, delta, size);
25 }
26 
__kmpc_shuffle_int64(int64_t val,int16_t delta,int16_t size)27 EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) {
28    uint32_t lo, hi;
29    __kmpc_impl_unpack(val, lo, hi);
30    hi = __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, hi, delta, size);
31    lo = __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, lo, delta, size);
32    return __kmpc_impl_pack(lo, hi);
33 }
34 
gpu_regular_warp_reduce(void * reduce_data,kmp_ShuffleReductFctPtr shflFct)35 INLINE static void gpu_regular_warp_reduce(void *reduce_data,
36                                            kmp_ShuffleReductFctPtr shflFct) {
37   for (uint32_t mask = WARPSIZE / 2; mask > 0; mask /= 2) {
38     shflFct(reduce_data, /*LaneId - not used= */ 0,
39             /*Offset = */ mask, /*AlgoVersion=*/0);
40   }
41 }
42 
gpu_irregular_warp_reduce(void * reduce_data,kmp_ShuffleReductFctPtr shflFct,uint32_t size,uint32_t tid)43 INLINE static void gpu_irregular_warp_reduce(void *reduce_data,
44                                              kmp_ShuffleReductFctPtr shflFct,
45                                              uint32_t size, uint32_t tid) {
46   uint32_t curr_size;
47   uint32_t mask;
48   curr_size = size;
49   mask = curr_size / 2;
50   while (mask > 0) {
51     shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1);
52     curr_size = (curr_size + 1) / 2;
53     mask = curr_size / 2;
54   }
55 }
56 
57 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
58 INLINE static uint32_t
gpu_irregular_simd_reduce(void * reduce_data,kmp_ShuffleReductFctPtr shflFct)59 gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
60   uint32_t size, remote_id, physical_lane_id;
61   physical_lane_id = GetThreadIdInBlock() % WARPSIZE;
62   __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
63   __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
64   uint32_t logical_lane_id = __kmpc_impl_popc(Liveness & lanemask_lt) * 2;
65   __kmpc_impl_lanemask_t lanemask_gt = __kmpc_impl_lanemask_gt();
66   do {
67     Liveness = __kmpc_impl_activemask();
68     remote_id = __kmpc_impl_ffs(Liveness & lanemask_gt);
69     size = __kmpc_impl_popc(Liveness);
70     logical_lane_id /= 2;
71     shflFct(reduce_data, /*LaneId =*/logical_lane_id,
72             /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
73   } while (logical_lane_id % 2 == 0 && size > 1);
74   return (logical_lane_id == 0);
75 }
76 #endif
77 
78 INLINE
nvptx_parallel_reduce_nowait(int32_t global_tid,int32_t num_vars,size_t reduce_size,void * reduce_data,kmp_ShuffleReductFctPtr shflFct,kmp_InterWarpCopyFctPtr cpyFct,bool isSPMDExecutionMode,bool isRuntimeUninitialized)79 static int32_t nvptx_parallel_reduce_nowait(
80     int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
81     kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
82     bool isSPMDExecutionMode, bool isRuntimeUninitialized) {
83   uint32_t BlockThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
84   uint32_t NumThreads = GetNumberOfOmpThreads(isSPMDExecutionMode);
85   if (NumThreads == 1)
86     return 1;
87   /*
88    * This reduce function handles reduction within a team. It handles
89    * parallel regions in both L1 and L2 parallelism levels. It also
90    * supports Generic, SPMD, and NoOMP modes.
91    *
92    * 1. Reduce within a warp.
93    * 2. Warp master copies value to warp 0 via shared memory.
94    * 3. Warp 0 reduces to a single value.
95    * 4. The reduced value is available in the thread that returns 1.
96    */
97 
98 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
99   uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
100   uint32_t WarpId = BlockThreadId / WARPSIZE;
101 
102   // Volta execution model:
103   // For the Generic execution mode a parallel region either has 1 thread and
104   // beyond that, always a multiple of 32. For the SPMD execution mode we may
105   // have any number of threads.
106   if ((NumThreads % WARPSIZE == 0) || (WarpId < WarpsNeeded - 1))
107     gpu_regular_warp_reduce(reduce_data, shflFct);
108   else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
109     gpu_irregular_warp_reduce(reduce_data, shflFct,
110                               /*LaneCount=*/NumThreads % WARPSIZE,
111                               /*LaneId=*/GetThreadIdInBlock() % WARPSIZE);
112 
113   // When we have more than [warpsize] number of threads
114   // a block reduction is performed here.
115   //
116   // Only L1 parallel region can enter this if condition.
117   if (NumThreads > WARPSIZE) {
118     // Gather all the reduced values from each warp
119     // to the first warp.
120     cpyFct(reduce_data, WarpsNeeded);
121 
122     if (WarpId == 0)
123       gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
124                                 BlockThreadId);
125   }
126   return BlockThreadId == 0;
127 #else
128   __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
129   if (Liveness == __kmpc_impl_all_lanes) // Full warp
130     gpu_regular_warp_reduce(reduce_data, shflFct);
131   else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
132     gpu_irregular_warp_reduce(reduce_data, shflFct,
133                               /*LaneCount=*/__kmpc_impl_popc(Liveness),
134                               /*LaneId=*/GetThreadIdInBlock() % WARPSIZE);
135   else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2
136                                     // parallel region may enter here; return
137                                     // early.
138     return gpu_irregular_simd_reduce(reduce_data, shflFct);
139 
140   // When we have more than [warpsize] number of threads
141   // a block reduction is performed here.
142   //
143   // Only L1 parallel region can enter this if condition.
144   if (NumThreads > WARPSIZE) {
145     uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
146     // Gather all the reduced values from each warp
147     // to the first warp.
148     cpyFct(reduce_data, WarpsNeeded);
149 
150     uint32_t WarpId = BlockThreadId / WARPSIZE;
151     if (WarpId == 0)
152       gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
153                                 BlockThreadId);
154 
155     return BlockThreadId == 0;
156   } else if (isRuntimeUninitialized /* Never an L2 parallel region without the OMP runtime */) {
157     return BlockThreadId == 0;
158   }
159 
160   // Get the OMP thread Id. This is different from BlockThreadId in the case of
161   // an L2 parallel region.
162   return global_tid == 0;
163 #endif // __CUDA_ARCH__ >= 700
164 }
165 
166 EXTERN
__kmpc_nvptx_parallel_reduce_nowait_v2(kmp_Ident * loc,int32_t global_tid,int32_t num_vars,size_t reduce_size,void * reduce_data,kmp_ShuffleReductFctPtr shflFct,kmp_InterWarpCopyFctPtr cpyFct)167 int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
168     kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size,
169     void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
170     kmp_InterWarpCopyFctPtr cpyFct) {
171   return nvptx_parallel_reduce_nowait(
172       global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
173       checkSPMDMode(loc), checkRuntimeUninitialized(loc));
174 }
175 
isMaster(kmp_Ident * loc,uint32_t ThreadId)176 INLINE static bool isMaster(kmp_Ident *loc, uint32_t ThreadId) {
177   return checkGenericMode(loc) || IsTeamMaster(ThreadId);
178 }
179 
roundToWarpsize(uint32_t s)180 INLINE static uint32_t roundToWarpsize(uint32_t s) {
181   if (s < WARPSIZE)
182     return 1;
183   return (s & ~(unsigned)(WARPSIZE - 1));
184 }
185 
kmpcMin(uint32_t x,uint32_t y)186 INLINE static uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
187 
188 DEVICE static volatile uint32_t IterCnt = 0;
189 DEVICE static volatile uint32_t Cnt = 0;
__kmpc_nvptx_teams_reduce_nowait_v2(kmp_Ident * loc,int32_t global_tid,void * global_buffer,int32_t num_of_records,void * reduce_data,kmp_ShuffleReductFctPtr shflFct,kmp_InterWarpCopyFctPtr cpyFct,kmp_ListGlobalFctPtr lgcpyFct,kmp_ListGlobalFctPtr lgredFct,kmp_ListGlobalFctPtr glcpyFct,kmp_ListGlobalFctPtr glredFct)190 EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
191     kmp_Ident *loc, int32_t global_tid, void *global_buffer,
192     int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
193     kmp_InterWarpCopyFctPtr cpyFct, kmp_ListGlobalFctPtr lgcpyFct,
194     kmp_ListGlobalFctPtr lgredFct, kmp_ListGlobalFctPtr glcpyFct,
195     kmp_ListGlobalFctPtr glredFct) {
196 
197   // Terminate all threads in non-SPMD mode except for the master thread.
198   if (checkGenericMode(loc) && GetThreadIdInBlock() != GetMasterThreadID())
199     return 0;
200 
201   uint32_t ThreadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
202 
203   // In non-generic mode all workers participate in the teams reduction.
204   // In generic mode only the team master participates in the teams
205   // reduction because the workers are waiting for parallel work.
206   uint32_t NumThreads =
207       checkSPMDMode(loc) ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true)
208                          : /*Master thread only*/ 1;
209   uint32_t TeamId = GetBlockIdInKernel();
210   uint32_t NumTeams = GetNumberOfBlocksInKernel();
211   static unsigned SHARED(Bound);
212   static unsigned SHARED(ChunkTeamCount);
213 
214   // Block progress for teams greater than the current upper
215   // limit. We always only allow a number of teams less or equal
216   // to the number of slots in the buffer.
217   bool IsMaster = isMaster(loc, ThreadId);
218   while (IsMaster) {
219     // Atomic read
220     Bound = __kmpc_atomic_add((uint32_t *)&IterCnt, 0u);
221     if (TeamId < Bound + num_of_records)
222       break;
223   }
224 
225   if (IsMaster) {
226     int ModBockId = TeamId % num_of_records;
227     if (TeamId < num_of_records)
228       lgcpyFct(global_buffer, ModBockId, reduce_data);
229     else
230       lgredFct(global_buffer, ModBockId, reduce_data);
231     __kmpc_impl_threadfence_system();
232 
233     // Increment team counter.
234     // This counter is incremented by all teams in the current
235     // BUFFER_SIZE chunk.
236     ChunkTeamCount = __kmpc_atomic_inc((uint32_t *)&Cnt, num_of_records - 1u);
237   }
238   // Synchronize
239   if (checkSPMDMode(loc))
240     __kmpc_barrier(loc, global_tid);
241 
242   // reduce_data is global or shared so before being reduced within the
243   // warp we need to bring it in local memory:
244   // local_reduce_data = reduce_data[i]
245   //
246   // Example for 3 reduction variables a, b, c (of potentially different
247   // types):
248   //
249   // buffer layout (struct of arrays):
250   // a, a, ..., a, b, b, ... b, c, c, ... c
251   // |__________|
252   //     num_of_records
253   //
254   // local_data_reduce layout (struct):
255   // a, b, c
256   //
257   // Each thread will have a local struct containing the values to be
258   // reduced:
259   //      1. do reduction within each warp.
260   //      2. do reduction across warps.
261   //      3. write the final result to the main reduction variable
262   //         by returning 1 in the thread holding the reduction result.
263 
264   // Check if this is the very last team.
265   unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records));
266   if (ChunkTeamCount == NumTeams - Bound - 1) {
267     //
268     // Last team processing.
269     //
270     if (ThreadId >= NumRecs)
271       return 0;
272     NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs));
273     if (ThreadId >= NumThreads)
274       return 0;
275 
276     // Load from buffer and reduce.
277     glcpyFct(global_buffer, ThreadId, reduce_data);
278     for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads)
279       glredFct(global_buffer, i, reduce_data);
280 
281     // Reduce across warps to the warp master.
282     if (NumThreads > 1) {
283       gpu_regular_warp_reduce(reduce_data, shflFct);
284 
285       // When we have more than [warpsize] number of threads
286       // a block reduction is performed here.
287       uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads);
288       if (ActiveThreads > WARPSIZE) {
289         uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
290         // Gather all the reduced values from each warp
291         // to the first warp.
292         cpyFct(reduce_data, WarpsNeeded);
293 
294         uint32_t WarpId = ThreadId / WARPSIZE;
295         if (WarpId == 0)
296           gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
297                                     ThreadId);
298       }
299     }
300 
301     if (IsMaster) {
302       Cnt = 0;
303       IterCnt = 0;
304       return 1;
305     }
306     return 0;
307   }
308   if (IsMaster && ChunkTeamCount == num_of_records - 1) {
309     // Allow SIZE number of teams to proceed writing their
310     // intermediate results to the global buffer.
311     __kmpc_atomic_add((uint32_t *)&IterCnt, uint32_t(num_of_records));
312   }
313 
314   return 0;
315 }
316 
317 #pragma omp end declare target
318