1 //===---- reduction.cu - GPU OpenMP reduction implementation ----- CUDA -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the implementation of reduction with KMPC interface.
10 //
11 //===----------------------------------------------------------------------===//
12 #pragma omp declare target
13
14 #include "common/omptarget.h"
15 #include "target_impl.h"
16
17 EXTERN
__kmpc_nvptx_end_reduce(int32_t global_tid)18 void __kmpc_nvptx_end_reduce(int32_t global_tid) {}
19
20 EXTERN
__kmpc_nvptx_end_reduce_nowait(int32_t global_tid)21 void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid) {}
22
__kmpc_shuffle_int32(int32_t val,int16_t delta,int16_t size)23 EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) {
24 return __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, val, delta, size);
25 }
26
__kmpc_shuffle_int64(int64_t val,int16_t delta,int16_t size)27 EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) {
28 uint32_t lo, hi;
29 __kmpc_impl_unpack(val, lo, hi);
30 hi = __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, hi, delta, size);
31 lo = __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, lo, delta, size);
32 return __kmpc_impl_pack(lo, hi);
33 }
34
gpu_regular_warp_reduce(void * reduce_data,kmp_ShuffleReductFctPtr shflFct)35 INLINE static void gpu_regular_warp_reduce(void *reduce_data,
36 kmp_ShuffleReductFctPtr shflFct) {
37 for (uint32_t mask = WARPSIZE / 2; mask > 0; mask /= 2) {
38 shflFct(reduce_data, /*LaneId - not used= */ 0,
39 /*Offset = */ mask, /*AlgoVersion=*/0);
40 }
41 }
42
gpu_irregular_warp_reduce(void * reduce_data,kmp_ShuffleReductFctPtr shflFct,uint32_t size,uint32_t tid)43 INLINE static void gpu_irregular_warp_reduce(void *reduce_data,
44 kmp_ShuffleReductFctPtr shflFct,
45 uint32_t size, uint32_t tid) {
46 uint32_t curr_size;
47 uint32_t mask;
48 curr_size = size;
49 mask = curr_size / 2;
50 while (mask > 0) {
51 shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1);
52 curr_size = (curr_size + 1) / 2;
53 mask = curr_size / 2;
54 }
55 }
56
57 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
58 INLINE static uint32_t
gpu_irregular_simd_reduce(void * reduce_data,kmp_ShuffleReductFctPtr shflFct)59 gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
60 uint32_t size, remote_id, physical_lane_id;
61 physical_lane_id = GetThreadIdInBlock() % WARPSIZE;
62 __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
63 __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
64 uint32_t logical_lane_id = __kmpc_impl_popc(Liveness & lanemask_lt) * 2;
65 __kmpc_impl_lanemask_t lanemask_gt = __kmpc_impl_lanemask_gt();
66 do {
67 Liveness = __kmpc_impl_activemask();
68 remote_id = __kmpc_impl_ffs(Liveness & lanemask_gt);
69 size = __kmpc_impl_popc(Liveness);
70 logical_lane_id /= 2;
71 shflFct(reduce_data, /*LaneId =*/logical_lane_id,
72 /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
73 } while (logical_lane_id % 2 == 0 && size > 1);
74 return (logical_lane_id == 0);
75 }
76 #endif
77
78 INLINE
nvptx_parallel_reduce_nowait(int32_t global_tid,int32_t num_vars,size_t reduce_size,void * reduce_data,kmp_ShuffleReductFctPtr shflFct,kmp_InterWarpCopyFctPtr cpyFct,bool isSPMDExecutionMode,bool isRuntimeUninitialized)79 static int32_t nvptx_parallel_reduce_nowait(
80 int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
81 kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
82 bool isSPMDExecutionMode, bool isRuntimeUninitialized) {
83 uint32_t BlockThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
84 uint32_t NumThreads = GetNumberOfOmpThreads(isSPMDExecutionMode);
85 if (NumThreads == 1)
86 return 1;
87 /*
88 * This reduce function handles reduction within a team. It handles
89 * parallel regions in both L1 and L2 parallelism levels. It also
90 * supports Generic, SPMD, and NoOMP modes.
91 *
92 * 1. Reduce within a warp.
93 * 2. Warp master copies value to warp 0 via shared memory.
94 * 3. Warp 0 reduces to a single value.
95 * 4. The reduced value is available in the thread that returns 1.
96 */
97
98 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
99 uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
100 uint32_t WarpId = BlockThreadId / WARPSIZE;
101
102 // Volta execution model:
103 // For the Generic execution mode a parallel region either has 1 thread and
104 // beyond that, always a multiple of 32. For the SPMD execution mode we may
105 // have any number of threads.
106 if ((NumThreads % WARPSIZE == 0) || (WarpId < WarpsNeeded - 1))
107 gpu_regular_warp_reduce(reduce_data, shflFct);
108 else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
109 gpu_irregular_warp_reduce(reduce_data, shflFct,
110 /*LaneCount=*/NumThreads % WARPSIZE,
111 /*LaneId=*/GetThreadIdInBlock() % WARPSIZE);
112
113 // When we have more than [warpsize] number of threads
114 // a block reduction is performed here.
115 //
116 // Only L1 parallel region can enter this if condition.
117 if (NumThreads > WARPSIZE) {
118 // Gather all the reduced values from each warp
119 // to the first warp.
120 cpyFct(reduce_data, WarpsNeeded);
121
122 if (WarpId == 0)
123 gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
124 BlockThreadId);
125 }
126 return BlockThreadId == 0;
127 #else
128 __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
129 if (Liveness == __kmpc_impl_all_lanes) // Full warp
130 gpu_regular_warp_reduce(reduce_data, shflFct);
131 else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
132 gpu_irregular_warp_reduce(reduce_data, shflFct,
133 /*LaneCount=*/__kmpc_impl_popc(Liveness),
134 /*LaneId=*/GetThreadIdInBlock() % WARPSIZE);
135 else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2
136 // parallel region may enter here; return
137 // early.
138 return gpu_irregular_simd_reduce(reduce_data, shflFct);
139
140 // When we have more than [warpsize] number of threads
141 // a block reduction is performed here.
142 //
143 // Only L1 parallel region can enter this if condition.
144 if (NumThreads > WARPSIZE) {
145 uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
146 // Gather all the reduced values from each warp
147 // to the first warp.
148 cpyFct(reduce_data, WarpsNeeded);
149
150 uint32_t WarpId = BlockThreadId / WARPSIZE;
151 if (WarpId == 0)
152 gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
153 BlockThreadId);
154
155 return BlockThreadId == 0;
156 } else if (isRuntimeUninitialized /* Never an L2 parallel region without the OMP runtime */) {
157 return BlockThreadId == 0;
158 }
159
160 // Get the OMP thread Id. This is different from BlockThreadId in the case of
161 // an L2 parallel region.
162 return global_tid == 0;
163 #endif // __CUDA_ARCH__ >= 700
164 }
165
166 EXTERN
__kmpc_nvptx_parallel_reduce_nowait_v2(kmp_Ident * loc,int32_t global_tid,int32_t num_vars,size_t reduce_size,void * reduce_data,kmp_ShuffleReductFctPtr shflFct,kmp_InterWarpCopyFctPtr cpyFct)167 int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
168 kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size,
169 void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
170 kmp_InterWarpCopyFctPtr cpyFct) {
171 return nvptx_parallel_reduce_nowait(
172 global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
173 checkSPMDMode(loc), checkRuntimeUninitialized(loc));
174 }
175
isMaster(kmp_Ident * loc,uint32_t ThreadId)176 INLINE static bool isMaster(kmp_Ident *loc, uint32_t ThreadId) {
177 return checkGenericMode(loc) || IsTeamMaster(ThreadId);
178 }
179
roundToWarpsize(uint32_t s)180 INLINE static uint32_t roundToWarpsize(uint32_t s) {
181 if (s < WARPSIZE)
182 return 1;
183 return (s & ~(unsigned)(WARPSIZE - 1));
184 }
185
kmpcMin(uint32_t x,uint32_t y)186 INLINE static uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
187
188 DEVICE static volatile uint32_t IterCnt = 0;
189 DEVICE static volatile uint32_t Cnt = 0;
__kmpc_nvptx_teams_reduce_nowait_v2(kmp_Ident * loc,int32_t global_tid,void * global_buffer,int32_t num_of_records,void * reduce_data,kmp_ShuffleReductFctPtr shflFct,kmp_InterWarpCopyFctPtr cpyFct,kmp_ListGlobalFctPtr lgcpyFct,kmp_ListGlobalFctPtr lgredFct,kmp_ListGlobalFctPtr glcpyFct,kmp_ListGlobalFctPtr glredFct)190 EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
191 kmp_Ident *loc, int32_t global_tid, void *global_buffer,
192 int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
193 kmp_InterWarpCopyFctPtr cpyFct, kmp_ListGlobalFctPtr lgcpyFct,
194 kmp_ListGlobalFctPtr lgredFct, kmp_ListGlobalFctPtr glcpyFct,
195 kmp_ListGlobalFctPtr glredFct) {
196
197 // Terminate all threads in non-SPMD mode except for the master thread.
198 if (checkGenericMode(loc) && GetThreadIdInBlock() != GetMasterThreadID())
199 return 0;
200
201 uint32_t ThreadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
202
203 // In non-generic mode all workers participate in the teams reduction.
204 // In generic mode only the team master participates in the teams
205 // reduction because the workers are waiting for parallel work.
206 uint32_t NumThreads =
207 checkSPMDMode(loc) ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true)
208 : /*Master thread only*/ 1;
209 uint32_t TeamId = GetBlockIdInKernel();
210 uint32_t NumTeams = GetNumberOfBlocksInKernel();
211 static unsigned SHARED(Bound);
212 static unsigned SHARED(ChunkTeamCount);
213
214 // Block progress for teams greater than the current upper
215 // limit. We always only allow a number of teams less or equal
216 // to the number of slots in the buffer.
217 bool IsMaster = isMaster(loc, ThreadId);
218 while (IsMaster) {
219 // Atomic read
220 Bound = __kmpc_atomic_add((uint32_t *)&IterCnt, 0u);
221 if (TeamId < Bound + num_of_records)
222 break;
223 }
224
225 if (IsMaster) {
226 int ModBockId = TeamId % num_of_records;
227 if (TeamId < num_of_records)
228 lgcpyFct(global_buffer, ModBockId, reduce_data);
229 else
230 lgredFct(global_buffer, ModBockId, reduce_data);
231 __kmpc_impl_threadfence_system();
232
233 // Increment team counter.
234 // This counter is incremented by all teams in the current
235 // BUFFER_SIZE chunk.
236 ChunkTeamCount = __kmpc_atomic_inc((uint32_t *)&Cnt, num_of_records - 1u);
237 }
238 // Synchronize
239 if (checkSPMDMode(loc))
240 __kmpc_barrier(loc, global_tid);
241
242 // reduce_data is global or shared so before being reduced within the
243 // warp we need to bring it in local memory:
244 // local_reduce_data = reduce_data[i]
245 //
246 // Example for 3 reduction variables a, b, c (of potentially different
247 // types):
248 //
249 // buffer layout (struct of arrays):
250 // a, a, ..., a, b, b, ... b, c, c, ... c
251 // |__________|
252 // num_of_records
253 //
254 // local_data_reduce layout (struct):
255 // a, b, c
256 //
257 // Each thread will have a local struct containing the values to be
258 // reduced:
259 // 1. do reduction within each warp.
260 // 2. do reduction across warps.
261 // 3. write the final result to the main reduction variable
262 // by returning 1 in the thread holding the reduction result.
263
264 // Check if this is the very last team.
265 unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records));
266 if (ChunkTeamCount == NumTeams - Bound - 1) {
267 //
268 // Last team processing.
269 //
270 if (ThreadId >= NumRecs)
271 return 0;
272 NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs));
273 if (ThreadId >= NumThreads)
274 return 0;
275
276 // Load from buffer and reduce.
277 glcpyFct(global_buffer, ThreadId, reduce_data);
278 for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads)
279 glredFct(global_buffer, i, reduce_data);
280
281 // Reduce across warps to the warp master.
282 if (NumThreads > 1) {
283 gpu_regular_warp_reduce(reduce_data, shflFct);
284
285 // When we have more than [warpsize] number of threads
286 // a block reduction is performed here.
287 uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads);
288 if (ActiveThreads > WARPSIZE) {
289 uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
290 // Gather all the reduced values from each warp
291 // to the first warp.
292 cpyFct(reduce_data, WarpsNeeded);
293
294 uint32_t WarpId = ThreadId / WARPSIZE;
295 if (WarpId == 0)
296 gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
297 ThreadId);
298 }
299 }
300
301 if (IsMaster) {
302 Cnt = 0;
303 IterCnt = 0;
304 return 1;
305 }
306 return 0;
307 }
308 if (IsMaster && ChunkTeamCount == num_of_records - 1) {
309 // Allow SIZE number of teams to proceed writing their
310 // intermediate results to the global buffer.
311 __kmpc_atomic_add((uint32_t *)&IterCnt, uint32_t(num_of_records));
312 }
313
314 return 0;
315 }
316
317 #pragma omp end declare target
318