1 //===---- reduction.cu - GPU OpenMP reduction implementation ----- CUDA -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the implementation of reduction with KMPC interface.
10 //
11 //===----------------------------------------------------------------------===//
12 #pragma omp declare target
13
14 #include "common/omptarget.h"
15 #include "target/shuffle.h"
16 #include "target_impl.h"
17
18 EXTERN
__kmpc_nvptx_end_reduce(int32_t global_tid)19 void __kmpc_nvptx_end_reduce(int32_t global_tid) {}
20
21 EXTERN
__kmpc_nvptx_end_reduce_nowait(int32_t global_tid)22 void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid) {}
23
gpu_regular_warp_reduce(void * reduce_data,kmp_ShuffleReductFctPtr shflFct)24 INLINE static void gpu_regular_warp_reduce(void *reduce_data,
25 kmp_ShuffleReductFctPtr shflFct) {
26 for (uint32_t mask = WARPSIZE / 2; mask > 0; mask /= 2) {
27 shflFct(reduce_data, /*LaneId - not used= */ 0,
28 /*Offset = */ mask, /*AlgoVersion=*/0);
29 }
30 }
31
gpu_irregular_warp_reduce(void * reduce_data,kmp_ShuffleReductFctPtr shflFct,uint32_t size,uint32_t tid)32 INLINE static void gpu_irregular_warp_reduce(void *reduce_data,
33 kmp_ShuffleReductFctPtr shflFct,
34 uint32_t size, uint32_t tid) {
35 uint32_t curr_size;
36 uint32_t mask;
37 curr_size = size;
38 mask = curr_size / 2;
39 while (mask > 0) {
40 shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1);
41 curr_size = (curr_size + 1) / 2;
42 mask = curr_size / 2;
43 }
44 }
45
46 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
47 INLINE static uint32_t
gpu_irregular_simd_reduce(void * reduce_data,kmp_ShuffleReductFctPtr shflFct)48 gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
49 uint32_t size, remote_id, physical_lane_id;
50 physical_lane_id = __kmpc_get_hardware_thread_id_in_block() % WARPSIZE;
51 __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
52 __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
53 uint32_t logical_lane_id = __kmpc_impl_popc(Liveness & lanemask_lt) * 2;
54 __kmpc_impl_lanemask_t lanemask_gt = __kmpc_impl_lanemask_gt();
55 do {
56 Liveness = __kmpc_impl_activemask();
57 remote_id = __kmpc_impl_ffs(Liveness & lanemask_gt);
58 size = __kmpc_impl_popc(Liveness);
59 logical_lane_id /= 2;
60 shflFct(reduce_data, /*LaneId =*/logical_lane_id,
61 /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
62 } while (logical_lane_id % 2 == 0 && size > 1);
63 return (logical_lane_id == 0);
64 }
65 #endif
66
67 INLINE
nvptx_parallel_reduce_nowait(int32_t global_tid,int32_t num_vars,size_t reduce_size,void * reduce_data,kmp_ShuffleReductFctPtr shflFct,kmp_InterWarpCopyFctPtr cpyFct,bool isSPMDExecutionMode,bool isRuntimeUninitialized)68 static int32_t nvptx_parallel_reduce_nowait(
69 int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
70 kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
71 bool isSPMDExecutionMode, bool isRuntimeUninitialized) {
72 uint32_t BlockThreadId = GetLogicalThreadIdInBlock();
73 uint32_t NumThreads = GetNumberOfOmpThreads(isSPMDExecutionMode);
74 if (NumThreads == 1)
75 return 1;
76 /*
77 * This reduce function handles reduction within a team. It handles
78 * parallel regions in both L1 and L2 parallelism levels. It also
79 * supports Generic, SPMD, and NoOMP modes.
80 *
81 * 1. Reduce within a warp.
82 * 2. Warp master copies value to warp 0 via shared memory.
83 * 3. Warp 0 reduces to a single value.
84 * 4. The reduced value is available in the thread that returns 1.
85 */
86
87 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
88 uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
89 uint32_t WarpId = BlockThreadId / WARPSIZE;
90
91 // Volta execution model:
92 // For the Generic execution mode a parallel region either has 1 thread and
93 // beyond that, always a multiple of 32. For the SPMD execution mode we may
94 // have any number of threads.
95 if ((NumThreads % WARPSIZE == 0) || (WarpId < WarpsNeeded - 1))
96 gpu_regular_warp_reduce(reduce_data, shflFct);
97 else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
98 gpu_irregular_warp_reduce(
99 reduce_data, shflFct,
100 /*LaneCount=*/NumThreads % WARPSIZE,
101 /*LaneId=*/__kmpc_get_hardware_thread_id_in_block() % WARPSIZE);
102
103 // When we have more than [warpsize] number of threads
104 // a block reduction is performed here.
105 //
106 // Only L1 parallel region can enter this if condition.
107 if (NumThreads > WARPSIZE) {
108 // Gather all the reduced values from each warp
109 // to the first warp.
110 cpyFct(reduce_data, WarpsNeeded);
111
112 if (WarpId == 0)
113 gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
114 BlockThreadId);
115 }
116 return BlockThreadId == 0;
117 #else
118 __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
119 if (Liveness == __kmpc_impl_all_lanes) // Full warp
120 gpu_regular_warp_reduce(reduce_data, shflFct);
121 else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
122 gpu_irregular_warp_reduce(
123 reduce_data, shflFct,
124 /*LaneCount=*/__kmpc_impl_popc(Liveness),
125 /*LaneId=*/__kmpc_get_hardware_thread_id_in_block() % WARPSIZE);
126 else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2
127 // parallel region may enter here; return
128 // early.
129 return gpu_irregular_simd_reduce(reduce_data, shflFct);
130
131 // When we have more than [warpsize] number of threads
132 // a block reduction is performed here.
133 //
134 // Only L1 parallel region can enter this if condition.
135 if (NumThreads > WARPSIZE) {
136 uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
137 // Gather all the reduced values from each warp
138 // to the first warp.
139 cpyFct(reduce_data, WarpsNeeded);
140
141 uint32_t WarpId = BlockThreadId / WARPSIZE;
142 if (WarpId == 0)
143 gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
144 BlockThreadId);
145
146 return BlockThreadId == 0;
147 } else if (isRuntimeUninitialized /* Never an L2 parallel region without the OMP runtime */) {
148 return BlockThreadId == 0;
149 }
150
151 // Get the OMP thread Id. This is different from BlockThreadId in the case of
152 // an L2 parallel region.
153 return global_tid == 0;
154 #endif // __CUDA_ARCH__ >= 700
155 }
156
157 EXTERN
__kmpc_nvptx_parallel_reduce_nowait_v2(kmp_Ident * loc,int32_t global_tid,int32_t num_vars,size_t reduce_size,void * reduce_data,kmp_ShuffleReductFctPtr shflFct,kmp_InterWarpCopyFctPtr cpyFct)158 int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
159 kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size,
160 void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
161 kmp_InterWarpCopyFctPtr cpyFct) {
162 return nvptx_parallel_reduce_nowait(
163 global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
164 __kmpc_is_spmd_exec_mode(), isRuntimeUninitialized());
165 }
166
isMaster(kmp_Ident * loc,uint32_t ThreadId)167 INLINE static bool isMaster(kmp_Ident *loc, uint32_t ThreadId) {
168 return !__kmpc_is_spmd_exec_mode() || IsTeamMaster(ThreadId);
169 }
170
roundToWarpsize(uint32_t s)171 INLINE static uint32_t roundToWarpsize(uint32_t s) {
172 if (s < WARPSIZE)
173 return 1;
174 return (s & ~(unsigned)(WARPSIZE - 1));
175 }
176
kmpcMin(uint32_t x,uint32_t y)177 INLINE static uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
178
179 static volatile uint32_t IterCnt = 0;
180 static volatile uint32_t Cnt = 0;
__kmpc_nvptx_teams_reduce_nowait_v2(kmp_Ident * loc,int32_t global_tid,void * global_buffer,int32_t num_of_records,void * reduce_data,kmp_ShuffleReductFctPtr shflFct,kmp_InterWarpCopyFctPtr cpyFct,kmp_ListGlobalFctPtr lgcpyFct,kmp_ListGlobalFctPtr lgredFct,kmp_ListGlobalFctPtr glcpyFct,kmp_ListGlobalFctPtr glredFct)181 EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
182 kmp_Ident *loc, int32_t global_tid, void *global_buffer,
183 int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
184 kmp_InterWarpCopyFctPtr cpyFct, kmp_ListGlobalFctPtr lgcpyFct,
185 kmp_ListGlobalFctPtr lgredFct, kmp_ListGlobalFctPtr glcpyFct,
186 kmp_ListGlobalFctPtr glredFct) {
187
188 // Terminate all threads in non-SPMD mode except for the master thread.
189 if (!__kmpc_is_spmd_exec_mode() &&
190 !__kmpc_is_generic_main_thread(__kmpc_get_hardware_thread_id_in_block()))
191 return 0;
192
193 uint32_t ThreadId = GetLogicalThreadIdInBlock();
194
195 // In non-generic mode all workers participate in the teams reduction.
196 // In generic mode only the team master participates in the teams
197 // reduction because the workers are waiting for parallel work.
198 uint32_t NumThreads =
199 __kmpc_is_spmd_exec_mode() ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true)
200 : /*Master thread only*/ 1;
201 uint32_t TeamId = GetBlockIdInKernel();
202 uint32_t NumTeams = __kmpc_get_hardware_num_blocks();
203 static unsigned SHARED(Bound);
204 static unsigned SHARED(ChunkTeamCount);
205
206 // Block progress for teams greater than the current upper
207 // limit. We always only allow a number of teams less or equal
208 // to the number of slots in the buffer.
209 bool IsMaster = isMaster(loc, ThreadId);
210 while (IsMaster) {
211 // Atomic read
212 Bound = __kmpc_atomic_add((uint32_t *)&IterCnt, 0u);
213 if (TeamId < Bound + num_of_records)
214 break;
215 }
216
217 if (IsMaster) {
218 int ModBockId = TeamId % num_of_records;
219 if (TeamId < num_of_records)
220 lgcpyFct(global_buffer, ModBockId, reduce_data);
221 else
222 lgredFct(global_buffer, ModBockId, reduce_data);
223 __kmpc_impl_threadfence_system();
224
225 // Increment team counter.
226 // This counter is incremented by all teams in the current
227 // BUFFER_SIZE chunk.
228 ChunkTeamCount = __kmpc_atomic_inc((uint32_t *)&Cnt, num_of_records - 1u);
229 }
230 // Synchronize
231 if (__kmpc_is_spmd_exec_mode())
232 __kmpc_barrier(loc, global_tid);
233
234 // reduce_data is global or shared so before being reduced within the
235 // warp we need to bring it in local memory:
236 // local_reduce_data = reduce_data[i]
237 //
238 // Example for 3 reduction variables a, b, c (of potentially different
239 // types):
240 //
241 // buffer layout (struct of arrays):
242 // a, a, ..., a, b, b, ... b, c, c, ... c
243 // |__________|
244 // num_of_records
245 //
246 // local_data_reduce layout (struct):
247 // a, b, c
248 //
249 // Each thread will have a local struct containing the values to be
250 // reduced:
251 // 1. do reduction within each warp.
252 // 2. do reduction across warps.
253 // 3. write the final result to the main reduction variable
254 // by returning 1 in the thread holding the reduction result.
255
256 // Check if this is the very last team.
257 unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records));
258 if (ChunkTeamCount == NumTeams - Bound - 1) {
259 //
260 // Last team processing.
261 //
262 if (ThreadId >= NumRecs)
263 return 0;
264 NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs));
265 if (ThreadId >= NumThreads)
266 return 0;
267
268 // Load from buffer and reduce.
269 glcpyFct(global_buffer, ThreadId, reduce_data);
270 for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads)
271 glredFct(global_buffer, i, reduce_data);
272
273 // Reduce across warps to the warp master.
274 if (NumThreads > 1) {
275 gpu_regular_warp_reduce(reduce_data, shflFct);
276
277 // When we have more than [warpsize] number of threads
278 // a block reduction is performed here.
279 uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads);
280 if (ActiveThreads > WARPSIZE) {
281 uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
282 // Gather all the reduced values from each warp
283 // to the first warp.
284 cpyFct(reduce_data, WarpsNeeded);
285
286 uint32_t WarpId = ThreadId / WARPSIZE;
287 if (WarpId == 0)
288 gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
289 ThreadId);
290 }
291 }
292
293 if (IsMaster) {
294 Cnt = 0;
295 IterCnt = 0;
296 return 1;
297 }
298 return 0;
299 }
300 if (IsMaster && ChunkTeamCount == num_of_records - 1) {
301 // Allow SIZE number of teams to proceed writing their
302 // intermediate results to the global buffer.
303 __kmpc_atomic_add((uint32_t *)&IterCnt, uint32_t(num_of_records));
304 }
305
306 return 0;
307 }
308
309 #pragma omp end declare target
310