1 //===---- reduction.cu - GPU OpenMP reduction implementation ----- CUDA -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the implementation of reduction with KMPC interface.
10 //
11 //===----------------------------------------------------------------------===//
12 #pragma omp declare target
13 
14 #include "common/omptarget.h"
15 #include "target/shuffle.h"
16 #include "target_impl.h"
17 
18 EXTERN
__kmpc_nvptx_end_reduce(int32_t global_tid)19 void __kmpc_nvptx_end_reduce(int32_t global_tid) {}
20 
21 EXTERN
__kmpc_nvptx_end_reduce_nowait(int32_t global_tid)22 void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid) {}
23 
gpu_regular_warp_reduce(void * reduce_data,kmp_ShuffleReductFctPtr shflFct)24 INLINE static void gpu_regular_warp_reduce(void *reduce_data,
25                                            kmp_ShuffleReductFctPtr shflFct) {
26   for (uint32_t mask = WARPSIZE / 2; mask > 0; mask /= 2) {
27     shflFct(reduce_data, /*LaneId - not used= */ 0,
28             /*Offset = */ mask, /*AlgoVersion=*/0);
29   }
30 }
31 
gpu_irregular_warp_reduce(void * reduce_data,kmp_ShuffleReductFctPtr shflFct,uint32_t size,uint32_t tid)32 INLINE static void gpu_irregular_warp_reduce(void *reduce_data,
33                                              kmp_ShuffleReductFctPtr shflFct,
34                                              uint32_t size, uint32_t tid) {
35   uint32_t curr_size;
36   uint32_t mask;
37   curr_size = size;
38   mask = curr_size / 2;
39   while (mask > 0) {
40     shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1);
41     curr_size = (curr_size + 1) / 2;
42     mask = curr_size / 2;
43   }
44 }
45 
46 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
47 INLINE static uint32_t
gpu_irregular_simd_reduce(void * reduce_data,kmp_ShuffleReductFctPtr shflFct)48 gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
49   uint32_t size, remote_id, physical_lane_id;
50   physical_lane_id = __kmpc_get_hardware_thread_id_in_block() % WARPSIZE;
51   __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
52   __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
53   uint32_t logical_lane_id = __kmpc_impl_popc(Liveness & lanemask_lt) * 2;
54   __kmpc_impl_lanemask_t lanemask_gt = __kmpc_impl_lanemask_gt();
55   do {
56     Liveness = __kmpc_impl_activemask();
57     remote_id = __kmpc_impl_ffs(Liveness & lanemask_gt);
58     size = __kmpc_impl_popc(Liveness);
59     logical_lane_id /= 2;
60     shflFct(reduce_data, /*LaneId =*/logical_lane_id,
61             /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
62   } while (logical_lane_id % 2 == 0 && size > 1);
63   return (logical_lane_id == 0);
64 }
65 #endif
66 
67 INLINE
nvptx_parallel_reduce_nowait(int32_t global_tid,int32_t num_vars,size_t reduce_size,void * reduce_data,kmp_ShuffleReductFctPtr shflFct,kmp_InterWarpCopyFctPtr cpyFct,bool isSPMDExecutionMode,bool isRuntimeUninitialized)68 static int32_t nvptx_parallel_reduce_nowait(
69     int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
70     kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
71     bool isSPMDExecutionMode, bool isRuntimeUninitialized) {
72   uint32_t BlockThreadId = GetLogicalThreadIdInBlock();
73   uint32_t NumThreads = GetNumberOfOmpThreads(isSPMDExecutionMode);
74   if (NumThreads == 1)
75     return 1;
76     /*
77      * This reduce function handles reduction within a team. It handles
78      * parallel regions in both L1 and L2 parallelism levels. It also
79      * supports Generic, SPMD, and NoOMP modes.
80      *
81      * 1. Reduce within a warp.
82      * 2. Warp master copies value to warp 0 via shared memory.
83      * 3. Warp 0 reduces to a single value.
84      * 4. The reduced value is available in the thread that returns 1.
85      */
86 
87 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
88   uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
89   uint32_t WarpId = BlockThreadId / WARPSIZE;
90 
91   // Volta execution model:
92   // For the Generic execution mode a parallel region either has 1 thread and
93   // beyond that, always a multiple of 32. For the SPMD execution mode we may
94   // have any number of threads.
95   if ((NumThreads % WARPSIZE == 0) || (WarpId < WarpsNeeded - 1))
96     gpu_regular_warp_reduce(reduce_data, shflFct);
97   else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
98     gpu_irregular_warp_reduce(
99         reduce_data, shflFct,
100         /*LaneCount=*/NumThreads % WARPSIZE,
101         /*LaneId=*/__kmpc_get_hardware_thread_id_in_block() % WARPSIZE);
102 
103   // When we have more than [warpsize] number of threads
104   // a block reduction is performed here.
105   //
106   // Only L1 parallel region can enter this if condition.
107   if (NumThreads > WARPSIZE) {
108     // Gather all the reduced values from each warp
109     // to the first warp.
110     cpyFct(reduce_data, WarpsNeeded);
111 
112     if (WarpId == 0)
113       gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
114                                 BlockThreadId);
115   }
116   return BlockThreadId == 0;
117 #else
118   __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
119   if (Liveness == __kmpc_impl_all_lanes) // Full warp
120     gpu_regular_warp_reduce(reduce_data, shflFct);
121   else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
122     gpu_irregular_warp_reduce(
123         reduce_data, shflFct,
124         /*LaneCount=*/__kmpc_impl_popc(Liveness),
125         /*LaneId=*/__kmpc_get_hardware_thread_id_in_block() % WARPSIZE);
126   else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2
127                                     // parallel region may enter here; return
128                                     // early.
129     return gpu_irregular_simd_reduce(reduce_data, shflFct);
130 
131   // When we have more than [warpsize] number of threads
132   // a block reduction is performed here.
133   //
134   // Only L1 parallel region can enter this if condition.
135   if (NumThreads > WARPSIZE) {
136     uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
137     // Gather all the reduced values from each warp
138     // to the first warp.
139     cpyFct(reduce_data, WarpsNeeded);
140 
141     uint32_t WarpId = BlockThreadId / WARPSIZE;
142     if (WarpId == 0)
143       gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
144                                 BlockThreadId);
145 
146     return BlockThreadId == 0;
147   } else if (isRuntimeUninitialized /* Never an L2 parallel region without the OMP runtime */) {
148     return BlockThreadId == 0;
149   }
150 
151   // Get the OMP thread Id. This is different from BlockThreadId in the case of
152   // an L2 parallel region.
153   return global_tid == 0;
154 #endif // __CUDA_ARCH__ >= 700
155 }
156 
157 EXTERN
__kmpc_nvptx_parallel_reduce_nowait_v2(kmp_Ident * loc,int32_t global_tid,int32_t num_vars,size_t reduce_size,void * reduce_data,kmp_ShuffleReductFctPtr shflFct,kmp_InterWarpCopyFctPtr cpyFct)158 int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
159     kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size,
160     void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
161     kmp_InterWarpCopyFctPtr cpyFct) {
162   return nvptx_parallel_reduce_nowait(
163       global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
164       __kmpc_is_spmd_exec_mode(), isRuntimeUninitialized());
165 }
166 
isMaster(kmp_Ident * loc,uint32_t ThreadId)167 INLINE static bool isMaster(kmp_Ident *loc, uint32_t ThreadId) {
168   return !__kmpc_is_spmd_exec_mode() || IsTeamMaster(ThreadId);
169 }
170 
roundToWarpsize(uint32_t s)171 INLINE static uint32_t roundToWarpsize(uint32_t s) {
172   if (s < WARPSIZE)
173     return 1;
174   return (s & ~(unsigned)(WARPSIZE - 1));
175 }
176 
kmpcMin(uint32_t x,uint32_t y)177 INLINE static uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
178 
179 static volatile uint32_t IterCnt = 0;
180 static volatile uint32_t Cnt = 0;
__kmpc_nvptx_teams_reduce_nowait_v2(kmp_Ident * loc,int32_t global_tid,void * global_buffer,int32_t num_of_records,void * reduce_data,kmp_ShuffleReductFctPtr shflFct,kmp_InterWarpCopyFctPtr cpyFct,kmp_ListGlobalFctPtr lgcpyFct,kmp_ListGlobalFctPtr lgredFct,kmp_ListGlobalFctPtr glcpyFct,kmp_ListGlobalFctPtr glredFct)181 EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
182     kmp_Ident *loc, int32_t global_tid, void *global_buffer,
183     int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
184     kmp_InterWarpCopyFctPtr cpyFct, kmp_ListGlobalFctPtr lgcpyFct,
185     kmp_ListGlobalFctPtr lgredFct, kmp_ListGlobalFctPtr glcpyFct,
186     kmp_ListGlobalFctPtr glredFct) {
187 
188   // Terminate all threads in non-SPMD mode except for the master thread.
189   if (!__kmpc_is_spmd_exec_mode() &&
190       !__kmpc_is_generic_main_thread(__kmpc_get_hardware_thread_id_in_block()))
191     return 0;
192 
193   uint32_t ThreadId = GetLogicalThreadIdInBlock();
194 
195   // In non-generic mode all workers participate in the teams reduction.
196   // In generic mode only the team master participates in the teams
197   // reduction because the workers are waiting for parallel work.
198   uint32_t NumThreads =
199       __kmpc_is_spmd_exec_mode() ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true)
200                          : /*Master thread only*/ 1;
201   uint32_t TeamId = GetBlockIdInKernel();
202   uint32_t NumTeams = __kmpc_get_hardware_num_blocks();
203   static unsigned SHARED(Bound);
204   static unsigned SHARED(ChunkTeamCount);
205 
206   // Block progress for teams greater than the current upper
207   // limit. We always only allow a number of teams less or equal
208   // to the number of slots in the buffer.
209   bool IsMaster = isMaster(loc, ThreadId);
210   while (IsMaster) {
211     // Atomic read
212     Bound = __kmpc_atomic_add((uint32_t *)&IterCnt, 0u);
213     if (TeamId < Bound + num_of_records)
214       break;
215   }
216 
217   if (IsMaster) {
218     int ModBockId = TeamId % num_of_records;
219     if (TeamId < num_of_records)
220       lgcpyFct(global_buffer, ModBockId, reduce_data);
221     else
222       lgredFct(global_buffer, ModBockId, reduce_data);
223     __kmpc_impl_threadfence_system();
224 
225     // Increment team counter.
226     // This counter is incremented by all teams in the current
227     // BUFFER_SIZE chunk.
228     ChunkTeamCount = __kmpc_atomic_inc((uint32_t *)&Cnt, num_of_records - 1u);
229   }
230   // Synchronize
231   if (__kmpc_is_spmd_exec_mode())
232     __kmpc_barrier(loc, global_tid);
233 
234   // reduce_data is global or shared so before being reduced within the
235   // warp we need to bring it in local memory:
236   // local_reduce_data = reduce_data[i]
237   //
238   // Example for 3 reduction variables a, b, c (of potentially different
239   // types):
240   //
241   // buffer layout (struct of arrays):
242   // a, a, ..., a, b, b, ... b, c, c, ... c
243   // |__________|
244   //     num_of_records
245   //
246   // local_data_reduce layout (struct):
247   // a, b, c
248   //
249   // Each thread will have a local struct containing the values to be
250   // reduced:
251   //      1. do reduction within each warp.
252   //      2. do reduction across warps.
253   //      3. write the final result to the main reduction variable
254   //         by returning 1 in the thread holding the reduction result.
255 
256   // Check if this is the very last team.
257   unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records));
258   if (ChunkTeamCount == NumTeams - Bound - 1) {
259     //
260     // Last team processing.
261     //
262     if (ThreadId >= NumRecs)
263       return 0;
264     NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs));
265     if (ThreadId >= NumThreads)
266       return 0;
267 
268     // Load from buffer and reduce.
269     glcpyFct(global_buffer, ThreadId, reduce_data);
270     for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads)
271       glredFct(global_buffer, i, reduce_data);
272 
273     // Reduce across warps to the warp master.
274     if (NumThreads > 1) {
275       gpu_regular_warp_reduce(reduce_data, shflFct);
276 
277       // When we have more than [warpsize] number of threads
278       // a block reduction is performed here.
279       uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads);
280       if (ActiveThreads > WARPSIZE) {
281         uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
282         // Gather all the reduced values from each warp
283         // to the first warp.
284         cpyFct(reduce_data, WarpsNeeded);
285 
286         uint32_t WarpId = ThreadId / WARPSIZE;
287         if (WarpId == 0)
288           gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
289                                     ThreadId);
290       }
291     }
292 
293     if (IsMaster) {
294       Cnt = 0;
295       IterCnt = 0;
296       return 1;
297     }
298     return 0;
299   }
300   if (IsMaster && ChunkTeamCount == num_of_records - 1) {
301     // Allow SIZE number of teams to proceed writing their
302     // intermediate results to the global buffer.
303     __kmpc_atomic_add((uint32_t *)&IterCnt, uint32_t(num_of_records));
304   }
305 
306   return 0;
307 }
308 
309 #pragma omp end declare target
310