1 //===---- parallel.cu - GPU OpenMP parallel implementation ------- CUDA -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Parallel implementation in the GPU. Here is the pattern:
10 //
11 //    while (not finished) {
12 //
13 //    if (master) {
14 //      sequential code, decide which par loop to do, or if finished
15 //     __kmpc_kernel_prepare_parallel() // exec by master only
16 //    }
17 //    syncthreads // A
18 //    __kmpc_kernel_parallel() // exec by all
19 //    if (this thread is included in the parallel) {
20 //      switch () for all parallel loops
21 //      __kmpc_kernel_end_parallel() // exec only by threads in parallel
22 //    }
23 //
24 //
25 //    The reason we don't exec end_parallel for the threads not included
26 //    in the parallel loop is that for each barrier in the parallel
27 //    region, these non-included threads will cycle through the
28 //    syncthread A. Thus they must preserve their current threadId that
29 //    is larger than thread in team.
30 //
31 //    To make a long story short...
32 //
33 //===----------------------------------------------------------------------===//
34 #pragma omp declare target
35 
36 #include "common/omptarget.h"
37 #include "target_impl.h"
38 
39 ////////////////////////////////////////////////////////////////////////////////
40 // support for parallel that goes parallel (1 static level only)
41 ////////////////////////////////////////////////////////////////////////////////
42 
determineNumberOfThreads(uint16_t NumThreadsClause,uint16_t NThreadsICV,uint16_t ThreadLimit)43 INLINE static uint16_t determineNumberOfThreads(uint16_t NumThreadsClause,
44                                                 uint16_t NThreadsICV,
45                                                 uint16_t ThreadLimit) {
46   uint16_t ThreadsRequested = NThreadsICV;
47   if (NumThreadsClause != 0) {
48     ThreadsRequested = NumThreadsClause;
49   }
50 
51   uint16_t ThreadsAvailable = GetNumberOfWorkersInTeam();
52   if (ThreadLimit != 0 && ThreadLimit < ThreadsAvailable) {
53     ThreadsAvailable = ThreadLimit;
54   }
55 
56   uint16_t NumThreads = ThreadsAvailable;
57   if (ThreadsRequested != 0 && ThreadsRequested < NumThreads) {
58     NumThreads = ThreadsRequested;
59   }
60 
61 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
62   // On Volta and newer architectures we require that all lanes in
63   // a warp participate in the parallel region.  Round down to a
64   // multiple of WARPSIZE since it is legal to do so in OpenMP.
65   if (NumThreads < WARPSIZE) {
66     NumThreads = 1;
67   } else {
68     NumThreads = (NumThreads & ~((uint16_t)WARPSIZE - 1));
69   }
70 #endif
71 
72   return NumThreads;
73 }
74 
75 // This routine is always called by the team master..
__kmpc_kernel_prepare_parallel(void * WorkFn)76 EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn) {
77   PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n");
78 
79   omptarget_nvptx_workFn = WorkFn;
80 
81   // This routine is only called by the team master.  The team master is
82   // the first thread of the last warp.  It always has the logical thread
83   // id of 0 (since it is a shadow for the first worker thread).
84   const int threadId = 0;
85   omptarget_nvptx_TaskDescr *currTaskDescr =
86       omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
87   ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
88   ASSERT0(LT_FUSSY, !currTaskDescr->InParallelRegion(),
89           "cannot be called in a parallel region.");
90   if (currTaskDescr->InParallelRegion()) {
91     PRINT0(LD_PAR, "already in parallel: go seq\n");
92     return;
93   }
94 
95   uint16_t &NumThreadsClause =
96       omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
97 
98   uint16_t NumThreads =
99       determineNumberOfThreads(NumThreadsClause, nThreads, threadLimit);
100 
101   if (NumThreadsClause != 0) {
102     // Reset request to avoid propagating to successive #parallel
103     NumThreadsClause = 0;
104   }
105 
106   ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
107          (int)NumThreads);
108   ASSERT0(LT_FUSSY,
109           __kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(),
110           "only team master can create parallel");
111 
112   // Set number of threads on work descriptor.
113   omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
114   workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr);
115   threadsInTeam = NumThreads;
116 }
117 
118 // All workers call this function.  Deactivate those not needed.
119 // Fn - the outlined work function to execute.
120 // returns True if this thread is active, else False.
121 //
122 // Only the worker threads call this routine.
__kmpc_kernel_parallel(void ** WorkFn)123 EXTERN bool __kmpc_kernel_parallel(void **WorkFn) {
124   PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n");
125 
126   // Work function and arguments for L1 parallel region.
127   *WorkFn = omptarget_nvptx_workFn;
128 
129   // If this is the termination signal from the master, quit early.
130   if (!*WorkFn) {
131     PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel finished\n");
132     return false;
133   }
134 
135   // Only the worker threads call this routine and the master warp
136   // never arrives here.  Therefore, use the nvptx thread id.
137   int threadId = __kmpc_get_hardware_thread_id_in_block();
138   omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
139   // Set to true for workers participating in the parallel region.
140   bool isActive = false;
141   // Initialize state for active threads.
142   if (threadId < threadsInTeam) {
143     // init work descriptor from workdesccr
144     omptarget_nvptx_TaskDescr *newTaskDescr =
145         omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
146     ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
147     newTaskDescr->CopyFromWorkDescr(workDescr.WorkTaskDescr());
148     // install new top descriptor
149     omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
150                                                                newTaskDescr);
151     // init private from int value
152     PRINT(LD_PAR,
153           "thread will execute parallel region with id %d in a team of "
154           "%d threads\n",
155           (int)newTaskDescr->ThreadId(), (int)nThreads);
156 
157     isActive = true;
158   }
159 
160   return isActive;
161 }
162 
__kmpc_kernel_end_parallel()163 EXTERN void __kmpc_kernel_end_parallel() {
164   // pop stack
165   PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_parallel\n");
166   ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
167 
168   // Only the worker threads call this routine and the master warp
169   // never arrives here.  Therefore, use the nvptx thread id.
170   int threadId = __kmpc_get_hardware_thread_id_in_block();
171   omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
172   omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
173       threadId, currTaskDescr->GetPrevTaskDescr());
174 }
175 
176 ////////////////////////////////////////////////////////////////////////////////
177 // support for parallel that goes sequential
178 ////////////////////////////////////////////////////////////////////////////////
179 
__kmpc_serialized_parallel(kmp_Ident * loc,uint32_t global_tid)180 EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) {
181   PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n");
182 
183   IncParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask());
184 
185   if (isRuntimeUninitialized()) {
186     ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(),
187             "Expected SPMD mode with uninitialized runtime.");
188     return;
189   }
190 
191   // assume this is only called for nested parallel
192   int threadId = GetLogicalThreadIdInBlock();
193 
194   // unlike actual parallel, threads in the same team do not share
195   // the workTaskDescr in this case and num threads is fixed to 1
196 
197   // get current task
198   omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
199   currTaskDescr->SaveLoopData();
200 
201   // allocate new task descriptor and copy value from current one, set prev to
202   // it
203   omptarget_nvptx_TaskDescr *newTaskDescr =
204       (omptarget_nvptx_TaskDescr *)SafeMalloc(sizeof(omptarget_nvptx_TaskDescr),
205                                               "new seq parallel task");
206   newTaskDescr->CopyParent(currTaskDescr);
207 
208   // tweak values for serialized parallel case:
209   // - each thread becomes ID 0 in its serialized parallel, and
210   // - there is only one thread per team
211   newTaskDescr->ThreadId() = 0;
212 
213   // set new task descriptor as top
214   omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
215                                                              newTaskDescr);
216 }
217 
__kmpc_end_serialized_parallel(kmp_Ident * loc,uint32_t global_tid)218 EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc,
219                                            uint32_t global_tid) {
220   PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n");
221 
222   DecParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask());
223 
224   if (isRuntimeUninitialized()) {
225     ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(),
226             "Expected SPMD mode with uninitialized runtime.");
227     return;
228   }
229 
230   // pop stack
231   int threadId = GetLogicalThreadIdInBlock();
232   omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
233   // set new top
234   omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
235       threadId, currTaskDescr->GetPrevTaskDescr());
236   // free
237   SafeFree(currTaskDescr, "new seq parallel task");
238   currTaskDescr = getMyTopTaskDescriptor(threadId);
239   currTaskDescr->RestoreLoopData();
240 }
241 
__kmpc_parallel_level()242 NOINLINE EXTERN uint8_t __kmpc_parallel_level() {
243   return parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1);
244 }
245 
246 // This kmpc call returns the thread id across all teams. It's value is
247 // cached by the compiler and used when calling the runtime. On nvptx
248 // it's cheap to recalculate this value so we never use the result
249 // of this call.
__kmpc_global_thread_num(kmp_Ident * loc)250 EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) {
251   return GetOmpThreadId();
252 }
253 
254 ////////////////////////////////////////////////////////////////////////////////
255 // push params
256 ////////////////////////////////////////////////////////////////////////////////
257 
__kmpc_push_num_threads(kmp_Ident * loc,int32_t tid,int32_t num_threads)258 EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t tid,
259                                     int32_t num_threads) {
260   PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads);
261   ASSERT0(LT_FUSSY, isRuntimeInitialized(),
262           "Runtime must be initialized.");
263   tid = GetLogicalThreadIdInBlock();
264   omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) =
265       num_threads;
266 }
267 
268 // Do nothing. The host guarantees we started the requested number of
269 // teams and we only need inspection of gridDim.
270 
__kmpc_push_num_teams(kmp_Ident * loc,int32_t tid,int32_t num_teams,int32_t thread_limit)271 EXTERN void __kmpc_push_num_teams(kmp_Ident *loc, int32_t tid,
272                                   int32_t num_teams, int32_t thread_limit) {
273   PRINT(LD_IO, "call kmpc_push_num_teams %d\n", (int)num_teams);
274   ASSERT0(LT_FUSSY, 0, "should never have anything with new teams on device");
275 }
276 
__kmpc_push_proc_bind(kmp_Ident * loc,uint32_t tid,int proc_bind)277 EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t tid, int proc_bind) {
278   PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", (int)proc_bind);
279 }
280 
281 ////////////////////////////////////////////////////////////////////////////////
282 // parallel interface
283 ////////////////////////////////////////////////////////////////////////////////
284 
__kmpc_parallel_51(kmp_Ident * ident,kmp_int32 global_tid,kmp_int32 if_expr,kmp_int32 num_threads,int proc_bind,void * fn,void * wrapper_fn,void ** args,size_t nargs)285 NOINLINE EXTERN void __kmpc_parallel_51(kmp_Ident *ident, kmp_int32 global_tid,
286                                         kmp_int32 if_expr,
287                                         kmp_int32 num_threads, int proc_bind,
288                                         void *fn, void *wrapper_fn, void **args,
289                                         size_t nargs) {
290   // Handle the serialized case first, same for SPMD/non-SPMD except that in
291   // SPMD mode we already incremented the parallel level counter, account for
292   // that.
293   bool InParallelRegion =
294       (__kmpc_parallel_level() > __kmpc_is_spmd_exec_mode());
295   if (!if_expr || InParallelRegion) {
296     __kmpc_serialized_parallel(ident, global_tid);
297     __kmp_invoke_microtask(global_tid, 0, fn, args, nargs);
298     __kmpc_end_serialized_parallel(ident, global_tid);
299     return;
300   }
301 
302   if (__kmpc_is_spmd_exec_mode()) {
303     __kmp_invoke_microtask(global_tid, 0, fn, args, nargs);
304     return;
305   }
306 
307   // Handle the num_threads clause.
308   if (num_threads != -1)
309     __kmpc_push_num_threads(ident, global_tid, num_threads);
310 
311   __kmpc_kernel_prepare_parallel((void *)wrapper_fn);
312 
313   if (nargs) {
314     void **GlobalArgs;
315     __kmpc_begin_sharing_variables(&GlobalArgs, nargs);
316     // TODO: faster memcpy?
317 #pragma unroll
318     for (int I = 0; I < nargs; I++)
319       GlobalArgs[I] = args[I];
320   }
321 
322   // TODO: what if that's a parallel region with a single thread? this is
323   // considered not active in the existing implementation.
324   bool IsActiveParallelRegion = threadsInTeam != 1;
325   int NumWarps =
326       threadsInTeam / WARPSIZE + ((threadsInTeam % WARPSIZE) ? 1 : 0);
327   // Increment parallel level for non-SPMD warps.
328   for (int I = 0; I < NumWarps; ++I)
329     parallelLevel[I] +=
330         (1 + (IsActiveParallelRegion ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
331 
332   // Master signals work to activate workers.
333   __kmpc_barrier_simple_spmd(ident, 0);
334 
335   // OpenMP [2.5, Parallel Construct, p.49]
336   // There is an implied barrier at the end of a parallel region. After the
337   // end of a parallel region, only the master thread of the team resumes
338   // execution of the enclosing task region.
339   //
340   // The master waits at this barrier until all workers are done.
341   __kmpc_barrier_simple_spmd(ident, 0);
342 
343   // Decrement parallel level for non-SPMD warps.
344   for (int I = 0; I < NumWarps; ++I)
345     parallelLevel[I] -=
346         (1 + (IsActiveParallelRegion ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
347   // TODO: Is synchronization needed since out of parallel execution?
348 
349   if (nargs)
350     __kmpc_end_sharing_variables();
351 
352   // TODO: proc_bind is a noop?
353   // if (proc_bind != proc_bind_default)
354   //  __kmpc_push_proc_bind(ident, global_tid, proc_bind);
355 }
356 
357 #pragma omp end declare target
358