1 //===---- parallel.cu - GPU OpenMP parallel implementation ------- CUDA -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Parallel implementation in the GPU. Here is the pattern:
10 //
11 // while (not finished) {
12 //
13 // if (master) {
14 // sequential code, decide which par loop to do, or if finished
15 // __kmpc_kernel_prepare_parallel() // exec by master only
16 // }
17 // syncthreads // A
18 // __kmpc_kernel_parallel() // exec by all
19 // if (this thread is included in the parallel) {
20 // switch () for all parallel loops
21 // __kmpc_kernel_end_parallel() // exec only by threads in parallel
22 // }
23 //
24 //
25 // The reason we don't exec end_parallel for the threads not included
26 // in the parallel loop is that for each barrier in the parallel
27 // region, these non-included threads will cycle through the
28 // syncthread A. Thus they must preserve their current threadId that
29 // is larger than thread in team.
30 //
31 // To make a long story short...
32 //
33 //===----------------------------------------------------------------------===//
34 #pragma omp declare target
35
36 #include "common/omptarget.h"
37 #include "target_impl.h"
38
39 ////////////////////////////////////////////////////////////////////////////////
40 // support for parallel that goes parallel (1 static level only)
41 ////////////////////////////////////////////////////////////////////////////////
42
determineNumberOfThreads(uint16_t NumThreadsClause,uint16_t NThreadsICV,uint16_t ThreadLimit)43 INLINE static uint16_t determineNumberOfThreads(uint16_t NumThreadsClause,
44 uint16_t NThreadsICV,
45 uint16_t ThreadLimit) {
46 uint16_t ThreadsRequested = NThreadsICV;
47 if (NumThreadsClause != 0) {
48 ThreadsRequested = NumThreadsClause;
49 }
50
51 uint16_t ThreadsAvailable = GetNumberOfWorkersInTeam();
52 if (ThreadLimit != 0 && ThreadLimit < ThreadsAvailable) {
53 ThreadsAvailable = ThreadLimit;
54 }
55
56 uint16_t NumThreads = ThreadsAvailable;
57 if (ThreadsRequested != 0 && ThreadsRequested < NumThreads) {
58 NumThreads = ThreadsRequested;
59 }
60
61 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
62 // On Volta and newer architectures we require that all lanes in
63 // a warp participate in the parallel region. Round down to a
64 // multiple of WARPSIZE since it is legal to do so in OpenMP.
65 if (NumThreads < WARPSIZE) {
66 NumThreads = 1;
67 } else {
68 NumThreads = (NumThreads & ~((uint16_t)WARPSIZE - 1));
69 }
70 #endif
71
72 return NumThreads;
73 }
74
75 // This routine is always called by the team master..
__kmpc_kernel_prepare_parallel(void * WorkFn)76 EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn) {
77 PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n");
78
79 omptarget_nvptx_workFn = WorkFn;
80
81 // This routine is only called by the team master. The team master is
82 // the first thread of the last warp. It always has the logical thread
83 // id of 0 (since it is a shadow for the first worker thread).
84 const int threadId = 0;
85 omptarget_nvptx_TaskDescr *currTaskDescr =
86 omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
87 ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
88 ASSERT0(LT_FUSSY, !currTaskDescr->InParallelRegion(),
89 "cannot be called in a parallel region.");
90 if (currTaskDescr->InParallelRegion()) {
91 PRINT0(LD_PAR, "already in parallel: go seq\n");
92 return;
93 }
94
95 uint16_t &NumThreadsClause =
96 omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
97
98 uint16_t NumThreads =
99 determineNumberOfThreads(NumThreadsClause, nThreads, threadLimit);
100
101 if (NumThreadsClause != 0) {
102 // Reset request to avoid propagating to successive #parallel
103 NumThreadsClause = 0;
104 }
105
106 ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
107 (int)NumThreads);
108 ASSERT0(LT_FUSSY,
109 __kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(),
110 "only team master can create parallel");
111
112 // Set number of threads on work descriptor.
113 omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
114 workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr);
115 threadsInTeam = NumThreads;
116 }
117
118 // All workers call this function. Deactivate those not needed.
119 // Fn - the outlined work function to execute.
120 // returns True if this thread is active, else False.
121 //
122 // Only the worker threads call this routine.
__kmpc_kernel_parallel(void ** WorkFn)123 EXTERN bool __kmpc_kernel_parallel(void **WorkFn) {
124 PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n");
125
126 // Work function and arguments for L1 parallel region.
127 *WorkFn = omptarget_nvptx_workFn;
128
129 // If this is the termination signal from the master, quit early.
130 if (!*WorkFn) {
131 PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel finished\n");
132 return false;
133 }
134
135 // Only the worker threads call this routine and the master warp
136 // never arrives here. Therefore, use the nvptx thread id.
137 int threadId = __kmpc_get_hardware_thread_id_in_block();
138 omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
139 // Set to true for workers participating in the parallel region.
140 bool isActive = false;
141 // Initialize state for active threads.
142 if (threadId < threadsInTeam) {
143 // init work descriptor from workdesccr
144 omptarget_nvptx_TaskDescr *newTaskDescr =
145 omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
146 ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
147 newTaskDescr->CopyFromWorkDescr(workDescr.WorkTaskDescr());
148 // install new top descriptor
149 omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
150 newTaskDescr);
151 // init private from int value
152 PRINT(LD_PAR,
153 "thread will execute parallel region with id %d in a team of "
154 "%d threads\n",
155 (int)newTaskDescr->ThreadId(), (int)nThreads);
156
157 isActive = true;
158 }
159
160 return isActive;
161 }
162
__kmpc_kernel_end_parallel()163 EXTERN void __kmpc_kernel_end_parallel() {
164 // pop stack
165 PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_parallel\n");
166 ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
167
168 // Only the worker threads call this routine and the master warp
169 // never arrives here. Therefore, use the nvptx thread id.
170 int threadId = __kmpc_get_hardware_thread_id_in_block();
171 omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
172 omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
173 threadId, currTaskDescr->GetPrevTaskDescr());
174 }
175
176 ////////////////////////////////////////////////////////////////////////////////
177 // support for parallel that goes sequential
178 ////////////////////////////////////////////////////////////////////////////////
179
__kmpc_serialized_parallel(kmp_Ident * loc,uint32_t global_tid)180 EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) {
181 PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n");
182
183 IncParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask());
184
185 if (isRuntimeUninitialized()) {
186 ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(),
187 "Expected SPMD mode with uninitialized runtime.");
188 return;
189 }
190
191 // assume this is only called for nested parallel
192 int threadId = GetLogicalThreadIdInBlock();
193
194 // unlike actual parallel, threads in the same team do not share
195 // the workTaskDescr in this case and num threads is fixed to 1
196
197 // get current task
198 omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
199 currTaskDescr->SaveLoopData();
200
201 // allocate new task descriptor and copy value from current one, set prev to
202 // it
203 omptarget_nvptx_TaskDescr *newTaskDescr =
204 (omptarget_nvptx_TaskDescr *)SafeMalloc(sizeof(omptarget_nvptx_TaskDescr),
205 "new seq parallel task");
206 newTaskDescr->CopyParent(currTaskDescr);
207
208 // tweak values for serialized parallel case:
209 // - each thread becomes ID 0 in its serialized parallel, and
210 // - there is only one thread per team
211 newTaskDescr->ThreadId() = 0;
212
213 // set new task descriptor as top
214 omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
215 newTaskDescr);
216 }
217
__kmpc_end_serialized_parallel(kmp_Ident * loc,uint32_t global_tid)218 EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc,
219 uint32_t global_tid) {
220 PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n");
221
222 DecParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask());
223
224 if (isRuntimeUninitialized()) {
225 ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(),
226 "Expected SPMD mode with uninitialized runtime.");
227 return;
228 }
229
230 // pop stack
231 int threadId = GetLogicalThreadIdInBlock();
232 omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
233 // set new top
234 omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
235 threadId, currTaskDescr->GetPrevTaskDescr());
236 // free
237 SafeFree(currTaskDescr, "new seq parallel task");
238 currTaskDescr = getMyTopTaskDescriptor(threadId);
239 currTaskDescr->RestoreLoopData();
240 }
241
__kmpc_parallel_level()242 NOINLINE EXTERN uint8_t __kmpc_parallel_level() {
243 return parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1);
244 }
245
246 // This kmpc call returns the thread id across all teams. It's value is
247 // cached by the compiler and used when calling the runtime. On nvptx
248 // it's cheap to recalculate this value so we never use the result
249 // of this call.
__kmpc_global_thread_num(kmp_Ident * loc)250 EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) {
251 return GetOmpThreadId();
252 }
253
254 ////////////////////////////////////////////////////////////////////////////////
255 // push params
256 ////////////////////////////////////////////////////////////////////////////////
257
__kmpc_push_num_threads(kmp_Ident * loc,int32_t tid,int32_t num_threads)258 EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t tid,
259 int32_t num_threads) {
260 PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads);
261 ASSERT0(LT_FUSSY, isRuntimeInitialized(),
262 "Runtime must be initialized.");
263 tid = GetLogicalThreadIdInBlock();
264 omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) =
265 num_threads;
266 }
267
268 // Do nothing. The host guarantees we started the requested number of
269 // teams and we only need inspection of gridDim.
270
__kmpc_push_num_teams(kmp_Ident * loc,int32_t tid,int32_t num_teams,int32_t thread_limit)271 EXTERN void __kmpc_push_num_teams(kmp_Ident *loc, int32_t tid,
272 int32_t num_teams, int32_t thread_limit) {
273 PRINT(LD_IO, "call kmpc_push_num_teams %d\n", (int)num_teams);
274 ASSERT0(LT_FUSSY, 0, "should never have anything with new teams on device");
275 }
276
__kmpc_push_proc_bind(kmp_Ident * loc,uint32_t tid,int proc_bind)277 EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t tid, int proc_bind) {
278 PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", (int)proc_bind);
279 }
280
281 ////////////////////////////////////////////////////////////////////////////////
282 // parallel interface
283 ////////////////////////////////////////////////////////////////////////////////
284
__kmpc_parallel_51(kmp_Ident * ident,kmp_int32 global_tid,kmp_int32 if_expr,kmp_int32 num_threads,int proc_bind,void * fn,void * wrapper_fn,void ** args,size_t nargs)285 NOINLINE EXTERN void __kmpc_parallel_51(kmp_Ident *ident, kmp_int32 global_tid,
286 kmp_int32 if_expr,
287 kmp_int32 num_threads, int proc_bind,
288 void *fn, void *wrapper_fn, void **args,
289 size_t nargs) {
290 // Handle the serialized case first, same for SPMD/non-SPMD except that in
291 // SPMD mode we already incremented the parallel level counter, account for
292 // that.
293 bool InParallelRegion =
294 (__kmpc_parallel_level() > __kmpc_is_spmd_exec_mode());
295 if (!if_expr || InParallelRegion) {
296 __kmpc_serialized_parallel(ident, global_tid);
297 __kmp_invoke_microtask(global_tid, 0, fn, args, nargs);
298 __kmpc_end_serialized_parallel(ident, global_tid);
299 return;
300 }
301
302 if (__kmpc_is_spmd_exec_mode()) {
303 __kmp_invoke_microtask(global_tid, 0, fn, args, nargs);
304 return;
305 }
306
307 // Handle the num_threads clause.
308 if (num_threads != -1)
309 __kmpc_push_num_threads(ident, global_tid, num_threads);
310
311 __kmpc_kernel_prepare_parallel((void *)wrapper_fn);
312
313 if (nargs) {
314 void **GlobalArgs;
315 __kmpc_begin_sharing_variables(&GlobalArgs, nargs);
316 // TODO: faster memcpy?
317 #pragma unroll
318 for (int I = 0; I < nargs; I++)
319 GlobalArgs[I] = args[I];
320 }
321
322 // TODO: what if that's a parallel region with a single thread? this is
323 // considered not active in the existing implementation.
324 bool IsActiveParallelRegion = threadsInTeam != 1;
325 int NumWarps =
326 threadsInTeam / WARPSIZE + ((threadsInTeam % WARPSIZE) ? 1 : 0);
327 // Increment parallel level for non-SPMD warps.
328 for (int I = 0; I < NumWarps; ++I)
329 parallelLevel[I] +=
330 (1 + (IsActiveParallelRegion ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
331
332 // Master signals work to activate workers.
333 __kmpc_barrier_simple_spmd(ident, 0);
334
335 // OpenMP [2.5, Parallel Construct, p.49]
336 // There is an implied barrier at the end of a parallel region. After the
337 // end of a parallel region, only the master thread of the team resumes
338 // execution of the enclosing task region.
339 //
340 // The master waits at this barrier until all workers are done.
341 __kmpc_barrier_simple_spmd(ident, 0);
342
343 // Decrement parallel level for non-SPMD warps.
344 for (int I = 0; I < NumWarps; ++I)
345 parallelLevel[I] -=
346 (1 + (IsActiveParallelRegion ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
347 // TODO: Is synchronization needed since out of parallel execution?
348
349 if (nargs)
350 __kmpc_end_sharing_variables();
351
352 // TODO: proc_bind is a noop?
353 // if (proc_bind != proc_bind_default)
354 // __kmpc_push_proc_bind(ident, global_tid, proc_bind);
355 }
356
357 #pragma omp end declare target
358