1 //===---- parallel.cu - NVPTX OpenMP parallel implementation ----- CUDA -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Parallel implemention in the GPU. Here is the pattern:
10 //
11 // while (not finished) {
12 //
13 // if (master) {
14 // sequential code, decide which par loop to do, or if finished
15 // __kmpc_kernel_prepare_parallel() // exec by master only
16 // }
17 // syncthreads // A
18 // __kmpc_kernel_parallel() // exec by all
19 // if (this thread is included in the parallel) {
20 // switch () for all parallel loops
21 // __kmpc_kernel_end_parallel() // exec only by threads in parallel
22 // }
23 //
24 //
25 // The reason we don't exec end_parallel for the threads not included
26 // in the parallel loop is that for each barrier in the parallel
27 // region, these non-included threads will cycle through the
28 // syncthread A. Thus they must preserve their current threadId that
29 // is larger than thread in team.
30 //
31 // To make a long story short...
32 //
33 //===----------------------------------------------------------------------===//
34
35 #include "omptarget-nvptx.h"
36
37 typedef struct ConvergentSimdJob {
38 omptarget_nvptx_TaskDescr taskDescr;
39 omptarget_nvptx_TaskDescr *convHeadTaskDescr;
40 uint16_t slimForNextSimd;
41 } ConvergentSimdJob;
42
43 ////////////////////////////////////////////////////////////////////////////////
44 // support for convergent simd (team of threads in a warp only)
45 ////////////////////////////////////////////////////////////////////////////////
__kmpc_kernel_convergent_simd(void * buffer,uint32_t Mask,bool * IsFinal,int32_t * LaneSource,int32_t * LaneId,int32_t * NumLanes)46 EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, uint32_t Mask,
47 bool *IsFinal, int32_t *LaneSource,
48 int32_t *LaneId, int32_t *NumLanes) {
49 PRINT0(LD_IO, "call to __kmpc_kernel_convergent_simd\n");
50 uint32_t ConvergentMask = Mask;
51 int32_t ConvergentSize = __popc(ConvergentMask);
52 uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
53 *LaneSource += __ffs(WorkRemaining);
54 *IsFinal = __popc(WorkRemaining) == 1;
55 uint32_t lanemask_lt;
56 asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
57 *LaneId = __popc(ConvergentMask & lanemask_lt);
58
59 int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
60 int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
61
62 ConvergentSimdJob *job = (ConvergentSimdJob *)buffer;
63 int32_t SimdLimit =
64 omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId);
65 job->slimForNextSimd = SimdLimit;
66
67 int32_t SimdLimitSource = __SHFL_SYNC(Mask, SimdLimit, *LaneSource);
68 // reset simdlimit to avoid propagating to successive #simd
69 if (SimdLimitSource > 0 && threadId == sourceThreadId)
70 omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = 0;
71
72 // We cannot have more than the # of convergent threads.
73 if (SimdLimitSource > 0)
74 *NumLanes = min(ConvergentSize, SimdLimitSource);
75 else
76 *NumLanes = ConvergentSize;
77 ASSERT(LT_FUSSY, *NumLanes > 0, "bad thread request of %d threads",
78 (int)*NumLanes);
79
80 // Set to true for lanes participating in the simd region.
81 bool isActive = false;
82 // Initialize state for active threads.
83 if (*LaneId < *NumLanes) {
84 omptarget_nvptx_TaskDescr *currTaskDescr =
85 omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
86 omptarget_nvptx_TaskDescr *sourceTaskDescr =
87 omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(
88 sourceThreadId);
89 job->convHeadTaskDescr = currTaskDescr;
90 // install top descriptor from the thread for which the lanes are working.
91 omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
92 sourceTaskDescr);
93 isActive = true;
94 }
95
96 // requires a memory fence between threads of a warp
97 return isActive;
98 }
99
__kmpc_kernel_end_convergent_simd(void * buffer)100 EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer) {
101 PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n");
102 // pop stack
103 int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
104 ConvergentSimdJob *job = (ConvergentSimdJob *)buffer;
105 omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) =
106 job->slimForNextSimd;
107 omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
108 threadId, job->convHeadTaskDescr);
109 }
110
111 typedef struct ConvergentParallelJob {
112 omptarget_nvptx_TaskDescr taskDescr;
113 omptarget_nvptx_TaskDescr *convHeadTaskDescr;
114 uint16_t tnumForNextPar;
115 } ConvergentParallelJob;
116
117 ////////////////////////////////////////////////////////////////////////////////
118 // support for convergent parallelism (team of threads in a warp only)
119 ////////////////////////////////////////////////////////////////////////////////
__kmpc_kernel_convergent_parallel(void * buffer,uint32_t Mask,bool * IsFinal,int32_t * LaneSource)120 EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask,
121 bool *IsFinal,
122 int32_t *LaneSource) {
123 PRINT0(LD_IO, "call to __kmpc_kernel_convergent_parallel\n");
124 uint32_t ConvergentMask = Mask;
125 int32_t ConvergentSize = __popc(ConvergentMask);
126 uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
127 *LaneSource += __ffs(WorkRemaining);
128 *IsFinal = __popc(WorkRemaining) == 1;
129 uint32_t lanemask_lt;
130 asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
131 uint32_t OmpId = __popc(ConvergentMask & lanemask_lt);
132
133 int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
134 int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
135
136 ConvergentParallelJob *job = (ConvergentParallelJob *)buffer;
137 int32_t NumThreadsClause =
138 omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
139 job->tnumForNextPar = NumThreadsClause;
140
141 int32_t NumThreadsSource = __SHFL_SYNC(Mask, NumThreadsClause, *LaneSource);
142 // reset numthreads to avoid propagating to successive #parallel
143 if (NumThreadsSource > 0 && threadId == sourceThreadId)
144 omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
145 0;
146
147 // We cannot have more than the # of convergent threads.
148 uint16_t NumThreads;
149 if (NumThreadsSource > 0)
150 NumThreads = min(ConvergentSize, NumThreadsSource);
151 else
152 NumThreads = ConvergentSize;
153 ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
154 (int)NumThreads);
155
156 // Set to true for workers participating in the parallel region.
157 bool isActive = false;
158 // Initialize state for active threads.
159 if (OmpId < NumThreads) {
160 // init L2 task descriptor and storage for the L1 parallel task descriptor.
161 omptarget_nvptx_TaskDescr *newTaskDescr = &job->taskDescr;
162 ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
163 omptarget_nvptx_TaskDescr *currTaskDescr =
164 omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
165 omptarget_nvptx_TaskDescr *sourceTaskDescr =
166 omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(
167 sourceThreadId);
168 job->convHeadTaskDescr = currTaskDescr;
169 newTaskDescr->CopyConvergentParent(sourceTaskDescr, OmpId, NumThreads);
170 // install new top descriptor
171 omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
172 newTaskDescr);
173 isActive = true;
174 }
175
176 // requires a memory fence between threads of a warp
177 return isActive;
178 }
179
__kmpc_kernel_end_convergent_parallel(void * buffer)180 EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer) {
181 PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n");
182 // pop stack
183 int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
184 ConvergentParallelJob *job = (ConvergentParallelJob *)buffer;
185 omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
186 threadId, job->convHeadTaskDescr);
187 omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
188 job->tnumForNextPar;
189 }
190
191 ////////////////////////////////////////////////////////////////////////////////
192 // support for parallel that goes parallel (1 static level only)
193 ////////////////////////////////////////////////////////////////////////////////
194
determineNumberOfThreads(uint16_t NumThreadsClause,uint16_t NThreadsICV,uint16_t ThreadLimit)195 INLINE static uint16_t determineNumberOfThreads(uint16_t NumThreadsClause,
196 uint16_t NThreadsICV,
197 uint16_t ThreadLimit) {
198 uint16_t ThreadsRequested = NThreadsICV;
199 if (NumThreadsClause != 0) {
200 ThreadsRequested = NumThreadsClause;
201 }
202
203 uint16_t ThreadsAvailable = GetNumberOfWorkersInTeam();
204 if (ThreadLimit != 0 && ThreadLimit < ThreadsAvailable) {
205 ThreadsAvailable = ThreadLimit;
206 }
207
208 uint16_t NumThreads = ThreadsAvailable;
209 if (ThreadsRequested != 0 && ThreadsRequested < NumThreads) {
210 NumThreads = ThreadsRequested;
211 }
212
213 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
214 // On Volta and newer architectures we require that all lanes in
215 // a warp participate in the parallel region. Round down to a
216 // multiple of WARPSIZE since it is legal to do so in OpenMP.
217 if (NumThreads < WARPSIZE) {
218 NumThreads = 1;
219 } else {
220 NumThreads = (NumThreads & ~((uint16_t)WARPSIZE - 1));
221 }
222 #endif
223
224 return NumThreads;
225 }
226
227 // This routine is always called by the team master..
__kmpc_kernel_prepare_parallel(void * WorkFn,int16_t IsOMPRuntimeInitialized)228 EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
229 int16_t IsOMPRuntimeInitialized) {
230 PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n");
231 ASSERT0(LT_FUSSY, IsOMPRuntimeInitialized, "Expected initialized runtime.");
232
233 omptarget_nvptx_workFn = WorkFn;
234
235 // This routine is only called by the team master. The team master is
236 // the first thread of the last warp. It always has the logical thread
237 // id of 0 (since it is a shadow for the first worker thread).
238 const int threadId = 0;
239 omptarget_nvptx_TaskDescr *currTaskDescr =
240 omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
241 ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
242 ASSERT0(LT_FUSSY, !currTaskDescr->InParallelRegion(),
243 "cannot be called in a parallel region.");
244 if (currTaskDescr->InParallelRegion()) {
245 PRINT0(LD_PAR, "already in parallel: go seq\n");
246 return;
247 }
248
249 uint16_t &NumThreadsClause =
250 omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
251
252 uint16_t NumThreads =
253 determineNumberOfThreads(NumThreadsClause, nThreads, threadLimit);
254
255 if (NumThreadsClause != 0) {
256 // Reset request to avoid propagating to successive #parallel
257 NumThreadsClause = 0;
258 }
259
260 ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
261 (int)NumThreads);
262 ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
263 "only team master can create parallel");
264
265 // Set number of threads on work descriptor.
266 omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
267 workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr);
268 threadsInTeam = NumThreads;
269 }
270
271 // All workers call this function. Deactivate those not needed.
272 // Fn - the outlined work function to execute.
273 // returns True if this thread is active, else False.
274 //
275 // Only the worker threads call this routine.
__kmpc_kernel_parallel(void ** WorkFn,int16_t IsOMPRuntimeInitialized)276 EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
277 int16_t IsOMPRuntimeInitialized) {
278 PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n");
279
280 ASSERT0(LT_FUSSY, IsOMPRuntimeInitialized, "Expected initialized runtime.");
281
282 // Work function and arguments for L1 parallel region.
283 *WorkFn = omptarget_nvptx_workFn;
284
285 // If this is the termination signal from the master, quit early.
286 if (!*WorkFn) {
287 PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel finished\n");
288 return false;
289 }
290
291 // Only the worker threads call this routine and the master warp
292 // never arrives here. Therefore, use the nvptx thread id.
293 int threadId = GetThreadIdInBlock();
294 omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
295 // Set to true for workers participating in the parallel region.
296 bool isActive = false;
297 // Initialize state for active threads.
298 if (threadId < threadsInTeam) {
299 // init work descriptor from workdesccr
300 omptarget_nvptx_TaskDescr *newTaskDescr =
301 omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
302 ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
303 newTaskDescr->CopyFromWorkDescr(workDescr.WorkTaskDescr());
304 // install new top descriptor
305 omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
306 newTaskDescr);
307 // init private from int value
308 PRINT(LD_PAR,
309 "thread will execute parallel region with id %d in a team of "
310 "%d threads\n",
311 (int)newTaskDescr->ThreadId(), (int)nThreads);
312
313 isActive = true;
314 IncParallelLevel(threadsInTeam != 1);
315 }
316
317 return isActive;
318 }
319
__kmpc_kernel_end_parallel()320 EXTERN void __kmpc_kernel_end_parallel() {
321 // pop stack
322 PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_parallel\n");
323 ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
324
325 // Only the worker threads call this routine and the master warp
326 // never arrives here. Therefore, use the nvptx thread id.
327 int threadId = GetThreadIdInBlock();
328 omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
329 omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
330 threadId, currTaskDescr->GetPrevTaskDescr());
331
332 DecParallelLevel(threadsInTeam != 1);
333 }
334
335 ////////////////////////////////////////////////////////////////////////////////
336 // support for parallel that goes sequential
337 ////////////////////////////////////////////////////////////////////////////////
338
__kmpc_serialized_parallel(kmp_Ident * loc,uint32_t global_tid)339 EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) {
340 PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n");
341
342 IncParallelLevel(/*ActiveParallel=*/false);
343
344 if (checkRuntimeUninitialized(loc)) {
345 ASSERT0(LT_FUSSY, checkSPMDMode(loc),
346 "Expected SPMD mode with uninitialized runtime.");
347 return;
348 }
349
350 // assume this is only called for nested parallel
351 int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
352
353 // unlike actual parallel, threads in the same team do not share
354 // the workTaskDescr in this case and num threads is fixed to 1
355
356 // get current task
357 omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
358 currTaskDescr->SaveLoopData();
359
360 // allocate new task descriptor and copy value from current one, set prev to
361 // it
362 omptarget_nvptx_TaskDescr *newTaskDescr =
363 (omptarget_nvptx_TaskDescr *)SafeMalloc(sizeof(omptarget_nvptx_TaskDescr),
364 "new seq parallel task");
365 newTaskDescr->CopyParent(currTaskDescr);
366
367 // tweak values for serialized parallel case:
368 // - each thread becomes ID 0 in its serialized parallel, and
369 // - there is only one thread per team
370 newTaskDescr->ThreadId() = 0;
371
372 // set new task descriptor as top
373 omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
374 newTaskDescr);
375 }
376
__kmpc_end_serialized_parallel(kmp_Ident * loc,uint32_t global_tid)377 EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc,
378 uint32_t global_tid) {
379 PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n");
380
381 DecParallelLevel(/*ActiveParallel=*/false);
382
383 if (checkRuntimeUninitialized(loc)) {
384 ASSERT0(LT_FUSSY, checkSPMDMode(loc),
385 "Expected SPMD mode with uninitialized runtime.");
386 return;
387 }
388
389 // pop stack
390 int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
391 omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
392 // set new top
393 omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
394 threadId, currTaskDescr->GetPrevTaskDescr());
395 // free
396 SafeFree(currTaskDescr, (char *)"new seq parallel task");
397 currTaskDescr = getMyTopTaskDescriptor(threadId);
398 currTaskDescr->RestoreLoopData();
399 }
400
__kmpc_parallel_level(kmp_Ident * loc,uint32_t global_tid)401 EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) {
402 PRINT0(LD_IO, "call to __kmpc_parallel_level\n");
403
404 return parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1);
405 }
406
407 // This kmpc call returns the thread id across all teams. It's value is
408 // cached by the compiler and used when calling the runtime. On nvptx
409 // it's cheap to recalculate this value so we never use the result
410 // of this call.
__kmpc_global_thread_num(kmp_Ident * loc)411 EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) {
412 int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
413 return GetOmpThreadId(tid, checkSPMDMode(loc));
414 }
415
416 ////////////////////////////////////////////////////////////////////////////////
417 // push params
418 ////////////////////////////////////////////////////////////////////////////////
419
__kmpc_push_num_threads(kmp_Ident * loc,int32_t tid,int32_t num_threads)420 EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t tid,
421 int32_t num_threads) {
422 PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads);
423 ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized.");
424 tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
425 omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) =
426 num_threads;
427 }
428
__kmpc_push_simd_limit(kmp_Ident * loc,int32_t tid,int32_t simd_limit)429 EXTERN void __kmpc_push_simd_limit(kmp_Ident *loc, int32_t tid,
430 int32_t simd_limit) {
431 PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", (int)simd_limit);
432 ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized.");
433 tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
434 omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(tid) = simd_limit;
435 }
436
437 // Do nothing. The host guarantees we started the requested number of
438 // teams and we only need inspection of gridDim.
439
__kmpc_push_num_teams(kmp_Ident * loc,int32_t tid,int32_t num_teams,int32_t thread_limit)440 EXTERN void __kmpc_push_num_teams(kmp_Ident *loc, int32_t tid,
441 int32_t num_teams, int32_t thread_limit) {
442 PRINT(LD_IO, "call kmpc_push_num_teams %d\n", (int)num_teams);
443 ASSERT0(LT_FUSSY, FALSE,
444 "should never have anything with new teams on device");
445 }
446
__kmpc_push_proc_bind(kmp_Ident * loc,uint32_t tid,int proc_bind)447 EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t tid,
448 int proc_bind) {
449 PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", (int)proc_bind);
450 }
451