1 //===---- parallel.cu - NVPTX OpenMP parallel implementation ----- CUDA -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Parallel implemention in the GPU. Here is the pattern:
10 //
11 //    while (not finished) {
12 //
13 //    if (master) {
14 //      sequential code, decide which par loop to do, or if finished
15 //     __kmpc_kernel_prepare_parallel() // exec by master only
16 //    }
17 //    syncthreads // A
18 //    __kmpc_kernel_parallel() // exec by all
19 //    if (this thread is included in the parallel) {
20 //      switch () for all parallel loops
21 //      __kmpc_kernel_end_parallel() // exec only by threads in parallel
22 //    }
23 //
24 //
25 //    The reason we don't exec end_parallel for the threads not included
26 //    in the parallel loop is that for each barrier in the parallel
27 //    region, these non-included threads will cycle through the
28 //    syncthread A. Thus they must preserve their current threadId that
29 //    is larger than thread in team.
30 //
31 //    To make a long story short...
32 //
33 //===----------------------------------------------------------------------===//
34 
35 #include "omptarget-nvptx.h"
36 
37 typedef struct ConvergentSimdJob {
38   omptarget_nvptx_TaskDescr taskDescr;
39   omptarget_nvptx_TaskDescr *convHeadTaskDescr;
40   uint16_t slimForNextSimd;
41 } ConvergentSimdJob;
42 
43 ////////////////////////////////////////////////////////////////////////////////
44 // support for convergent simd (team of threads in a warp only)
45 ////////////////////////////////////////////////////////////////////////////////
__kmpc_kernel_convergent_simd(void * buffer,uint32_t Mask,bool * IsFinal,int32_t * LaneSource,int32_t * LaneId,int32_t * NumLanes)46 EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, uint32_t Mask,
47                                           bool *IsFinal, int32_t *LaneSource,
48                                           int32_t *LaneId, int32_t *NumLanes) {
49   PRINT0(LD_IO, "call to __kmpc_kernel_convergent_simd\n");
50   uint32_t ConvergentMask = Mask;
51   int32_t ConvergentSize = __popc(ConvergentMask);
52   uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
53   *LaneSource += __ffs(WorkRemaining);
54   *IsFinal = __popc(WorkRemaining) == 1;
55   uint32_t lanemask_lt;
56   asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
57   *LaneId = __popc(ConvergentMask & lanemask_lt);
58 
59   int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
60   int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
61 
62   ConvergentSimdJob *job = (ConvergentSimdJob *)buffer;
63   int32_t SimdLimit =
64       omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId);
65   job->slimForNextSimd = SimdLimit;
66 
67   int32_t SimdLimitSource = __SHFL_SYNC(Mask, SimdLimit, *LaneSource);
68   // reset simdlimit to avoid propagating to successive #simd
69   if (SimdLimitSource > 0 && threadId == sourceThreadId)
70     omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = 0;
71 
72   // We cannot have more than the # of convergent threads.
73   if (SimdLimitSource > 0)
74     *NumLanes = min(ConvergentSize, SimdLimitSource);
75   else
76     *NumLanes = ConvergentSize;
77   ASSERT(LT_FUSSY, *NumLanes > 0, "bad thread request of %d threads",
78          (int)*NumLanes);
79 
80   // Set to true for lanes participating in the simd region.
81   bool isActive = false;
82   // Initialize state for active threads.
83   if (*LaneId < *NumLanes) {
84     omptarget_nvptx_TaskDescr *currTaskDescr =
85         omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
86     omptarget_nvptx_TaskDescr *sourceTaskDescr =
87         omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(
88             sourceThreadId);
89     job->convHeadTaskDescr = currTaskDescr;
90     // install top descriptor from the thread for which the lanes are working.
91     omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
92                                                                sourceTaskDescr);
93     isActive = true;
94   }
95 
96   // requires a memory fence between threads of a warp
97   return isActive;
98 }
99 
__kmpc_kernel_end_convergent_simd(void * buffer)100 EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer) {
101   PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n");
102   // pop stack
103   int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
104   ConvergentSimdJob *job = (ConvergentSimdJob *)buffer;
105   omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) =
106       job->slimForNextSimd;
107   omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
108       threadId, job->convHeadTaskDescr);
109 }
110 
111 typedef struct ConvergentParallelJob {
112   omptarget_nvptx_TaskDescr taskDescr;
113   omptarget_nvptx_TaskDescr *convHeadTaskDescr;
114   uint16_t tnumForNextPar;
115 } ConvergentParallelJob;
116 
117 ////////////////////////////////////////////////////////////////////////////////
118 // support for convergent parallelism (team of threads in a warp only)
119 ////////////////////////////////////////////////////////////////////////////////
__kmpc_kernel_convergent_parallel(void * buffer,uint32_t Mask,bool * IsFinal,int32_t * LaneSource)120 EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask,
121                                               bool *IsFinal,
122                                               int32_t *LaneSource) {
123   PRINT0(LD_IO, "call to __kmpc_kernel_convergent_parallel\n");
124   uint32_t ConvergentMask = Mask;
125   int32_t ConvergentSize = __popc(ConvergentMask);
126   uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
127   *LaneSource += __ffs(WorkRemaining);
128   *IsFinal = __popc(WorkRemaining) == 1;
129   uint32_t lanemask_lt;
130   asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
131   uint32_t OmpId = __popc(ConvergentMask & lanemask_lt);
132 
133   int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
134   int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
135 
136   ConvergentParallelJob *job = (ConvergentParallelJob *)buffer;
137   int32_t NumThreadsClause =
138       omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
139   job->tnumForNextPar = NumThreadsClause;
140 
141   int32_t NumThreadsSource = __SHFL_SYNC(Mask, NumThreadsClause, *LaneSource);
142   // reset numthreads to avoid propagating to successive #parallel
143   if (NumThreadsSource > 0 && threadId == sourceThreadId)
144     omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
145         0;
146 
147   // We cannot have more than the # of convergent threads.
148   uint16_t NumThreads;
149   if (NumThreadsSource > 0)
150     NumThreads = min(ConvergentSize, NumThreadsSource);
151   else
152     NumThreads = ConvergentSize;
153   ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
154          (int)NumThreads);
155 
156   // Set to true for workers participating in the parallel region.
157   bool isActive = false;
158   // Initialize state for active threads.
159   if (OmpId < NumThreads) {
160     // init L2 task descriptor and storage for the L1 parallel task descriptor.
161     omptarget_nvptx_TaskDescr *newTaskDescr = &job->taskDescr;
162     ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
163     omptarget_nvptx_TaskDescr *currTaskDescr =
164         omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
165     omptarget_nvptx_TaskDescr *sourceTaskDescr =
166         omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(
167             sourceThreadId);
168     job->convHeadTaskDescr = currTaskDescr;
169     newTaskDescr->CopyConvergentParent(sourceTaskDescr, OmpId, NumThreads);
170     // install new top descriptor
171     omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
172                                                                newTaskDescr);
173     isActive = true;
174   }
175 
176   // requires a memory fence between threads of a warp
177   return isActive;
178 }
179 
__kmpc_kernel_end_convergent_parallel(void * buffer)180 EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer) {
181   PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n");
182   // pop stack
183   int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
184   ConvergentParallelJob *job = (ConvergentParallelJob *)buffer;
185   omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
186       threadId, job->convHeadTaskDescr);
187   omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
188       job->tnumForNextPar;
189 }
190 
191 ////////////////////////////////////////////////////////////////////////////////
192 // support for parallel that goes parallel (1 static level only)
193 ////////////////////////////////////////////////////////////////////////////////
194 
determineNumberOfThreads(uint16_t NumThreadsClause,uint16_t NThreadsICV,uint16_t ThreadLimit)195 INLINE static uint16_t determineNumberOfThreads(uint16_t NumThreadsClause,
196                                                 uint16_t NThreadsICV,
197                                                 uint16_t ThreadLimit) {
198   uint16_t ThreadsRequested = NThreadsICV;
199   if (NumThreadsClause != 0) {
200     ThreadsRequested = NumThreadsClause;
201   }
202 
203   uint16_t ThreadsAvailable = GetNumberOfWorkersInTeam();
204   if (ThreadLimit != 0 && ThreadLimit < ThreadsAvailable) {
205     ThreadsAvailable = ThreadLimit;
206   }
207 
208   uint16_t NumThreads = ThreadsAvailable;
209   if (ThreadsRequested != 0 && ThreadsRequested < NumThreads) {
210     NumThreads = ThreadsRequested;
211   }
212 
213 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
214   // On Volta and newer architectures we require that all lanes in
215   // a warp participate in the parallel region.  Round down to a
216   // multiple of WARPSIZE since it is legal to do so in OpenMP.
217   if (NumThreads < WARPSIZE) {
218     NumThreads = 1;
219   } else {
220     NumThreads = (NumThreads & ~((uint16_t)WARPSIZE - 1));
221   }
222 #endif
223 
224   return NumThreads;
225 }
226 
227 // This routine is always called by the team master..
__kmpc_kernel_prepare_parallel(void * WorkFn,int16_t IsOMPRuntimeInitialized)228 EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
229                                            int16_t IsOMPRuntimeInitialized) {
230   PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n");
231   ASSERT0(LT_FUSSY, IsOMPRuntimeInitialized, "Expected initialized runtime.");
232 
233   omptarget_nvptx_workFn = WorkFn;
234 
235   // This routine is only called by the team master.  The team master is
236   // the first thread of the last warp.  It always has the logical thread
237   // id of 0 (since it is a shadow for the first worker thread).
238   const int threadId = 0;
239   omptarget_nvptx_TaskDescr *currTaskDescr =
240       omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
241   ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
242   ASSERT0(LT_FUSSY, !currTaskDescr->InParallelRegion(),
243           "cannot be called in a parallel region.");
244   if (currTaskDescr->InParallelRegion()) {
245     PRINT0(LD_PAR, "already in parallel: go seq\n");
246     return;
247   }
248 
249   uint16_t &NumThreadsClause =
250       omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
251 
252   uint16_t NumThreads =
253       determineNumberOfThreads(NumThreadsClause, nThreads, threadLimit);
254 
255   if (NumThreadsClause != 0) {
256     // Reset request to avoid propagating to successive #parallel
257     NumThreadsClause = 0;
258   }
259 
260   ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
261          (int)NumThreads);
262   ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
263           "only team master can create parallel");
264 
265   // Set number of threads on work descriptor.
266   omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
267   workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr);
268   threadsInTeam = NumThreads;
269 }
270 
271 // All workers call this function.  Deactivate those not needed.
272 // Fn - the outlined work function to execute.
273 // returns True if this thread is active, else False.
274 //
275 // Only the worker threads call this routine.
__kmpc_kernel_parallel(void ** WorkFn,int16_t IsOMPRuntimeInitialized)276 EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
277                                    int16_t IsOMPRuntimeInitialized) {
278   PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n");
279 
280   ASSERT0(LT_FUSSY, IsOMPRuntimeInitialized, "Expected initialized runtime.");
281 
282   // Work function and arguments for L1 parallel region.
283   *WorkFn = omptarget_nvptx_workFn;
284 
285   // If this is the termination signal from the master, quit early.
286   if (!*WorkFn) {
287     PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel finished\n");
288     return false;
289   }
290 
291   // Only the worker threads call this routine and the master warp
292   // never arrives here.  Therefore, use the nvptx thread id.
293   int threadId = GetThreadIdInBlock();
294   omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
295   // Set to true for workers participating in the parallel region.
296   bool isActive = false;
297   // Initialize state for active threads.
298   if (threadId < threadsInTeam) {
299     // init work descriptor from workdesccr
300     omptarget_nvptx_TaskDescr *newTaskDescr =
301         omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
302     ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
303     newTaskDescr->CopyFromWorkDescr(workDescr.WorkTaskDescr());
304     // install new top descriptor
305     omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
306                                                                newTaskDescr);
307     // init private from int value
308     PRINT(LD_PAR,
309           "thread will execute parallel region with id %d in a team of "
310           "%d threads\n",
311           (int)newTaskDescr->ThreadId(), (int)nThreads);
312 
313     isActive = true;
314     IncParallelLevel(threadsInTeam != 1);
315   }
316 
317   return isActive;
318 }
319 
__kmpc_kernel_end_parallel()320 EXTERN void __kmpc_kernel_end_parallel() {
321   // pop stack
322   PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_parallel\n");
323   ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
324 
325   // Only the worker threads call this routine and the master warp
326   // never arrives here.  Therefore, use the nvptx thread id.
327   int threadId = GetThreadIdInBlock();
328   omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
329   omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
330       threadId, currTaskDescr->GetPrevTaskDescr());
331 
332   DecParallelLevel(threadsInTeam != 1);
333 }
334 
335 ////////////////////////////////////////////////////////////////////////////////
336 // support for parallel that goes sequential
337 ////////////////////////////////////////////////////////////////////////////////
338 
__kmpc_serialized_parallel(kmp_Ident * loc,uint32_t global_tid)339 EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) {
340   PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n");
341 
342   IncParallelLevel(/*ActiveParallel=*/false);
343 
344   if (checkRuntimeUninitialized(loc)) {
345     ASSERT0(LT_FUSSY, checkSPMDMode(loc),
346             "Expected SPMD mode with uninitialized runtime.");
347     return;
348   }
349 
350   // assume this is only called for nested parallel
351   int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
352 
353   // unlike actual parallel, threads in the same team do not share
354   // the workTaskDescr in this case and num threads is fixed to 1
355 
356   // get current task
357   omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
358   currTaskDescr->SaveLoopData();
359 
360   // allocate new task descriptor and copy value from current one, set prev to
361   // it
362   omptarget_nvptx_TaskDescr *newTaskDescr =
363       (omptarget_nvptx_TaskDescr *)SafeMalloc(sizeof(omptarget_nvptx_TaskDescr),
364                                               "new seq parallel task");
365   newTaskDescr->CopyParent(currTaskDescr);
366 
367   // tweak values for serialized parallel case:
368   // - each thread becomes ID 0 in its serialized parallel, and
369   // - there is only one thread per team
370   newTaskDescr->ThreadId() = 0;
371 
372   // set new task descriptor as top
373   omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
374                                                              newTaskDescr);
375 }
376 
__kmpc_end_serialized_parallel(kmp_Ident * loc,uint32_t global_tid)377 EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc,
378                                            uint32_t global_tid) {
379   PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n");
380 
381   DecParallelLevel(/*ActiveParallel=*/false);
382 
383   if (checkRuntimeUninitialized(loc)) {
384     ASSERT0(LT_FUSSY, checkSPMDMode(loc),
385             "Expected SPMD mode with uninitialized runtime.");
386     return;
387   }
388 
389   // pop stack
390   int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
391   omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
392   // set new top
393   omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
394       threadId, currTaskDescr->GetPrevTaskDescr());
395   // free
396   SafeFree(currTaskDescr, (char *)"new seq parallel task");
397   currTaskDescr = getMyTopTaskDescriptor(threadId);
398   currTaskDescr->RestoreLoopData();
399 }
400 
__kmpc_parallel_level(kmp_Ident * loc,uint32_t global_tid)401 EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) {
402   PRINT0(LD_IO, "call to __kmpc_parallel_level\n");
403 
404   return parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1);
405 }
406 
407 // This kmpc call returns the thread id across all teams. It's value is
408 // cached by the compiler and used when calling the runtime. On nvptx
409 // it's cheap to recalculate this value so we never use the result
410 // of this call.
__kmpc_global_thread_num(kmp_Ident * loc)411 EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) {
412   int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
413   return GetOmpThreadId(tid, checkSPMDMode(loc));
414 }
415 
416 ////////////////////////////////////////////////////////////////////////////////
417 // push params
418 ////////////////////////////////////////////////////////////////////////////////
419 
__kmpc_push_num_threads(kmp_Ident * loc,int32_t tid,int32_t num_threads)420 EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t tid,
421                                     int32_t num_threads) {
422   PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads);
423   ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized.");
424   tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
425   omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) =
426       num_threads;
427 }
428 
__kmpc_push_simd_limit(kmp_Ident * loc,int32_t tid,int32_t simd_limit)429 EXTERN void __kmpc_push_simd_limit(kmp_Ident *loc, int32_t tid,
430                                    int32_t simd_limit) {
431   PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", (int)simd_limit);
432   ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized.");
433   tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
434   omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(tid) = simd_limit;
435 }
436 
437 // Do nothing. The host guarantees we started the requested number of
438 // teams and we only need inspection of gridDim.
439 
__kmpc_push_num_teams(kmp_Ident * loc,int32_t tid,int32_t num_teams,int32_t thread_limit)440 EXTERN void __kmpc_push_num_teams(kmp_Ident *loc, int32_t tid,
441                                   int32_t num_teams, int32_t thread_limit) {
442   PRINT(LD_IO, "call kmpc_push_num_teams %d\n", (int)num_teams);
443   ASSERT0(LT_FUSSY, FALSE,
444           "should never have anything with new teams on device");
445 }
446 
__kmpc_push_proc_bind(kmp_Ident * loc,uint32_t tid,int proc_bind)447 EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t tid,
448                                   int proc_bind) {
449   PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", (int)proc_bind);
450 }
451