1 //===---- omptarget.h - OpenMP GPU initialization ---------------- CUDA -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the declarations of all library macros, types,
10 // and functions.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef OMPTARGET_H
15 #define OMPTARGET_H
16 
17 #include "target_impl.h"
18 #include "common/debug.h"     // debug
19 #include "interface.h" // interfaces with omp, compiler, and user
20 #include "common/state-queue.h"
21 #include "common/support.h"
22 
23 #define OMPTARGET_NVPTX_VERSION 1.1
24 
25 // used by the library for the interface with the app
26 #define DISPATCH_FINISHED 0
27 #define DISPATCH_NOTFINISHED 1
28 
29 // used by dynamic scheduling
30 #define FINISHED 0
31 #define NOT_FINISHED 1
32 #define LAST_CHUNK 2
33 
34 #define BARRIER_COUNTER 0
35 #define ORDERED_COUNTER 1
36 
37 // arguments needed for L0 parallelism only.
38 class omptarget_nvptx_SharedArgs {
39 public:
40   // All these methods must be called by the master thread only.
Init()41   INLINE void Init() {
42     args  = buffer;
43     nArgs = MAX_SHARED_ARGS;
44   }
DeInit()45   INLINE void DeInit() {
46     // Free any memory allocated for outlined parallel function with a large
47     // number of arguments.
48     if (nArgs > MAX_SHARED_ARGS) {
49       SafeFree(args, "new extended args");
50       Init();
51     }
52   }
EnsureSize(size_t size)53   INLINE void EnsureSize(size_t size) {
54     if (size > nArgs) {
55       if (nArgs > MAX_SHARED_ARGS) {
56         SafeFree(args, "new extended args");
57       }
58       args = (void **)SafeMalloc(size * sizeof(void *), "new extended args");
59       nArgs = size;
60     }
61   }
62   // Called by all threads.
GetArgs()63   INLINE void **GetArgs() const { return args; };
64 private:
65   // buffer of pre-allocated arguments.
66   void *buffer[MAX_SHARED_ARGS];
67   // pointer to arguments buffer.
68   // starts off as a pointer to 'buffer' but can be dynamically allocated.
69   void **args;
70   // starts off as MAX_SHARED_ARGS but can increase in size.
71   uint32_t nArgs;
72 };
73 
74 extern DEVICE SHARED omptarget_nvptx_SharedArgs
75     omptarget_nvptx_globalArgs;
76 
77 // Data structure to keep in shared memory that traces the current slot, stack,
78 // and frame pointer as well as the active threads that didn't exit the current
79 // environment.
80 struct DataSharingStateTy {
81   __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number];
82   void *StackPtr[DS_Max_Warp_Number];
83   void * volatile FramePtr[DS_Max_Warp_Number];
84   __kmpc_impl_lanemask_t ActiveThreads[DS_Max_Warp_Number];
85 };
86 // Additional worker slot type which is initialized with the default worker slot
87 // size of 4*32 bytes.
88 struct __kmpc_data_sharing_worker_slot_static {
89   __kmpc_data_sharing_slot *Next;
90   __kmpc_data_sharing_slot *Prev;
91   void *PrevSlotStackPtr;
92   void *DataEnd;
93   char Data[DS_Worker_Warp_Slot_Size];
94 };
95 // Additional master slot type which is initialized with the default master slot
96 // size of 4 bytes.
97 struct __kmpc_data_sharing_master_slot_static {
98   __kmpc_data_sharing_slot *Next;
99   __kmpc_data_sharing_slot *Prev;
100   void *PrevSlotStackPtr;
101   void *DataEnd;
102   char Data[DS_Slot_Size];
103 };
104 extern DEVICE SHARED DataSharingStateTy DataSharingState;
105 
106 ////////////////////////////////////////////////////////////////////////////////
107 // task ICV and (implicit & explicit) task state
108 
109 class omptarget_nvptx_TaskDescr {
110 public:
111   // methods for flags
112   INLINE omp_sched_t GetRuntimeSched() const;
113   INLINE void SetRuntimeSched(omp_sched_t sched);
InParallelRegion()114   INLINE int InParallelRegion() const { return items.flags & TaskDescr_InPar; }
InL2OrHigherParallelRegion()115   INLINE int InL2OrHigherParallelRegion() const {
116     return items.flags & TaskDescr_InParL2P;
117   }
IsParallelConstruct()118   INLINE int IsParallelConstruct() const {
119     return items.flags & TaskDescr_IsParConstr;
120   }
IsTaskConstruct()121   INLINE int IsTaskConstruct() const { return !IsParallelConstruct(); }
122   // methods for other fields
ThreadId()123   INLINE uint16_t &ThreadId() { return items.threadId; }
RuntimeChunkSize()124   INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; }
GetPrevTaskDescr()125   INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() const { return prev; }
SetPrevTaskDescr(omptarget_nvptx_TaskDescr * taskDescr)126   INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) {
127     prev = taskDescr;
128   }
129   // init & copy
130   INLINE void InitLevelZeroTaskDescr();
131   INLINE void InitLevelOneTaskDescr(omptarget_nvptx_TaskDescr *parentTaskDescr);
132   INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr);
133   INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr);
134   INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr);
135   INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr);
136   INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr);
137   INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr);
138   INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr,
139                                    uint16_t tid, uint16_t tnum);
140   INLINE void SaveLoopData();
141   INLINE void RestoreLoopData() const;
142 
143 private:
144   // bits for flags: (6 used, 2 free)
145   //   3 bits (SchedMask) for runtime schedule
146   //   1 bit (InPar) if this thread has encountered one or more parallel region
147   //   1 bit (IsParConstr) if ICV for a parallel region (false = explicit task)
148   //   1 bit (InParL2+) if this thread has encountered L2 or higher parallel
149   //   region
150   static const uint8_t TaskDescr_SchedMask = (0x1 | 0x2 | 0x4);
151   static const uint8_t TaskDescr_InPar = 0x10;
152   static const uint8_t TaskDescr_IsParConstr = 0x20;
153   static const uint8_t TaskDescr_InParL2P = 0x40;
154 
155   struct SavedLoopDescr_items {
156     int64_t loopUpperBound;
157     int64_t nextLowerBound;
158     int64_t chunk;
159     int64_t stride;
160     kmp_sched_t schedule;
161   } loopData;
162 
163   struct TaskDescr_items {
164     uint8_t flags; // 6 bit used (see flag above)
165     uint8_t unused;
166     uint16_t threadId;         // thread id
167     uint64_t runtimeChunkSize; // runtime chunk size
168   } items;
169   omptarget_nvptx_TaskDescr *prev;
170 };
171 
172 // build on kmp
173 typedef struct omptarget_nvptx_ExplicitTaskDescr {
174   omptarget_nvptx_TaskDescr
175       taskDescr; // omptarget_nvptx task description (must be first)
176   kmp_TaskDescr kmpTaskDescr; // kmp task description (must be last)
177 } omptarget_nvptx_ExplicitTaskDescr;
178 
179 ////////////////////////////////////////////////////////////////////////////////
180 // Descriptor of a parallel region (worksharing in general)
181 
182 class omptarget_nvptx_WorkDescr {
183 
184 public:
185   // access to data
WorkTaskDescr()186   INLINE omptarget_nvptx_TaskDescr *WorkTaskDescr() { return &masterTaskICV; }
187 
188 private:
189   omptarget_nvptx_TaskDescr masterTaskICV;
190 };
191 
192 ////////////////////////////////////////////////////////////////////////////////
193 
194 class omptarget_nvptx_TeamDescr {
195 public:
196   // access to data
LevelZeroTaskDescr()197   INLINE omptarget_nvptx_TaskDescr *LevelZeroTaskDescr() {
198     return &levelZeroTaskDescr;
199   }
WorkDescr()200   INLINE omptarget_nvptx_WorkDescr &WorkDescr() {
201     return workDescrForActiveParallel;
202   }
203 
204   // init
205   INLINE void InitTeamDescr();
206 
RootS(int wid,bool IsMasterThread)207   INLINE __kmpc_data_sharing_slot *RootS(int wid, bool IsMasterThread) {
208     // If this is invoked by the master thread of the master warp then
209     // initialize it with a smaller slot.
210     if (IsMasterThread) {
211       // Do not initialize this slot again if it has already been initalized.
212       if (master_rootS[0].DataEnd == &master_rootS[0].Data[0] + DS_Slot_Size)
213         return 0;
214       // Initialize the pointer to the end of the slot given the size of the
215       // data section. DataEnd is non-inclusive.
216       master_rootS[0].DataEnd = &master_rootS[0].Data[0] + DS_Slot_Size;
217       // We currently do not have a next slot.
218       master_rootS[0].Next = 0;
219       master_rootS[0].Prev = 0;
220       master_rootS[0].PrevSlotStackPtr = 0;
221       return (__kmpc_data_sharing_slot *)&master_rootS[0];
222     }
223     // Do not initialize this slot again if it has already been initalized.
224     if (worker_rootS[wid].DataEnd ==
225         &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size)
226       return 0;
227     // Initialize the pointer to the end of the slot given the size of the data
228     // section. DataEnd is non-inclusive.
229     worker_rootS[wid].DataEnd =
230         &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size;
231     // We currently do not have a next slot.
232     worker_rootS[wid].Next = 0;
233     worker_rootS[wid].Prev = 0;
234     worker_rootS[wid].PrevSlotStackPtr = 0;
235     return (__kmpc_data_sharing_slot *)&worker_rootS[wid];
236   }
237 
GetPreallocatedSlotAddr(int wid)238   INLINE __kmpc_data_sharing_slot *GetPreallocatedSlotAddr(int wid) {
239     worker_rootS[wid].DataEnd =
240         &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size;
241     // We currently do not have a next slot.
242     worker_rootS[wid].Next = 0;
243     worker_rootS[wid].Prev = 0;
244     worker_rootS[wid].PrevSlotStackPtr = 0;
245     return (__kmpc_data_sharing_slot *)&worker_rootS[wid];
246   }
247 
248 private:
249   omptarget_nvptx_TaskDescr
250       levelZeroTaskDescr; // icv for team master initial thread
251   omptarget_nvptx_WorkDescr
252       workDescrForActiveParallel; // one, ONLY for the active par
253 
254   ALIGN(16)
255   __kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE];
256   ALIGN(16) __kmpc_data_sharing_master_slot_static master_rootS[1];
257 };
258 
259 ////////////////////////////////////////////////////////////////////////////////
260 // thread private data (struct of arrays for better coalescing)
261 // tid refers here to the global thread id
262 // do not support multiple concurrent kernel a this time
263 class omptarget_nvptx_ThreadPrivateContext {
264 public:
265   // task
Level1TaskDescr(int tid)266   INLINE omptarget_nvptx_TaskDescr *Level1TaskDescr(int tid) {
267     return &levelOneTaskDescr[tid];
268   }
SetTopLevelTaskDescr(int tid,omptarget_nvptx_TaskDescr * taskICV)269   INLINE void SetTopLevelTaskDescr(int tid,
270                                    omptarget_nvptx_TaskDescr *taskICV) {
271     topTaskDescr[tid] = taskICV;
272   }
273   INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid) const;
274   // parallel
NumThreadsForNextParallel(int tid)275   INLINE uint16_t &NumThreadsForNextParallel(int tid) {
276     return nextRegion.tnum[tid];
277   }
278   // schedule (for dispatch)
ScheduleType(int tid)279   INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; }
Chunk(int tid)280   INLINE int64_t &Chunk(int tid) { return chunk[tid]; }
LoopUpperBound(int tid)281   INLINE int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; }
NextLowerBound(int tid)282   INLINE int64_t &NextLowerBound(int tid) { return nextLowerBound[tid]; }
Stride(int tid)283   INLINE int64_t &Stride(int tid) { return stride[tid]; }
284 
TeamContext()285   INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; }
286 
287   INLINE void InitThreadPrivateContext(int tid);
Cnt()288   INLINE uint64_t &Cnt() { return cnt; }
289 
290 private:
291   // team context for this team
292   omptarget_nvptx_TeamDescr teamContext;
293   // task ICV for implicit threads in the only parallel region
294   omptarget_nvptx_TaskDescr levelOneTaskDescr[MAX_THREADS_PER_TEAM];
295   // pointer where to find the current task ICV (top of the stack)
296   omptarget_nvptx_TaskDescr *topTaskDescr[MAX_THREADS_PER_TEAM];
297   union {
298     // Only one of the two is live at the same time.
299     // parallel
300     uint16_t tnum[MAX_THREADS_PER_TEAM];
301   } nextRegion;
302   // schedule (for dispatch)
303   kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for
304   int64_t chunk[MAX_THREADS_PER_TEAM];
305   int64_t loopUpperBound[MAX_THREADS_PER_TEAM];
306   // state for dispatch with dyn/guided OR static (never use both at a time)
307   int64_t nextLowerBound[MAX_THREADS_PER_TEAM];
308   int64_t stride[MAX_THREADS_PER_TEAM];
309   uint64_t cnt;
310 };
311 
312 /// Memory manager for statically allocated memory.
313 class omptarget_nvptx_SimpleMemoryManager {
314 private:
315   ALIGN(128) struct MemDataTy {
316     volatile unsigned keys[OMP_STATE_COUNT];
317   } MemData[MAX_SM];
318 
hash(unsigned key)319   INLINE static uint32_t hash(unsigned key) {
320     return key & (OMP_STATE_COUNT - 1);
321   }
322 
323 public:
324   INLINE void Release();
325   INLINE const void *Acquire(const void *buf, size_t size);
326 };
327 
328 ////////////////////////////////////////////////////////////////////////////////
329 
330 ////////////////////////////////////////////////////////////////////////////////
331 // global data tables
332 ////////////////////////////////////////////////////////////////////////////////
333 
334 extern DEVICE omptarget_nvptx_SimpleMemoryManager
335     omptarget_nvptx_simpleMemoryManager;
336 extern DEVICE SHARED uint32_t usedMemIdx;
337 extern DEVICE SHARED uint32_t usedSlotIdx;
338 extern DEVICE SHARED uint8_t
339     parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
340 extern DEVICE SHARED uint16_t threadLimit;
341 extern DEVICE SHARED uint16_t threadsInTeam;
342 extern DEVICE SHARED uint16_t nThreads;
343 extern DEVICE SHARED
344     omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
345 
346 extern DEVICE SHARED uint32_t execution_param;
347 extern DEVICE SHARED void *ReductionScratchpadPtr;
348 
349 ////////////////////////////////////////////////////////////////////////////////
350 // work function (outlined parallel/simd functions) and arguments.
351 // needed for L1 parallelism only.
352 ////////////////////////////////////////////////////////////////////////////////
353 
354 typedef void *omptarget_nvptx_WorkFn;
355 extern volatile DEVICE SHARED omptarget_nvptx_WorkFn
356     omptarget_nvptx_workFn;
357 
358 ////////////////////////////////////////////////////////////////////////////////
359 // get private data structures
360 ////////////////////////////////////////////////////////////////////////////////
361 
362 INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor();
363 INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor();
364 INLINE omptarget_nvptx_TaskDescr *
365 getMyTopTaskDescriptor(bool isSPMDExecutionMode);
366 INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId);
367 
368 ////////////////////////////////////////////////////////////////////////////////
369 // inlined implementation
370 ////////////////////////////////////////////////////////////////////////////////
371 
372 #include "common/omptargeti.h"
373 
374 #endif
375