1 //===---- omptarget.h - OpenMP GPU initialization ---------------- CUDA -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the declarations of all library macros, types,
10 // and functions.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef OMPTARGET_H
15 #define OMPTARGET_H
16 
17 #include "common/allocator.h"
18 #include "common/debug.h" // debug
19 #include "common/state-queue.h"
20 #include "common/support.h"
21 #include "interface.h" // interfaces with omp, compiler, and user
22 #include "target_impl.h"
23 
24 #define OMPTARGET_NVPTX_VERSION 1.1
25 
26 // used by the library for the interface with the app
27 #define DISPATCH_FINISHED 0
28 #define DISPATCH_NOTFINISHED 1
29 
30 // used by dynamic scheduling
31 #define FINISHED 0
32 #define NOT_FINISHED 1
33 #define LAST_CHUNK 2
34 
35 #define BARRIER_COUNTER 0
36 #define ORDERED_COUNTER 1
37 
38 // Worker slot type which is initialized with the default worker slot
39 // size of 4*32 bytes.
40 struct __kmpc_data_sharing_slot {
41   __kmpc_data_sharing_slot *Next;
42   __kmpc_data_sharing_slot *Prev;
43   void *PrevSlotStackPtr;
44   void *DataEnd;
45   char Data[DS_Worker_Warp_Slot_Size];
46 };
47 
48 ////////////////////////////////////////////////////////////////////////////////
49 // task ICV and (implicit & explicit) task state
50 
51 class omptarget_nvptx_TaskDescr {
52 public:
53   // methods for flags
54   INLINE omp_sched_t GetRuntimeSched() const;
55   INLINE void SetRuntimeSched(omp_sched_t sched);
InParallelRegion()56   INLINE int InParallelRegion() const { return items.flags & TaskDescr_InPar; }
InL2OrHigherParallelRegion()57   INLINE int InL2OrHigherParallelRegion() const {
58     return items.flags & TaskDescr_InParL2P;
59   }
IsParallelConstruct()60   INLINE int IsParallelConstruct() const {
61     return items.flags & TaskDescr_IsParConstr;
62   }
IsTaskConstruct()63   INLINE int IsTaskConstruct() const { return !IsParallelConstruct(); }
64   // methods for other fields
ThreadId()65   INLINE uint16_t &ThreadId() { return items.threadId; }
RuntimeChunkSize()66   INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; }
GetPrevTaskDescr()67   INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() const { return prev; }
SetPrevTaskDescr(omptarget_nvptx_TaskDescr * taskDescr)68   INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) {
69     prev = taskDescr;
70   }
71   // init & copy
72   INLINE void InitLevelZeroTaskDescr();
73   INLINE void InitLevelOneTaskDescr(omptarget_nvptx_TaskDescr *parentTaskDescr);
74   INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr);
75   INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr);
76   INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr);
77   INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr);
78   INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr);
79   INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr);
80   INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr,
81                                    uint16_t tid, uint16_t tnum);
82   INLINE void SaveLoopData();
83   INLINE void RestoreLoopData() const;
84 
85 private:
86   // bits for flags: (6 used, 2 free)
87   //   3 bits (SchedMask) for runtime schedule
88   //   1 bit (InPar) if this thread has encountered one or more parallel region
89   //   1 bit (IsParConstr) if ICV for a parallel region (false = explicit task)
90   //   1 bit (InParL2+) if this thread has encountered L2 or higher parallel
91   //   region
92   static const uint8_t TaskDescr_SchedMask = (0x1 | 0x2 | 0x4);
93   static const uint8_t TaskDescr_InPar = 0x10;
94   static const uint8_t TaskDescr_IsParConstr = 0x20;
95   static const uint8_t TaskDescr_InParL2P = 0x40;
96 
97   struct SavedLoopDescr_items {
98     int64_t loopUpperBound;
99     int64_t nextLowerBound;
100     int64_t chunk;
101     int64_t stride;
102     kmp_sched_t schedule;
103   } loopData;
104 
105   struct TaskDescr_items {
106     uint8_t flags; // 6 bit used (see flag above)
107     uint8_t unused;
108     uint16_t threadId;         // thread id
109     uint64_t runtimeChunkSize; // runtime chunk size
110   } items;
111   omptarget_nvptx_TaskDescr *prev;
112 };
113 
114 // build on kmp
115 typedef struct omptarget_nvptx_ExplicitTaskDescr {
116   omptarget_nvptx_TaskDescr
117       taskDescr; // omptarget_nvptx task description (must be first)
118   kmp_TaskDescr kmpTaskDescr; // kmp task description (must be last)
119 } omptarget_nvptx_ExplicitTaskDescr;
120 
121 ////////////////////////////////////////////////////////////////////////////////
122 // Descriptor of a parallel region (worksharing in general)
123 
124 class omptarget_nvptx_WorkDescr {
125 
126 public:
127   // access to data
WorkTaskDescr()128   INLINE omptarget_nvptx_TaskDescr *WorkTaskDescr() { return &masterTaskICV; }
129 
130 private:
131   omptarget_nvptx_TaskDescr masterTaskICV;
132 };
133 
134 ////////////////////////////////////////////////////////////////////////////////
135 
136 class omptarget_nvptx_TeamDescr {
137 public:
138   // access to data
LevelZeroTaskDescr()139   INLINE omptarget_nvptx_TaskDescr *LevelZeroTaskDescr() {
140     return &levelZeroTaskDescr;
141   }
WorkDescr()142   INLINE omptarget_nvptx_WorkDescr &WorkDescr() {
143     return workDescrForActiveParallel;
144   }
145 
146   // init
147   INLINE void InitTeamDescr();
148 
GetPreallocatedSlotAddr(int wid)149   INLINE __kmpc_data_sharing_slot *GetPreallocatedSlotAddr(int wid) {
150     worker_rootS[wid].DataEnd =
151         &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size;
152     // We currently do not have a next slot.
153     worker_rootS[wid].Next = 0;
154     worker_rootS[wid].Prev = 0;
155     worker_rootS[wid].PrevSlotStackPtr = 0;
156     return (__kmpc_data_sharing_slot *)&worker_rootS[wid];
157   }
158 
159 private:
160   omptarget_nvptx_TaskDescr
161       levelZeroTaskDescr; // icv for team master initial thread
162   omptarget_nvptx_WorkDescr
163       workDescrForActiveParallel; // one, ONLY for the active par
164 
165   ALIGN(16)
166   __kmpc_data_sharing_slot worker_rootS[DS_Max_Warp_Number];
167 };
168 
169 ////////////////////////////////////////////////////////////////////////////////
170 // thread private data (struct of arrays for better coalescing)
171 // tid refers here to the global thread id
172 // do not support multiple concurrent kernel a this time
173 class omptarget_nvptx_ThreadPrivateContext {
174 public:
175   // task
Level1TaskDescr(int tid)176   INLINE omptarget_nvptx_TaskDescr *Level1TaskDescr(int tid) {
177     return &levelOneTaskDescr[tid];
178   }
SetTopLevelTaskDescr(int tid,omptarget_nvptx_TaskDescr * taskICV)179   INLINE void SetTopLevelTaskDescr(int tid,
180                                    omptarget_nvptx_TaskDescr *taskICV) {
181     topTaskDescr[tid] = taskICV;
182   }
183   INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid) const;
184   // parallel
NumThreadsForNextParallel(int tid)185   INLINE uint16_t &NumThreadsForNextParallel(int tid) {
186     return nextRegion.tnum[tid];
187   }
188   // schedule (for dispatch)
ScheduleType(int tid)189   INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; }
Chunk(int tid)190   INLINE int64_t &Chunk(int tid) { return chunk[tid]; }
LoopUpperBound(int tid)191   INLINE int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; }
NextLowerBound(int tid)192   INLINE int64_t &NextLowerBound(int tid) { return nextLowerBound[tid]; }
Stride(int tid)193   INLINE int64_t &Stride(int tid) { return stride[tid]; }
194 
TeamContext()195   INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; }
196 
197   INLINE void InitThreadPrivateContext(int tid);
Cnt()198   INLINE uint64_t &Cnt() { return cnt; }
199 
200 private:
201   // team context for this team
202   omptarget_nvptx_TeamDescr teamContext;
203   // task ICV for implicit threads in the only parallel region
204   omptarget_nvptx_TaskDescr levelOneTaskDescr[MAX_THREADS_PER_TEAM];
205   // pointer where to find the current task ICV (top of the stack)
206   omptarget_nvptx_TaskDescr *topTaskDescr[MAX_THREADS_PER_TEAM];
207   union {
208     // Only one of the two is live at the same time.
209     // parallel
210     uint16_t tnum[MAX_THREADS_PER_TEAM];
211   } nextRegion;
212   // schedule (for dispatch)
213   kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for
214   int64_t chunk[MAX_THREADS_PER_TEAM];
215   int64_t loopUpperBound[MAX_THREADS_PER_TEAM];
216   // state for dispatch with dyn/guided OR static (never use both at a time)
217   int64_t nextLowerBound[MAX_THREADS_PER_TEAM];
218   int64_t stride[MAX_THREADS_PER_TEAM];
219   uint64_t cnt;
220 };
221 
222 /// Memory manager for statically allocated memory.
223 class omptarget_nvptx_SimpleMemoryManager {
224 private:
225   struct MemDataTy {
226     volatile unsigned keys[OMP_STATE_COUNT];
227   } MemData[MAX_SM] ALIGN(128);
228 
hash(unsigned key)229   INLINE static uint32_t hash(unsigned key) {
230     return key & (OMP_STATE_COUNT - 1);
231   }
232 
233 public:
234   INLINE void Release();
235   INLINE const void *Acquire(const void *buf, size_t size);
236 };
237 
238 ////////////////////////////////////////////////////////////////////////////////
239 
240 ////////////////////////////////////////////////////////////////////////////////
241 // global data tables
242 ////////////////////////////////////////////////////////////////////////////////
243 
244 extern omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager;
245 extern uint32_t EXTERN_SHARED(usedMemIdx);
246 extern uint32_t EXTERN_SHARED(usedSlotIdx);
247 #if _OPENMP
248 extern uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
249 #pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc)
250 #else
251 extern uint8_t EXTERN_SHARED(parallelLevel)[MAX_THREADS_PER_TEAM / WARPSIZE];
252 #endif
253 extern uint16_t EXTERN_SHARED(threadLimit);
254 extern uint16_t EXTERN_SHARED(threadsInTeam);
255 extern uint16_t EXTERN_SHARED(nThreads);
256 extern omptarget_nvptx_ThreadPrivateContext *
257     EXTERN_SHARED(omptarget_nvptx_threadPrivateContext);
258 
259 extern int8_t EXTERN_SHARED(execution_param);
260 extern void *EXTERN_SHARED(ReductionScratchpadPtr);
261 
262 ////////////////////////////////////////////////////////////////////////////////
263 // work function (outlined parallel/simd functions) and arguments.
264 // needed for L1 parallelism only.
265 ////////////////////////////////////////////////////////////////////////////////
266 
267 typedef void *omptarget_nvptx_WorkFn;
268 extern omptarget_nvptx_WorkFn EXTERN_SHARED(omptarget_nvptx_workFn);
269 
270 ////////////////////////////////////////////////////////////////////////////////
271 // get private data structures
272 ////////////////////////////////////////////////////////////////////////////////
273 
274 INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor();
275 INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor();
276 INLINE omptarget_nvptx_TaskDescr *
277 getMyTopTaskDescriptor(bool isSPMDExecutionMode);
278 INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId);
279 
280 ////////////////////////////////////////////////////////////////////////////////
281 // inlined implementation
282 ////////////////////////////////////////////////////////////////////////////////
283 
__kmpc_impl_ffs(uint32_t x)284 INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __builtin_ffs(x); }
__kmpc_impl_popc(uint32_t x)285 INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __builtin_popcount(x); }
__kmpc_impl_ffs(uint64_t x)286 INLINE uint32_t __kmpc_impl_ffs(uint64_t x) { return __builtin_ffsl(x); }
__kmpc_impl_popc(uint64_t x)287 INLINE uint32_t __kmpc_impl_popc(uint64_t x) { return __builtin_popcountl(x); }
288 
289 #include "common/omptargeti.h"
290 
291 #endif
292