1 //===---- omptarget.h - OpenMP GPU initialization ---------------- CUDA -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains the declarations of all library macros, types, 10 // and functions. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef OMPTARGET_H 15 #define OMPTARGET_H 16 17 #include "target_impl.h" 18 #include "common/debug.h" // debug 19 #include "interface.h" // interfaces with omp, compiler, and user 20 #include "common/state-queue.h" 21 #include "common/support.h" 22 23 #define OMPTARGET_NVPTX_VERSION 1.1 24 25 // used by the library for the interface with the app 26 #define DISPATCH_FINISHED 0 27 #define DISPATCH_NOTFINISHED 1 28 29 // used by dynamic scheduling 30 #define FINISHED 0 31 #define NOT_FINISHED 1 32 #define LAST_CHUNK 2 33 34 #define BARRIER_COUNTER 0 35 #define ORDERED_COUNTER 1 36 37 // arguments needed for L0 parallelism only. 38 class omptarget_nvptx_SharedArgs { 39 public: 40 // All these methods must be called by the master thread only. Init()41 INLINE void Init() { 42 args = buffer; 43 nArgs = MAX_SHARED_ARGS; 44 } DeInit()45 INLINE void DeInit() { 46 // Free any memory allocated for outlined parallel function with a large 47 // number of arguments. 48 if (nArgs > MAX_SHARED_ARGS) { 49 SafeFree(args, "new extended args"); 50 Init(); 51 } 52 } EnsureSize(size_t size)53 INLINE void EnsureSize(size_t size) { 54 if (size > nArgs) { 55 if (nArgs > MAX_SHARED_ARGS) { 56 SafeFree(args, "new extended args"); 57 } 58 args = (void **)SafeMalloc(size * sizeof(void *), "new extended args"); 59 nArgs = size; 60 } 61 } 62 // Called by all threads. GetArgs()63 INLINE void **GetArgs() const { return args; }; 64 private: 65 // buffer of pre-allocated arguments. 66 void *buffer[MAX_SHARED_ARGS]; 67 // pointer to arguments buffer. 68 // starts off as a pointer to 'buffer' but can be dynamically allocated. 69 void **args; 70 // starts off as MAX_SHARED_ARGS but can increase in size. 71 uint32_t nArgs; 72 }; 73 74 extern DEVICE SHARED omptarget_nvptx_SharedArgs 75 omptarget_nvptx_globalArgs; 76 77 // Data structure to keep in shared memory that traces the current slot, stack, 78 // and frame pointer as well as the active threads that didn't exit the current 79 // environment. 80 struct DataSharingStateTy { 81 __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number]; 82 void *StackPtr[DS_Max_Warp_Number]; 83 void * volatile FramePtr[DS_Max_Warp_Number]; 84 __kmpc_impl_lanemask_t ActiveThreads[DS_Max_Warp_Number]; 85 }; 86 // Additional worker slot type which is initialized with the default worker slot 87 // size of 4*32 bytes. 88 struct __kmpc_data_sharing_worker_slot_static { 89 __kmpc_data_sharing_slot *Next; 90 __kmpc_data_sharing_slot *Prev; 91 void *PrevSlotStackPtr; 92 void *DataEnd; 93 char Data[DS_Worker_Warp_Slot_Size]; 94 }; 95 // Additional master slot type which is initialized with the default master slot 96 // size of 4 bytes. 97 struct __kmpc_data_sharing_master_slot_static { 98 __kmpc_data_sharing_slot *Next; 99 __kmpc_data_sharing_slot *Prev; 100 void *PrevSlotStackPtr; 101 void *DataEnd; 102 char Data[DS_Slot_Size]; 103 }; 104 extern DEVICE SHARED DataSharingStateTy DataSharingState; 105 106 //////////////////////////////////////////////////////////////////////////////// 107 // task ICV and (implicit & explicit) task state 108 109 class omptarget_nvptx_TaskDescr { 110 public: 111 // methods for flags 112 INLINE omp_sched_t GetRuntimeSched() const; 113 INLINE void SetRuntimeSched(omp_sched_t sched); InParallelRegion()114 INLINE int InParallelRegion() const { return items.flags & TaskDescr_InPar; } InL2OrHigherParallelRegion()115 INLINE int InL2OrHigherParallelRegion() const { 116 return items.flags & TaskDescr_InParL2P; 117 } IsParallelConstruct()118 INLINE int IsParallelConstruct() const { 119 return items.flags & TaskDescr_IsParConstr; 120 } IsTaskConstruct()121 INLINE int IsTaskConstruct() const { return !IsParallelConstruct(); } 122 // methods for other fields ThreadId()123 INLINE uint16_t &ThreadId() { return items.threadId; } RuntimeChunkSize()124 INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; } GetPrevTaskDescr()125 INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() const { return prev; } SetPrevTaskDescr(omptarget_nvptx_TaskDescr * taskDescr)126 INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) { 127 prev = taskDescr; 128 } 129 // init & copy 130 INLINE void InitLevelZeroTaskDescr(); 131 INLINE void InitLevelOneTaskDescr(omptarget_nvptx_TaskDescr *parentTaskDescr); 132 INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr); 133 INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr); 134 INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr); 135 INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr); 136 INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr); 137 INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr); 138 INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr, 139 uint16_t tid, uint16_t tnum); 140 INLINE void SaveLoopData(); 141 INLINE void RestoreLoopData() const; 142 143 private: 144 // bits for flags: (6 used, 2 free) 145 // 3 bits (SchedMask) for runtime schedule 146 // 1 bit (InPar) if this thread has encountered one or more parallel region 147 // 1 bit (IsParConstr) if ICV for a parallel region (false = explicit task) 148 // 1 bit (InParL2+) if this thread has encountered L2 or higher parallel 149 // region 150 static const uint8_t TaskDescr_SchedMask = (0x1 | 0x2 | 0x4); 151 static const uint8_t TaskDescr_InPar = 0x10; 152 static const uint8_t TaskDescr_IsParConstr = 0x20; 153 static const uint8_t TaskDescr_InParL2P = 0x40; 154 155 struct SavedLoopDescr_items { 156 int64_t loopUpperBound; 157 int64_t nextLowerBound; 158 int64_t chunk; 159 int64_t stride; 160 kmp_sched_t schedule; 161 } loopData; 162 163 struct TaskDescr_items { 164 uint8_t flags; // 6 bit used (see flag above) 165 uint8_t unused; 166 uint16_t threadId; // thread id 167 uint64_t runtimeChunkSize; // runtime chunk size 168 } items; 169 omptarget_nvptx_TaskDescr *prev; 170 }; 171 172 // build on kmp 173 typedef struct omptarget_nvptx_ExplicitTaskDescr { 174 omptarget_nvptx_TaskDescr 175 taskDescr; // omptarget_nvptx task description (must be first) 176 kmp_TaskDescr kmpTaskDescr; // kmp task description (must be last) 177 } omptarget_nvptx_ExplicitTaskDescr; 178 179 //////////////////////////////////////////////////////////////////////////////// 180 // Descriptor of a parallel region (worksharing in general) 181 182 class omptarget_nvptx_WorkDescr { 183 184 public: 185 // access to data WorkTaskDescr()186 INLINE omptarget_nvptx_TaskDescr *WorkTaskDescr() { return &masterTaskICV; } 187 188 private: 189 omptarget_nvptx_TaskDescr masterTaskICV; 190 }; 191 192 //////////////////////////////////////////////////////////////////////////////// 193 194 class omptarget_nvptx_TeamDescr { 195 public: 196 // access to data LevelZeroTaskDescr()197 INLINE omptarget_nvptx_TaskDescr *LevelZeroTaskDescr() { 198 return &levelZeroTaskDescr; 199 } WorkDescr()200 INLINE omptarget_nvptx_WorkDescr &WorkDescr() { 201 return workDescrForActiveParallel; 202 } 203 204 // init 205 INLINE void InitTeamDescr(); 206 RootS(int wid,bool IsMasterThread)207 INLINE __kmpc_data_sharing_slot *RootS(int wid, bool IsMasterThread) { 208 // If this is invoked by the master thread of the master warp then 209 // initialize it with a smaller slot. 210 if (IsMasterThread) { 211 // Do not initialize this slot again if it has already been initalized. 212 if (master_rootS[0].DataEnd == &master_rootS[0].Data[0] + DS_Slot_Size) 213 return 0; 214 // Initialize the pointer to the end of the slot given the size of the 215 // data section. DataEnd is non-inclusive. 216 master_rootS[0].DataEnd = &master_rootS[0].Data[0] + DS_Slot_Size; 217 // We currently do not have a next slot. 218 master_rootS[0].Next = 0; 219 master_rootS[0].Prev = 0; 220 master_rootS[0].PrevSlotStackPtr = 0; 221 return (__kmpc_data_sharing_slot *)&master_rootS[0]; 222 } 223 // Do not initialize this slot again if it has already been initalized. 224 if (worker_rootS[wid].DataEnd == 225 &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size) 226 return 0; 227 // Initialize the pointer to the end of the slot given the size of the data 228 // section. DataEnd is non-inclusive. 229 worker_rootS[wid].DataEnd = 230 &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size; 231 // We currently do not have a next slot. 232 worker_rootS[wid].Next = 0; 233 worker_rootS[wid].Prev = 0; 234 worker_rootS[wid].PrevSlotStackPtr = 0; 235 return (__kmpc_data_sharing_slot *)&worker_rootS[wid]; 236 } 237 GetPreallocatedSlotAddr(int wid)238 INLINE __kmpc_data_sharing_slot *GetPreallocatedSlotAddr(int wid) { 239 worker_rootS[wid].DataEnd = 240 &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size; 241 // We currently do not have a next slot. 242 worker_rootS[wid].Next = 0; 243 worker_rootS[wid].Prev = 0; 244 worker_rootS[wid].PrevSlotStackPtr = 0; 245 return (__kmpc_data_sharing_slot *)&worker_rootS[wid]; 246 } 247 248 private: 249 omptarget_nvptx_TaskDescr 250 levelZeroTaskDescr; // icv for team master initial thread 251 omptarget_nvptx_WorkDescr 252 workDescrForActiveParallel; // one, ONLY for the active par 253 254 ALIGN(16) 255 __kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE]; 256 ALIGN(16) __kmpc_data_sharing_master_slot_static master_rootS[1]; 257 }; 258 259 //////////////////////////////////////////////////////////////////////////////// 260 // thread private data (struct of arrays for better coalescing) 261 // tid refers here to the global thread id 262 // do not support multiple concurrent kernel a this time 263 class omptarget_nvptx_ThreadPrivateContext { 264 public: 265 // task Level1TaskDescr(int tid)266 INLINE omptarget_nvptx_TaskDescr *Level1TaskDescr(int tid) { 267 return &levelOneTaskDescr[tid]; 268 } SetTopLevelTaskDescr(int tid,omptarget_nvptx_TaskDescr * taskICV)269 INLINE void SetTopLevelTaskDescr(int tid, 270 omptarget_nvptx_TaskDescr *taskICV) { 271 topTaskDescr[tid] = taskICV; 272 } 273 INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid) const; 274 // parallel NumThreadsForNextParallel(int tid)275 INLINE uint16_t &NumThreadsForNextParallel(int tid) { 276 return nextRegion.tnum[tid]; 277 } 278 // schedule (for dispatch) ScheduleType(int tid)279 INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; } Chunk(int tid)280 INLINE int64_t &Chunk(int tid) { return chunk[tid]; } LoopUpperBound(int tid)281 INLINE int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; } NextLowerBound(int tid)282 INLINE int64_t &NextLowerBound(int tid) { return nextLowerBound[tid]; } Stride(int tid)283 INLINE int64_t &Stride(int tid) { return stride[tid]; } 284 TeamContext()285 INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; } 286 287 INLINE void InitThreadPrivateContext(int tid); Cnt()288 INLINE uint64_t &Cnt() { return cnt; } 289 290 private: 291 // team context for this team 292 omptarget_nvptx_TeamDescr teamContext; 293 // task ICV for implicit threads in the only parallel region 294 omptarget_nvptx_TaskDescr levelOneTaskDescr[MAX_THREADS_PER_TEAM]; 295 // pointer where to find the current task ICV (top of the stack) 296 omptarget_nvptx_TaskDescr *topTaskDescr[MAX_THREADS_PER_TEAM]; 297 union { 298 // Only one of the two is live at the same time. 299 // parallel 300 uint16_t tnum[MAX_THREADS_PER_TEAM]; 301 } nextRegion; 302 // schedule (for dispatch) 303 kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for 304 int64_t chunk[MAX_THREADS_PER_TEAM]; 305 int64_t loopUpperBound[MAX_THREADS_PER_TEAM]; 306 // state for dispatch with dyn/guided OR static (never use both at a time) 307 int64_t nextLowerBound[MAX_THREADS_PER_TEAM]; 308 int64_t stride[MAX_THREADS_PER_TEAM]; 309 uint64_t cnt; 310 }; 311 312 /// Memory manager for statically allocated memory. 313 class omptarget_nvptx_SimpleMemoryManager { 314 private: 315 ALIGN(128) struct MemDataTy { 316 volatile unsigned keys[OMP_STATE_COUNT]; 317 } MemData[MAX_SM]; 318 hash(unsigned key)319 INLINE static uint32_t hash(unsigned key) { 320 return key & (OMP_STATE_COUNT - 1); 321 } 322 323 public: 324 INLINE void Release(); 325 INLINE const void *Acquire(const void *buf, size_t size); 326 }; 327 328 //////////////////////////////////////////////////////////////////////////////// 329 330 //////////////////////////////////////////////////////////////////////////////// 331 // global data tables 332 //////////////////////////////////////////////////////////////////////////////// 333 334 extern DEVICE omptarget_nvptx_SimpleMemoryManager 335 omptarget_nvptx_simpleMemoryManager; 336 extern DEVICE SHARED uint32_t usedMemIdx; 337 extern DEVICE SHARED uint32_t usedSlotIdx; 338 extern DEVICE SHARED uint8_t 339 parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; 340 extern DEVICE SHARED uint16_t threadLimit; 341 extern DEVICE SHARED uint16_t threadsInTeam; 342 extern DEVICE SHARED uint16_t nThreads; 343 extern DEVICE SHARED 344 omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; 345 346 extern DEVICE SHARED uint32_t execution_param; 347 extern DEVICE SHARED void *ReductionScratchpadPtr; 348 349 //////////////////////////////////////////////////////////////////////////////// 350 // work function (outlined parallel/simd functions) and arguments. 351 // needed for L1 parallelism only. 352 //////////////////////////////////////////////////////////////////////////////// 353 354 typedef void *omptarget_nvptx_WorkFn; 355 extern volatile DEVICE SHARED omptarget_nvptx_WorkFn 356 omptarget_nvptx_workFn; 357 358 //////////////////////////////////////////////////////////////////////////////// 359 // get private data structures 360 //////////////////////////////////////////////////////////////////////////////// 361 362 INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor(); 363 INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor(); 364 INLINE omptarget_nvptx_TaskDescr * 365 getMyTopTaskDescriptor(bool isSPMDExecutionMode); 366 INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId); 367 368 //////////////////////////////////////////////////////////////////////////////// 369 // inlined implementation 370 //////////////////////////////////////////////////////////////////////////////// 371 372 #include "common/omptargeti.h" 373 374 #endif 375