1 //===---- omptargeti.h - OpenMP GPU initialization --------------- CUDA -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the declarations of all library macros, types,
10 // and functions.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "common/target_atomic.h"
15 
16 ////////////////////////////////////////////////////////////////////////////////
17 // Task Descriptor
18 ////////////////////////////////////////////////////////////////////////////////
19 
GetRuntimeSched()20 INLINE omp_sched_t omptarget_nvptx_TaskDescr::GetRuntimeSched() const {
21   // sched starts from 1..4; encode it as 0..3; so add 1 here
22   uint8_t rc = (items.flags & TaskDescr_SchedMask) + 1;
23   return (omp_sched_t)rc;
24 }
25 
SetRuntimeSched(omp_sched_t sched)26 INLINE void omptarget_nvptx_TaskDescr::SetRuntimeSched(omp_sched_t sched) {
27   // sched starts from 1..4; encode it as 0..3; so sub 1 here
28   uint8_t val = ((uint8_t)sched) - 1;
29   // clear current sched
30   items.flags &= ~TaskDescr_SchedMask;
31   // set new sched
32   items.flags |= val;
33 }
34 
35 INLINE void
InitLevelZeroTaskDescr()36 omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr() {
37   // slow method
38   // flag:
39   //   default sched is static,
40   //   dyn is off (unused now anyway, but may need to sample from host ?)
41   //   not in parallel
42 
43   items.flags = 0;
44   items.threadId = 0;         // is master
45   items.runtimeChunkSize = 1; // preferred chunking statik with chunk 1
46 }
47 
48 // This is called when all threads are started together in SPMD mode.
49 // OMP directives include target parallel, target distribute parallel for, etc.
InitLevelOneTaskDescr(omptarget_nvptx_TaskDescr * parentTaskDescr)50 INLINE void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr(
51     omptarget_nvptx_TaskDescr *parentTaskDescr) {
52   // slow method
53   // flag:
54   //   default sched is static,
55   //   dyn is off (unused now anyway, but may need to sample from host ?)
56   //   in L1 parallel
57 
58   items.flags =
59       TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
60   items.threadId =
61       GetThreadIdInBlock(); // get ids from cuda (only called for 1st level)
62   items.runtimeChunkSize = 1; // preferred chunking statik with chunk 1
63   prev = parentTaskDescr;
64 }
65 
CopyData(omptarget_nvptx_TaskDescr * sourceTaskDescr)66 INLINE void omptarget_nvptx_TaskDescr::CopyData(
67     omptarget_nvptx_TaskDescr *sourceTaskDescr) {
68   items = sourceTaskDescr->items;
69 }
70 
71 INLINE void
Copy(omptarget_nvptx_TaskDescr * sourceTaskDescr)72 omptarget_nvptx_TaskDescr::Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr) {
73   CopyData(sourceTaskDescr);
74   prev = sourceTaskDescr->prev;
75 }
76 
CopyParent(omptarget_nvptx_TaskDescr * parentTaskDescr)77 INLINE void omptarget_nvptx_TaskDescr::CopyParent(
78     omptarget_nvptx_TaskDescr *parentTaskDescr) {
79   CopyData(parentTaskDescr);
80   prev = parentTaskDescr;
81 }
82 
CopyForExplicitTask(omptarget_nvptx_TaskDescr * parentTaskDescr)83 INLINE void omptarget_nvptx_TaskDescr::CopyForExplicitTask(
84     omptarget_nvptx_TaskDescr *parentTaskDescr) {
85   CopyParent(parentTaskDescr);
86   items.flags = items.flags & ~TaskDescr_IsParConstr;
87   ASSERT0(LT_FUSSY, IsTaskConstruct(), "expected task");
88 }
89 
CopyToWorkDescr(omptarget_nvptx_TaskDescr * masterTaskDescr)90 INLINE void omptarget_nvptx_TaskDescr::CopyToWorkDescr(
91     omptarget_nvptx_TaskDescr *masterTaskDescr) {
92   CopyParent(masterTaskDescr);
93   // overwrite specific items;
94   items.flags |=
95       TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
96 }
97 
CopyFromWorkDescr(omptarget_nvptx_TaskDescr * workTaskDescr)98 INLINE void omptarget_nvptx_TaskDescr::CopyFromWorkDescr(
99     omptarget_nvptx_TaskDescr *workTaskDescr) {
100   Copy(workTaskDescr);
101   //
102   // overwrite specific items;
103   //
104   // The threadID should be GetThreadIdInBlock() % GetMasterThreadID().
105   // This is so that the serial master (first lane in the master warp)
106   // gets a threadId of 0.
107   // However, we know that this function is always called in a parallel
108   // region where only workers are active.  The serial master thread
109   // never enters this region.  When a parallel region is executed serially,
110   // the threadId is set to 0 elsewhere and the kmpc_serialized_* functions
111   // are called, which never activate this region.
112   items.threadId =
113       GetThreadIdInBlock(); // get ids from cuda (only called for 1st level)
114 }
115 
CopyConvergentParent(omptarget_nvptx_TaskDescr * parentTaskDescr,uint16_t tid,uint16_t tnum)116 INLINE void omptarget_nvptx_TaskDescr::CopyConvergentParent(
117     omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum) {
118   CopyParent(parentTaskDescr);
119   items.flags |= TaskDescr_InParL2P; // In L2+ parallelism
120   items.threadId = tid;
121 }
122 
SaveLoopData()123 INLINE void omptarget_nvptx_TaskDescr::SaveLoopData() {
124   loopData.loopUpperBound =
125       omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId);
126   loopData.nextLowerBound =
127       omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId);
128   loopData.schedule =
129       omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId);
130   loopData.chunk = omptarget_nvptx_threadPrivateContext->Chunk(items.threadId);
131   loopData.stride =
132       omptarget_nvptx_threadPrivateContext->Stride(items.threadId);
133 }
134 
RestoreLoopData()135 INLINE void omptarget_nvptx_TaskDescr::RestoreLoopData() const {
136   omptarget_nvptx_threadPrivateContext->Chunk(items.threadId) = loopData.chunk;
137   omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId) =
138       loopData.loopUpperBound;
139   omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId) =
140       loopData.nextLowerBound;
141   omptarget_nvptx_threadPrivateContext->Stride(items.threadId) =
142       loopData.stride;
143   omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId) =
144       loopData.schedule;
145 }
146 
147 ////////////////////////////////////////////////////////////////////////////////
148 // Thread Private Context
149 ////////////////////////////////////////////////////////////////////////////////
150 
151 INLINE omptarget_nvptx_TaskDescr *
GetTopLevelTaskDescr(int tid)152 omptarget_nvptx_ThreadPrivateContext::GetTopLevelTaskDescr(int tid) const {
153   ASSERT0(
154       LT_FUSSY, tid < MAX_THREADS_PER_TEAM,
155       "Getting top level, tid is larger than allocated data structure size");
156   return topTaskDescr[tid];
157 }
158 
159 INLINE void
InitThreadPrivateContext(int tid)160 omptarget_nvptx_ThreadPrivateContext::InitThreadPrivateContext(int tid) {
161   // levelOneTaskDescr is init when starting the parallel region
162   // top task descr is NULL (team master version will be fixed separately)
163   topTaskDescr[tid] = NULL;
164   // no num threads value has been pushed
165   nextRegion.tnum[tid] = 0;
166   // the following don't need to be init here; they are init when using dyn
167   // sched
168   // current_Event, events_Number, chunk, num_Iterations, schedule
169 }
170 
171 ////////////////////////////////////////////////////////////////////////////////
172 // Team Descriptor
173 ////////////////////////////////////////////////////////////////////////////////
174 
InitTeamDescr()175 INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr() {
176   levelZeroTaskDescr.InitLevelZeroTaskDescr();
177 }
178 
179 ////////////////////////////////////////////////////////////////////////////////
180 // Get private data structure for thread
181 ////////////////////////////////////////////////////////////////////////////////
182 
183 // Utility routines for CUDA threads
getMyTeamDescriptor()184 INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor() {
185   return omptarget_nvptx_threadPrivateContext->TeamContext();
186 }
187 
getMyWorkDescriptor()188 INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor() {
189   omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
190   return currTeamDescr.WorkDescr();
191 }
192 
getMyTopTaskDescriptor(int threadId)193 INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int threadId) {
194   return omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
195 }
196 
197 INLINE omptarget_nvptx_TaskDescr *
getMyTopTaskDescriptor(bool isSPMDExecutionMode)198 getMyTopTaskDescriptor(bool isSPMDExecutionMode) {
199   return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock(isSPMDExecutionMode));
200 }
201 
202 ////////////////////////////////////////////////////////////////////////////////
203 // Memory management runtime functions.
204 ////////////////////////////////////////////////////////////////////////////////
205 
Release()206 INLINE void omptarget_nvptx_SimpleMemoryManager::Release() {
207   ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM,
208           "SlotIdx is too big or uninitialized.");
209   ASSERT0(LT_FUSSY, usedMemIdx < OMP_STATE_COUNT,
210           "MemIdx is too big or uninitialized.");
211   MemDataTy &MD = MemData[usedSlotIdx];
212   __kmpc_atomic_exchange((unsigned *)&MD.keys[usedMemIdx], 0u);
213 }
214 
Acquire(const void * buf,size_t size)215 INLINE const void *omptarget_nvptx_SimpleMemoryManager::Acquire(const void *buf,
216                                                                 size_t size) {
217   ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM,
218           "SlotIdx is too big or uninitialized.");
219   const unsigned sm = usedSlotIdx;
220   MemDataTy &MD = MemData[sm];
221   unsigned i = hash(GetBlockIdInKernel());
222   while (__kmpc_atomic_cas((unsigned *)&MD.keys[i], 0u, 1u) != 0) {
223     i = hash(i + 1);
224   }
225   usedSlotIdx = sm;
226   usedMemIdx = i;
227   return static_cast<const char *>(buf) + (sm * OMP_STATE_COUNT + i) * size;
228 }
229