1 //===----- data_sharing.cu - OpenMP GPU data sharing ------------- CUDA -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the implementation of data sharing environments
10 //
11 //===----------------------------------------------------------------------===//
12 #pragma omp declare target
13
14 #include "common/omptarget.h"
15 #include "target_impl.h"
16
17 // Return true if this is the master thread.
IsMasterThread(bool isSPMDExecutionMode)18 INLINE static bool IsMasterThread(bool isSPMDExecutionMode) {
19 return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock();
20 }
21
22 ////////////////////////////////////////////////////////////////////////////////
23 // Runtime functions for trunk data sharing scheme.
24 ////////////////////////////////////////////////////////////////////////////////
25
data_sharing_init_stack_common()26 INLINE static void data_sharing_init_stack_common() {
27 ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
28 omptarget_nvptx_TeamDescr *teamDescr =
29 &omptarget_nvptx_threadPrivateContext->TeamContext();
30
31 for (int WID = 0; WID < DS_Max_Warp_Number; WID++) {
32 __kmpc_data_sharing_slot *RootS = teamDescr->GetPreallocatedSlotAddr(WID);
33 DataSharingState.SlotPtr[WID] = RootS;
34 DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
35 }
36 }
37
38 // Initialize data sharing data structure. This function needs to be called
39 // once at the beginning of a data sharing context (coincides with the kernel
40 // initialization). This function is called only by the MASTER thread of each
41 // team in non-SPMD mode.
__kmpc_data_sharing_init_stack()42 EXTERN void __kmpc_data_sharing_init_stack() {
43 ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
44 // This function initializes the stack pointer with the pointer to the
45 // statically allocated shared memory slots. The size of a shared memory
46 // slot is pre-determined to be 256 bytes.
47 data_sharing_init_stack_common();
48 omptarget_nvptx_globalArgs.Init();
49 }
50
51 // Initialize data sharing data structure. This function needs to be called
52 // once at the beginning of a data sharing context (coincides with the kernel
53 // initialization). This function is called in SPMD mode only.
__kmpc_data_sharing_init_stack_spmd()54 EXTERN void __kmpc_data_sharing_init_stack_spmd() {
55 ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
56 // This function initializes the stack pointer with the pointer to the
57 // statically allocated shared memory slots. The size of a shared memory
58 // slot is pre-determined to be 256 bytes.
59 if (GetThreadIdInBlock() == 0)
60 data_sharing_init_stack_common();
61
62 __kmpc_impl_threadfence_block();
63 }
64
data_sharing_push_stack_common(size_t PushSize)65 INLINE static void* data_sharing_push_stack_common(size_t PushSize) {
66 ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
67
68 // Only warp active master threads manage the stack.
69 bool IsWarpMaster = (GetThreadIdInBlock() % WARPSIZE) == 0;
70
71 // Add worst-case padding to DataSize so that future stack allocations are
72 // correctly aligned.
73 const size_t Alignment = 8;
74 PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment;
75
76 // Frame pointer must be visible to all workers in the same warp.
77 const unsigned WID = GetWarpId();
78 void *FrameP = 0;
79 __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
80
81 if (IsWarpMaster) {
82 // SlotP will point to either the shared memory slot or an existing
83 // global memory slot.
84 __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
85 void *&StackP = DataSharingState.StackPtr[WID];
86
87 // Check if we have room for the data in the current slot.
88 const uintptr_t StartAddress = (uintptr_t)StackP;
89 const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd;
90 const uintptr_t RequestedEndAddress = StartAddress + (uintptr_t)PushSize;
91
92 // If we requested more data than there is room for in the rest
93 // of the slot then we need to either re-use the next slot, if one exists,
94 // or create a new slot.
95 if (EndAddress < RequestedEndAddress) {
96 __kmpc_data_sharing_slot *NewSlot = 0;
97 size_t NewSize = PushSize;
98
99 // Allocate at least the default size for each type of slot.
100 // Master is a special case and even though there is only one thread,
101 // it can share more things with the workers. For uniformity, it uses
102 // the full size of a worker warp slot.
103 size_t DefaultSlotSize = DS_Worker_Warp_Slot_Size;
104 if (DefaultSlotSize > NewSize)
105 NewSize = DefaultSlotSize;
106 NewSlot = (__kmpc_data_sharing_slot *) SafeMalloc(
107 sizeof(__kmpc_data_sharing_slot) + NewSize,
108 "Global memory slot allocation.");
109
110 NewSlot->Next = 0;
111 NewSlot->Prev = SlotP;
112 NewSlot->PrevSlotStackPtr = StackP;
113 NewSlot->DataEnd = &NewSlot->Data[0] + NewSize;
114
115 // Make previous slot point to the newly allocated slot.
116 SlotP->Next = NewSlot;
117 // The current slot becomes the new slot.
118 SlotP = NewSlot;
119 // The stack pointer always points to the next free stack frame.
120 StackP = &NewSlot->Data[0] + PushSize;
121 // The frame pointer always points to the beginning of the frame.
122 FrameP = DataSharingState.FramePtr[WID] = &NewSlot->Data[0];
123 } else {
124 // Add the data chunk to the current slot. The frame pointer is set to
125 // point to the start of the new frame held in StackP.
126 FrameP = DataSharingState.FramePtr[WID] = StackP;
127 // Reset stack pointer to the requested address.
128 StackP = (void *)RequestedEndAddress;
129 }
130 }
131 // Get address from lane 0.
132 int *FP = (int *)&FrameP;
133 FP[0] = __kmpc_impl_shfl_sync(CurActive, FP[0], 0);
134 if (sizeof(FrameP) == 8)
135 FP[1] = __kmpc_impl_shfl_sync(CurActive, FP[1], 0);
136
137 return FrameP;
138 }
139
__kmpc_data_sharing_coalesced_push_stack(size_t DataSize,int16_t UseSharedMemory)140 EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t DataSize,
141 int16_t UseSharedMemory) {
142 return data_sharing_push_stack_common(DataSize);
143 }
144
145 // Called at the time of the kernel initialization. This is used to initilize
146 // the list of references to shared variables and to pre-allocate global storage
147 // for holding the globalized variables.
148 //
149 // By default the globalized variables are stored in global memory. If the
150 // UseSharedMemory is set to true, the runtime will attempt to use shared memory
151 // as long as the size requested fits the pre-allocated size.
__kmpc_data_sharing_push_stack(size_t DataSize,int16_t UseSharedMemory)152 EXTERN void *__kmpc_data_sharing_push_stack(size_t DataSize,
153 int16_t UseSharedMemory) {
154 // Compute the total memory footprint of the requested data.
155 // The master thread requires a stack only for itself. A worker
156 // thread (which at this point is a warp master) will require
157 // space for the variables of each thread in the warp,
158 // i.e. one DataSize chunk per warp lane.
159 // TODO: change WARPSIZE to the number of active threads in the warp.
160 size_t PushSize = (isRuntimeUninitialized() || IsMasterThread(isSPMDMode()))
161 ? DataSize
162 : WARPSIZE * DataSize;
163
164 // Compute the start address of the frame of each thread in the warp.
165 uintptr_t FrameStartAddress =
166 (uintptr_t) data_sharing_push_stack_common(PushSize);
167 FrameStartAddress += (uintptr_t) (GetLaneId() * DataSize);
168 return (void *)FrameStartAddress;
169 }
170
171 // Pop the stack and free any memory which can be reclaimed.
172 //
173 // When the pop operation removes the last global memory slot,
174 // reclaim all outstanding global memory slots since it is
175 // likely we have reached the end of the kernel.
__kmpc_data_sharing_pop_stack(void * FrameStart)176 EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
177 ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
178
179 __kmpc_impl_threadfence_block();
180
181 if (GetThreadIdInBlock() % WARPSIZE == 0) {
182 unsigned WID = GetWarpId();
183
184 // Current slot
185 __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
186
187 // Pointer to next available stack.
188 void *&StackP = DataSharingState.StackPtr[WID];
189
190 // Pop the frame.
191 StackP = FrameStart;
192
193 // If the current slot is empty, we need to free the slot after the
194 // pop.
195 bool SlotEmpty = (StackP == &SlotP->Data[0]);
196
197 if (SlotEmpty && SlotP->Prev) {
198 // Before removing the slot we need to reset StackP.
199 StackP = SlotP->PrevSlotStackPtr;
200
201 // Remove the slot.
202 SlotP = SlotP->Prev;
203 SafeFree(SlotP->Next, "Free slot.");
204 SlotP->Next = 0;
205 }
206 }
207 }
208
209 // Begin a data sharing context. Maintain a list of references to shared
210 // variables. This list of references to shared variables will be passed
211 // to one or more threads.
212 // In L0 data sharing this is called by master thread.
213 // In L1 data sharing this is called by active warp master thread.
__kmpc_begin_sharing_variables(void *** GlobalArgs,size_t nArgs)214 EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) {
215 omptarget_nvptx_globalArgs.EnsureSize(nArgs);
216 *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
217 }
218
219 // End a data sharing context. There is no need to have a list of refs
220 // to shared variables because the context in which those variables were
221 // shared has now ended. This should clean-up the list of references only
222 // without affecting the actual global storage of the variables.
223 // In L0 data sharing this is called by master thread.
224 // In L1 data sharing this is called by active warp master thread.
__kmpc_end_sharing_variables()225 EXTERN void __kmpc_end_sharing_variables() {
226 omptarget_nvptx_globalArgs.DeInit();
227 }
228
229 // This function will return a list of references to global variables. This
230 // is how the workers will get a reference to the globalized variable. The
231 // members of this list will be passed to the outlined parallel function
232 // preserving the order.
233 // Called by all workers.
__kmpc_get_shared_variables(void *** GlobalArgs)234 EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) {
235 *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
236 }
237
238 // This function is used to init static memory manager. This manager is used to
239 // manage statically allocated global memory. This memory is allocated by the
240 // compiler and used to correctly implement globalization of the variables in
241 // target, teams and distribute regions.
__kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,const void * buf,size_t size,int16_t is_shared,const void ** frame)242 EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
243 const void *buf, size_t size,
244 int16_t is_shared,
245 const void **frame) {
246 if (is_shared) {
247 *frame = buf;
248 return;
249 }
250 if (isSPMDExecutionMode) {
251 if (GetThreadIdInBlock() == 0) {
252 *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
253 }
254 __kmpc_impl_syncthreads();
255 return;
256 }
257 ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
258 "Must be called only in the target master thread.");
259 *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
260 __kmpc_impl_threadfence();
261 }
262
__kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,int16_t is_shared)263 EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
264 int16_t is_shared) {
265 if (is_shared)
266 return;
267 if (isSPMDExecutionMode) {
268 __kmpc_impl_syncthreads();
269 if (GetThreadIdInBlock() == 0) {
270 omptarget_nvptx_simpleMemoryManager.Release();
271 }
272 return;
273 }
274 __kmpc_impl_threadfence();
275 ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
276 "Must be called only in the target master thread.");
277 omptarget_nvptx_simpleMemoryManager.Release();
278 }
279
280 #pragma omp end declare target
281