1 //===----- data_sharing.cu - OpenMP GPU data sharing ------------- CUDA -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the implementation of data sharing environments
10 //
11 //===----------------------------------------------------------------------===//
12 #pragma omp declare target
13 
14 #include "common/omptarget.h"
15 #include "target_impl.h"
16 
17 // Return true if this is the master thread.
IsMasterThread(bool isSPMDExecutionMode)18 INLINE static bool IsMasterThread(bool isSPMDExecutionMode) {
19   return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock();
20 }
21 
22 ////////////////////////////////////////////////////////////////////////////////
23 // Runtime functions for trunk data sharing scheme.
24 ////////////////////////////////////////////////////////////////////////////////
25 
data_sharing_init_stack_common()26 INLINE static void data_sharing_init_stack_common() {
27   ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
28   omptarget_nvptx_TeamDescr *teamDescr =
29       &omptarget_nvptx_threadPrivateContext->TeamContext();
30 
31   for (int WID = 0; WID < DS_Max_Warp_Number; WID++) {
32     __kmpc_data_sharing_slot *RootS = teamDescr->GetPreallocatedSlotAddr(WID);
33     DataSharingState.SlotPtr[WID] = RootS;
34     DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
35   }
36 }
37 
38 // Initialize data sharing data structure. This function needs to be called
39 // once at the beginning of a data sharing context (coincides with the kernel
40 // initialization). This function is called only by the MASTER thread of each
41 // team in non-SPMD mode.
__kmpc_data_sharing_init_stack()42 EXTERN void __kmpc_data_sharing_init_stack() {
43   ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
44   // This function initializes the stack pointer with the pointer to the
45   // statically allocated shared memory slots. The size of a shared memory
46   // slot is pre-determined to be 256 bytes.
47   data_sharing_init_stack_common();
48   omptarget_nvptx_globalArgs.Init();
49 }
50 
51 // Initialize data sharing data structure. This function needs to be called
52 // once at the beginning of a data sharing context (coincides with the kernel
53 // initialization). This function is called in SPMD mode only.
__kmpc_data_sharing_init_stack_spmd()54 EXTERN void __kmpc_data_sharing_init_stack_spmd() {
55   ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
56   // This function initializes the stack pointer with the pointer to the
57   // statically allocated shared memory slots. The size of a shared memory
58   // slot is pre-determined to be 256 bytes.
59   if (GetThreadIdInBlock() == 0)
60     data_sharing_init_stack_common();
61 
62   __kmpc_impl_threadfence_block();
63 }
64 
data_sharing_push_stack_common(size_t PushSize)65 INLINE static void* data_sharing_push_stack_common(size_t PushSize) {
66   ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
67 
68   // Only warp active master threads manage the stack.
69   bool IsWarpMaster = (GetThreadIdInBlock() % WARPSIZE) == 0;
70 
71   // Add worst-case padding to DataSize so that future stack allocations are
72   // correctly aligned.
73   const size_t Alignment = 8;
74   PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment;
75 
76   // Frame pointer must be visible to all workers in the same warp.
77   const unsigned WID = GetWarpId();
78   void *FrameP = 0;
79   __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
80 
81   if (IsWarpMaster) {
82     // SlotP will point to either the shared memory slot or an existing
83     // global memory slot.
84     __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
85     void *&StackP = DataSharingState.StackPtr[WID];
86 
87     // Check if we have room for the data in the current slot.
88     const uintptr_t StartAddress = (uintptr_t)StackP;
89     const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd;
90     const uintptr_t RequestedEndAddress = StartAddress + (uintptr_t)PushSize;
91 
92     // If we requested more data than there is room for in the rest
93     // of the slot then we need to either re-use the next slot, if one exists,
94     // or create a new slot.
95     if (EndAddress < RequestedEndAddress) {
96       __kmpc_data_sharing_slot *NewSlot = 0;
97       size_t NewSize = PushSize;
98 
99       // Allocate at least the default size for each type of slot.
100       // Master is a special case and even though there is only one thread,
101       // it can share more things with the workers. For uniformity, it uses
102       // the full size of a worker warp slot.
103       size_t DefaultSlotSize = DS_Worker_Warp_Slot_Size;
104       if (DefaultSlotSize > NewSize)
105         NewSize = DefaultSlotSize;
106       NewSlot = (__kmpc_data_sharing_slot *) SafeMalloc(
107           sizeof(__kmpc_data_sharing_slot) + NewSize,
108           "Global memory slot allocation.");
109 
110       NewSlot->Next = 0;
111       NewSlot->Prev = SlotP;
112       NewSlot->PrevSlotStackPtr = StackP;
113       NewSlot->DataEnd = &NewSlot->Data[0] + NewSize;
114 
115       // Make previous slot point to the newly allocated slot.
116       SlotP->Next = NewSlot;
117       // The current slot becomes the new slot.
118       SlotP = NewSlot;
119       // The stack pointer always points to the next free stack frame.
120       StackP = &NewSlot->Data[0] + PushSize;
121       // The frame pointer always points to the beginning of the frame.
122       FrameP = DataSharingState.FramePtr[WID] = &NewSlot->Data[0];
123     } else {
124       // Add the data chunk to the current slot. The frame pointer is set to
125       // point to the start of the new frame held in StackP.
126       FrameP = DataSharingState.FramePtr[WID] = StackP;
127       // Reset stack pointer to the requested address.
128       StackP = (void *)RequestedEndAddress;
129     }
130   }
131   // Get address from lane 0.
132   int *FP = (int *)&FrameP;
133   FP[0] = __kmpc_impl_shfl_sync(CurActive, FP[0], 0);
134   if (sizeof(FrameP) == 8)
135     FP[1] = __kmpc_impl_shfl_sync(CurActive, FP[1], 0);
136 
137   return FrameP;
138 }
139 
__kmpc_data_sharing_coalesced_push_stack(size_t DataSize,int16_t UseSharedMemory)140 EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t DataSize,
141                                                       int16_t UseSharedMemory) {
142   return data_sharing_push_stack_common(DataSize);
143 }
144 
145 // Called at the time of the kernel initialization. This is used to initilize
146 // the list of references to shared variables and to pre-allocate global storage
147 // for holding the globalized variables.
148 //
149 // By default the globalized variables are stored in global memory. If the
150 // UseSharedMemory is set to true, the runtime will attempt to use shared memory
151 // as long as the size requested fits the pre-allocated size.
__kmpc_data_sharing_push_stack(size_t DataSize,int16_t UseSharedMemory)152 EXTERN void *__kmpc_data_sharing_push_stack(size_t DataSize,
153                                             int16_t UseSharedMemory) {
154   // Compute the total memory footprint of the requested data.
155   // The master thread requires a stack only for itself. A worker
156   // thread (which at this point is a warp master) will require
157   // space for the variables of each thread in the warp,
158   // i.e. one DataSize chunk per warp lane.
159   // TODO: change WARPSIZE to the number of active threads in the warp.
160   size_t PushSize = (isRuntimeUninitialized() || IsMasterThread(isSPMDMode()))
161                         ? DataSize
162                         : WARPSIZE * DataSize;
163 
164   // Compute the start address of the frame of each thread in the warp.
165   uintptr_t FrameStartAddress =
166       (uintptr_t) data_sharing_push_stack_common(PushSize);
167   FrameStartAddress += (uintptr_t) (GetLaneId() * DataSize);
168   return (void *)FrameStartAddress;
169 }
170 
171 // Pop the stack and free any memory which can be reclaimed.
172 //
173 // When the pop operation removes the last global memory slot,
174 // reclaim all outstanding global memory slots since it is
175 // likely we have reached the end of the kernel.
__kmpc_data_sharing_pop_stack(void * FrameStart)176 EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
177   ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
178 
179   __kmpc_impl_threadfence_block();
180 
181   if (GetThreadIdInBlock() % WARPSIZE == 0) {
182     unsigned WID = GetWarpId();
183 
184     // Current slot
185     __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
186 
187     // Pointer to next available stack.
188     void *&StackP = DataSharingState.StackPtr[WID];
189 
190     // Pop the frame.
191     StackP = FrameStart;
192 
193     // If the current slot is empty, we need to free the slot after the
194     // pop.
195     bool SlotEmpty = (StackP == &SlotP->Data[0]);
196 
197     if (SlotEmpty && SlotP->Prev) {
198       // Before removing the slot we need to reset StackP.
199       StackP = SlotP->PrevSlotStackPtr;
200 
201       // Remove the slot.
202       SlotP = SlotP->Prev;
203       SafeFree(SlotP->Next, "Free slot.");
204       SlotP->Next = 0;
205     }
206   }
207 }
208 
209 // Begin a data sharing context. Maintain a list of references to shared
210 // variables. This list of references to shared variables will be passed
211 // to one or more threads.
212 // In L0 data sharing this is called by master thread.
213 // In L1 data sharing this is called by active warp master thread.
__kmpc_begin_sharing_variables(void *** GlobalArgs,size_t nArgs)214 EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) {
215   omptarget_nvptx_globalArgs.EnsureSize(nArgs);
216   *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
217 }
218 
219 // End a data sharing context. There is no need to have a list of refs
220 // to shared variables because the context in which those variables were
221 // shared has now ended. This should clean-up the list of references only
222 // without affecting the actual global storage of the variables.
223 // In L0 data sharing this is called by master thread.
224 // In L1 data sharing this is called by active warp master thread.
__kmpc_end_sharing_variables()225 EXTERN void __kmpc_end_sharing_variables() {
226   omptarget_nvptx_globalArgs.DeInit();
227 }
228 
229 // This function will return a list of references to global variables. This
230 // is how the workers will get a reference to the globalized variable. The
231 // members of this list will be passed to the outlined parallel function
232 // preserving the order.
233 // Called by all workers.
__kmpc_get_shared_variables(void *** GlobalArgs)234 EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) {
235   *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
236 }
237 
238 // This function is used to init static memory manager. This manager is used to
239 // manage statically allocated global memory. This memory is allocated by the
240 // compiler and used to correctly implement globalization of the variables in
241 // target, teams and distribute regions.
__kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,const void * buf,size_t size,int16_t is_shared,const void ** frame)242 EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
243                                           const void *buf, size_t size,
244                                           int16_t is_shared,
245                                           const void **frame) {
246   if (is_shared) {
247     *frame = buf;
248     return;
249   }
250   if (isSPMDExecutionMode) {
251     if (GetThreadIdInBlock() == 0) {
252       *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
253     }
254     __kmpc_impl_syncthreads();
255     return;
256   }
257   ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
258           "Must be called only in the target master thread.");
259   *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
260   __kmpc_impl_threadfence();
261 }
262 
__kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,int16_t is_shared)263 EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
264                                               int16_t is_shared) {
265   if (is_shared)
266     return;
267   if (isSPMDExecutionMode) {
268     __kmpc_impl_syncthreads();
269     if (GetThreadIdInBlock() == 0) {
270       omptarget_nvptx_simpleMemoryManager.Release();
271     }
272     return;
273   }
274   __kmpc_impl_threadfence();
275   ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
276           "Must be called only in the target master thread.");
277   omptarget_nvptx_simpleMemoryManager.Release();
278 }
279 
280 #pragma omp end declare target
281