1 //===----- data_sharing.cu - OpenMP GPU data sharing ------------- CUDA -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the implementation of data sharing environments
10 //
11 //===----------------------------------------------------------------------===//
12 #include "common/omptarget.h"
13 #include "target_impl.h"
14 
15 // Return true if this is the master thread.
IsMasterThread(bool isSPMDExecutionMode)16 INLINE static bool IsMasterThread(bool isSPMDExecutionMode) {
17   return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock();
18 }
19 
20 ////////////////////////////////////////////////////////////////////////////////
21 // Runtime functions for trunk data sharing scheme.
22 ////////////////////////////////////////////////////////////////////////////////
23 
data_sharing_init_stack_common()24 INLINE static void data_sharing_init_stack_common() {
25   ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
26   omptarget_nvptx_TeamDescr *teamDescr =
27       &omptarget_nvptx_threadPrivateContext->TeamContext();
28 
29   for (int WID = 0; WID < WARPSIZE; WID++) {
30     __kmpc_data_sharing_slot *RootS = teamDescr->GetPreallocatedSlotAddr(WID);
31     DataSharingState.SlotPtr[WID] = RootS;
32     DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
33   }
34 }
35 
36 // Initialize data sharing data structure. This function needs to be called
37 // once at the beginning of a data sharing context (coincides with the kernel
38 // initialization). This function is called only by the MASTER thread of each
39 // team in non-SPMD mode.
__kmpc_data_sharing_init_stack()40 EXTERN void __kmpc_data_sharing_init_stack() {
41   ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
42   // This function initializes the stack pointer with the pointer to the
43   // statically allocated shared memory slots. The size of a shared memory
44   // slot is pre-determined to be 256 bytes.
45   data_sharing_init_stack_common();
46   omptarget_nvptx_globalArgs.Init();
47 }
48 
49 // Initialize data sharing data structure. This function needs to be called
50 // once at the beginning of a data sharing context (coincides with the kernel
51 // initialization). This function is called in SPMD mode only.
__kmpc_data_sharing_init_stack_spmd()52 EXTERN void __kmpc_data_sharing_init_stack_spmd() {
53   ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
54   // This function initializes the stack pointer with the pointer to the
55   // statically allocated shared memory slots. The size of a shared memory
56   // slot is pre-determined to be 256 bytes.
57   if (GetThreadIdInBlock() == 0)
58     data_sharing_init_stack_common();
59 
60   __kmpc_impl_threadfence_block();
61 }
62 
data_sharing_push_stack_common(size_t PushSize)63 INLINE static void* data_sharing_push_stack_common(size_t PushSize) {
64   ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
65 
66   // Only warp active master threads manage the stack.
67   bool IsWarpMaster = (GetThreadIdInBlock() % WARPSIZE) == 0;
68 
69   // Add worst-case padding to DataSize so that future stack allocations are
70   // correctly aligned.
71   const size_t Alignment = 8;
72   PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment;
73 
74   // Frame pointer must be visible to all workers in the same warp.
75   const unsigned WID = GetWarpId();
76   void *FrameP = 0;
77   __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
78 
79   if (IsWarpMaster) {
80     // SlotP will point to either the shared memory slot or an existing
81     // global memory slot.
82     __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
83     void *&StackP = DataSharingState.StackPtr[WID];
84 
85     // Check if we have room for the data in the current slot.
86     const uintptr_t StartAddress = (uintptr_t)StackP;
87     const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd;
88     const uintptr_t RequestedEndAddress = StartAddress + (uintptr_t)PushSize;
89 
90     // If we requested more data than there is room for in the rest
91     // of the slot then we need to either re-use the next slot, if one exists,
92     // or create a new slot.
93     if (EndAddress < RequestedEndAddress) {
94       __kmpc_data_sharing_slot *NewSlot = 0;
95       size_t NewSize = PushSize;
96 
97       // Allocate at least the default size for each type of slot.
98       // Master is a special case and even though there is only one thread,
99       // it can share more things with the workers. For uniformity, it uses
100       // the full size of a worker warp slot.
101       size_t DefaultSlotSize = DS_Worker_Warp_Slot_Size;
102       if (DefaultSlotSize > NewSize)
103         NewSize = DefaultSlotSize;
104       NewSlot = (__kmpc_data_sharing_slot *) SafeMalloc(
105           sizeof(__kmpc_data_sharing_slot) + NewSize,
106           "Global memory slot allocation.");
107 
108       NewSlot->Next = 0;
109       NewSlot->Prev = SlotP;
110       NewSlot->PrevSlotStackPtr = StackP;
111       NewSlot->DataEnd = &NewSlot->Data[0] + NewSize;
112 
113       // Make previous slot point to the newly allocated slot.
114       SlotP->Next = NewSlot;
115       // The current slot becomes the new slot.
116       SlotP = NewSlot;
117       // The stack pointer always points to the next free stack frame.
118       StackP = &NewSlot->Data[0] + PushSize;
119       // The frame pointer always points to the beginning of the frame.
120       FrameP = DataSharingState.FramePtr[WID] = &NewSlot->Data[0];
121     } else {
122       // Add the data chunk to the current slot. The frame pointer is set to
123       // point to the start of the new frame held in StackP.
124       FrameP = DataSharingState.FramePtr[WID] = StackP;
125       // Reset stack pointer to the requested address.
126       StackP = (void *)RequestedEndAddress;
127     }
128   }
129   // Get address from lane 0.
130   int *FP = (int *)&FrameP;
131   FP[0] = __kmpc_impl_shfl_sync(CurActive, FP[0], 0);
132   if (sizeof(FrameP) == 8)
133     FP[1] = __kmpc_impl_shfl_sync(CurActive, FP[1], 0);
134 
135   return FrameP;
136 }
137 
__kmpc_data_sharing_coalesced_push_stack(size_t DataSize,int16_t UseSharedMemory)138 EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t DataSize,
139                                                       int16_t UseSharedMemory) {
140   return data_sharing_push_stack_common(DataSize);
141 }
142 
143 // Called at the time of the kernel initialization. This is used to initilize
144 // the list of references to shared variables and to pre-allocate global storage
145 // for holding the globalized variables.
146 //
147 // By default the globalized variables are stored in global memory. If the
148 // UseSharedMemory is set to true, the runtime will attempt to use shared memory
149 // as long as the size requested fits the pre-allocated size.
__kmpc_data_sharing_push_stack(size_t DataSize,int16_t UseSharedMemory)150 EXTERN void *__kmpc_data_sharing_push_stack(size_t DataSize,
151                                             int16_t UseSharedMemory) {
152   // Compute the total memory footprint of the requested data.
153   // The master thread requires a stack only for itself. A worker
154   // thread (which at this point is a warp master) will require
155   // space for the variables of each thread in the warp,
156   // i.e. one DataSize chunk per warp lane.
157   // TODO: change WARPSIZE to the number of active threads in the warp.
158   size_t PushSize = (isRuntimeUninitialized() || IsMasterThread(isSPMDMode()))
159                         ? DataSize
160                         : WARPSIZE * DataSize;
161 
162   // Compute the start address of the frame of each thread in the warp.
163   uintptr_t FrameStartAddress =
164       (uintptr_t) data_sharing_push_stack_common(PushSize);
165   FrameStartAddress += (uintptr_t) (GetLaneId() * DataSize);
166   return (void *)FrameStartAddress;
167 }
168 
169 // Pop the stack and free any memory which can be reclaimed.
170 //
171 // When the pop operation removes the last global memory slot,
172 // reclaim all outstanding global memory slots since it is
173 // likely we have reached the end of the kernel.
__kmpc_data_sharing_pop_stack(void * FrameStart)174 EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
175   ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
176 
177   __kmpc_impl_threadfence_block();
178 
179   if (GetThreadIdInBlock() % WARPSIZE == 0) {
180     unsigned WID = GetWarpId();
181 
182     // Current slot
183     __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
184 
185     // Pointer to next available stack.
186     void *&StackP = DataSharingState.StackPtr[WID];
187 
188     // Pop the frame.
189     StackP = FrameStart;
190 
191     // If the current slot is empty, we need to free the slot after the
192     // pop.
193     bool SlotEmpty = (StackP == &SlotP->Data[0]);
194 
195     if (SlotEmpty && SlotP->Prev) {
196       // Before removing the slot we need to reset StackP.
197       StackP = SlotP->PrevSlotStackPtr;
198 
199       // Remove the slot.
200       SlotP = SlotP->Prev;
201       SafeFree(SlotP->Next, "Free slot.");
202       SlotP->Next = 0;
203     }
204   }
205 }
206 
207 // Begin a data sharing context. Maintain a list of references to shared
208 // variables. This list of references to shared variables will be passed
209 // to one or more threads.
210 // In L0 data sharing this is called by master thread.
211 // In L1 data sharing this is called by active warp master thread.
__kmpc_begin_sharing_variables(void *** GlobalArgs,size_t nArgs)212 EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) {
213   omptarget_nvptx_globalArgs.EnsureSize(nArgs);
214   *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
215 }
216 
217 // End a data sharing context. There is no need to have a list of refs
218 // to shared variables because the context in which those variables were
219 // shared has now ended. This should clean-up the list of references only
220 // without affecting the actual global storage of the variables.
221 // In L0 data sharing this is called by master thread.
222 // In L1 data sharing this is called by active warp master thread.
__kmpc_end_sharing_variables()223 EXTERN void __kmpc_end_sharing_variables() {
224   omptarget_nvptx_globalArgs.DeInit();
225 }
226 
227 // This function will return a list of references to global variables. This
228 // is how the workers will get a reference to the globalized variable. The
229 // members of this list will be passed to the outlined parallel function
230 // preserving the order.
231 // Called by all workers.
__kmpc_get_shared_variables(void *** GlobalArgs)232 EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) {
233   *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
234 }
235 
236 // This function is used to init static memory manager. This manager is used to
237 // manage statically allocated global memory. This memory is allocated by the
238 // compiler and used to correctly implement globalization of the variables in
239 // target, teams and distribute regions.
__kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,const void * buf,size_t size,int16_t is_shared,const void ** frame)240 EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
241                                           const void *buf, size_t size,
242                                           int16_t is_shared,
243                                           const void **frame) {
244   if (is_shared) {
245     *frame = buf;
246     return;
247   }
248   if (isSPMDExecutionMode) {
249     if (GetThreadIdInBlock() == 0) {
250       *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
251     }
252     __kmpc_impl_syncthreads();
253     return;
254   }
255   ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
256           "Must be called only in the target master thread.");
257   *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
258   __kmpc_impl_threadfence();
259 }
260 
__kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,int16_t is_shared)261 EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
262                                               int16_t is_shared) {
263   if (is_shared)
264     return;
265   if (isSPMDExecutionMode) {
266     __kmpc_impl_syncthreads();
267     if (GetThreadIdInBlock() == 0) {
268       omptarget_nvptx_simpleMemoryManager.Release();
269     }
270     return;
271   }
272   __kmpc_impl_threadfence();
273   ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
274           "Must be called only in the target master thread.");
275   omptarget_nvptx_simpleMemoryManager.Release();
276 }
277 
278