1 //===----- data_sharing.cu - OpenMP GPU data sharing ------------- CUDA -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the implementation of data sharing environments
10 //
11 //===----------------------------------------------------------------------===//
12 #pragma omp declare target
13 
14 #include "common/omptarget.h"
15 #include "target/shuffle.h"
16 #include "target_impl.h"
17 
18 ////////////////////////////////////////////////////////////////////////////////
19 // Runtime functions for trunk data sharing scheme.
20 ////////////////////////////////////////////////////////////////////////////////
21 
22 static constexpr unsigned MinBytes = 8;
23 
24 static constexpr unsigned Alignment = 8;
25 
26 /// External symbol to access dynamic shared memory.
27 extern unsigned char DynamicSharedBuffer[] __attribute__((aligned(Alignment)));
28 #pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
29 
__kmpc_get_dynamic_shared()30 EXTERN void *__kmpc_get_dynamic_shared() { return DynamicSharedBuffer; }
31 
llvm_omp_get_dynamic_shared()32 EXTERN void *llvm_omp_get_dynamic_shared() {
33   return __kmpc_get_dynamic_shared();
34 }
35 
36 template <unsigned BPerThread, unsigned NThreads = MAX_THREADS_PER_TEAM>
37 struct alignas(32) ThreadStackTy {
38   static constexpr unsigned BytesPerThread = BPerThread;
39   static constexpr unsigned NumThreads = NThreads;
40   static constexpr unsigned NumWarps = (NThreads + WARPSIZE - 1) / WARPSIZE;
41 
42   unsigned char Data[NumThreads][BytesPerThread];
43   unsigned char Usage[NumThreads];
44 };
45 
46 [[clang::loader_uninitialized]] ThreadStackTy<MinBytes * 8, 1> MainSharedStack;
47 #pragma omp allocate(MainSharedStack) allocator(omp_pteam_mem_alloc)
48 
49 [[clang::loader_uninitialized]] ThreadStackTy<MinBytes,
50                                               MAX_THREADS_PER_TEAM / 4>
51     WorkerSharedStack;
52 #pragma omp allocate(WorkerSharedStack) allocator(omp_pteam_mem_alloc)
53 
__kmpc_alloc_shared(size_t Bytes)54 EXTERN void *__kmpc_alloc_shared(size_t Bytes) {
55   size_t AlignedBytes = Bytes + (Bytes % MinBytes);
56   int TID = __kmpc_get_hardware_thread_id_in_block();
57   if (__kmpc_is_generic_main_thread(TID)) {
58     // Main thread alone, use shared memory if space is available.
59     if (MainSharedStack.Usage[0] + AlignedBytes <= MainSharedStack.BytesPerThread) {
60       void *Ptr = &MainSharedStack.Data[0][MainSharedStack.Usage[0]];
61       MainSharedStack.Usage[0] += AlignedBytes;
62       return Ptr;
63     }
64   } else if (TID < WorkerSharedStack.NumThreads) {
65     if (WorkerSharedStack.Usage[TID] + AlignedBytes <= WorkerSharedStack.BytesPerThread) {
66       void *Ptr = &WorkerSharedStack.Data[TID][WorkerSharedStack.Usage[TID]];
67       WorkerSharedStack.Usage[TID] += AlignedBytes;
68       return Ptr;
69     }
70   }
71   // Fallback to malloc
72   return SafeMalloc(Bytes, "AllocGlobalFallback");
73 }
74 
__kmpc_free_shared(void * Ptr,size_t Bytes)75 EXTERN void __kmpc_free_shared(void *Ptr, size_t Bytes) {
76   size_t AlignedBytes = Bytes + (Bytes % MinBytes);
77   int TID = __kmpc_get_hardware_thread_id_in_block();
78   if (__kmpc_is_generic_main_thread(TID)) {
79     if (Ptr >= &MainSharedStack.Data[0][0] &&
80         Ptr < &MainSharedStack.Data[MainSharedStack.NumThreads][0]) {
81       MainSharedStack.Usage[0] -= AlignedBytes;
82       return;
83     }
84   } else if (TID < WorkerSharedStack.NumThreads) {
85     if (Ptr >= &WorkerSharedStack.Data[0][0] &&
86         Ptr < &WorkerSharedStack.Data[WorkerSharedStack.NumThreads][0]) {
87       int TID = __kmpc_get_hardware_thread_id_in_block();
88       WorkerSharedStack.Usage[TID] -= AlignedBytes;
89       return;
90     }
91   }
92   SafeFree(Ptr, "FreeGlobalFallback");
93 }
94 
__kmpc_data_sharing_init_stack()95 EXTERN void __kmpc_data_sharing_init_stack() {
96   for (unsigned i = 0; i < MainSharedStack.NumWarps; ++i)
97     MainSharedStack.Usage[i] = 0;
98   for (unsigned i = 0; i < WorkerSharedStack.NumThreads; ++i)
99     WorkerSharedStack.Usage[i] = 0;
100 }
101 
102 /// Allocate storage in shared memory to communicate arguments from the main
103 /// thread to the workers in generic mode. If we exceed
104 /// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
105 #define NUM_SHARED_VARIABLES_IN_SHARED_MEM 64
106 
107 [[clang::loader_uninitialized]] static void
108     *SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
109 #pragma omp allocate(SharedMemVariableSharingSpace)                            \
110     allocator(omp_pteam_mem_alloc)
111 [[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr;
112 #pragma omp allocate(SharedMemVariableSharingSpacePtr)                         \
113     allocator(omp_pteam_mem_alloc)
114 
115 // Begin a data sharing context. Maintain a list of references to shared
116 // variables. This list of references to shared variables will be passed
117 // to one or more threads.
118 // In L0 data sharing this is called by master thread.
119 // In L1 data sharing this is called by active warp master thread.
__kmpc_begin_sharing_variables(void *** GlobalArgs,size_t nArgs)120 EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) {
121   if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
122     SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0];
123   } else {
124     SharedMemVariableSharingSpacePtr =
125         (void **)SafeMalloc(nArgs * sizeof(void *), "new extended args");
126   }
127   *GlobalArgs = SharedMemVariableSharingSpacePtr;
128 }
129 
130 // End a data sharing context. There is no need to have a list of refs
131 // to shared variables because the context in which those variables were
132 // shared has now ended. This should clean-up the list of references only
133 // without affecting the actual global storage of the variables.
134 // In L0 data sharing this is called by master thread.
135 // In L1 data sharing this is called by active warp master thread.
__kmpc_end_sharing_variables()136 EXTERN void __kmpc_end_sharing_variables() {
137   if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0])
138     SafeFree(SharedMemVariableSharingSpacePtr, "new extended args");
139 }
140 
141 // This function will return a list of references to global variables. This
142 // is how the workers will get a reference to the globalized variable. The
143 // members of this list will be passed to the outlined parallel function
144 // preserving the order.
145 // Called by all workers.
__kmpc_get_shared_variables(void *** GlobalArgs)146 EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) {
147   *GlobalArgs = SharedMemVariableSharingSpacePtr;
148 }
149 
150 // This function is used to init static memory manager. This manager is used to
151 // manage statically allocated global memory. This memory is allocated by the
152 // compiler and used to correctly implement globalization of the variables in
153 // target, teams and distribute regions.
__kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,const void * buf,size_t size,int16_t is_shared,const void ** frame)154 EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
155                                           const void *buf, size_t size,
156                                           int16_t is_shared,
157                                           const void **frame) {
158   if (is_shared) {
159     *frame = buf;
160     return;
161   }
162   if (isSPMDExecutionMode) {
163     if (__kmpc_get_hardware_thread_id_in_block() == 0) {
164       *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
165     }
166     __kmpc_impl_syncthreads();
167     return;
168   }
169   ASSERT0(LT_FUSSY,
170           __kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(),
171           "Must be called only in the target master thread.");
172   *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
173   __kmpc_impl_threadfence();
174 }
175 
__kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,int16_t is_shared)176 EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
177                                               int16_t is_shared) {
178   if (is_shared)
179     return;
180   if (isSPMDExecutionMode) {
181     __kmpc_impl_syncthreads();
182     if (__kmpc_get_hardware_thread_id_in_block() == 0) {
183       omptarget_nvptx_simpleMemoryManager.Release();
184     }
185     return;
186   }
187   __kmpc_impl_threadfence();
188   ASSERT0(LT_FUSSY,
189           __kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(),
190           "Must be called only in the target master thread.");
191   omptarget_nvptx_simpleMemoryManager.Release();
192 }
193 
194 #pragma omp end declare target
195