1 //===----- data_sharing.cu - OpenMP GPU data sharing ------------- CUDA -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the implementation of data sharing environments
10 //
11 //===----------------------------------------------------------------------===//
12 #pragma omp declare target
13
14 #include "common/omptarget.h"
15 #include "target/shuffle.h"
16 #include "target_impl.h"
17
18 ////////////////////////////////////////////////////////////////////////////////
19 // Runtime functions for trunk data sharing scheme.
20 ////////////////////////////////////////////////////////////////////////////////
21
22 static constexpr unsigned MinBytes = 8;
23
24 static constexpr unsigned Alignment = 8;
25
26 /// External symbol to access dynamic shared memory.
27 extern unsigned char DynamicSharedBuffer[] __attribute__((aligned(Alignment)));
28 #pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
29
__kmpc_get_dynamic_shared()30 EXTERN void *__kmpc_get_dynamic_shared() { return DynamicSharedBuffer; }
31
llvm_omp_get_dynamic_shared()32 EXTERN void *llvm_omp_get_dynamic_shared() {
33 return __kmpc_get_dynamic_shared();
34 }
35
36 template <unsigned BPerThread, unsigned NThreads = MAX_THREADS_PER_TEAM>
37 struct alignas(32) ThreadStackTy {
38 static constexpr unsigned BytesPerThread = BPerThread;
39 static constexpr unsigned NumThreads = NThreads;
40 static constexpr unsigned NumWarps = (NThreads + WARPSIZE - 1) / WARPSIZE;
41
42 unsigned char Data[NumThreads][BytesPerThread];
43 unsigned char Usage[NumThreads];
44 };
45
46 [[clang::loader_uninitialized]] ThreadStackTy<MinBytes * 8, 1> MainSharedStack;
47 #pragma omp allocate(MainSharedStack) allocator(omp_pteam_mem_alloc)
48
49 [[clang::loader_uninitialized]] ThreadStackTy<MinBytes,
50 MAX_THREADS_PER_TEAM / 4>
51 WorkerSharedStack;
52 #pragma omp allocate(WorkerSharedStack) allocator(omp_pteam_mem_alloc)
53
__kmpc_alloc_shared(size_t Bytes)54 EXTERN void *__kmpc_alloc_shared(size_t Bytes) {
55 size_t AlignedBytes = Bytes + (Bytes % MinBytes);
56 int TID = __kmpc_get_hardware_thread_id_in_block();
57 if (__kmpc_is_generic_main_thread(TID)) {
58 // Main thread alone, use shared memory if space is available.
59 if (MainSharedStack.Usage[0] + AlignedBytes <= MainSharedStack.BytesPerThread) {
60 void *Ptr = &MainSharedStack.Data[0][MainSharedStack.Usage[0]];
61 MainSharedStack.Usage[0] += AlignedBytes;
62 return Ptr;
63 }
64 } else if (TID < WorkerSharedStack.NumThreads) {
65 if (WorkerSharedStack.Usage[TID] + AlignedBytes <= WorkerSharedStack.BytesPerThread) {
66 void *Ptr = &WorkerSharedStack.Data[TID][WorkerSharedStack.Usage[TID]];
67 WorkerSharedStack.Usage[TID] += AlignedBytes;
68 return Ptr;
69 }
70 }
71 // Fallback to malloc
72 return SafeMalloc(Bytes, "AllocGlobalFallback");
73 }
74
__kmpc_free_shared(void * Ptr,size_t Bytes)75 EXTERN void __kmpc_free_shared(void *Ptr, size_t Bytes) {
76 size_t AlignedBytes = Bytes + (Bytes % MinBytes);
77 int TID = __kmpc_get_hardware_thread_id_in_block();
78 if (__kmpc_is_generic_main_thread(TID)) {
79 if (Ptr >= &MainSharedStack.Data[0][0] &&
80 Ptr < &MainSharedStack.Data[MainSharedStack.NumThreads][0]) {
81 MainSharedStack.Usage[0] -= AlignedBytes;
82 return;
83 }
84 } else if (TID < WorkerSharedStack.NumThreads) {
85 if (Ptr >= &WorkerSharedStack.Data[0][0] &&
86 Ptr < &WorkerSharedStack.Data[WorkerSharedStack.NumThreads][0]) {
87 int TID = __kmpc_get_hardware_thread_id_in_block();
88 WorkerSharedStack.Usage[TID] -= AlignedBytes;
89 return;
90 }
91 }
92 SafeFree(Ptr, "FreeGlobalFallback");
93 }
94
__kmpc_data_sharing_init_stack()95 EXTERN void __kmpc_data_sharing_init_stack() {
96 for (unsigned i = 0; i < MainSharedStack.NumWarps; ++i)
97 MainSharedStack.Usage[i] = 0;
98 for (unsigned i = 0; i < WorkerSharedStack.NumThreads; ++i)
99 WorkerSharedStack.Usage[i] = 0;
100 }
101
102 /// Allocate storage in shared memory to communicate arguments from the main
103 /// thread to the workers in generic mode. If we exceed
104 /// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
105 #define NUM_SHARED_VARIABLES_IN_SHARED_MEM 64
106
107 [[clang::loader_uninitialized]] static void
108 *SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
109 #pragma omp allocate(SharedMemVariableSharingSpace) \
110 allocator(omp_pteam_mem_alloc)
111 [[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr;
112 #pragma omp allocate(SharedMemVariableSharingSpacePtr) \
113 allocator(omp_pteam_mem_alloc)
114
115 // Begin a data sharing context. Maintain a list of references to shared
116 // variables. This list of references to shared variables will be passed
117 // to one or more threads.
118 // In L0 data sharing this is called by master thread.
119 // In L1 data sharing this is called by active warp master thread.
__kmpc_begin_sharing_variables(void *** GlobalArgs,size_t nArgs)120 EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) {
121 if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
122 SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0];
123 } else {
124 SharedMemVariableSharingSpacePtr =
125 (void **)SafeMalloc(nArgs * sizeof(void *), "new extended args");
126 }
127 *GlobalArgs = SharedMemVariableSharingSpacePtr;
128 }
129
130 // End a data sharing context. There is no need to have a list of refs
131 // to shared variables because the context in which those variables were
132 // shared has now ended. This should clean-up the list of references only
133 // without affecting the actual global storage of the variables.
134 // In L0 data sharing this is called by master thread.
135 // In L1 data sharing this is called by active warp master thread.
__kmpc_end_sharing_variables()136 EXTERN void __kmpc_end_sharing_variables() {
137 if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0])
138 SafeFree(SharedMemVariableSharingSpacePtr, "new extended args");
139 }
140
141 // This function will return a list of references to global variables. This
142 // is how the workers will get a reference to the globalized variable. The
143 // members of this list will be passed to the outlined parallel function
144 // preserving the order.
145 // Called by all workers.
__kmpc_get_shared_variables(void *** GlobalArgs)146 EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) {
147 *GlobalArgs = SharedMemVariableSharingSpacePtr;
148 }
149
150 // This function is used to init static memory manager. This manager is used to
151 // manage statically allocated global memory. This memory is allocated by the
152 // compiler and used to correctly implement globalization of the variables in
153 // target, teams and distribute regions.
__kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,const void * buf,size_t size,int16_t is_shared,const void ** frame)154 EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
155 const void *buf, size_t size,
156 int16_t is_shared,
157 const void **frame) {
158 if (is_shared) {
159 *frame = buf;
160 return;
161 }
162 if (isSPMDExecutionMode) {
163 if (__kmpc_get_hardware_thread_id_in_block() == 0) {
164 *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
165 }
166 __kmpc_impl_syncthreads();
167 return;
168 }
169 ASSERT0(LT_FUSSY,
170 __kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(),
171 "Must be called only in the target master thread.");
172 *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
173 __kmpc_impl_threadfence();
174 }
175
__kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,int16_t is_shared)176 EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
177 int16_t is_shared) {
178 if (is_shared)
179 return;
180 if (isSPMDExecutionMode) {
181 __kmpc_impl_syncthreads();
182 if (__kmpc_get_hardware_thread_id_in_block() == 0) {
183 omptarget_nvptx_simpleMemoryManager.Release();
184 }
185 return;
186 }
187 __kmpc_impl_threadfence();
188 ASSERT0(LT_FUSSY,
189 __kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(),
190 "Must be called only in the target master thread.");
191 omptarget_nvptx_simpleMemoryManager.Release();
192 }
193
194 #pragma omp end declare target
195