1 //===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Parallel implementation in the GPU. Here is the pattern:
10 //
11 // while (not finished) {
12 //
13 // if (master) {
14 // sequential code, decide which par loop to do, or if finished
15 // __kmpc_kernel_prepare_parallel() // exec by master only
16 // }
17 // syncthreads // A
18 // __kmpc_kernel_parallel() // exec by all
19 // if (this thread is included in the parallel) {
20 // switch () for all parallel loops
21 // __kmpc_kernel_end_parallel() // exec only by threads in parallel
22 // }
23 //
24 //
25 // The reason we don't exec end_parallel for the threads not included
26 // in the parallel loop is that for each barrier in the parallel
27 // region, these non-included threads will cycle through the
28 // syncthread A. Thus they must preserve their current threadId that
29 // is larger than thread in team.
30 //
31 // To make a long story short...
32 //
33 //===----------------------------------------------------------------------===//
34
35 #include "Debug.h"
36 #include "Interface.h"
37 #include "Mapping.h"
38 #include "State.h"
39 #include "Synchronization.h"
40 #include "Types.h"
41 #include "Utils.h"
42
43 using namespace _OMP;
44
45 #pragma omp declare target
46
47 namespace {
48
determineNumberOfThreads(int32_t NumThreadsClause)49 uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
50 uint32_t NThreadsICV =
51 NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
52 uint32_t NumThreads = mapping::getBlockSize();
53
54 if (NThreadsICV != 0 && NThreadsICV < NumThreads)
55 NumThreads = NThreadsICV;
56
57 // Round down to a multiple of WARPSIZE since it is legal to do so in OpenMP.
58 if (NumThreads < mapping::getWarpSize())
59 NumThreads = 1;
60 else
61 NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1));
62
63 return NumThreads;
64 }
65
66 // Invoke an outlined parallel function unwrapping arguments (up to 32).
invokeMicrotask(int32_t global_tid,int32_t bound_tid,void * fn,void ** args,int64_t nargs)67 void invokeMicrotask(int32_t global_tid, int32_t bound_tid, void *fn,
68 void **args, int64_t nargs) {
69 switch (nargs) {
70 #include "generated_microtask_cases.gen"
71 default:
72 PRINT("Too many arguments in kmp_invoke_microtask, aborting execution.\n");
73 __builtin_trap();
74 }
75 }
76
77 } // namespace
78
79 extern "C" {
80
__kmpc_parallel_51(IdentTy * ident,int32_t,int32_t if_expr,int32_t num_threads,int proc_bind,void * fn,void * wrapper_fn,void ** args,int64_t nargs)81 void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
82 int32_t num_threads, int proc_bind, void *fn,
83 void *wrapper_fn, void **args, int64_t nargs) {
84
85 uint32_t TId = mapping::getThreadIdInBlock();
86 // Handle the serialized case first, same for SPMD/non-SPMD.
87 if (OMP_UNLIKELY(!if_expr || icv::Level)) {
88 __kmpc_serialized_parallel(ident, TId);
89 invokeMicrotask(TId, 0, fn, args, nargs);
90 __kmpc_end_serialized_parallel(ident, TId);
91 return;
92 }
93
94 uint32_t NumThreads = determineNumberOfThreads(num_threads);
95 if (mapping::isSPMDMode()) {
96 synchronize::threads();
97 {
98 state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads,
99 1u, TId == 0);
100 state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0);
101 state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0);
102 synchronize::threads();
103
104 if (TId < NumThreads)
105 invokeMicrotask(TId, 0, fn, args, nargs);
106 synchronize::threads();
107 }
108 return;
109 }
110
111 // We do *not* create a new data environment because all threads in the team
112 // that are active are now running this parallel region. They share the
113 // TeamState, which has an increase level-var and potentially active-level
114 // set, but they do not have individual ThreadStates yet. If they ever
115 // modify the ICVs beyond this point a ThreadStates will be allocated.
116
117 bool IsActiveParallelRegion = NumThreads > 1;
118 if (!IsActiveParallelRegion) {
119 state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true);
120 invokeMicrotask(TId, 0, fn, args, nargs);
121 return;
122 }
123
124 void **GlobalArgs = nullptr;
125 if (nargs) {
126 __kmpc_begin_sharing_variables(&GlobalArgs, nargs);
127 #pragma unroll
128 for (int I = 0; I < nargs; I++)
129 GlobalArgs[I] = args[I];
130 }
131
132 {
133 state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads,
134 1u, true);
135 state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
136 (void *)nullptr, true);
137 state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true);
138 state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true);
139
140 // Master signals work to activate workers.
141 synchronize::threads();
142 // Master waits for workers to signal.
143 synchronize::threads();
144 }
145
146 if (nargs)
147 __kmpc_end_sharing_variables();
148 }
149
150 __attribute__((noinline)) bool
__kmpc_kernel_parallel(ParallelRegionFnTy * WorkFn)151 __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
152 // Work function and arguments for L1 parallel region.
153 *WorkFn = state::ParallelRegionFn;
154
155 // If this is the termination signal from the master, quit early.
156 if (!*WorkFn)
157 return false;
158
159 // Set to true for workers participating in the parallel region.
160 uint32_t TId = mapping::getThreadIdInBlock();
161 bool ThreadIsActive = TId < state::ParallelTeamSize;
162 return ThreadIsActive;
163 }
164
__kmpc_kernel_end_parallel()165 __attribute__((noinline)) void __kmpc_kernel_end_parallel() {
166 // In case we have modified an ICV for this thread before a ThreadState was
167 // created. We drop it now to not contaminate the next parallel region.
168 ASSERT(!mapping::isSPMDMode());
169 uint32_t TId = mapping::getThreadIdInBlock();
170 state::resetStateForThread(TId);
171 ASSERT(!mapping::isSPMDMode());
172 }
173
__kmpc_serialized_parallel(IdentTy *,uint32_t TId)174 void __kmpc_serialized_parallel(IdentTy *, uint32_t TId) {
175 state::enterDataEnvironment();
176 ++icv::Level;
177 }
178
__kmpc_end_serialized_parallel(IdentTy *,uint32_t TId)179 void __kmpc_end_serialized_parallel(IdentTy *, uint32_t TId) {
180 state::exitDataEnvironment();
181 --icv::Level;
182 }
183
__kmpc_parallel_level(IdentTy *,uint32_t)184 uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) { return omp_get_level(); }
185
__kmpc_global_thread_num(IdentTy *)186 int32_t __kmpc_global_thread_num(IdentTy *) { return omp_get_thread_num(); }
187
__kmpc_push_num_threads(IdentTy *,int32_t,int32_t NumThreads)188 void __kmpc_push_num_threads(IdentTy *, int32_t, int32_t NumThreads) {
189 icv::NThreads = NumThreads;
190 }
191
__kmpc_push_num_teams(IdentTy * loc,int32_t tid,int32_t num_teams,int32_t thread_limit)192 void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams,
193 int32_t thread_limit) {}
194
__kmpc_push_proc_bind(IdentTy * loc,uint32_t tid,int proc_bind)195 void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {}
196 }
197
198 #pragma omp end declare target
199