1*13fbcb42Sjoerg //===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
2*13fbcb42Sjoerg //
3*13fbcb42Sjoerg // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*13fbcb42Sjoerg // See https://llvm.org/LICENSE.txt for license information.
5*13fbcb42Sjoerg // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*13fbcb42Sjoerg //
7*13fbcb42Sjoerg //===----------------------------------------------------------------------===//
8*13fbcb42Sjoerg //
9*13fbcb42Sjoerg // This provides a generalized class for OpenMP runtime code generation
10*13fbcb42Sjoerg // specialized by GPU targets NVPTX and AMDGCN.
11*13fbcb42Sjoerg //
12*13fbcb42Sjoerg //===----------------------------------------------------------------------===//
13*13fbcb42Sjoerg
14*13fbcb42Sjoerg #include "CGOpenMPRuntimeGPU.h"
15*13fbcb42Sjoerg #include "CGOpenMPRuntimeNVPTX.h"
16*13fbcb42Sjoerg #include "CodeGenFunction.h"
17*13fbcb42Sjoerg #include "clang/AST/Attr.h"
18*13fbcb42Sjoerg #include "clang/AST/DeclOpenMP.h"
19*13fbcb42Sjoerg #include "clang/AST/StmtOpenMP.h"
20*13fbcb42Sjoerg #include "clang/AST/StmtVisitor.h"
21*13fbcb42Sjoerg #include "clang/Basic/Cuda.h"
22*13fbcb42Sjoerg #include "llvm/ADT/SmallPtrSet.h"
23*13fbcb42Sjoerg #include "llvm/Frontend/OpenMP/OMPGridValues.h"
24*13fbcb42Sjoerg #include "llvm/IR/IntrinsicsNVPTX.h"
25*13fbcb42Sjoerg
26*13fbcb42Sjoerg using namespace clang;
27*13fbcb42Sjoerg using namespace CodeGen;
28*13fbcb42Sjoerg using namespace llvm::omp;
29*13fbcb42Sjoerg
30*13fbcb42Sjoerg namespace {
31*13fbcb42Sjoerg /// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
32*13fbcb42Sjoerg class NVPTXActionTy final : public PrePostActionTy {
33*13fbcb42Sjoerg llvm::FunctionCallee EnterCallee = nullptr;
34*13fbcb42Sjoerg ArrayRef<llvm::Value *> EnterArgs;
35*13fbcb42Sjoerg llvm::FunctionCallee ExitCallee = nullptr;
36*13fbcb42Sjoerg ArrayRef<llvm::Value *> ExitArgs;
37*13fbcb42Sjoerg bool Conditional = false;
38*13fbcb42Sjoerg llvm::BasicBlock *ContBlock = nullptr;
39*13fbcb42Sjoerg
40*13fbcb42Sjoerg public:
NVPTXActionTy(llvm::FunctionCallee EnterCallee,ArrayRef<llvm::Value * > EnterArgs,llvm::FunctionCallee ExitCallee,ArrayRef<llvm::Value * > ExitArgs,bool Conditional=false)41*13fbcb42Sjoerg NVPTXActionTy(llvm::FunctionCallee EnterCallee,
42*13fbcb42Sjoerg ArrayRef<llvm::Value *> EnterArgs,
43*13fbcb42Sjoerg llvm::FunctionCallee ExitCallee,
44*13fbcb42Sjoerg ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
45*13fbcb42Sjoerg : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
46*13fbcb42Sjoerg ExitArgs(ExitArgs), Conditional(Conditional) {}
Enter(CodeGenFunction & CGF)47*13fbcb42Sjoerg void Enter(CodeGenFunction &CGF) override {
48*13fbcb42Sjoerg llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
49*13fbcb42Sjoerg if (Conditional) {
50*13fbcb42Sjoerg llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
51*13fbcb42Sjoerg auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
52*13fbcb42Sjoerg ContBlock = CGF.createBasicBlock("omp_if.end");
53*13fbcb42Sjoerg // Generate the branch (If-stmt)
54*13fbcb42Sjoerg CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
55*13fbcb42Sjoerg CGF.EmitBlock(ThenBlock);
56*13fbcb42Sjoerg }
57*13fbcb42Sjoerg }
Done(CodeGenFunction & CGF)58*13fbcb42Sjoerg void Done(CodeGenFunction &CGF) {
59*13fbcb42Sjoerg // Emit the rest of blocks/branches
60*13fbcb42Sjoerg CGF.EmitBranch(ContBlock);
61*13fbcb42Sjoerg CGF.EmitBlock(ContBlock, true);
62*13fbcb42Sjoerg }
Exit(CodeGenFunction & CGF)63*13fbcb42Sjoerg void Exit(CodeGenFunction &CGF) override {
64*13fbcb42Sjoerg CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
65*13fbcb42Sjoerg }
66*13fbcb42Sjoerg };
67*13fbcb42Sjoerg
68*13fbcb42Sjoerg /// A class to track the execution mode when codegening directives within
69*13fbcb42Sjoerg /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
70*13fbcb42Sjoerg /// to the target region and used by containing directives such as 'parallel'
71*13fbcb42Sjoerg /// to emit optimized code.
72*13fbcb42Sjoerg class ExecutionRuntimeModesRAII {
73*13fbcb42Sjoerg private:
74*13fbcb42Sjoerg CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode =
75*13fbcb42Sjoerg CGOpenMPRuntimeGPU::EM_Unknown;
76*13fbcb42Sjoerg CGOpenMPRuntimeGPU::ExecutionMode &ExecMode;
77*13fbcb42Sjoerg bool SavedRuntimeMode = false;
78*13fbcb42Sjoerg bool *RuntimeMode = nullptr;
79*13fbcb42Sjoerg
80*13fbcb42Sjoerg public:
81*13fbcb42Sjoerg /// Constructor for Non-SPMD mode.
ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode & ExecMode)82*13fbcb42Sjoerg ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode)
83*13fbcb42Sjoerg : ExecMode(ExecMode) {
84*13fbcb42Sjoerg SavedExecMode = ExecMode;
85*13fbcb42Sjoerg ExecMode = CGOpenMPRuntimeGPU::EM_NonSPMD;
86*13fbcb42Sjoerg }
87*13fbcb42Sjoerg /// Constructor for SPMD mode.
ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode & ExecMode,bool & RuntimeMode,bool FullRuntimeMode)88*13fbcb42Sjoerg ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
89*13fbcb42Sjoerg bool &RuntimeMode, bool FullRuntimeMode)
90*13fbcb42Sjoerg : ExecMode(ExecMode), RuntimeMode(&RuntimeMode) {
91*13fbcb42Sjoerg SavedExecMode = ExecMode;
92*13fbcb42Sjoerg SavedRuntimeMode = RuntimeMode;
93*13fbcb42Sjoerg ExecMode = CGOpenMPRuntimeGPU::EM_SPMD;
94*13fbcb42Sjoerg RuntimeMode = FullRuntimeMode;
95*13fbcb42Sjoerg }
~ExecutionRuntimeModesRAII()96*13fbcb42Sjoerg ~ExecutionRuntimeModesRAII() {
97*13fbcb42Sjoerg ExecMode = SavedExecMode;
98*13fbcb42Sjoerg if (RuntimeMode)
99*13fbcb42Sjoerg *RuntimeMode = SavedRuntimeMode;
100*13fbcb42Sjoerg }
101*13fbcb42Sjoerg };
102*13fbcb42Sjoerg
103*13fbcb42Sjoerg /// GPU Configuration: This information can be derived from cuda registers,
104*13fbcb42Sjoerg /// however, providing compile time constants helps generate more efficient
105*13fbcb42Sjoerg /// code. For all practical purposes this is fine because the configuration
106*13fbcb42Sjoerg /// is the same for all known NVPTX architectures.
107*13fbcb42Sjoerg enum MachineConfiguration : unsigned {
108*13fbcb42Sjoerg /// See "llvm/Frontend/OpenMP/OMPGridValues.h" for various related target
109*13fbcb42Sjoerg /// specific Grid Values like GV_Warp_Size, GV_Warp_Size_Log2,
110*13fbcb42Sjoerg /// and GV_Warp_Size_Log2_Mask.
111*13fbcb42Sjoerg
112*13fbcb42Sjoerg /// Global memory alignment for performance.
113*13fbcb42Sjoerg GlobalMemoryAlignment = 128,
114*13fbcb42Sjoerg
115*13fbcb42Sjoerg /// Maximal size of the shared memory buffer.
116*13fbcb42Sjoerg SharedMemorySize = 128,
117*13fbcb42Sjoerg };
118*13fbcb42Sjoerg
getPrivateItem(const Expr * RefExpr)119*13fbcb42Sjoerg static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
120*13fbcb42Sjoerg RefExpr = RefExpr->IgnoreParens();
121*13fbcb42Sjoerg if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
122*13fbcb42Sjoerg const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
123*13fbcb42Sjoerg while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
124*13fbcb42Sjoerg Base = TempASE->getBase()->IgnoreParenImpCasts();
125*13fbcb42Sjoerg RefExpr = Base;
126*13fbcb42Sjoerg } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
127*13fbcb42Sjoerg const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
128*13fbcb42Sjoerg while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
129*13fbcb42Sjoerg Base = TempOASE->getBase()->IgnoreParenImpCasts();
130*13fbcb42Sjoerg while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
131*13fbcb42Sjoerg Base = TempASE->getBase()->IgnoreParenImpCasts();
132*13fbcb42Sjoerg RefExpr = Base;
133*13fbcb42Sjoerg }
134*13fbcb42Sjoerg RefExpr = RefExpr->IgnoreParenImpCasts();
135*13fbcb42Sjoerg if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
136*13fbcb42Sjoerg return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
137*13fbcb42Sjoerg const auto *ME = cast<MemberExpr>(RefExpr);
138*13fbcb42Sjoerg return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
139*13fbcb42Sjoerg }
140*13fbcb42Sjoerg
141*13fbcb42Sjoerg
buildRecordForGlobalizedVars(ASTContext & C,ArrayRef<const ValueDecl * > EscapedDecls,ArrayRef<const ValueDecl * > EscapedDeclsForTeams,llvm::SmallDenseMap<const ValueDecl *,const FieldDecl * > & MappedDeclsFields,int BufSize)142*13fbcb42Sjoerg static RecordDecl *buildRecordForGlobalizedVars(
143*13fbcb42Sjoerg ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
144*13fbcb42Sjoerg ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
145*13fbcb42Sjoerg llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
146*13fbcb42Sjoerg &MappedDeclsFields, int BufSize) {
147*13fbcb42Sjoerg using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
148*13fbcb42Sjoerg if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
149*13fbcb42Sjoerg return nullptr;
150*13fbcb42Sjoerg SmallVector<VarsDataTy, 4> GlobalizedVars;
151*13fbcb42Sjoerg for (const ValueDecl *D : EscapedDecls)
152*13fbcb42Sjoerg GlobalizedVars.emplace_back(
153*13fbcb42Sjoerg CharUnits::fromQuantity(std::max(
154*13fbcb42Sjoerg C.getDeclAlign(D).getQuantity(),
155*13fbcb42Sjoerg static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
156*13fbcb42Sjoerg D);
157*13fbcb42Sjoerg for (const ValueDecl *D : EscapedDeclsForTeams)
158*13fbcb42Sjoerg GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
159*13fbcb42Sjoerg llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) {
160*13fbcb42Sjoerg return L.first > R.first;
161*13fbcb42Sjoerg });
162*13fbcb42Sjoerg
163*13fbcb42Sjoerg // Build struct _globalized_locals_ty {
164*13fbcb42Sjoerg // /* globalized vars */[WarSize] align (max(decl_align,
165*13fbcb42Sjoerg // GlobalMemoryAlignment))
166*13fbcb42Sjoerg // /* globalized vars */ for EscapedDeclsForTeams
167*13fbcb42Sjoerg // };
168*13fbcb42Sjoerg RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
169*13fbcb42Sjoerg GlobalizedRD->startDefinition();
170*13fbcb42Sjoerg llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
171*13fbcb42Sjoerg EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
172*13fbcb42Sjoerg for (const auto &Pair : GlobalizedVars) {
173*13fbcb42Sjoerg const ValueDecl *VD = Pair.second;
174*13fbcb42Sjoerg QualType Type = VD->getType();
175*13fbcb42Sjoerg if (Type->isLValueReferenceType())
176*13fbcb42Sjoerg Type = C.getPointerType(Type.getNonReferenceType());
177*13fbcb42Sjoerg else
178*13fbcb42Sjoerg Type = Type.getNonReferenceType();
179*13fbcb42Sjoerg SourceLocation Loc = VD->getLocation();
180*13fbcb42Sjoerg FieldDecl *Field;
181*13fbcb42Sjoerg if (SingleEscaped.count(VD)) {
182*13fbcb42Sjoerg Field = FieldDecl::Create(
183*13fbcb42Sjoerg C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
184*13fbcb42Sjoerg C.getTrivialTypeSourceInfo(Type, SourceLocation()),
185*13fbcb42Sjoerg /*BW=*/nullptr, /*Mutable=*/false,
186*13fbcb42Sjoerg /*InitStyle=*/ICIS_NoInit);
187*13fbcb42Sjoerg Field->setAccess(AS_public);
188*13fbcb42Sjoerg if (VD->hasAttrs()) {
189*13fbcb42Sjoerg for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
190*13fbcb42Sjoerg E(VD->getAttrs().end());
191*13fbcb42Sjoerg I != E; ++I)
192*13fbcb42Sjoerg Field->addAttr(*I);
193*13fbcb42Sjoerg }
194*13fbcb42Sjoerg } else {
195*13fbcb42Sjoerg llvm::APInt ArraySize(32, BufSize);
196*13fbcb42Sjoerg Type = C.getConstantArrayType(Type, ArraySize, nullptr, ArrayType::Normal,
197*13fbcb42Sjoerg 0);
198*13fbcb42Sjoerg Field = FieldDecl::Create(
199*13fbcb42Sjoerg C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
200*13fbcb42Sjoerg C.getTrivialTypeSourceInfo(Type, SourceLocation()),
201*13fbcb42Sjoerg /*BW=*/nullptr, /*Mutable=*/false,
202*13fbcb42Sjoerg /*InitStyle=*/ICIS_NoInit);
203*13fbcb42Sjoerg Field->setAccess(AS_public);
204*13fbcb42Sjoerg llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
205*13fbcb42Sjoerg static_cast<CharUnits::QuantityType>(
206*13fbcb42Sjoerg GlobalMemoryAlignment)));
207*13fbcb42Sjoerg Field->addAttr(AlignedAttr::CreateImplicit(
208*13fbcb42Sjoerg C, /*IsAlignmentExpr=*/true,
209*13fbcb42Sjoerg IntegerLiteral::Create(C, Align,
210*13fbcb42Sjoerg C.getIntTypeForBitwidth(32, /*Signed=*/0),
211*13fbcb42Sjoerg SourceLocation()),
212*13fbcb42Sjoerg {}, AttributeCommonInfo::AS_GNU, AlignedAttr::GNU_aligned));
213*13fbcb42Sjoerg }
214*13fbcb42Sjoerg GlobalizedRD->addDecl(Field);
215*13fbcb42Sjoerg MappedDeclsFields.try_emplace(VD, Field);
216*13fbcb42Sjoerg }
217*13fbcb42Sjoerg GlobalizedRD->completeDefinition();
218*13fbcb42Sjoerg return GlobalizedRD;
219*13fbcb42Sjoerg }
220*13fbcb42Sjoerg
221*13fbcb42Sjoerg /// Get the list of variables that can escape their declaration context.
222*13fbcb42Sjoerg class CheckVarsEscapingDeclContext final
223*13fbcb42Sjoerg : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
224*13fbcb42Sjoerg CodeGenFunction &CGF;
225*13fbcb42Sjoerg llvm::SetVector<const ValueDecl *> EscapedDecls;
226*13fbcb42Sjoerg llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
227*13fbcb42Sjoerg llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
228*13fbcb42Sjoerg RecordDecl *GlobalizedRD = nullptr;
229*13fbcb42Sjoerg llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
230*13fbcb42Sjoerg bool AllEscaped = false;
231*13fbcb42Sjoerg bool IsForCombinedParallelRegion = false;
232*13fbcb42Sjoerg
markAsEscaped(const ValueDecl * VD)233*13fbcb42Sjoerg void markAsEscaped(const ValueDecl *VD) {
234*13fbcb42Sjoerg // Do not globalize declare target variables.
235*13fbcb42Sjoerg if (!isa<VarDecl>(VD) ||
236*13fbcb42Sjoerg OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
237*13fbcb42Sjoerg return;
238*13fbcb42Sjoerg VD = cast<ValueDecl>(VD->getCanonicalDecl());
239*13fbcb42Sjoerg // Use user-specified allocation.
240*13fbcb42Sjoerg if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
241*13fbcb42Sjoerg return;
242*13fbcb42Sjoerg // Variables captured by value must be globalized.
243*13fbcb42Sjoerg if (auto *CSI = CGF.CapturedStmtInfo) {
244*13fbcb42Sjoerg if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
245*13fbcb42Sjoerg // Check if need to capture the variable that was already captured by
246*13fbcb42Sjoerg // value in the outer region.
247*13fbcb42Sjoerg if (!IsForCombinedParallelRegion) {
248*13fbcb42Sjoerg if (!FD->hasAttrs())
249*13fbcb42Sjoerg return;
250*13fbcb42Sjoerg const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
251*13fbcb42Sjoerg if (!Attr)
252*13fbcb42Sjoerg return;
253*13fbcb42Sjoerg if (((Attr->getCaptureKind() != OMPC_map) &&
254*13fbcb42Sjoerg !isOpenMPPrivate(Attr->getCaptureKind())) ||
255*13fbcb42Sjoerg ((Attr->getCaptureKind() == OMPC_map) &&
256*13fbcb42Sjoerg !FD->getType()->isAnyPointerType()))
257*13fbcb42Sjoerg return;
258*13fbcb42Sjoerg }
259*13fbcb42Sjoerg if (!FD->getType()->isReferenceType()) {
260*13fbcb42Sjoerg assert(!VD->getType()->isVariablyModifiedType() &&
261*13fbcb42Sjoerg "Parameter captured by value with variably modified type");
262*13fbcb42Sjoerg EscapedParameters.insert(VD);
263*13fbcb42Sjoerg } else if (!IsForCombinedParallelRegion) {
264*13fbcb42Sjoerg return;
265*13fbcb42Sjoerg }
266*13fbcb42Sjoerg }
267*13fbcb42Sjoerg }
268*13fbcb42Sjoerg if ((!CGF.CapturedStmtInfo ||
269*13fbcb42Sjoerg (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
270*13fbcb42Sjoerg VD->getType()->isReferenceType())
271*13fbcb42Sjoerg // Do not globalize variables with reference type.
272*13fbcb42Sjoerg return;
273*13fbcb42Sjoerg if (VD->getType()->isVariablyModifiedType())
274*13fbcb42Sjoerg EscapedVariableLengthDecls.insert(VD);
275*13fbcb42Sjoerg else
276*13fbcb42Sjoerg EscapedDecls.insert(VD);
277*13fbcb42Sjoerg }
278*13fbcb42Sjoerg
VisitValueDecl(const ValueDecl * VD)279*13fbcb42Sjoerg void VisitValueDecl(const ValueDecl *VD) {
280*13fbcb42Sjoerg if (VD->getType()->isLValueReferenceType())
281*13fbcb42Sjoerg markAsEscaped(VD);
282*13fbcb42Sjoerg if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
283*13fbcb42Sjoerg if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
284*13fbcb42Sjoerg const bool SavedAllEscaped = AllEscaped;
285*13fbcb42Sjoerg AllEscaped = VD->getType()->isLValueReferenceType();
286*13fbcb42Sjoerg Visit(VarD->getInit());
287*13fbcb42Sjoerg AllEscaped = SavedAllEscaped;
288*13fbcb42Sjoerg }
289*13fbcb42Sjoerg }
290*13fbcb42Sjoerg }
VisitOpenMPCapturedStmt(const CapturedStmt * S,ArrayRef<OMPClause * > Clauses,bool IsCombinedParallelRegion)291*13fbcb42Sjoerg void VisitOpenMPCapturedStmt(const CapturedStmt *S,
292*13fbcb42Sjoerg ArrayRef<OMPClause *> Clauses,
293*13fbcb42Sjoerg bool IsCombinedParallelRegion) {
294*13fbcb42Sjoerg if (!S)
295*13fbcb42Sjoerg return;
296*13fbcb42Sjoerg for (const CapturedStmt::Capture &C : S->captures()) {
297*13fbcb42Sjoerg if (C.capturesVariable() && !C.capturesVariableByCopy()) {
298*13fbcb42Sjoerg const ValueDecl *VD = C.getCapturedVar();
299*13fbcb42Sjoerg bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
300*13fbcb42Sjoerg if (IsCombinedParallelRegion) {
301*13fbcb42Sjoerg // Check if the variable is privatized in the combined construct and
302*13fbcb42Sjoerg // those private copies must be shared in the inner parallel
303*13fbcb42Sjoerg // directive.
304*13fbcb42Sjoerg IsForCombinedParallelRegion = false;
305*13fbcb42Sjoerg for (const OMPClause *C : Clauses) {
306*13fbcb42Sjoerg if (!isOpenMPPrivate(C->getClauseKind()) ||
307*13fbcb42Sjoerg C->getClauseKind() == OMPC_reduction ||
308*13fbcb42Sjoerg C->getClauseKind() == OMPC_linear ||
309*13fbcb42Sjoerg C->getClauseKind() == OMPC_private)
310*13fbcb42Sjoerg continue;
311*13fbcb42Sjoerg ArrayRef<const Expr *> Vars;
312*13fbcb42Sjoerg if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
313*13fbcb42Sjoerg Vars = PC->getVarRefs();
314*13fbcb42Sjoerg else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
315*13fbcb42Sjoerg Vars = PC->getVarRefs();
316*13fbcb42Sjoerg else
317*13fbcb42Sjoerg llvm_unreachable("Unexpected clause.");
318*13fbcb42Sjoerg for (const auto *E : Vars) {
319*13fbcb42Sjoerg const Decl *D =
320*13fbcb42Sjoerg cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
321*13fbcb42Sjoerg if (D == VD->getCanonicalDecl()) {
322*13fbcb42Sjoerg IsForCombinedParallelRegion = true;
323*13fbcb42Sjoerg break;
324*13fbcb42Sjoerg }
325*13fbcb42Sjoerg }
326*13fbcb42Sjoerg if (IsForCombinedParallelRegion)
327*13fbcb42Sjoerg break;
328*13fbcb42Sjoerg }
329*13fbcb42Sjoerg }
330*13fbcb42Sjoerg markAsEscaped(VD);
331*13fbcb42Sjoerg if (isa<OMPCapturedExprDecl>(VD))
332*13fbcb42Sjoerg VisitValueDecl(VD);
333*13fbcb42Sjoerg IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
334*13fbcb42Sjoerg }
335*13fbcb42Sjoerg }
336*13fbcb42Sjoerg }
337*13fbcb42Sjoerg
buildRecordForGlobalizedVars(bool IsInTTDRegion)338*13fbcb42Sjoerg void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
339*13fbcb42Sjoerg assert(!GlobalizedRD &&
340*13fbcb42Sjoerg "Record for globalized variables is built already.");
341*13fbcb42Sjoerg ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
342*13fbcb42Sjoerg unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
343*13fbcb42Sjoerg if (IsInTTDRegion)
344*13fbcb42Sjoerg EscapedDeclsForTeams = EscapedDecls.getArrayRef();
345*13fbcb42Sjoerg else
346*13fbcb42Sjoerg EscapedDeclsForParallel = EscapedDecls.getArrayRef();
347*13fbcb42Sjoerg GlobalizedRD = ::buildRecordForGlobalizedVars(
348*13fbcb42Sjoerg CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
349*13fbcb42Sjoerg MappedDeclsFields, WarpSize);
350*13fbcb42Sjoerg }
351*13fbcb42Sjoerg
352*13fbcb42Sjoerg public:
CheckVarsEscapingDeclContext(CodeGenFunction & CGF,ArrayRef<const ValueDecl * > TeamsReductions)353*13fbcb42Sjoerg CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
354*13fbcb42Sjoerg ArrayRef<const ValueDecl *> TeamsReductions)
355*13fbcb42Sjoerg : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
356*13fbcb42Sjoerg }
357*13fbcb42Sjoerg virtual ~CheckVarsEscapingDeclContext() = default;
VisitDeclStmt(const DeclStmt * S)358*13fbcb42Sjoerg void VisitDeclStmt(const DeclStmt *S) {
359*13fbcb42Sjoerg if (!S)
360*13fbcb42Sjoerg return;
361*13fbcb42Sjoerg for (const Decl *D : S->decls())
362*13fbcb42Sjoerg if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
363*13fbcb42Sjoerg VisitValueDecl(VD);
364*13fbcb42Sjoerg }
VisitOMPExecutableDirective(const OMPExecutableDirective * D)365*13fbcb42Sjoerg void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
366*13fbcb42Sjoerg if (!D)
367*13fbcb42Sjoerg return;
368*13fbcb42Sjoerg if (!D->hasAssociatedStmt())
369*13fbcb42Sjoerg return;
370*13fbcb42Sjoerg if (const auto *S =
371*13fbcb42Sjoerg dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
372*13fbcb42Sjoerg // Do not analyze directives that do not actually require capturing,
373*13fbcb42Sjoerg // like `omp for` or `omp simd` directives.
374*13fbcb42Sjoerg llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
375*13fbcb42Sjoerg getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
376*13fbcb42Sjoerg if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
377*13fbcb42Sjoerg VisitStmt(S->getCapturedStmt());
378*13fbcb42Sjoerg return;
379*13fbcb42Sjoerg }
380*13fbcb42Sjoerg VisitOpenMPCapturedStmt(
381*13fbcb42Sjoerg S, D->clauses(),
382*13fbcb42Sjoerg CaptureRegions.back() == OMPD_parallel &&
383*13fbcb42Sjoerg isOpenMPDistributeDirective(D->getDirectiveKind()));
384*13fbcb42Sjoerg }
385*13fbcb42Sjoerg }
VisitCapturedStmt(const CapturedStmt * S)386*13fbcb42Sjoerg void VisitCapturedStmt(const CapturedStmt *S) {
387*13fbcb42Sjoerg if (!S)
388*13fbcb42Sjoerg return;
389*13fbcb42Sjoerg for (const CapturedStmt::Capture &C : S->captures()) {
390*13fbcb42Sjoerg if (C.capturesVariable() && !C.capturesVariableByCopy()) {
391*13fbcb42Sjoerg const ValueDecl *VD = C.getCapturedVar();
392*13fbcb42Sjoerg markAsEscaped(VD);
393*13fbcb42Sjoerg if (isa<OMPCapturedExprDecl>(VD))
394*13fbcb42Sjoerg VisitValueDecl(VD);
395*13fbcb42Sjoerg }
396*13fbcb42Sjoerg }
397*13fbcb42Sjoerg }
VisitLambdaExpr(const LambdaExpr * E)398*13fbcb42Sjoerg void VisitLambdaExpr(const LambdaExpr *E) {
399*13fbcb42Sjoerg if (!E)
400*13fbcb42Sjoerg return;
401*13fbcb42Sjoerg for (const LambdaCapture &C : E->captures()) {
402*13fbcb42Sjoerg if (C.capturesVariable()) {
403*13fbcb42Sjoerg if (C.getCaptureKind() == LCK_ByRef) {
404*13fbcb42Sjoerg const ValueDecl *VD = C.getCapturedVar();
405*13fbcb42Sjoerg markAsEscaped(VD);
406*13fbcb42Sjoerg if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
407*13fbcb42Sjoerg VisitValueDecl(VD);
408*13fbcb42Sjoerg }
409*13fbcb42Sjoerg }
410*13fbcb42Sjoerg }
411*13fbcb42Sjoerg }
VisitBlockExpr(const BlockExpr * E)412*13fbcb42Sjoerg void VisitBlockExpr(const BlockExpr *E) {
413*13fbcb42Sjoerg if (!E)
414*13fbcb42Sjoerg return;
415*13fbcb42Sjoerg for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
416*13fbcb42Sjoerg if (C.isByRef()) {
417*13fbcb42Sjoerg const VarDecl *VD = C.getVariable();
418*13fbcb42Sjoerg markAsEscaped(VD);
419*13fbcb42Sjoerg if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
420*13fbcb42Sjoerg VisitValueDecl(VD);
421*13fbcb42Sjoerg }
422*13fbcb42Sjoerg }
423*13fbcb42Sjoerg }
VisitCallExpr(const CallExpr * E)424*13fbcb42Sjoerg void VisitCallExpr(const CallExpr *E) {
425*13fbcb42Sjoerg if (!E)
426*13fbcb42Sjoerg return;
427*13fbcb42Sjoerg for (const Expr *Arg : E->arguments()) {
428*13fbcb42Sjoerg if (!Arg)
429*13fbcb42Sjoerg continue;
430*13fbcb42Sjoerg if (Arg->isLValue()) {
431*13fbcb42Sjoerg const bool SavedAllEscaped = AllEscaped;
432*13fbcb42Sjoerg AllEscaped = true;
433*13fbcb42Sjoerg Visit(Arg);
434*13fbcb42Sjoerg AllEscaped = SavedAllEscaped;
435*13fbcb42Sjoerg } else {
436*13fbcb42Sjoerg Visit(Arg);
437*13fbcb42Sjoerg }
438*13fbcb42Sjoerg }
439*13fbcb42Sjoerg Visit(E->getCallee());
440*13fbcb42Sjoerg }
VisitDeclRefExpr(const DeclRefExpr * E)441*13fbcb42Sjoerg void VisitDeclRefExpr(const DeclRefExpr *E) {
442*13fbcb42Sjoerg if (!E)
443*13fbcb42Sjoerg return;
444*13fbcb42Sjoerg const ValueDecl *VD = E->getDecl();
445*13fbcb42Sjoerg if (AllEscaped)
446*13fbcb42Sjoerg markAsEscaped(VD);
447*13fbcb42Sjoerg if (isa<OMPCapturedExprDecl>(VD))
448*13fbcb42Sjoerg VisitValueDecl(VD);
449*13fbcb42Sjoerg else if (const auto *VarD = dyn_cast<VarDecl>(VD))
450*13fbcb42Sjoerg if (VarD->isInitCapture())
451*13fbcb42Sjoerg VisitValueDecl(VD);
452*13fbcb42Sjoerg }
VisitUnaryOperator(const UnaryOperator * E)453*13fbcb42Sjoerg void VisitUnaryOperator(const UnaryOperator *E) {
454*13fbcb42Sjoerg if (!E)
455*13fbcb42Sjoerg return;
456*13fbcb42Sjoerg if (E->getOpcode() == UO_AddrOf) {
457*13fbcb42Sjoerg const bool SavedAllEscaped = AllEscaped;
458*13fbcb42Sjoerg AllEscaped = true;
459*13fbcb42Sjoerg Visit(E->getSubExpr());
460*13fbcb42Sjoerg AllEscaped = SavedAllEscaped;
461*13fbcb42Sjoerg } else {
462*13fbcb42Sjoerg Visit(E->getSubExpr());
463*13fbcb42Sjoerg }
464*13fbcb42Sjoerg }
VisitImplicitCastExpr(const ImplicitCastExpr * E)465*13fbcb42Sjoerg void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
466*13fbcb42Sjoerg if (!E)
467*13fbcb42Sjoerg return;
468*13fbcb42Sjoerg if (E->getCastKind() == CK_ArrayToPointerDecay) {
469*13fbcb42Sjoerg const bool SavedAllEscaped = AllEscaped;
470*13fbcb42Sjoerg AllEscaped = true;
471*13fbcb42Sjoerg Visit(E->getSubExpr());
472*13fbcb42Sjoerg AllEscaped = SavedAllEscaped;
473*13fbcb42Sjoerg } else {
474*13fbcb42Sjoerg Visit(E->getSubExpr());
475*13fbcb42Sjoerg }
476*13fbcb42Sjoerg }
VisitExpr(const Expr * E)477*13fbcb42Sjoerg void VisitExpr(const Expr *E) {
478*13fbcb42Sjoerg if (!E)
479*13fbcb42Sjoerg return;
480*13fbcb42Sjoerg bool SavedAllEscaped = AllEscaped;
481*13fbcb42Sjoerg if (!E->isLValue())
482*13fbcb42Sjoerg AllEscaped = false;
483*13fbcb42Sjoerg for (const Stmt *Child : E->children())
484*13fbcb42Sjoerg if (Child)
485*13fbcb42Sjoerg Visit(Child);
486*13fbcb42Sjoerg AllEscaped = SavedAllEscaped;
487*13fbcb42Sjoerg }
VisitStmt(const Stmt * S)488*13fbcb42Sjoerg void VisitStmt(const Stmt *S) {
489*13fbcb42Sjoerg if (!S)
490*13fbcb42Sjoerg return;
491*13fbcb42Sjoerg for (const Stmt *Child : S->children())
492*13fbcb42Sjoerg if (Child)
493*13fbcb42Sjoerg Visit(Child);
494*13fbcb42Sjoerg }
495*13fbcb42Sjoerg
496*13fbcb42Sjoerg /// Returns the record that handles all the escaped local variables and used
497*13fbcb42Sjoerg /// instead of their original storage.
getGlobalizedRecord(bool IsInTTDRegion)498*13fbcb42Sjoerg const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
499*13fbcb42Sjoerg if (!GlobalizedRD)
500*13fbcb42Sjoerg buildRecordForGlobalizedVars(IsInTTDRegion);
501*13fbcb42Sjoerg return GlobalizedRD;
502*13fbcb42Sjoerg }
503*13fbcb42Sjoerg
504*13fbcb42Sjoerg /// Returns the field in the globalized record for the escaped variable.
getFieldForGlobalizedVar(const ValueDecl * VD) const505*13fbcb42Sjoerg const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
506*13fbcb42Sjoerg assert(GlobalizedRD &&
507*13fbcb42Sjoerg "Record for globalized variables must be generated already.");
508*13fbcb42Sjoerg auto I = MappedDeclsFields.find(VD);
509*13fbcb42Sjoerg if (I == MappedDeclsFields.end())
510*13fbcb42Sjoerg return nullptr;
511*13fbcb42Sjoerg return I->getSecond();
512*13fbcb42Sjoerg }
513*13fbcb42Sjoerg
514*13fbcb42Sjoerg /// Returns the list of the escaped local variables/parameters.
getEscapedDecls() const515*13fbcb42Sjoerg ArrayRef<const ValueDecl *> getEscapedDecls() const {
516*13fbcb42Sjoerg return EscapedDecls.getArrayRef();
517*13fbcb42Sjoerg }
518*13fbcb42Sjoerg
519*13fbcb42Sjoerg /// Checks if the escaped local variable is actually a parameter passed by
520*13fbcb42Sjoerg /// value.
getEscapedParameters() const521*13fbcb42Sjoerg const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
522*13fbcb42Sjoerg return EscapedParameters;
523*13fbcb42Sjoerg }
524*13fbcb42Sjoerg
525*13fbcb42Sjoerg /// Returns the list of the escaped variables with the variably modified
526*13fbcb42Sjoerg /// types.
getEscapedVariableLengthDecls() const527*13fbcb42Sjoerg ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
528*13fbcb42Sjoerg return EscapedVariableLengthDecls.getArrayRef();
529*13fbcb42Sjoerg }
530*13fbcb42Sjoerg };
531*13fbcb42Sjoerg } // anonymous namespace
532*13fbcb42Sjoerg
533*13fbcb42Sjoerg /// Get the id of the warp in the block.
534*13fbcb42Sjoerg /// We assume that the warp size is 32, which is always the case
535*13fbcb42Sjoerg /// on the NVPTX device, to generate more efficient code.
getNVPTXWarpID(CodeGenFunction & CGF)536*13fbcb42Sjoerg static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
537*13fbcb42Sjoerg CGBuilderTy &Bld = CGF.Builder;
538*13fbcb42Sjoerg unsigned LaneIDBits =
539*13fbcb42Sjoerg CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size_Log2);
540*13fbcb42Sjoerg auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
541*13fbcb42Sjoerg return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
542*13fbcb42Sjoerg }
543*13fbcb42Sjoerg
544*13fbcb42Sjoerg /// Get the id of the current lane in the Warp.
545*13fbcb42Sjoerg /// We assume that the warp size is 32, which is always the case
546*13fbcb42Sjoerg /// on the NVPTX device, to generate more efficient code.
getNVPTXLaneID(CodeGenFunction & CGF)547*13fbcb42Sjoerg static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
548*13fbcb42Sjoerg CGBuilderTy &Bld = CGF.Builder;
549*13fbcb42Sjoerg unsigned LaneIDMask = CGF.getContext().getTargetInfo().getGridValue(
550*13fbcb42Sjoerg llvm::omp::GV_Warp_Size_Log2_Mask);
551*13fbcb42Sjoerg auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
552*13fbcb42Sjoerg return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
553*13fbcb42Sjoerg "nvptx_lane_id");
554*13fbcb42Sjoerg }
555*13fbcb42Sjoerg
556*13fbcb42Sjoerg /// Get the value of the thread_limit clause in the teams directive.
557*13fbcb42Sjoerg /// For the 'generic' execution mode, the runtime encodes thread_limit in
558*13fbcb42Sjoerg /// the launch parameters, always starting thread_limit+warpSize threads per
559*13fbcb42Sjoerg /// CTA. The threads in the last warp are reserved for master execution.
560*13fbcb42Sjoerg /// For the 'spmd' execution mode, all threads in a CTA are part of the team.
getThreadLimit(CodeGenFunction & CGF,bool IsInSPMDExecutionMode=false)561*13fbcb42Sjoerg static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
562*13fbcb42Sjoerg bool IsInSPMDExecutionMode = false) {
563*13fbcb42Sjoerg CGBuilderTy &Bld = CGF.Builder;
564*13fbcb42Sjoerg auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
565*13fbcb42Sjoerg llvm::Value *ThreadLimit = nullptr;
566*13fbcb42Sjoerg if (IsInSPMDExecutionMode)
567*13fbcb42Sjoerg ThreadLimit = RT.getGPUNumThreads(CGF);
568*13fbcb42Sjoerg else {
569*13fbcb42Sjoerg llvm::Value *GPUNumThreads = RT.getGPUNumThreads(CGF);
570*13fbcb42Sjoerg llvm::Value *GPUWarpSize = RT.getGPUWarpSize(CGF);
571*13fbcb42Sjoerg ThreadLimit = Bld.CreateNUWSub(GPUNumThreads, GPUWarpSize, "thread_limit");
572*13fbcb42Sjoerg }
573*13fbcb42Sjoerg assert(ThreadLimit != nullptr && "Expected non-null ThreadLimit");
574*13fbcb42Sjoerg return ThreadLimit;
575*13fbcb42Sjoerg }
576*13fbcb42Sjoerg
577*13fbcb42Sjoerg /// Get the thread id of the OMP master thread.
578*13fbcb42Sjoerg /// The master thread id is the first thread (lane) of the last warp in the
579*13fbcb42Sjoerg /// GPU block. Warp size is assumed to be some power of 2.
580*13fbcb42Sjoerg /// Thread id is 0 indexed.
581*13fbcb42Sjoerg /// E.g: If NumThreads is 33, master id is 32.
582*13fbcb42Sjoerg /// If NumThreads is 64, master id is 32.
583*13fbcb42Sjoerg /// If NumThreads is 1024, master id is 992.
getMasterThreadID(CodeGenFunction & CGF)584*13fbcb42Sjoerg static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
585*13fbcb42Sjoerg CGBuilderTy &Bld = CGF.Builder;
586*13fbcb42Sjoerg auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
587*13fbcb42Sjoerg llvm::Value *NumThreads = RT.getGPUNumThreads(CGF);
588*13fbcb42Sjoerg // We assume that the warp size is a power of 2.
589*13fbcb42Sjoerg llvm::Value *Mask = Bld.CreateNUWSub(RT.getGPUWarpSize(CGF), Bld.getInt32(1));
590*13fbcb42Sjoerg
591*13fbcb42Sjoerg llvm::Value *NumThreadsSubOne = Bld.CreateNUWSub(NumThreads, Bld.getInt32(1));
592*13fbcb42Sjoerg return Bld.CreateAnd(NumThreadsSubOne, Bld.CreateNot(Mask), "master_tid");
593*13fbcb42Sjoerg }
594*13fbcb42Sjoerg
WorkerFunctionState(CodeGenModule & CGM,SourceLocation Loc)595*13fbcb42Sjoerg CGOpenMPRuntimeGPU::WorkerFunctionState::WorkerFunctionState(
596*13fbcb42Sjoerg CodeGenModule &CGM, SourceLocation Loc)
597*13fbcb42Sjoerg : WorkerFn(nullptr), CGFI(CGM.getTypes().arrangeNullaryFunction()),
598*13fbcb42Sjoerg Loc(Loc) {
599*13fbcb42Sjoerg createWorkerFunction(CGM);
600*13fbcb42Sjoerg }
601*13fbcb42Sjoerg
createWorkerFunction(CodeGenModule & CGM)602*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::WorkerFunctionState::createWorkerFunction(
603*13fbcb42Sjoerg CodeGenModule &CGM) {
604*13fbcb42Sjoerg // Create an worker function with no arguments.
605*13fbcb42Sjoerg
606*13fbcb42Sjoerg WorkerFn = llvm::Function::Create(
607*13fbcb42Sjoerg CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
608*13fbcb42Sjoerg /*placeholder=*/"_worker", &CGM.getModule());
609*13fbcb42Sjoerg CGM.SetInternalFunctionAttributes(GlobalDecl(), WorkerFn, CGFI);
610*13fbcb42Sjoerg WorkerFn->setDoesNotRecurse();
611*13fbcb42Sjoerg }
612*13fbcb42Sjoerg
613*13fbcb42Sjoerg CGOpenMPRuntimeGPU::ExecutionMode
getExecutionMode() const614*13fbcb42Sjoerg CGOpenMPRuntimeGPU::getExecutionMode() const {
615*13fbcb42Sjoerg return CurrentExecutionMode;
616*13fbcb42Sjoerg }
617*13fbcb42Sjoerg
618*13fbcb42Sjoerg static CGOpenMPRuntimeGPU::DataSharingMode
getDataSharingMode(CodeGenModule & CGM)619*13fbcb42Sjoerg getDataSharingMode(CodeGenModule &CGM) {
620*13fbcb42Sjoerg return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeGPU::CUDA
621*13fbcb42Sjoerg : CGOpenMPRuntimeGPU::Generic;
622*13fbcb42Sjoerg }
623*13fbcb42Sjoerg
624*13fbcb42Sjoerg /// Check for inner (nested) SPMD construct, if any
hasNestedSPMDDirective(ASTContext & Ctx,const OMPExecutableDirective & D)625*13fbcb42Sjoerg static bool hasNestedSPMDDirective(ASTContext &Ctx,
626*13fbcb42Sjoerg const OMPExecutableDirective &D) {
627*13fbcb42Sjoerg const auto *CS = D.getInnermostCapturedStmt();
628*13fbcb42Sjoerg const auto *Body =
629*13fbcb42Sjoerg CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
630*13fbcb42Sjoerg const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
631*13fbcb42Sjoerg
632*13fbcb42Sjoerg if (const auto *NestedDir =
633*13fbcb42Sjoerg dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
634*13fbcb42Sjoerg OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
635*13fbcb42Sjoerg switch (D.getDirectiveKind()) {
636*13fbcb42Sjoerg case OMPD_target:
637*13fbcb42Sjoerg if (isOpenMPParallelDirective(DKind))
638*13fbcb42Sjoerg return true;
639*13fbcb42Sjoerg if (DKind == OMPD_teams) {
640*13fbcb42Sjoerg Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
641*13fbcb42Sjoerg /*IgnoreCaptured=*/true);
642*13fbcb42Sjoerg if (!Body)
643*13fbcb42Sjoerg return false;
644*13fbcb42Sjoerg ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
645*13fbcb42Sjoerg if (const auto *NND =
646*13fbcb42Sjoerg dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
647*13fbcb42Sjoerg DKind = NND->getDirectiveKind();
648*13fbcb42Sjoerg if (isOpenMPParallelDirective(DKind))
649*13fbcb42Sjoerg return true;
650*13fbcb42Sjoerg }
651*13fbcb42Sjoerg }
652*13fbcb42Sjoerg return false;
653*13fbcb42Sjoerg case OMPD_target_teams:
654*13fbcb42Sjoerg return isOpenMPParallelDirective(DKind);
655*13fbcb42Sjoerg case OMPD_target_simd:
656*13fbcb42Sjoerg case OMPD_target_parallel:
657*13fbcb42Sjoerg case OMPD_target_parallel_for:
658*13fbcb42Sjoerg case OMPD_target_parallel_for_simd:
659*13fbcb42Sjoerg case OMPD_target_teams_distribute:
660*13fbcb42Sjoerg case OMPD_target_teams_distribute_simd:
661*13fbcb42Sjoerg case OMPD_target_teams_distribute_parallel_for:
662*13fbcb42Sjoerg case OMPD_target_teams_distribute_parallel_for_simd:
663*13fbcb42Sjoerg case OMPD_parallel:
664*13fbcb42Sjoerg case OMPD_for:
665*13fbcb42Sjoerg case OMPD_parallel_for:
666*13fbcb42Sjoerg case OMPD_parallel_master:
667*13fbcb42Sjoerg case OMPD_parallel_sections:
668*13fbcb42Sjoerg case OMPD_for_simd:
669*13fbcb42Sjoerg case OMPD_parallel_for_simd:
670*13fbcb42Sjoerg case OMPD_cancel:
671*13fbcb42Sjoerg case OMPD_cancellation_point:
672*13fbcb42Sjoerg case OMPD_ordered:
673*13fbcb42Sjoerg case OMPD_threadprivate:
674*13fbcb42Sjoerg case OMPD_allocate:
675*13fbcb42Sjoerg case OMPD_task:
676*13fbcb42Sjoerg case OMPD_simd:
677*13fbcb42Sjoerg case OMPD_sections:
678*13fbcb42Sjoerg case OMPD_section:
679*13fbcb42Sjoerg case OMPD_single:
680*13fbcb42Sjoerg case OMPD_master:
681*13fbcb42Sjoerg case OMPD_critical:
682*13fbcb42Sjoerg case OMPD_taskyield:
683*13fbcb42Sjoerg case OMPD_barrier:
684*13fbcb42Sjoerg case OMPD_taskwait:
685*13fbcb42Sjoerg case OMPD_taskgroup:
686*13fbcb42Sjoerg case OMPD_atomic:
687*13fbcb42Sjoerg case OMPD_flush:
688*13fbcb42Sjoerg case OMPD_depobj:
689*13fbcb42Sjoerg case OMPD_scan:
690*13fbcb42Sjoerg case OMPD_teams:
691*13fbcb42Sjoerg case OMPD_target_data:
692*13fbcb42Sjoerg case OMPD_target_exit_data:
693*13fbcb42Sjoerg case OMPD_target_enter_data:
694*13fbcb42Sjoerg case OMPD_distribute:
695*13fbcb42Sjoerg case OMPD_distribute_simd:
696*13fbcb42Sjoerg case OMPD_distribute_parallel_for:
697*13fbcb42Sjoerg case OMPD_distribute_parallel_for_simd:
698*13fbcb42Sjoerg case OMPD_teams_distribute:
699*13fbcb42Sjoerg case OMPD_teams_distribute_simd:
700*13fbcb42Sjoerg case OMPD_teams_distribute_parallel_for:
701*13fbcb42Sjoerg case OMPD_teams_distribute_parallel_for_simd:
702*13fbcb42Sjoerg case OMPD_target_update:
703*13fbcb42Sjoerg case OMPD_declare_simd:
704*13fbcb42Sjoerg case OMPD_declare_variant:
705*13fbcb42Sjoerg case OMPD_begin_declare_variant:
706*13fbcb42Sjoerg case OMPD_end_declare_variant:
707*13fbcb42Sjoerg case OMPD_declare_target:
708*13fbcb42Sjoerg case OMPD_end_declare_target:
709*13fbcb42Sjoerg case OMPD_declare_reduction:
710*13fbcb42Sjoerg case OMPD_declare_mapper:
711*13fbcb42Sjoerg case OMPD_taskloop:
712*13fbcb42Sjoerg case OMPD_taskloop_simd:
713*13fbcb42Sjoerg case OMPD_master_taskloop:
714*13fbcb42Sjoerg case OMPD_master_taskloop_simd:
715*13fbcb42Sjoerg case OMPD_parallel_master_taskloop:
716*13fbcb42Sjoerg case OMPD_parallel_master_taskloop_simd:
717*13fbcb42Sjoerg case OMPD_requires:
718*13fbcb42Sjoerg case OMPD_unknown:
719*13fbcb42Sjoerg default:
720*13fbcb42Sjoerg llvm_unreachable("Unexpected directive.");
721*13fbcb42Sjoerg }
722*13fbcb42Sjoerg }
723*13fbcb42Sjoerg
724*13fbcb42Sjoerg return false;
725*13fbcb42Sjoerg }
726*13fbcb42Sjoerg
supportsSPMDExecutionMode(ASTContext & Ctx,const OMPExecutableDirective & D)727*13fbcb42Sjoerg static bool supportsSPMDExecutionMode(ASTContext &Ctx,
728*13fbcb42Sjoerg const OMPExecutableDirective &D) {
729*13fbcb42Sjoerg OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
730*13fbcb42Sjoerg switch (DirectiveKind) {
731*13fbcb42Sjoerg case OMPD_target:
732*13fbcb42Sjoerg case OMPD_target_teams:
733*13fbcb42Sjoerg return hasNestedSPMDDirective(Ctx, D);
734*13fbcb42Sjoerg case OMPD_target_parallel:
735*13fbcb42Sjoerg case OMPD_target_parallel_for:
736*13fbcb42Sjoerg case OMPD_target_parallel_for_simd:
737*13fbcb42Sjoerg case OMPD_target_teams_distribute_parallel_for:
738*13fbcb42Sjoerg case OMPD_target_teams_distribute_parallel_for_simd:
739*13fbcb42Sjoerg case OMPD_target_simd:
740*13fbcb42Sjoerg case OMPD_target_teams_distribute_simd:
741*13fbcb42Sjoerg return true;
742*13fbcb42Sjoerg case OMPD_target_teams_distribute:
743*13fbcb42Sjoerg return false;
744*13fbcb42Sjoerg case OMPD_parallel:
745*13fbcb42Sjoerg case OMPD_for:
746*13fbcb42Sjoerg case OMPD_parallel_for:
747*13fbcb42Sjoerg case OMPD_parallel_master:
748*13fbcb42Sjoerg case OMPD_parallel_sections:
749*13fbcb42Sjoerg case OMPD_for_simd:
750*13fbcb42Sjoerg case OMPD_parallel_for_simd:
751*13fbcb42Sjoerg case OMPD_cancel:
752*13fbcb42Sjoerg case OMPD_cancellation_point:
753*13fbcb42Sjoerg case OMPD_ordered:
754*13fbcb42Sjoerg case OMPD_threadprivate:
755*13fbcb42Sjoerg case OMPD_allocate:
756*13fbcb42Sjoerg case OMPD_task:
757*13fbcb42Sjoerg case OMPD_simd:
758*13fbcb42Sjoerg case OMPD_sections:
759*13fbcb42Sjoerg case OMPD_section:
760*13fbcb42Sjoerg case OMPD_single:
761*13fbcb42Sjoerg case OMPD_master:
762*13fbcb42Sjoerg case OMPD_critical:
763*13fbcb42Sjoerg case OMPD_taskyield:
764*13fbcb42Sjoerg case OMPD_barrier:
765*13fbcb42Sjoerg case OMPD_taskwait:
766*13fbcb42Sjoerg case OMPD_taskgroup:
767*13fbcb42Sjoerg case OMPD_atomic:
768*13fbcb42Sjoerg case OMPD_flush:
769*13fbcb42Sjoerg case OMPD_depobj:
770*13fbcb42Sjoerg case OMPD_scan:
771*13fbcb42Sjoerg case OMPD_teams:
772*13fbcb42Sjoerg case OMPD_target_data:
773*13fbcb42Sjoerg case OMPD_target_exit_data:
774*13fbcb42Sjoerg case OMPD_target_enter_data:
775*13fbcb42Sjoerg case OMPD_distribute:
776*13fbcb42Sjoerg case OMPD_distribute_simd:
777*13fbcb42Sjoerg case OMPD_distribute_parallel_for:
778*13fbcb42Sjoerg case OMPD_distribute_parallel_for_simd:
779*13fbcb42Sjoerg case OMPD_teams_distribute:
780*13fbcb42Sjoerg case OMPD_teams_distribute_simd:
781*13fbcb42Sjoerg case OMPD_teams_distribute_parallel_for:
782*13fbcb42Sjoerg case OMPD_teams_distribute_parallel_for_simd:
783*13fbcb42Sjoerg case OMPD_target_update:
784*13fbcb42Sjoerg case OMPD_declare_simd:
785*13fbcb42Sjoerg case OMPD_declare_variant:
786*13fbcb42Sjoerg case OMPD_begin_declare_variant:
787*13fbcb42Sjoerg case OMPD_end_declare_variant:
788*13fbcb42Sjoerg case OMPD_declare_target:
789*13fbcb42Sjoerg case OMPD_end_declare_target:
790*13fbcb42Sjoerg case OMPD_declare_reduction:
791*13fbcb42Sjoerg case OMPD_declare_mapper:
792*13fbcb42Sjoerg case OMPD_taskloop:
793*13fbcb42Sjoerg case OMPD_taskloop_simd:
794*13fbcb42Sjoerg case OMPD_master_taskloop:
795*13fbcb42Sjoerg case OMPD_master_taskloop_simd:
796*13fbcb42Sjoerg case OMPD_parallel_master_taskloop:
797*13fbcb42Sjoerg case OMPD_parallel_master_taskloop_simd:
798*13fbcb42Sjoerg case OMPD_requires:
799*13fbcb42Sjoerg case OMPD_unknown:
800*13fbcb42Sjoerg default:
801*13fbcb42Sjoerg break;
802*13fbcb42Sjoerg }
803*13fbcb42Sjoerg llvm_unreachable(
804*13fbcb42Sjoerg "Unknown programming model for OpenMP directive on NVPTX target.");
805*13fbcb42Sjoerg }
806*13fbcb42Sjoerg
807*13fbcb42Sjoerg /// Check if the directive is loops based and has schedule clause at all or has
808*13fbcb42Sjoerg /// static scheduling.
hasStaticScheduling(const OMPExecutableDirective & D)809*13fbcb42Sjoerg static bool hasStaticScheduling(const OMPExecutableDirective &D) {
810*13fbcb42Sjoerg assert(isOpenMPWorksharingDirective(D.getDirectiveKind()) &&
811*13fbcb42Sjoerg isOpenMPLoopDirective(D.getDirectiveKind()) &&
812*13fbcb42Sjoerg "Expected loop-based directive.");
813*13fbcb42Sjoerg return !D.hasClausesOfKind<OMPOrderedClause>() &&
814*13fbcb42Sjoerg (!D.hasClausesOfKind<OMPScheduleClause>() ||
815*13fbcb42Sjoerg llvm::any_of(D.getClausesOfKind<OMPScheduleClause>(),
816*13fbcb42Sjoerg [](const OMPScheduleClause *C) {
817*13fbcb42Sjoerg return C->getScheduleKind() == OMPC_SCHEDULE_static;
818*13fbcb42Sjoerg }));
819*13fbcb42Sjoerg }
820*13fbcb42Sjoerg
821*13fbcb42Sjoerg /// Check for inner (nested) lightweight runtime construct, if any
hasNestedLightweightDirective(ASTContext & Ctx,const OMPExecutableDirective & D)822*13fbcb42Sjoerg static bool hasNestedLightweightDirective(ASTContext &Ctx,
823*13fbcb42Sjoerg const OMPExecutableDirective &D) {
824*13fbcb42Sjoerg assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive.");
825*13fbcb42Sjoerg const auto *CS = D.getInnermostCapturedStmt();
826*13fbcb42Sjoerg const auto *Body =
827*13fbcb42Sjoerg CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
828*13fbcb42Sjoerg const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
829*13fbcb42Sjoerg
830*13fbcb42Sjoerg if (const auto *NestedDir =
831*13fbcb42Sjoerg dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
832*13fbcb42Sjoerg OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
833*13fbcb42Sjoerg switch (D.getDirectiveKind()) {
834*13fbcb42Sjoerg case OMPD_target:
835*13fbcb42Sjoerg if (isOpenMPParallelDirective(DKind) &&
836*13fbcb42Sjoerg isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
837*13fbcb42Sjoerg hasStaticScheduling(*NestedDir))
838*13fbcb42Sjoerg return true;
839*13fbcb42Sjoerg if (DKind == OMPD_teams_distribute_simd || DKind == OMPD_simd)
840*13fbcb42Sjoerg return true;
841*13fbcb42Sjoerg if (DKind == OMPD_parallel) {
842*13fbcb42Sjoerg Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
843*13fbcb42Sjoerg /*IgnoreCaptured=*/true);
844*13fbcb42Sjoerg if (!Body)
845*13fbcb42Sjoerg return false;
846*13fbcb42Sjoerg ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
847*13fbcb42Sjoerg if (const auto *NND =
848*13fbcb42Sjoerg dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
849*13fbcb42Sjoerg DKind = NND->getDirectiveKind();
850*13fbcb42Sjoerg if (isOpenMPWorksharingDirective(DKind) &&
851*13fbcb42Sjoerg isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
852*13fbcb42Sjoerg return true;
853*13fbcb42Sjoerg }
854*13fbcb42Sjoerg } else if (DKind == OMPD_teams) {
855*13fbcb42Sjoerg Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
856*13fbcb42Sjoerg /*IgnoreCaptured=*/true);
857*13fbcb42Sjoerg if (!Body)
858*13fbcb42Sjoerg return false;
859*13fbcb42Sjoerg ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
860*13fbcb42Sjoerg if (const auto *NND =
861*13fbcb42Sjoerg dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
862*13fbcb42Sjoerg DKind = NND->getDirectiveKind();
863*13fbcb42Sjoerg if (isOpenMPParallelDirective(DKind) &&
864*13fbcb42Sjoerg isOpenMPWorksharingDirective(DKind) &&
865*13fbcb42Sjoerg isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
866*13fbcb42Sjoerg return true;
867*13fbcb42Sjoerg if (DKind == OMPD_parallel) {
868*13fbcb42Sjoerg Body = NND->getInnermostCapturedStmt()->IgnoreContainers(
869*13fbcb42Sjoerg /*IgnoreCaptured=*/true);
870*13fbcb42Sjoerg if (!Body)
871*13fbcb42Sjoerg return false;
872*13fbcb42Sjoerg ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
873*13fbcb42Sjoerg if (const auto *NND =
874*13fbcb42Sjoerg dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
875*13fbcb42Sjoerg DKind = NND->getDirectiveKind();
876*13fbcb42Sjoerg if (isOpenMPWorksharingDirective(DKind) &&
877*13fbcb42Sjoerg isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
878*13fbcb42Sjoerg return true;
879*13fbcb42Sjoerg }
880*13fbcb42Sjoerg }
881*13fbcb42Sjoerg }
882*13fbcb42Sjoerg }
883*13fbcb42Sjoerg return false;
884*13fbcb42Sjoerg case OMPD_target_teams:
885*13fbcb42Sjoerg if (isOpenMPParallelDirective(DKind) &&
886*13fbcb42Sjoerg isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
887*13fbcb42Sjoerg hasStaticScheduling(*NestedDir))
888*13fbcb42Sjoerg return true;
889*13fbcb42Sjoerg if (DKind == OMPD_distribute_simd || DKind == OMPD_simd)
890*13fbcb42Sjoerg return true;
891*13fbcb42Sjoerg if (DKind == OMPD_parallel) {
892*13fbcb42Sjoerg Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
893*13fbcb42Sjoerg /*IgnoreCaptured=*/true);
894*13fbcb42Sjoerg if (!Body)
895*13fbcb42Sjoerg return false;
896*13fbcb42Sjoerg ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
897*13fbcb42Sjoerg if (const auto *NND =
898*13fbcb42Sjoerg dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
899*13fbcb42Sjoerg DKind = NND->getDirectiveKind();
900*13fbcb42Sjoerg if (isOpenMPWorksharingDirective(DKind) &&
901*13fbcb42Sjoerg isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
902*13fbcb42Sjoerg return true;
903*13fbcb42Sjoerg }
904*13fbcb42Sjoerg }
905*13fbcb42Sjoerg return false;
906*13fbcb42Sjoerg case OMPD_target_parallel:
907*13fbcb42Sjoerg if (DKind == OMPD_simd)
908*13fbcb42Sjoerg return true;
909*13fbcb42Sjoerg return isOpenMPWorksharingDirective(DKind) &&
910*13fbcb42Sjoerg isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir);
911*13fbcb42Sjoerg case OMPD_target_teams_distribute:
912*13fbcb42Sjoerg case OMPD_target_simd:
913*13fbcb42Sjoerg case OMPD_target_parallel_for:
914*13fbcb42Sjoerg case OMPD_target_parallel_for_simd:
915*13fbcb42Sjoerg case OMPD_target_teams_distribute_simd:
916*13fbcb42Sjoerg case OMPD_target_teams_distribute_parallel_for:
917*13fbcb42Sjoerg case OMPD_target_teams_distribute_parallel_for_simd:
918*13fbcb42Sjoerg case OMPD_parallel:
919*13fbcb42Sjoerg case OMPD_for:
920*13fbcb42Sjoerg case OMPD_parallel_for:
921*13fbcb42Sjoerg case OMPD_parallel_master:
922*13fbcb42Sjoerg case OMPD_parallel_sections:
923*13fbcb42Sjoerg case OMPD_for_simd:
924*13fbcb42Sjoerg case OMPD_parallel_for_simd:
925*13fbcb42Sjoerg case OMPD_cancel:
926*13fbcb42Sjoerg case OMPD_cancellation_point:
927*13fbcb42Sjoerg case OMPD_ordered:
928*13fbcb42Sjoerg case OMPD_threadprivate:
929*13fbcb42Sjoerg case OMPD_allocate:
930*13fbcb42Sjoerg case OMPD_task:
931*13fbcb42Sjoerg case OMPD_simd:
932*13fbcb42Sjoerg case OMPD_sections:
933*13fbcb42Sjoerg case OMPD_section:
934*13fbcb42Sjoerg case OMPD_single:
935*13fbcb42Sjoerg case OMPD_master:
936*13fbcb42Sjoerg case OMPD_critical:
937*13fbcb42Sjoerg case OMPD_taskyield:
938*13fbcb42Sjoerg case OMPD_barrier:
939*13fbcb42Sjoerg case OMPD_taskwait:
940*13fbcb42Sjoerg case OMPD_taskgroup:
941*13fbcb42Sjoerg case OMPD_atomic:
942*13fbcb42Sjoerg case OMPD_flush:
943*13fbcb42Sjoerg case OMPD_depobj:
944*13fbcb42Sjoerg case OMPD_scan:
945*13fbcb42Sjoerg case OMPD_teams:
946*13fbcb42Sjoerg case OMPD_target_data:
947*13fbcb42Sjoerg case OMPD_target_exit_data:
948*13fbcb42Sjoerg case OMPD_target_enter_data:
949*13fbcb42Sjoerg case OMPD_distribute:
950*13fbcb42Sjoerg case OMPD_distribute_simd:
951*13fbcb42Sjoerg case OMPD_distribute_parallel_for:
952*13fbcb42Sjoerg case OMPD_distribute_parallel_for_simd:
953*13fbcb42Sjoerg case OMPD_teams_distribute:
954*13fbcb42Sjoerg case OMPD_teams_distribute_simd:
955*13fbcb42Sjoerg case OMPD_teams_distribute_parallel_for:
956*13fbcb42Sjoerg case OMPD_teams_distribute_parallel_for_simd:
957*13fbcb42Sjoerg case OMPD_target_update:
958*13fbcb42Sjoerg case OMPD_declare_simd:
959*13fbcb42Sjoerg case OMPD_declare_variant:
960*13fbcb42Sjoerg case OMPD_begin_declare_variant:
961*13fbcb42Sjoerg case OMPD_end_declare_variant:
962*13fbcb42Sjoerg case OMPD_declare_target:
963*13fbcb42Sjoerg case OMPD_end_declare_target:
964*13fbcb42Sjoerg case OMPD_declare_reduction:
965*13fbcb42Sjoerg case OMPD_declare_mapper:
966*13fbcb42Sjoerg case OMPD_taskloop:
967*13fbcb42Sjoerg case OMPD_taskloop_simd:
968*13fbcb42Sjoerg case OMPD_master_taskloop:
969*13fbcb42Sjoerg case OMPD_master_taskloop_simd:
970*13fbcb42Sjoerg case OMPD_parallel_master_taskloop:
971*13fbcb42Sjoerg case OMPD_parallel_master_taskloop_simd:
972*13fbcb42Sjoerg case OMPD_requires:
973*13fbcb42Sjoerg case OMPD_unknown:
974*13fbcb42Sjoerg default:
975*13fbcb42Sjoerg llvm_unreachable("Unexpected directive.");
976*13fbcb42Sjoerg }
977*13fbcb42Sjoerg }
978*13fbcb42Sjoerg
979*13fbcb42Sjoerg return false;
980*13fbcb42Sjoerg }
981*13fbcb42Sjoerg
982*13fbcb42Sjoerg /// Checks if the construct supports lightweight runtime. It must be SPMD
983*13fbcb42Sjoerg /// construct + inner loop-based construct with static scheduling.
supportsLightweightRuntime(ASTContext & Ctx,const OMPExecutableDirective & D)984*13fbcb42Sjoerg static bool supportsLightweightRuntime(ASTContext &Ctx,
985*13fbcb42Sjoerg const OMPExecutableDirective &D) {
986*13fbcb42Sjoerg if (!supportsSPMDExecutionMode(Ctx, D))
987*13fbcb42Sjoerg return false;
988*13fbcb42Sjoerg OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
989*13fbcb42Sjoerg switch (DirectiveKind) {
990*13fbcb42Sjoerg case OMPD_target:
991*13fbcb42Sjoerg case OMPD_target_teams:
992*13fbcb42Sjoerg case OMPD_target_parallel:
993*13fbcb42Sjoerg return hasNestedLightweightDirective(Ctx, D);
994*13fbcb42Sjoerg case OMPD_target_parallel_for:
995*13fbcb42Sjoerg case OMPD_target_parallel_for_simd:
996*13fbcb42Sjoerg case OMPD_target_teams_distribute_parallel_for:
997*13fbcb42Sjoerg case OMPD_target_teams_distribute_parallel_for_simd:
998*13fbcb42Sjoerg // (Last|First)-privates must be shared in parallel region.
999*13fbcb42Sjoerg return hasStaticScheduling(D);
1000*13fbcb42Sjoerg case OMPD_target_simd:
1001*13fbcb42Sjoerg case OMPD_target_teams_distribute_simd:
1002*13fbcb42Sjoerg return true;
1003*13fbcb42Sjoerg case OMPD_target_teams_distribute:
1004*13fbcb42Sjoerg return false;
1005*13fbcb42Sjoerg case OMPD_parallel:
1006*13fbcb42Sjoerg case OMPD_for:
1007*13fbcb42Sjoerg case OMPD_parallel_for:
1008*13fbcb42Sjoerg case OMPD_parallel_master:
1009*13fbcb42Sjoerg case OMPD_parallel_sections:
1010*13fbcb42Sjoerg case OMPD_for_simd:
1011*13fbcb42Sjoerg case OMPD_parallel_for_simd:
1012*13fbcb42Sjoerg case OMPD_cancel:
1013*13fbcb42Sjoerg case OMPD_cancellation_point:
1014*13fbcb42Sjoerg case OMPD_ordered:
1015*13fbcb42Sjoerg case OMPD_threadprivate:
1016*13fbcb42Sjoerg case OMPD_allocate:
1017*13fbcb42Sjoerg case OMPD_task:
1018*13fbcb42Sjoerg case OMPD_simd:
1019*13fbcb42Sjoerg case OMPD_sections:
1020*13fbcb42Sjoerg case OMPD_section:
1021*13fbcb42Sjoerg case OMPD_single:
1022*13fbcb42Sjoerg case OMPD_master:
1023*13fbcb42Sjoerg case OMPD_critical:
1024*13fbcb42Sjoerg case OMPD_taskyield:
1025*13fbcb42Sjoerg case OMPD_barrier:
1026*13fbcb42Sjoerg case OMPD_taskwait:
1027*13fbcb42Sjoerg case OMPD_taskgroup:
1028*13fbcb42Sjoerg case OMPD_atomic:
1029*13fbcb42Sjoerg case OMPD_flush:
1030*13fbcb42Sjoerg case OMPD_depobj:
1031*13fbcb42Sjoerg case OMPD_scan:
1032*13fbcb42Sjoerg case OMPD_teams:
1033*13fbcb42Sjoerg case OMPD_target_data:
1034*13fbcb42Sjoerg case OMPD_target_exit_data:
1035*13fbcb42Sjoerg case OMPD_target_enter_data:
1036*13fbcb42Sjoerg case OMPD_distribute:
1037*13fbcb42Sjoerg case OMPD_distribute_simd:
1038*13fbcb42Sjoerg case OMPD_distribute_parallel_for:
1039*13fbcb42Sjoerg case OMPD_distribute_parallel_for_simd:
1040*13fbcb42Sjoerg case OMPD_teams_distribute:
1041*13fbcb42Sjoerg case OMPD_teams_distribute_simd:
1042*13fbcb42Sjoerg case OMPD_teams_distribute_parallel_for:
1043*13fbcb42Sjoerg case OMPD_teams_distribute_parallel_for_simd:
1044*13fbcb42Sjoerg case OMPD_target_update:
1045*13fbcb42Sjoerg case OMPD_declare_simd:
1046*13fbcb42Sjoerg case OMPD_declare_variant:
1047*13fbcb42Sjoerg case OMPD_begin_declare_variant:
1048*13fbcb42Sjoerg case OMPD_end_declare_variant:
1049*13fbcb42Sjoerg case OMPD_declare_target:
1050*13fbcb42Sjoerg case OMPD_end_declare_target:
1051*13fbcb42Sjoerg case OMPD_declare_reduction:
1052*13fbcb42Sjoerg case OMPD_declare_mapper:
1053*13fbcb42Sjoerg case OMPD_taskloop:
1054*13fbcb42Sjoerg case OMPD_taskloop_simd:
1055*13fbcb42Sjoerg case OMPD_master_taskloop:
1056*13fbcb42Sjoerg case OMPD_master_taskloop_simd:
1057*13fbcb42Sjoerg case OMPD_parallel_master_taskloop:
1058*13fbcb42Sjoerg case OMPD_parallel_master_taskloop_simd:
1059*13fbcb42Sjoerg case OMPD_requires:
1060*13fbcb42Sjoerg case OMPD_unknown:
1061*13fbcb42Sjoerg default:
1062*13fbcb42Sjoerg break;
1063*13fbcb42Sjoerg }
1064*13fbcb42Sjoerg llvm_unreachable(
1065*13fbcb42Sjoerg "Unknown programming model for OpenMP directive on NVPTX target.");
1066*13fbcb42Sjoerg }
1067*13fbcb42Sjoerg
emitNonSPMDKernel(const OMPExecutableDirective & D,StringRef ParentName,llvm::Function * & OutlinedFn,llvm::Constant * & OutlinedFnID,bool IsOffloadEntry,const RegionCodeGenTy & CodeGen)1068*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
1069*13fbcb42Sjoerg StringRef ParentName,
1070*13fbcb42Sjoerg llvm::Function *&OutlinedFn,
1071*13fbcb42Sjoerg llvm::Constant *&OutlinedFnID,
1072*13fbcb42Sjoerg bool IsOffloadEntry,
1073*13fbcb42Sjoerg const RegionCodeGenTy &CodeGen) {
1074*13fbcb42Sjoerg ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode);
1075*13fbcb42Sjoerg EntryFunctionState EST;
1076*13fbcb42Sjoerg WorkerFunctionState WST(CGM, D.getBeginLoc());
1077*13fbcb42Sjoerg Work.clear();
1078*13fbcb42Sjoerg WrapperFunctionsMap.clear();
1079*13fbcb42Sjoerg
1080*13fbcb42Sjoerg // Emit target region as a standalone region.
1081*13fbcb42Sjoerg class NVPTXPrePostActionTy : public PrePostActionTy {
1082*13fbcb42Sjoerg CGOpenMPRuntimeGPU::EntryFunctionState &EST;
1083*13fbcb42Sjoerg CGOpenMPRuntimeGPU::WorkerFunctionState &WST;
1084*13fbcb42Sjoerg
1085*13fbcb42Sjoerg public:
1086*13fbcb42Sjoerg NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST,
1087*13fbcb42Sjoerg CGOpenMPRuntimeGPU::WorkerFunctionState &WST)
1088*13fbcb42Sjoerg : EST(EST), WST(WST) {}
1089*13fbcb42Sjoerg void Enter(CodeGenFunction &CGF) override {
1090*13fbcb42Sjoerg auto &RT =
1091*13fbcb42Sjoerg static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1092*13fbcb42Sjoerg RT.emitNonSPMDEntryHeader(CGF, EST, WST);
1093*13fbcb42Sjoerg // Skip target region initialization.
1094*13fbcb42Sjoerg RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1095*13fbcb42Sjoerg }
1096*13fbcb42Sjoerg void Exit(CodeGenFunction &CGF) override {
1097*13fbcb42Sjoerg auto &RT =
1098*13fbcb42Sjoerg static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1099*13fbcb42Sjoerg RT.clearLocThreadIdInsertPt(CGF);
1100*13fbcb42Sjoerg RT.emitNonSPMDEntryFooter(CGF, EST);
1101*13fbcb42Sjoerg }
1102*13fbcb42Sjoerg } Action(EST, WST);
1103*13fbcb42Sjoerg CodeGen.setAction(Action);
1104*13fbcb42Sjoerg IsInTTDRegion = true;
1105*13fbcb42Sjoerg // Reserve place for the globalized memory.
1106*13fbcb42Sjoerg GlobalizedRecords.emplace_back();
1107*13fbcb42Sjoerg if (!KernelStaticGlobalized) {
1108*13fbcb42Sjoerg KernelStaticGlobalized = new llvm::GlobalVariable(
1109*13fbcb42Sjoerg CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
1110*13fbcb42Sjoerg llvm::GlobalValue::InternalLinkage,
1111*13fbcb42Sjoerg llvm::UndefValue::get(CGM.VoidPtrTy),
1112*13fbcb42Sjoerg "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
1113*13fbcb42Sjoerg llvm::GlobalValue::NotThreadLocal,
1114*13fbcb42Sjoerg CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
1115*13fbcb42Sjoerg }
1116*13fbcb42Sjoerg emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1117*13fbcb42Sjoerg IsOffloadEntry, CodeGen);
1118*13fbcb42Sjoerg IsInTTDRegion = false;
1119*13fbcb42Sjoerg
1120*13fbcb42Sjoerg // Now change the name of the worker function to correspond to this target
1121*13fbcb42Sjoerg // region's entry function.
1122*13fbcb42Sjoerg WST.WorkerFn->setName(Twine(OutlinedFn->getName(), "_worker"));
1123*13fbcb42Sjoerg
1124*13fbcb42Sjoerg // Create the worker function
1125*13fbcb42Sjoerg emitWorkerFunction(WST);
1126*13fbcb42Sjoerg }
1127*13fbcb42Sjoerg
1128*13fbcb42Sjoerg // Setup NVPTX threads for master-worker OpenMP scheme.
emitNonSPMDEntryHeader(CodeGenFunction & CGF,EntryFunctionState & EST,WorkerFunctionState & WST)1129*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
1130*13fbcb42Sjoerg EntryFunctionState &EST,
1131*13fbcb42Sjoerg WorkerFunctionState &WST) {
1132*13fbcb42Sjoerg CGBuilderTy &Bld = CGF.Builder;
1133*13fbcb42Sjoerg
1134*13fbcb42Sjoerg llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
1135*13fbcb42Sjoerg llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
1136*13fbcb42Sjoerg llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
1137*13fbcb42Sjoerg EST.ExitBB = CGF.createBasicBlock(".exit");
1138*13fbcb42Sjoerg
1139*13fbcb42Sjoerg auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1140*13fbcb42Sjoerg llvm::Value *GPUThreadID = RT.getGPUThreadID(CGF);
1141*13fbcb42Sjoerg llvm::Value *ThreadLimit = getThreadLimit(CGF);
1142*13fbcb42Sjoerg llvm::Value *IsWorker = Bld.CreateICmpULT(GPUThreadID, ThreadLimit);
1143*13fbcb42Sjoerg Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
1144*13fbcb42Sjoerg
1145*13fbcb42Sjoerg CGF.EmitBlock(WorkerBB);
1146*13fbcb42Sjoerg emitCall(CGF, WST.Loc, WST.WorkerFn);
1147*13fbcb42Sjoerg CGF.EmitBranch(EST.ExitBB);
1148*13fbcb42Sjoerg
1149*13fbcb42Sjoerg CGF.EmitBlock(MasterCheckBB);
1150*13fbcb42Sjoerg GPUThreadID = RT.getGPUThreadID(CGF);
1151*13fbcb42Sjoerg llvm::Value *MasterThreadID = getMasterThreadID(CGF);
1152*13fbcb42Sjoerg llvm::Value *IsMaster = Bld.CreateICmpEQ(GPUThreadID, MasterThreadID);
1153*13fbcb42Sjoerg Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
1154*13fbcb42Sjoerg
1155*13fbcb42Sjoerg CGF.EmitBlock(MasterBB);
1156*13fbcb42Sjoerg IsInTargetMasterThreadRegion = true;
1157*13fbcb42Sjoerg // SEQUENTIAL (MASTER) REGION START
1158*13fbcb42Sjoerg // First action in sequential region:
1159*13fbcb42Sjoerg // Initialize the state of the OpenMP runtime library on the GPU.
1160*13fbcb42Sjoerg // TODO: Optimize runtime initialization and pass in correct value.
1161*13fbcb42Sjoerg llvm::Value *Args[] = {getThreadLimit(CGF),
1162*13fbcb42Sjoerg Bld.getInt16(/*RequiresOMPRuntime=*/1)};
1163*13fbcb42Sjoerg CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1164*13fbcb42Sjoerg CGM.getModule(), OMPRTL___kmpc_kernel_init),
1165*13fbcb42Sjoerg Args);
1166*13fbcb42Sjoerg
1167*13fbcb42Sjoerg // For data sharing, we need to initialize the stack.
1168*13fbcb42Sjoerg CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1169*13fbcb42Sjoerg CGM.getModule(), OMPRTL___kmpc_data_sharing_init_stack));
1170*13fbcb42Sjoerg
1171*13fbcb42Sjoerg emitGenericVarsProlog(CGF, WST.Loc);
1172*13fbcb42Sjoerg }
1173*13fbcb42Sjoerg
emitNonSPMDEntryFooter(CodeGenFunction & CGF,EntryFunctionState & EST)1174*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitNonSPMDEntryFooter(CodeGenFunction &CGF,
1175*13fbcb42Sjoerg EntryFunctionState &EST) {
1176*13fbcb42Sjoerg IsInTargetMasterThreadRegion = false;
1177*13fbcb42Sjoerg if (!CGF.HaveInsertPoint())
1178*13fbcb42Sjoerg return;
1179*13fbcb42Sjoerg
1180*13fbcb42Sjoerg emitGenericVarsEpilog(CGF);
1181*13fbcb42Sjoerg
1182*13fbcb42Sjoerg if (!EST.ExitBB)
1183*13fbcb42Sjoerg EST.ExitBB = CGF.createBasicBlock(".exit");
1184*13fbcb42Sjoerg
1185*13fbcb42Sjoerg llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
1186*13fbcb42Sjoerg CGF.EmitBranch(TerminateBB);
1187*13fbcb42Sjoerg
1188*13fbcb42Sjoerg CGF.EmitBlock(TerminateBB);
1189*13fbcb42Sjoerg // Signal termination condition.
1190*13fbcb42Sjoerg // TODO: Optimize runtime initialization and pass in correct value.
1191*13fbcb42Sjoerg llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)};
1192*13fbcb42Sjoerg CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1193*13fbcb42Sjoerg CGM.getModule(), OMPRTL___kmpc_kernel_deinit),
1194*13fbcb42Sjoerg Args);
1195*13fbcb42Sjoerg // Barrier to terminate worker threads.
1196*13fbcb42Sjoerg syncCTAThreads(CGF);
1197*13fbcb42Sjoerg // Master thread jumps to exit point.
1198*13fbcb42Sjoerg CGF.EmitBranch(EST.ExitBB);
1199*13fbcb42Sjoerg
1200*13fbcb42Sjoerg CGF.EmitBlock(EST.ExitBB);
1201*13fbcb42Sjoerg EST.ExitBB = nullptr;
1202*13fbcb42Sjoerg }
1203*13fbcb42Sjoerg
emitSPMDKernel(const OMPExecutableDirective & D,StringRef ParentName,llvm::Function * & OutlinedFn,llvm::Constant * & OutlinedFnID,bool IsOffloadEntry,const RegionCodeGenTy & CodeGen)1204*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
1205*13fbcb42Sjoerg StringRef ParentName,
1206*13fbcb42Sjoerg llvm::Function *&OutlinedFn,
1207*13fbcb42Sjoerg llvm::Constant *&OutlinedFnID,
1208*13fbcb42Sjoerg bool IsOffloadEntry,
1209*13fbcb42Sjoerg const RegionCodeGenTy &CodeGen) {
1210*13fbcb42Sjoerg ExecutionRuntimeModesRAII ModeRAII(
1211*13fbcb42Sjoerg CurrentExecutionMode, RequiresFullRuntime,
1212*13fbcb42Sjoerg CGM.getLangOpts().OpenMPCUDAForceFullRuntime ||
1213*13fbcb42Sjoerg !supportsLightweightRuntime(CGM.getContext(), D));
1214*13fbcb42Sjoerg EntryFunctionState EST;
1215*13fbcb42Sjoerg
1216*13fbcb42Sjoerg // Emit target region as a standalone region.
1217*13fbcb42Sjoerg class NVPTXPrePostActionTy : public PrePostActionTy {
1218*13fbcb42Sjoerg CGOpenMPRuntimeGPU &RT;
1219*13fbcb42Sjoerg CGOpenMPRuntimeGPU::EntryFunctionState &EST;
1220*13fbcb42Sjoerg const OMPExecutableDirective &D;
1221*13fbcb42Sjoerg
1222*13fbcb42Sjoerg public:
1223*13fbcb42Sjoerg NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
1224*13fbcb42Sjoerg CGOpenMPRuntimeGPU::EntryFunctionState &EST,
1225*13fbcb42Sjoerg const OMPExecutableDirective &D)
1226*13fbcb42Sjoerg : RT(RT), EST(EST), D(D) {}
1227*13fbcb42Sjoerg void Enter(CodeGenFunction &CGF) override {
1228*13fbcb42Sjoerg RT.emitSPMDEntryHeader(CGF, EST, D);
1229*13fbcb42Sjoerg // Skip target region initialization.
1230*13fbcb42Sjoerg RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1231*13fbcb42Sjoerg }
1232*13fbcb42Sjoerg void Exit(CodeGenFunction &CGF) override {
1233*13fbcb42Sjoerg RT.clearLocThreadIdInsertPt(CGF);
1234*13fbcb42Sjoerg RT.emitSPMDEntryFooter(CGF, EST);
1235*13fbcb42Sjoerg }
1236*13fbcb42Sjoerg } Action(*this, EST, D);
1237*13fbcb42Sjoerg CodeGen.setAction(Action);
1238*13fbcb42Sjoerg IsInTTDRegion = true;
1239*13fbcb42Sjoerg // Reserve place for the globalized memory.
1240*13fbcb42Sjoerg GlobalizedRecords.emplace_back();
1241*13fbcb42Sjoerg if (!KernelStaticGlobalized) {
1242*13fbcb42Sjoerg KernelStaticGlobalized = new llvm::GlobalVariable(
1243*13fbcb42Sjoerg CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
1244*13fbcb42Sjoerg llvm::GlobalValue::InternalLinkage,
1245*13fbcb42Sjoerg llvm::UndefValue::get(CGM.VoidPtrTy),
1246*13fbcb42Sjoerg "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
1247*13fbcb42Sjoerg llvm::GlobalValue::NotThreadLocal,
1248*13fbcb42Sjoerg CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
1249*13fbcb42Sjoerg }
1250*13fbcb42Sjoerg emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1251*13fbcb42Sjoerg IsOffloadEntry, CodeGen);
1252*13fbcb42Sjoerg IsInTTDRegion = false;
1253*13fbcb42Sjoerg }
1254*13fbcb42Sjoerg
emitSPMDEntryHeader(CodeGenFunction & CGF,EntryFunctionState & EST,const OMPExecutableDirective & D)1255*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitSPMDEntryHeader(
1256*13fbcb42Sjoerg CodeGenFunction &CGF, EntryFunctionState &EST,
1257*13fbcb42Sjoerg const OMPExecutableDirective &D) {
1258*13fbcb42Sjoerg CGBuilderTy &Bld = CGF.Builder;
1259*13fbcb42Sjoerg
1260*13fbcb42Sjoerg // Setup BBs in entry function.
1261*13fbcb42Sjoerg llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
1262*13fbcb42Sjoerg EST.ExitBB = CGF.createBasicBlock(".exit");
1263*13fbcb42Sjoerg
1264*13fbcb42Sjoerg llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSPMDExecutionMode=*/true),
1265*13fbcb42Sjoerg /*RequiresOMPRuntime=*/
1266*13fbcb42Sjoerg Bld.getInt16(RequiresFullRuntime ? 1 : 0)};
1267*13fbcb42Sjoerg CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1268*13fbcb42Sjoerg CGM.getModule(), OMPRTL___kmpc_spmd_kernel_init),
1269*13fbcb42Sjoerg Args);
1270*13fbcb42Sjoerg
1271*13fbcb42Sjoerg if (RequiresFullRuntime) {
1272*13fbcb42Sjoerg // For data sharing, we need to initialize the stack.
1273*13fbcb42Sjoerg CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1274*13fbcb42Sjoerg CGM.getModule(), OMPRTL___kmpc_data_sharing_init_stack_spmd));
1275*13fbcb42Sjoerg }
1276*13fbcb42Sjoerg
1277*13fbcb42Sjoerg CGF.EmitBranch(ExecuteBB);
1278*13fbcb42Sjoerg
1279*13fbcb42Sjoerg CGF.EmitBlock(ExecuteBB);
1280*13fbcb42Sjoerg
1281*13fbcb42Sjoerg IsInTargetMasterThreadRegion = true;
1282*13fbcb42Sjoerg }
1283*13fbcb42Sjoerg
emitSPMDEntryFooter(CodeGenFunction & CGF,EntryFunctionState & EST)1284*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitSPMDEntryFooter(CodeGenFunction &CGF,
1285*13fbcb42Sjoerg EntryFunctionState &EST) {
1286*13fbcb42Sjoerg IsInTargetMasterThreadRegion = false;
1287*13fbcb42Sjoerg if (!CGF.HaveInsertPoint())
1288*13fbcb42Sjoerg return;
1289*13fbcb42Sjoerg
1290*13fbcb42Sjoerg if (!EST.ExitBB)
1291*13fbcb42Sjoerg EST.ExitBB = CGF.createBasicBlock(".exit");
1292*13fbcb42Sjoerg
1293*13fbcb42Sjoerg llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
1294*13fbcb42Sjoerg CGF.EmitBranch(OMPDeInitBB);
1295*13fbcb42Sjoerg
1296*13fbcb42Sjoerg CGF.EmitBlock(OMPDeInitBB);
1297*13fbcb42Sjoerg // DeInitialize the OMP state in the runtime; called by all active threads.
1298*13fbcb42Sjoerg llvm::Value *Args[] = {/*RequiresOMPRuntime=*/
1299*13fbcb42Sjoerg CGF.Builder.getInt16(RequiresFullRuntime ? 1 : 0)};
1300*13fbcb42Sjoerg CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1301*13fbcb42Sjoerg CGM.getModule(), OMPRTL___kmpc_spmd_kernel_deinit_v2),
1302*13fbcb42Sjoerg Args);
1303*13fbcb42Sjoerg CGF.EmitBranch(EST.ExitBB);
1304*13fbcb42Sjoerg
1305*13fbcb42Sjoerg CGF.EmitBlock(EST.ExitBB);
1306*13fbcb42Sjoerg EST.ExitBB = nullptr;
1307*13fbcb42Sjoerg }
1308*13fbcb42Sjoerg
1309*13fbcb42Sjoerg // Create a unique global variable to indicate the execution mode of this target
1310*13fbcb42Sjoerg // region. The execution mode is either 'generic', or 'spmd' depending on the
1311*13fbcb42Sjoerg // target directive. This variable is picked up by the offload library to setup
1312*13fbcb42Sjoerg // the device appropriately before kernel launch. If the execution mode is
1313*13fbcb42Sjoerg // 'generic', the runtime reserves one warp for the master, otherwise, all
1314*13fbcb42Sjoerg // warps participate in parallel work.
setPropertyExecutionMode(CodeGenModule & CGM,StringRef Name,bool Mode)1315*13fbcb42Sjoerg static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
1316*13fbcb42Sjoerg bool Mode) {
1317*13fbcb42Sjoerg auto *GVMode =
1318*13fbcb42Sjoerg new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
1319*13fbcb42Sjoerg llvm::GlobalValue::WeakAnyLinkage,
1320*13fbcb42Sjoerg llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1),
1321*13fbcb42Sjoerg Twine(Name, "_exec_mode"));
1322*13fbcb42Sjoerg CGM.addCompilerUsedGlobal(GVMode);
1323*13fbcb42Sjoerg }
1324*13fbcb42Sjoerg
emitWorkerFunction(WorkerFunctionState & WST)1325*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitWorkerFunction(WorkerFunctionState &WST) {
1326*13fbcb42Sjoerg ASTContext &Ctx = CGM.getContext();
1327*13fbcb42Sjoerg
1328*13fbcb42Sjoerg CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
1329*13fbcb42Sjoerg CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, WST.CGFI, {},
1330*13fbcb42Sjoerg WST.Loc, WST.Loc);
1331*13fbcb42Sjoerg emitWorkerLoop(CGF, WST);
1332*13fbcb42Sjoerg CGF.FinishFunction();
1333*13fbcb42Sjoerg }
1334*13fbcb42Sjoerg
emitWorkerLoop(CodeGenFunction & CGF,WorkerFunctionState & WST)1335*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitWorkerLoop(CodeGenFunction &CGF,
1336*13fbcb42Sjoerg WorkerFunctionState &WST) {
1337*13fbcb42Sjoerg //
1338*13fbcb42Sjoerg // The workers enter this loop and wait for parallel work from the master.
1339*13fbcb42Sjoerg // When the master encounters a parallel region it sets up the work + variable
1340*13fbcb42Sjoerg // arguments, and wakes up the workers. The workers first check to see if
1341*13fbcb42Sjoerg // they are required for the parallel region, i.e., within the # of requested
1342*13fbcb42Sjoerg // parallel threads. The activated workers load the variable arguments and
1343*13fbcb42Sjoerg // execute the parallel work.
1344*13fbcb42Sjoerg //
1345*13fbcb42Sjoerg
1346*13fbcb42Sjoerg CGBuilderTy &Bld = CGF.Builder;
1347*13fbcb42Sjoerg
1348*13fbcb42Sjoerg llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
1349*13fbcb42Sjoerg llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
1350*13fbcb42Sjoerg llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
1351*13fbcb42Sjoerg llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
1352*13fbcb42Sjoerg llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
1353*13fbcb42Sjoerg llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
1354*13fbcb42Sjoerg
1355*13fbcb42Sjoerg CGF.EmitBranch(AwaitBB);
1356*13fbcb42Sjoerg
1357*13fbcb42Sjoerg // Workers wait for work from master.
1358*13fbcb42Sjoerg CGF.EmitBlock(AwaitBB);
1359*13fbcb42Sjoerg // Wait for parallel work
1360*13fbcb42Sjoerg syncCTAThreads(CGF);
1361*13fbcb42Sjoerg
1362*13fbcb42Sjoerg Address WorkFn =
1363*13fbcb42Sjoerg CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");
1364*13fbcb42Sjoerg Address ExecStatus =
1365*13fbcb42Sjoerg CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status");
1366*13fbcb42Sjoerg CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
1367*13fbcb42Sjoerg CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
1368*13fbcb42Sjoerg
1369*13fbcb42Sjoerg // TODO: Optimize runtime initialization and pass in correct value.
1370*13fbcb42Sjoerg llvm::Value *Args[] = {WorkFn.getPointer()};
1371*13fbcb42Sjoerg llvm::Value *Ret =
1372*13fbcb42Sjoerg CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1373*13fbcb42Sjoerg CGM.getModule(), OMPRTL___kmpc_kernel_parallel),
1374*13fbcb42Sjoerg Args);
1375*13fbcb42Sjoerg Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
1376*13fbcb42Sjoerg
1377*13fbcb42Sjoerg // On termination condition (workid == 0), exit loop.
1378*13fbcb42Sjoerg llvm::Value *WorkID = Bld.CreateLoad(WorkFn);
1379*13fbcb42Sjoerg llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID, "should_terminate");
1380*13fbcb42Sjoerg Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
1381*13fbcb42Sjoerg
1382*13fbcb42Sjoerg // Activate requested workers.
1383*13fbcb42Sjoerg CGF.EmitBlock(SelectWorkersBB);
1384*13fbcb42Sjoerg llvm::Value *IsActive =
1385*13fbcb42Sjoerg Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
1386*13fbcb42Sjoerg Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
1387*13fbcb42Sjoerg
1388*13fbcb42Sjoerg // Signal start of parallel region.
1389*13fbcb42Sjoerg CGF.EmitBlock(ExecuteBB);
1390*13fbcb42Sjoerg // Skip initialization.
1391*13fbcb42Sjoerg setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1392*13fbcb42Sjoerg
1393*13fbcb42Sjoerg // Process work items: outlined parallel functions.
1394*13fbcb42Sjoerg for (llvm::Function *W : Work) {
1395*13fbcb42Sjoerg // Try to match this outlined function.
1396*13fbcb42Sjoerg llvm::Value *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy);
1397*13fbcb42Sjoerg
1398*13fbcb42Sjoerg llvm::Value *WorkFnMatch =
1399*13fbcb42Sjoerg Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
1400*13fbcb42Sjoerg
1401*13fbcb42Sjoerg llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
1402*13fbcb42Sjoerg llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
1403*13fbcb42Sjoerg Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
1404*13fbcb42Sjoerg
1405*13fbcb42Sjoerg // Execute this outlined function.
1406*13fbcb42Sjoerg CGF.EmitBlock(ExecuteFNBB);
1407*13fbcb42Sjoerg
1408*13fbcb42Sjoerg // Insert call to work function via shared wrapper. The shared
1409*13fbcb42Sjoerg // wrapper takes two arguments:
1410*13fbcb42Sjoerg // - the parallelism level;
1411*13fbcb42Sjoerg // - the thread ID;
1412*13fbcb42Sjoerg emitCall(CGF, WST.Loc, W,
1413*13fbcb42Sjoerg {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1414*13fbcb42Sjoerg
1415*13fbcb42Sjoerg // Go to end of parallel region.
1416*13fbcb42Sjoerg CGF.EmitBranch(TerminateBB);
1417*13fbcb42Sjoerg
1418*13fbcb42Sjoerg CGF.EmitBlock(CheckNextBB);
1419*13fbcb42Sjoerg }
1420*13fbcb42Sjoerg // Default case: call to outlined function through pointer if the target
1421*13fbcb42Sjoerg // region makes a declare target call that may contain an orphaned parallel
1422*13fbcb42Sjoerg // directive.
1423*13fbcb42Sjoerg auto *ParallelFnTy =
1424*13fbcb42Sjoerg llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty},
1425*13fbcb42Sjoerg /*isVarArg=*/false);
1426*13fbcb42Sjoerg llvm::Value *WorkFnCast =
1427*13fbcb42Sjoerg Bld.CreateBitCast(WorkID, ParallelFnTy->getPointerTo());
1428*13fbcb42Sjoerg // Insert call to work function via shared wrapper. The shared
1429*13fbcb42Sjoerg // wrapper takes two arguments:
1430*13fbcb42Sjoerg // - the parallelism level;
1431*13fbcb42Sjoerg // - the thread ID;
1432*13fbcb42Sjoerg emitCall(CGF, WST.Loc, {ParallelFnTy, WorkFnCast},
1433*13fbcb42Sjoerg {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1434*13fbcb42Sjoerg // Go to end of parallel region.
1435*13fbcb42Sjoerg CGF.EmitBranch(TerminateBB);
1436*13fbcb42Sjoerg
1437*13fbcb42Sjoerg // Signal end of parallel region.
1438*13fbcb42Sjoerg CGF.EmitBlock(TerminateBB);
1439*13fbcb42Sjoerg CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1440*13fbcb42Sjoerg CGM.getModule(), OMPRTL___kmpc_kernel_end_parallel),
1441*13fbcb42Sjoerg llvm::None);
1442*13fbcb42Sjoerg CGF.EmitBranch(BarrierBB);
1443*13fbcb42Sjoerg
1444*13fbcb42Sjoerg // All active and inactive workers wait at a barrier after parallel region.
1445*13fbcb42Sjoerg CGF.EmitBlock(BarrierBB);
1446*13fbcb42Sjoerg // Barrier after parallel region.
1447*13fbcb42Sjoerg syncCTAThreads(CGF);
1448*13fbcb42Sjoerg CGF.EmitBranch(AwaitBB);
1449*13fbcb42Sjoerg
1450*13fbcb42Sjoerg // Exit target region.
1451*13fbcb42Sjoerg CGF.EmitBlock(ExitBB);
1452*13fbcb42Sjoerg // Skip initialization.
1453*13fbcb42Sjoerg clearLocThreadIdInsertPt(CGF);
1454*13fbcb42Sjoerg }
1455*13fbcb42Sjoerg
createOffloadEntry(llvm::Constant * ID,llvm::Constant * Addr,uint64_t Size,int32_t,llvm::GlobalValue::LinkageTypes)1456*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::createOffloadEntry(llvm::Constant *ID,
1457*13fbcb42Sjoerg llvm::Constant *Addr,
1458*13fbcb42Sjoerg uint64_t Size, int32_t,
1459*13fbcb42Sjoerg llvm::GlobalValue::LinkageTypes) {
1460*13fbcb42Sjoerg // TODO: Add support for global variables on the device after declare target
1461*13fbcb42Sjoerg // support.
1462*13fbcb42Sjoerg if (!isa<llvm::Function>(Addr))
1463*13fbcb42Sjoerg return;
1464*13fbcb42Sjoerg llvm::Module &M = CGM.getModule();
1465*13fbcb42Sjoerg llvm::LLVMContext &Ctx = CGM.getLLVMContext();
1466*13fbcb42Sjoerg
1467*13fbcb42Sjoerg // Get "nvvm.annotations" metadata node
1468*13fbcb42Sjoerg llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
1469*13fbcb42Sjoerg
1470*13fbcb42Sjoerg llvm::Metadata *MDVals[] = {
1471*13fbcb42Sjoerg llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"),
1472*13fbcb42Sjoerg llvm::ConstantAsMetadata::get(
1473*13fbcb42Sjoerg llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
1474*13fbcb42Sjoerg // Append metadata to nvvm.annotations
1475*13fbcb42Sjoerg MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
1476*13fbcb42Sjoerg }
1477*13fbcb42Sjoerg
emitTargetOutlinedFunction(const OMPExecutableDirective & D,StringRef ParentName,llvm::Function * & OutlinedFn,llvm::Constant * & OutlinedFnID,bool IsOffloadEntry,const RegionCodeGenTy & CodeGen)1478*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
1479*13fbcb42Sjoerg const OMPExecutableDirective &D, StringRef ParentName,
1480*13fbcb42Sjoerg llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
1481*13fbcb42Sjoerg bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
1482*13fbcb42Sjoerg if (!IsOffloadEntry) // Nothing to do.
1483*13fbcb42Sjoerg return;
1484*13fbcb42Sjoerg
1485*13fbcb42Sjoerg assert(!ParentName.empty() && "Invalid target region parent name!");
1486*13fbcb42Sjoerg
1487*13fbcb42Sjoerg bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
1488*13fbcb42Sjoerg if (Mode)
1489*13fbcb42Sjoerg emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1490*13fbcb42Sjoerg CodeGen);
1491*13fbcb42Sjoerg else
1492*13fbcb42Sjoerg emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1493*13fbcb42Sjoerg CodeGen);
1494*13fbcb42Sjoerg
1495*13fbcb42Sjoerg setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
1496*13fbcb42Sjoerg }
1497*13fbcb42Sjoerg
1498*13fbcb42Sjoerg namespace {
1499*13fbcb42Sjoerg LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
1500*13fbcb42Sjoerg /// Enum for accesseing the reserved_2 field of the ident_t struct.
1501*13fbcb42Sjoerg enum ModeFlagsTy : unsigned {
1502*13fbcb42Sjoerg /// Bit set to 1 when in SPMD mode.
1503*13fbcb42Sjoerg KMP_IDENT_SPMD_MODE = 0x01,
1504*13fbcb42Sjoerg /// Bit set to 1 when a simplified runtime is used.
1505*13fbcb42Sjoerg KMP_IDENT_SIMPLE_RT_MODE = 0x02,
1506*13fbcb42Sjoerg LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/KMP_IDENT_SIMPLE_RT_MODE)
1507*13fbcb42Sjoerg };
1508*13fbcb42Sjoerg
1509*13fbcb42Sjoerg /// Special mode Undefined. Is the combination of Non-SPMD mode + SimpleRuntime.
1510*13fbcb42Sjoerg static const ModeFlagsTy UndefinedMode =
1511*13fbcb42Sjoerg (~KMP_IDENT_SPMD_MODE) & KMP_IDENT_SIMPLE_RT_MODE;
1512*13fbcb42Sjoerg } // anonymous namespace
1513*13fbcb42Sjoerg
getDefaultLocationReserved2Flags() const1514*13fbcb42Sjoerg unsigned CGOpenMPRuntimeGPU::getDefaultLocationReserved2Flags() const {
1515*13fbcb42Sjoerg switch (getExecutionMode()) {
1516*13fbcb42Sjoerg case EM_SPMD:
1517*13fbcb42Sjoerg if (requiresFullRuntime())
1518*13fbcb42Sjoerg return KMP_IDENT_SPMD_MODE & (~KMP_IDENT_SIMPLE_RT_MODE);
1519*13fbcb42Sjoerg return KMP_IDENT_SPMD_MODE | KMP_IDENT_SIMPLE_RT_MODE;
1520*13fbcb42Sjoerg case EM_NonSPMD:
1521*13fbcb42Sjoerg assert(requiresFullRuntime() && "Expected full runtime.");
1522*13fbcb42Sjoerg return (~KMP_IDENT_SPMD_MODE) & (~KMP_IDENT_SIMPLE_RT_MODE);
1523*13fbcb42Sjoerg case EM_Unknown:
1524*13fbcb42Sjoerg return UndefinedMode;
1525*13fbcb42Sjoerg }
1526*13fbcb42Sjoerg llvm_unreachable("Unknown flags are requested.");
1527*13fbcb42Sjoerg }
1528*13fbcb42Sjoerg
CGOpenMPRuntimeGPU(CodeGenModule & CGM)1529*13fbcb42Sjoerg CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
1530*13fbcb42Sjoerg : CGOpenMPRuntime(CGM, "_", "$") {
1531*13fbcb42Sjoerg if (!CGM.getLangOpts().OpenMPIsDevice)
1532*13fbcb42Sjoerg llvm_unreachable("OpenMP NVPTX can only handle device code.");
1533*13fbcb42Sjoerg }
1534*13fbcb42Sjoerg
emitProcBindClause(CodeGenFunction & CGF,ProcBindKind ProcBind,SourceLocation Loc)1535*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,
1536*13fbcb42Sjoerg ProcBindKind ProcBind,
1537*13fbcb42Sjoerg SourceLocation Loc) {
1538*13fbcb42Sjoerg // Do nothing in case of SPMD mode and L0 parallel.
1539*13fbcb42Sjoerg if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
1540*13fbcb42Sjoerg return;
1541*13fbcb42Sjoerg
1542*13fbcb42Sjoerg CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
1543*13fbcb42Sjoerg }
1544*13fbcb42Sjoerg
emitNumThreadsClause(CodeGenFunction & CGF,llvm::Value * NumThreads,SourceLocation Loc)1545*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF,
1546*13fbcb42Sjoerg llvm::Value *NumThreads,
1547*13fbcb42Sjoerg SourceLocation Loc) {
1548*13fbcb42Sjoerg // Do nothing in case of SPMD mode and L0 parallel.
1549*13fbcb42Sjoerg if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
1550*13fbcb42Sjoerg return;
1551*13fbcb42Sjoerg
1552*13fbcb42Sjoerg CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
1553*13fbcb42Sjoerg }
1554*13fbcb42Sjoerg
emitNumTeamsClause(CodeGenFunction & CGF,const Expr * NumTeams,const Expr * ThreadLimit,SourceLocation Loc)1555*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF,
1556*13fbcb42Sjoerg const Expr *NumTeams,
1557*13fbcb42Sjoerg const Expr *ThreadLimit,
1558*13fbcb42Sjoerg SourceLocation Loc) {}
1559*13fbcb42Sjoerg
emitParallelOutlinedFunction(const OMPExecutableDirective & D,const VarDecl * ThreadIDVar,OpenMPDirectiveKind InnermostKind,const RegionCodeGenTy & CodeGen)1560*13fbcb42Sjoerg llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction(
1561*13fbcb42Sjoerg const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1562*13fbcb42Sjoerg OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1563*13fbcb42Sjoerg // Emit target region as a standalone region.
1564*13fbcb42Sjoerg class NVPTXPrePostActionTy : public PrePostActionTy {
1565*13fbcb42Sjoerg bool &IsInParallelRegion;
1566*13fbcb42Sjoerg bool PrevIsInParallelRegion;
1567*13fbcb42Sjoerg
1568*13fbcb42Sjoerg public:
1569*13fbcb42Sjoerg NVPTXPrePostActionTy(bool &IsInParallelRegion)
1570*13fbcb42Sjoerg : IsInParallelRegion(IsInParallelRegion) {}
1571*13fbcb42Sjoerg void Enter(CodeGenFunction &CGF) override {
1572*13fbcb42Sjoerg PrevIsInParallelRegion = IsInParallelRegion;
1573*13fbcb42Sjoerg IsInParallelRegion = true;
1574*13fbcb42Sjoerg }
1575*13fbcb42Sjoerg void Exit(CodeGenFunction &CGF) override {
1576*13fbcb42Sjoerg IsInParallelRegion = PrevIsInParallelRegion;
1577*13fbcb42Sjoerg }
1578*13fbcb42Sjoerg } Action(IsInParallelRegion);
1579*13fbcb42Sjoerg CodeGen.setAction(Action);
1580*13fbcb42Sjoerg bool PrevIsInTTDRegion = IsInTTDRegion;
1581*13fbcb42Sjoerg IsInTTDRegion = false;
1582*13fbcb42Sjoerg bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
1583*13fbcb42Sjoerg IsInTargetMasterThreadRegion = false;
1584*13fbcb42Sjoerg auto *OutlinedFun =
1585*13fbcb42Sjoerg cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
1586*13fbcb42Sjoerg D, ThreadIDVar, InnermostKind, CodeGen));
1587*13fbcb42Sjoerg IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
1588*13fbcb42Sjoerg IsInTTDRegion = PrevIsInTTDRegion;
1589*13fbcb42Sjoerg if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD &&
1590*13fbcb42Sjoerg !IsInParallelRegion) {
1591*13fbcb42Sjoerg llvm::Function *WrapperFun =
1592*13fbcb42Sjoerg createParallelDataSharingWrapper(OutlinedFun, D);
1593*13fbcb42Sjoerg WrapperFunctionsMap[OutlinedFun] = WrapperFun;
1594*13fbcb42Sjoerg }
1595*13fbcb42Sjoerg
1596*13fbcb42Sjoerg return OutlinedFun;
1597*13fbcb42Sjoerg }
1598*13fbcb42Sjoerg
1599*13fbcb42Sjoerg /// Get list of lastprivate variables from the teams distribute ... or
1600*13fbcb42Sjoerg /// teams {distribute ...} directives.
1601*13fbcb42Sjoerg static void
getDistributeLastprivateVars(ASTContext & Ctx,const OMPExecutableDirective & D,llvm::SmallVectorImpl<const ValueDecl * > & Vars)1602*13fbcb42Sjoerg getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,
1603*13fbcb42Sjoerg llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
1604*13fbcb42Sjoerg assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
1605*13fbcb42Sjoerg "expected teams directive.");
1606*13fbcb42Sjoerg const OMPExecutableDirective *Dir = &D;
1607*13fbcb42Sjoerg if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
1608*13fbcb42Sjoerg if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild(
1609*13fbcb42Sjoerg Ctx,
1610*13fbcb42Sjoerg D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
1611*13fbcb42Sjoerg /*IgnoreCaptured=*/true))) {
1612*13fbcb42Sjoerg Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
1613*13fbcb42Sjoerg if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
1614*13fbcb42Sjoerg Dir = nullptr;
1615*13fbcb42Sjoerg }
1616*13fbcb42Sjoerg }
1617*13fbcb42Sjoerg if (!Dir)
1618*13fbcb42Sjoerg return;
1619*13fbcb42Sjoerg for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
1620*13fbcb42Sjoerg for (const Expr *E : C->getVarRefs())
1621*13fbcb42Sjoerg Vars.push_back(getPrivateItem(E));
1622*13fbcb42Sjoerg }
1623*13fbcb42Sjoerg }
1624*13fbcb42Sjoerg
1625*13fbcb42Sjoerg /// Get list of reduction variables from the teams ... directives.
1626*13fbcb42Sjoerg static void
getTeamsReductionVars(ASTContext & Ctx,const OMPExecutableDirective & D,llvm::SmallVectorImpl<const ValueDecl * > & Vars)1627*13fbcb42Sjoerg getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,
1628*13fbcb42Sjoerg llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
1629*13fbcb42Sjoerg assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
1630*13fbcb42Sjoerg "expected teams directive.");
1631*13fbcb42Sjoerg for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
1632*13fbcb42Sjoerg for (const Expr *E : C->privates())
1633*13fbcb42Sjoerg Vars.push_back(getPrivateItem(E));
1634*13fbcb42Sjoerg }
1635*13fbcb42Sjoerg }
1636*13fbcb42Sjoerg
emitTeamsOutlinedFunction(const OMPExecutableDirective & D,const VarDecl * ThreadIDVar,OpenMPDirectiveKind InnermostKind,const RegionCodeGenTy & CodeGen)1637*13fbcb42Sjoerg llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
1638*13fbcb42Sjoerg const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1639*13fbcb42Sjoerg OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1640*13fbcb42Sjoerg SourceLocation Loc = D.getBeginLoc();
1641*13fbcb42Sjoerg
1642*13fbcb42Sjoerg const RecordDecl *GlobalizedRD = nullptr;
1643*13fbcb42Sjoerg llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
1644*13fbcb42Sjoerg llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
1645*13fbcb42Sjoerg unsigned WarpSize = CGM.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
1646*13fbcb42Sjoerg // Globalize team reductions variable unconditionally in all modes.
1647*13fbcb42Sjoerg if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1648*13fbcb42Sjoerg getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
1649*13fbcb42Sjoerg if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
1650*13fbcb42Sjoerg getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
1651*13fbcb42Sjoerg if (!LastPrivatesReductions.empty()) {
1652*13fbcb42Sjoerg GlobalizedRD = ::buildRecordForGlobalizedVars(
1653*13fbcb42Sjoerg CGM.getContext(), llvm::None, LastPrivatesReductions,
1654*13fbcb42Sjoerg MappedDeclsFields, WarpSize);
1655*13fbcb42Sjoerg }
1656*13fbcb42Sjoerg } else if (!LastPrivatesReductions.empty()) {
1657*13fbcb42Sjoerg assert(!TeamAndReductions.first &&
1658*13fbcb42Sjoerg "Previous team declaration is not expected.");
1659*13fbcb42Sjoerg TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
1660*13fbcb42Sjoerg std::swap(TeamAndReductions.second, LastPrivatesReductions);
1661*13fbcb42Sjoerg }
1662*13fbcb42Sjoerg
1663*13fbcb42Sjoerg // Emit target region as a standalone region.
1664*13fbcb42Sjoerg class NVPTXPrePostActionTy : public PrePostActionTy {
1665*13fbcb42Sjoerg SourceLocation &Loc;
1666*13fbcb42Sjoerg const RecordDecl *GlobalizedRD;
1667*13fbcb42Sjoerg llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1668*13fbcb42Sjoerg &MappedDeclsFields;
1669*13fbcb42Sjoerg
1670*13fbcb42Sjoerg public:
1671*13fbcb42Sjoerg NVPTXPrePostActionTy(
1672*13fbcb42Sjoerg SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1673*13fbcb42Sjoerg llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1674*13fbcb42Sjoerg &MappedDeclsFields)
1675*13fbcb42Sjoerg : Loc(Loc), GlobalizedRD(GlobalizedRD),
1676*13fbcb42Sjoerg MappedDeclsFields(MappedDeclsFields) {}
1677*13fbcb42Sjoerg void Enter(CodeGenFunction &CGF) override {
1678*13fbcb42Sjoerg auto &Rt =
1679*13fbcb42Sjoerg static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1680*13fbcb42Sjoerg if (GlobalizedRD) {
1681*13fbcb42Sjoerg auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
1682*13fbcb42Sjoerg I->getSecond().GlobalRecord = GlobalizedRD;
1683*13fbcb42Sjoerg I->getSecond().MappedParams =
1684*13fbcb42Sjoerg std::make_unique<CodeGenFunction::OMPMapVars>();
1685*13fbcb42Sjoerg DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
1686*13fbcb42Sjoerg for (const auto &Pair : MappedDeclsFields) {
1687*13fbcb42Sjoerg assert(Pair.getFirst()->isCanonicalDecl() &&
1688*13fbcb42Sjoerg "Expected canonical declaration");
1689*13fbcb42Sjoerg Data.insert(std::make_pair(Pair.getFirst(),
1690*13fbcb42Sjoerg MappedVarData(Pair.getSecond(),
1691*13fbcb42Sjoerg /*IsOnePerTeam=*/true)));
1692*13fbcb42Sjoerg }
1693*13fbcb42Sjoerg }
1694*13fbcb42Sjoerg Rt.emitGenericVarsProlog(CGF, Loc);
1695*13fbcb42Sjoerg }
1696*13fbcb42Sjoerg void Exit(CodeGenFunction &CGF) override {
1697*13fbcb42Sjoerg static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
1698*13fbcb42Sjoerg .emitGenericVarsEpilog(CGF);
1699*13fbcb42Sjoerg }
1700*13fbcb42Sjoerg } Action(Loc, GlobalizedRD, MappedDeclsFields);
1701*13fbcb42Sjoerg CodeGen.setAction(Action);
1702*13fbcb42Sjoerg llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1703*13fbcb42Sjoerg D, ThreadIDVar, InnermostKind, CodeGen);
1704*13fbcb42Sjoerg
1705*13fbcb42Sjoerg return OutlinedFun;
1706*13fbcb42Sjoerg }
1707*13fbcb42Sjoerg
emitGenericVarsProlog(CodeGenFunction & CGF,SourceLocation Loc,bool WithSPMDCheck)1708*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
1709*13fbcb42Sjoerg SourceLocation Loc,
1710*13fbcb42Sjoerg bool WithSPMDCheck) {
1711*13fbcb42Sjoerg if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
1712*13fbcb42Sjoerg getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1713*13fbcb42Sjoerg return;
1714*13fbcb42Sjoerg
1715*13fbcb42Sjoerg CGBuilderTy &Bld = CGF.Builder;
1716*13fbcb42Sjoerg
1717*13fbcb42Sjoerg const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1718*13fbcb42Sjoerg if (I == FunctionGlobalizedDecls.end())
1719*13fbcb42Sjoerg return;
1720*13fbcb42Sjoerg if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) {
1721*13fbcb42Sjoerg QualType GlobalRecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord);
1722*13fbcb42Sjoerg QualType SecGlobalRecTy;
1723*13fbcb42Sjoerg
1724*13fbcb42Sjoerg // Recover pointer to this function's global record. The runtime will
1725*13fbcb42Sjoerg // handle the specifics of the allocation of the memory.
1726*13fbcb42Sjoerg // Use actual memory size of the record including the padding
1727*13fbcb42Sjoerg // for alignment purposes.
1728*13fbcb42Sjoerg unsigned Alignment =
1729*13fbcb42Sjoerg CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
1730*13fbcb42Sjoerg unsigned GlobalRecordSize =
1731*13fbcb42Sjoerg CGM.getContext().getTypeSizeInChars(GlobalRecTy).getQuantity();
1732*13fbcb42Sjoerg GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
1733*13fbcb42Sjoerg
1734*13fbcb42Sjoerg llvm::PointerType *GlobalRecPtrTy =
1735*13fbcb42Sjoerg CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo();
1736*13fbcb42Sjoerg llvm::Value *GlobalRecCastAddr;
1737*13fbcb42Sjoerg llvm::Value *IsTTD = nullptr;
1738*13fbcb42Sjoerg if (!IsInTTDRegion &&
1739*13fbcb42Sjoerg (WithSPMDCheck ||
1740*13fbcb42Sjoerg getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) {
1741*13fbcb42Sjoerg llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
1742*13fbcb42Sjoerg llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd");
1743*13fbcb42Sjoerg llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
1744*13fbcb42Sjoerg if (I->getSecond().SecondaryGlobalRecord.hasValue()) {
1745*13fbcb42Sjoerg llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1746*13fbcb42Sjoerg llvm::Value *ThreadID = getThreadID(CGF, Loc);
1747*13fbcb42Sjoerg llvm::Value *PL = CGF.EmitRuntimeCall(
1748*13fbcb42Sjoerg OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
1749*13fbcb42Sjoerg OMPRTL___kmpc_parallel_level),
1750*13fbcb42Sjoerg {RTLoc, ThreadID});
1751*13fbcb42Sjoerg IsTTD = Bld.CreateIsNull(PL);
1752*13fbcb42Sjoerg }
1753*13fbcb42Sjoerg llvm::Value *IsSPMD = Bld.CreateIsNotNull(
1754*13fbcb42Sjoerg CGF.EmitNounwindRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1755*13fbcb42Sjoerg CGM.getModule(), OMPRTL___kmpc_is_spmd_exec_mode)));
1756*13fbcb42Sjoerg Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB);
1757*13fbcb42Sjoerg // There is no need to emit line number for unconditional branch.
1758*13fbcb42Sjoerg (void)ApplyDebugLocation::CreateEmpty(CGF);
1759*13fbcb42Sjoerg CGF.EmitBlock(SPMDBB);
1760*13fbcb42Sjoerg Address RecPtr = Address(llvm::ConstantPointerNull::get(GlobalRecPtrTy),
1761*13fbcb42Sjoerg CharUnits::fromQuantity(Alignment));
1762*13fbcb42Sjoerg CGF.EmitBranch(ExitBB);
1763*13fbcb42Sjoerg // There is no need to emit line number for unconditional branch.
1764*13fbcb42Sjoerg (void)ApplyDebugLocation::CreateEmpty(CGF);
1765*13fbcb42Sjoerg CGF.EmitBlock(NonSPMDBB);
1766*13fbcb42Sjoerg llvm::Value *Size = llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize);
1767*13fbcb42Sjoerg if (const RecordDecl *SecGlobalizedVarsRecord =
1768*13fbcb42Sjoerg I->getSecond().SecondaryGlobalRecord.getValueOr(nullptr)) {
1769*13fbcb42Sjoerg SecGlobalRecTy =
1770*13fbcb42Sjoerg CGM.getContext().getRecordType(SecGlobalizedVarsRecord);
1771*13fbcb42Sjoerg
1772*13fbcb42Sjoerg // Recover pointer to this function's global record. The runtime will
1773*13fbcb42Sjoerg // handle the specifics of the allocation of the memory.
1774*13fbcb42Sjoerg // Use actual memory size of the record including the padding
1775*13fbcb42Sjoerg // for alignment purposes.
1776*13fbcb42Sjoerg unsigned Alignment =
1777*13fbcb42Sjoerg CGM.getContext().getTypeAlignInChars(SecGlobalRecTy).getQuantity();
1778*13fbcb42Sjoerg unsigned GlobalRecordSize =
1779*13fbcb42Sjoerg CGM.getContext().getTypeSizeInChars(SecGlobalRecTy).getQuantity();
1780*13fbcb42Sjoerg GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
1781*13fbcb42Sjoerg Size = Bld.CreateSelect(
1782*13fbcb42Sjoerg IsTTD, llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), Size);
1783*13fbcb42Sjoerg }
1784*13fbcb42Sjoerg // TODO: allow the usage of shared memory to be controlled by
1785*13fbcb42Sjoerg // the user, for now, default to global.
1786*13fbcb42Sjoerg llvm::Value *GlobalRecordSizeArg[] = {
1787*13fbcb42Sjoerg Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
1788*13fbcb42Sjoerg llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
1789*13fbcb42Sjoerg OMPBuilder.getOrCreateRuntimeFunction(
1790*13fbcb42Sjoerg CGM.getModule(), OMPRTL___kmpc_data_sharing_coalesced_push_stack),
1791*13fbcb42Sjoerg GlobalRecordSizeArg);
1792*13fbcb42Sjoerg GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1793*13fbcb42Sjoerg GlobalRecValue, GlobalRecPtrTy);
1794*13fbcb42Sjoerg CGF.EmitBlock(ExitBB);
1795*13fbcb42Sjoerg auto *Phi = Bld.CreatePHI(GlobalRecPtrTy,
1796*13fbcb42Sjoerg /*NumReservedValues=*/2, "_select_stack");
1797*13fbcb42Sjoerg Phi->addIncoming(RecPtr.getPointer(), SPMDBB);
1798*13fbcb42Sjoerg Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB);
1799*13fbcb42Sjoerg GlobalRecCastAddr = Phi;
1800*13fbcb42Sjoerg I->getSecond().GlobalRecordAddr = Phi;
1801*13fbcb42Sjoerg I->getSecond().IsInSPMDModeFlag = IsSPMD;
1802*13fbcb42Sjoerg } else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) {
1803*13fbcb42Sjoerg assert(GlobalizedRecords.back().Records.size() < 2 &&
1804*13fbcb42Sjoerg "Expected less than 2 globalized records: one for target and one "
1805*13fbcb42Sjoerg "for teams.");
1806*13fbcb42Sjoerg unsigned Offset = 0;
1807*13fbcb42Sjoerg for (const RecordDecl *RD : GlobalizedRecords.back().Records) {
1808*13fbcb42Sjoerg QualType RDTy = CGM.getContext().getRecordType(RD);
1809*13fbcb42Sjoerg unsigned Alignment =
1810*13fbcb42Sjoerg CGM.getContext().getTypeAlignInChars(RDTy).getQuantity();
1811*13fbcb42Sjoerg unsigned Size = CGM.getContext().getTypeSizeInChars(RDTy).getQuantity();
1812*13fbcb42Sjoerg Offset =
1813*13fbcb42Sjoerg llvm::alignTo(llvm::alignTo(Offset, Alignment) + Size, Alignment);
1814*13fbcb42Sjoerg }
1815*13fbcb42Sjoerg unsigned Alignment =
1816*13fbcb42Sjoerg CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
1817*13fbcb42Sjoerg Offset = llvm::alignTo(Offset, Alignment);
1818*13fbcb42Sjoerg GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord);
1819*13fbcb42Sjoerg ++GlobalizedRecords.back().RegionCounter;
1820*13fbcb42Sjoerg if (GlobalizedRecords.back().Records.size() == 1) {
1821*13fbcb42Sjoerg assert(KernelStaticGlobalized &&
1822*13fbcb42Sjoerg "Kernel static pointer must be initialized already.");
1823*13fbcb42Sjoerg auto *UseSharedMemory = new llvm::GlobalVariable(
1824*13fbcb42Sjoerg CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true,
1825*13fbcb42Sjoerg llvm::GlobalValue::InternalLinkage, nullptr,
1826*13fbcb42Sjoerg "_openmp_static_kernel$is_shared");
1827*13fbcb42Sjoerg UseSharedMemory->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
1828*13fbcb42Sjoerg QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
1829*13fbcb42Sjoerg /*DestWidth=*/16, /*Signed=*/0);
1830*13fbcb42Sjoerg llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
1831*13fbcb42Sjoerg Address(UseSharedMemory,
1832*13fbcb42Sjoerg CGM.getContext().getTypeAlignInChars(Int16Ty)),
1833*13fbcb42Sjoerg /*Volatile=*/false, Int16Ty, Loc);
1834*13fbcb42Sjoerg auto *StaticGlobalized = new llvm::GlobalVariable(
1835*13fbcb42Sjoerg CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false,
1836*13fbcb42Sjoerg llvm::GlobalValue::CommonLinkage, nullptr);
1837*13fbcb42Sjoerg auto *RecSize = new llvm::GlobalVariable(
1838*13fbcb42Sjoerg CGM.getModule(), CGM.SizeTy, /*isConstant=*/true,
1839*13fbcb42Sjoerg llvm::GlobalValue::InternalLinkage, nullptr,
1840*13fbcb42Sjoerg "_openmp_static_kernel$size");
1841*13fbcb42Sjoerg RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
1842*13fbcb42Sjoerg llvm::Value *Ld = CGF.EmitLoadOfScalar(
1843*13fbcb42Sjoerg Address(RecSize, CGM.getSizeAlign()), /*Volatile=*/false,
1844*13fbcb42Sjoerg CGM.getContext().getSizeType(), Loc);
1845*13fbcb42Sjoerg llvm::Value *ResAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1846*13fbcb42Sjoerg KernelStaticGlobalized, CGM.VoidPtrPtrTy);
1847*13fbcb42Sjoerg llvm::Value *GlobalRecordSizeArg[] = {
1848*13fbcb42Sjoerg llvm::ConstantInt::get(
1849*13fbcb42Sjoerg CGM.Int16Ty,
1850*13fbcb42Sjoerg getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD ? 1 : 0),
1851*13fbcb42Sjoerg StaticGlobalized, Ld, IsInSharedMemory, ResAddr};
1852*13fbcb42Sjoerg CGF.EmitRuntimeCall(
1853*13fbcb42Sjoerg OMPBuilder.getOrCreateRuntimeFunction(
1854*13fbcb42Sjoerg CGM.getModule(), OMPRTL___kmpc_get_team_static_memory),
1855*13fbcb42Sjoerg GlobalRecordSizeArg);
1856*13fbcb42Sjoerg GlobalizedRecords.back().Buffer = StaticGlobalized;
1857*13fbcb42Sjoerg GlobalizedRecords.back().RecSize = RecSize;
1858*13fbcb42Sjoerg GlobalizedRecords.back().UseSharedMemory = UseSharedMemory;
1859*13fbcb42Sjoerg GlobalizedRecords.back().Loc = Loc;
1860*13fbcb42Sjoerg }
1861*13fbcb42Sjoerg assert(KernelStaticGlobalized && "Global address must be set already.");
1862*13fbcb42Sjoerg Address FrameAddr = CGF.EmitLoadOfPointer(
1863*13fbcb42Sjoerg Address(KernelStaticGlobalized, CGM.getPointerAlign()),
1864*13fbcb42Sjoerg CGM.getContext()
1865*13fbcb42Sjoerg .getPointerType(CGM.getContext().VoidPtrTy)
1866*13fbcb42Sjoerg .castAs<PointerType>());
1867*13fbcb42Sjoerg llvm::Value *GlobalRecValue =
1868*13fbcb42Sjoerg Bld.CreateConstInBoundsGEP(FrameAddr, Offset).getPointer();
1869*13fbcb42Sjoerg I->getSecond().GlobalRecordAddr = GlobalRecValue;
1870*13fbcb42Sjoerg I->getSecond().IsInSPMDModeFlag = nullptr;
1871*13fbcb42Sjoerg GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1872*13fbcb42Sjoerg GlobalRecValue, CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo());
1873*13fbcb42Sjoerg } else {
1874*13fbcb42Sjoerg // TODO: allow the usage of shared memory to be controlled by
1875*13fbcb42Sjoerg // the user, for now, default to global.
1876*13fbcb42Sjoerg bool UseSharedMemory =
1877*13fbcb42Sjoerg IsInTTDRegion && GlobalRecordSize <= SharedMemorySize;
1878*13fbcb42Sjoerg llvm::Value *GlobalRecordSizeArg[] = {
1879*13fbcb42Sjoerg llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
1880*13fbcb42Sjoerg CGF.Builder.getInt16(UseSharedMemory ? 1 : 0)};
1881*13fbcb42Sjoerg llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
1882*13fbcb42Sjoerg OMPBuilder.getOrCreateRuntimeFunction(
1883*13fbcb42Sjoerg CGM.getModule(),
1884*13fbcb42Sjoerg IsInTTDRegion ? OMPRTL___kmpc_data_sharing_push_stack
1885*13fbcb42Sjoerg : OMPRTL___kmpc_data_sharing_coalesced_push_stack),
1886*13fbcb42Sjoerg GlobalRecordSizeArg);
1887*13fbcb42Sjoerg GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1888*13fbcb42Sjoerg GlobalRecValue, GlobalRecPtrTy);
1889*13fbcb42Sjoerg I->getSecond().GlobalRecordAddr = GlobalRecValue;
1890*13fbcb42Sjoerg I->getSecond().IsInSPMDModeFlag = nullptr;
1891*13fbcb42Sjoerg }
1892*13fbcb42Sjoerg LValue Base =
1893*13fbcb42Sjoerg CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, GlobalRecTy);
1894*13fbcb42Sjoerg
1895*13fbcb42Sjoerg // Emit the "global alloca" which is a GEP from the global declaration
1896*13fbcb42Sjoerg // record using the pointer returned by the runtime.
1897*13fbcb42Sjoerg LValue SecBase;
1898*13fbcb42Sjoerg decltype(I->getSecond().LocalVarData)::const_iterator SecIt;
1899*13fbcb42Sjoerg if (IsTTD) {
1900*13fbcb42Sjoerg SecIt = I->getSecond().SecondaryLocalVarData->begin();
1901*13fbcb42Sjoerg llvm::PointerType *SecGlobalRecPtrTy =
1902*13fbcb42Sjoerg CGF.ConvertTypeForMem(SecGlobalRecTy)->getPointerTo();
1903*13fbcb42Sjoerg SecBase = CGF.MakeNaturalAlignPointeeAddrLValue(
1904*13fbcb42Sjoerg Bld.CreatePointerBitCastOrAddrSpaceCast(
1905*13fbcb42Sjoerg I->getSecond().GlobalRecordAddr, SecGlobalRecPtrTy),
1906*13fbcb42Sjoerg SecGlobalRecTy);
1907*13fbcb42Sjoerg }
1908*13fbcb42Sjoerg for (auto &Rec : I->getSecond().LocalVarData) {
1909*13fbcb42Sjoerg bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
1910*13fbcb42Sjoerg llvm::Value *ParValue;
1911*13fbcb42Sjoerg if (EscapedParam) {
1912*13fbcb42Sjoerg const auto *VD = cast<VarDecl>(Rec.first);
1913*13fbcb42Sjoerg LValue ParLVal =
1914*13fbcb42Sjoerg CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
1915*13fbcb42Sjoerg ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
1916*13fbcb42Sjoerg }
1917*13fbcb42Sjoerg LValue VarAddr = CGF.EmitLValueForField(Base, Rec.second.FD);
1918*13fbcb42Sjoerg // Emit VarAddr basing on lane-id if required.
1919*13fbcb42Sjoerg QualType VarTy;
1920*13fbcb42Sjoerg if (Rec.second.IsOnePerTeam) {
1921*13fbcb42Sjoerg VarTy = Rec.second.FD->getType();
1922*13fbcb42Sjoerg } else {
1923*13fbcb42Sjoerg Address Addr = VarAddr.getAddress(CGF);
1924*13fbcb42Sjoerg llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP(
1925*13fbcb42Sjoerg Addr.getElementType(), Addr.getPointer(),
1926*13fbcb42Sjoerg {Bld.getInt32(0), getNVPTXLaneID(CGF)});
1927*13fbcb42Sjoerg VarTy =
1928*13fbcb42Sjoerg Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType();
1929*13fbcb42Sjoerg VarAddr = CGF.MakeAddrLValue(
1930*13fbcb42Sjoerg Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)), VarTy,
1931*13fbcb42Sjoerg AlignmentSource::Decl);
1932*13fbcb42Sjoerg }
1933*13fbcb42Sjoerg Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
1934*13fbcb42Sjoerg if (!IsInTTDRegion &&
1935*13fbcb42Sjoerg (WithSPMDCheck ||
1936*13fbcb42Sjoerg getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) {
1937*13fbcb42Sjoerg assert(I->getSecond().IsInSPMDModeFlag &&
1938*13fbcb42Sjoerg "Expected unknown execution mode or required SPMD check.");
1939*13fbcb42Sjoerg if (IsTTD) {
1940*13fbcb42Sjoerg assert(SecIt->second.IsOnePerTeam &&
1941*13fbcb42Sjoerg "Secondary glob data must be one per team.");
1942*13fbcb42Sjoerg LValue SecVarAddr = CGF.EmitLValueForField(SecBase, SecIt->second.FD);
1943*13fbcb42Sjoerg VarAddr.setAddress(
1944*13fbcb42Sjoerg Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(CGF),
1945*13fbcb42Sjoerg VarAddr.getPointer(CGF)),
1946*13fbcb42Sjoerg VarAddr.getAlignment()));
1947*13fbcb42Sjoerg Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
1948*13fbcb42Sjoerg }
1949*13fbcb42Sjoerg Address GlobalPtr = Rec.second.PrivateAddr;
1950*13fbcb42Sjoerg Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName());
1951*13fbcb42Sjoerg Rec.second.PrivateAddr = Address(
1952*13fbcb42Sjoerg Bld.CreateSelect(I->getSecond().IsInSPMDModeFlag,
1953*13fbcb42Sjoerg LocalAddr.getPointer(), GlobalPtr.getPointer()),
1954*13fbcb42Sjoerg LocalAddr.getAlignment());
1955*13fbcb42Sjoerg }
1956*13fbcb42Sjoerg if (EscapedParam) {
1957*13fbcb42Sjoerg const auto *VD = cast<VarDecl>(Rec.first);
1958*13fbcb42Sjoerg CGF.EmitStoreOfScalar(ParValue, VarAddr);
1959*13fbcb42Sjoerg I->getSecond().MappedParams->setVarAddr(CGF, VD,
1960*13fbcb42Sjoerg VarAddr.getAddress(CGF));
1961*13fbcb42Sjoerg }
1962*13fbcb42Sjoerg if (IsTTD)
1963*13fbcb42Sjoerg ++SecIt;
1964*13fbcb42Sjoerg }
1965*13fbcb42Sjoerg }
1966*13fbcb42Sjoerg for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) {
1967*13fbcb42Sjoerg // Recover pointer to this function's global record. The runtime will
1968*13fbcb42Sjoerg // handle the specifics of the allocation of the memory.
1969*13fbcb42Sjoerg // Use actual memory size of the record including the padding
1970*13fbcb42Sjoerg // for alignment purposes.
1971*13fbcb42Sjoerg CGBuilderTy &Bld = CGF.Builder;
1972*13fbcb42Sjoerg llvm::Value *Size = CGF.getTypeSize(VD->getType());
1973*13fbcb42Sjoerg CharUnits Align = CGM.getContext().getDeclAlign(VD);
1974*13fbcb42Sjoerg Size = Bld.CreateNUWAdd(
1975*13fbcb42Sjoerg Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
1976*13fbcb42Sjoerg llvm::Value *AlignVal =
1977*13fbcb42Sjoerg llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
1978*13fbcb42Sjoerg Size = Bld.CreateUDiv(Size, AlignVal);
1979*13fbcb42Sjoerg Size = Bld.CreateNUWMul(Size, AlignVal);
1980*13fbcb42Sjoerg // TODO: allow the usage of shared memory to be controlled by
1981*13fbcb42Sjoerg // the user, for now, default to global.
1982*13fbcb42Sjoerg llvm::Value *GlobalRecordSizeArg[] = {
1983*13fbcb42Sjoerg Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
1984*13fbcb42Sjoerg llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
1985*13fbcb42Sjoerg OMPBuilder.getOrCreateRuntimeFunction(
1986*13fbcb42Sjoerg CGM.getModule(), OMPRTL___kmpc_data_sharing_coalesced_push_stack),
1987*13fbcb42Sjoerg GlobalRecordSizeArg);
1988*13fbcb42Sjoerg llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1989*13fbcb42Sjoerg GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo());
1990*13fbcb42Sjoerg LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(),
1991*13fbcb42Sjoerg CGM.getContext().getDeclAlign(VD),
1992*13fbcb42Sjoerg AlignmentSource::Decl);
1993*13fbcb42Sjoerg I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
1994*13fbcb42Sjoerg Base.getAddress(CGF));
1995*13fbcb42Sjoerg I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue);
1996*13fbcb42Sjoerg }
1997*13fbcb42Sjoerg I->getSecond().MappedParams->apply(CGF);
1998*13fbcb42Sjoerg }
1999*13fbcb42Sjoerg
emitGenericVarsEpilog(CodeGenFunction & CGF,bool WithSPMDCheck)2000*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF,
2001*13fbcb42Sjoerg bool WithSPMDCheck) {
2002*13fbcb42Sjoerg if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
2003*13fbcb42Sjoerg getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
2004*13fbcb42Sjoerg return;
2005*13fbcb42Sjoerg
2006*13fbcb42Sjoerg const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
2007*13fbcb42Sjoerg if (I != FunctionGlobalizedDecls.end()) {
2008*13fbcb42Sjoerg I->getSecond().MappedParams->restore(CGF);
2009*13fbcb42Sjoerg if (!CGF.HaveInsertPoint())
2010*13fbcb42Sjoerg return;
2011*13fbcb42Sjoerg for (llvm::Value *Addr :
2012*13fbcb42Sjoerg llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
2013*13fbcb42Sjoerg CGF.EmitRuntimeCall(
2014*13fbcb42Sjoerg OMPBuilder.getOrCreateRuntimeFunction(
2015*13fbcb42Sjoerg CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack),
2016*13fbcb42Sjoerg Addr);
2017*13fbcb42Sjoerg }
2018*13fbcb42Sjoerg if (I->getSecond().GlobalRecordAddr) {
2019*13fbcb42Sjoerg if (!IsInTTDRegion &&
2020*13fbcb42Sjoerg (WithSPMDCheck ||
2021*13fbcb42Sjoerg getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) {
2022*13fbcb42Sjoerg CGBuilderTy &Bld = CGF.Builder;
2023*13fbcb42Sjoerg llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2024*13fbcb42Sjoerg llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
2025*13fbcb42Sjoerg Bld.CreateCondBr(I->getSecond().IsInSPMDModeFlag, ExitBB, NonSPMDBB);
2026*13fbcb42Sjoerg // There is no need to emit line number for unconditional branch.
2027*13fbcb42Sjoerg (void)ApplyDebugLocation::CreateEmpty(CGF);
2028*13fbcb42Sjoerg CGF.EmitBlock(NonSPMDBB);
2029*13fbcb42Sjoerg CGF.EmitRuntimeCall(
2030*13fbcb42Sjoerg OMPBuilder.getOrCreateRuntimeFunction(
2031*13fbcb42Sjoerg CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack),
2032*13fbcb42Sjoerg CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr));
2033*13fbcb42Sjoerg CGF.EmitBlock(ExitBB);
2034*13fbcb42Sjoerg } else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) {
2035*13fbcb42Sjoerg assert(GlobalizedRecords.back().RegionCounter > 0 &&
2036*13fbcb42Sjoerg "region counter must be > 0.");
2037*13fbcb42Sjoerg --GlobalizedRecords.back().RegionCounter;
2038*13fbcb42Sjoerg // Emit the restore function only in the target region.
2039*13fbcb42Sjoerg if (GlobalizedRecords.back().RegionCounter == 0) {
2040*13fbcb42Sjoerg QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
2041*13fbcb42Sjoerg /*DestWidth=*/16, /*Signed=*/0);
2042*13fbcb42Sjoerg llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
2043*13fbcb42Sjoerg Address(GlobalizedRecords.back().UseSharedMemory,
2044*13fbcb42Sjoerg CGM.getContext().getTypeAlignInChars(Int16Ty)),
2045*13fbcb42Sjoerg /*Volatile=*/false, Int16Ty, GlobalizedRecords.back().Loc);
2046*13fbcb42Sjoerg llvm::Value *Args[] = {
2047*13fbcb42Sjoerg llvm::ConstantInt::get(
2048*13fbcb42Sjoerg CGM.Int16Ty,
2049*13fbcb42Sjoerg getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD ? 1 : 0),
2050*13fbcb42Sjoerg IsInSharedMemory};
2051*13fbcb42Sjoerg CGF.EmitRuntimeCall(
2052*13fbcb42Sjoerg OMPBuilder.getOrCreateRuntimeFunction(
2053*13fbcb42Sjoerg CGM.getModule(), OMPRTL___kmpc_restore_team_static_memory),
2054*13fbcb42Sjoerg Args);
2055*13fbcb42Sjoerg }
2056*13fbcb42Sjoerg } else {
2057*13fbcb42Sjoerg CGF.EmitRuntimeCall(
2058*13fbcb42Sjoerg OMPBuilder.getOrCreateRuntimeFunction(
2059*13fbcb42Sjoerg CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack),
2060*13fbcb42Sjoerg I->getSecond().GlobalRecordAddr);
2061*13fbcb42Sjoerg }
2062*13fbcb42Sjoerg }
2063*13fbcb42Sjoerg }
2064*13fbcb42Sjoerg }
2065*13fbcb42Sjoerg
emitTeamsCall(CodeGenFunction & CGF,const OMPExecutableDirective & D,SourceLocation Loc,llvm::Function * OutlinedFn,ArrayRef<llvm::Value * > CapturedVars)2066*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,
2067*13fbcb42Sjoerg const OMPExecutableDirective &D,
2068*13fbcb42Sjoerg SourceLocation Loc,
2069*13fbcb42Sjoerg llvm::Function *OutlinedFn,
2070*13fbcb42Sjoerg ArrayRef<llvm::Value *> CapturedVars) {
2071*13fbcb42Sjoerg if (!CGF.HaveInsertPoint())
2072*13fbcb42Sjoerg return;
2073*13fbcb42Sjoerg
2074*13fbcb42Sjoerg Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
2075*13fbcb42Sjoerg /*Name=*/".zero.addr");
2076*13fbcb42Sjoerg CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2077*13fbcb42Sjoerg llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2078*13fbcb42Sjoerg OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
2079*13fbcb42Sjoerg OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2080*13fbcb42Sjoerg OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2081*13fbcb42Sjoerg emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
2082*13fbcb42Sjoerg }
2083*13fbcb42Sjoerg
emitParallelCall(CodeGenFunction & CGF,SourceLocation Loc,llvm::Function * OutlinedFn,ArrayRef<llvm::Value * > CapturedVars,const Expr * IfCond)2084*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
2085*13fbcb42Sjoerg SourceLocation Loc,
2086*13fbcb42Sjoerg llvm::Function *OutlinedFn,
2087*13fbcb42Sjoerg ArrayRef<llvm::Value *> CapturedVars,
2088*13fbcb42Sjoerg const Expr *IfCond) {
2089*13fbcb42Sjoerg if (!CGF.HaveInsertPoint())
2090*13fbcb42Sjoerg return;
2091*13fbcb42Sjoerg
2092*13fbcb42Sjoerg auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars,
2093*13fbcb42Sjoerg IfCond](CodeGenFunction &CGF, PrePostActionTy &Action) {
2094*13fbcb42Sjoerg CGBuilderTy &Bld = CGF.Builder;
2095*13fbcb42Sjoerg llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
2096*13fbcb42Sjoerg llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
2097*13fbcb42Sjoerg if (WFn) {
2098*13fbcb42Sjoerg ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
2099*13fbcb42Sjoerg // Remember for post-processing in worker loop.
2100*13fbcb42Sjoerg Work.emplace_back(WFn);
2101*13fbcb42Sjoerg }
2102*13fbcb42Sjoerg llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);
2103*13fbcb42Sjoerg
2104*13fbcb42Sjoerg // Create a private scope that will globalize the arguments
2105*13fbcb42Sjoerg // passed from the outside of the target region.
2106*13fbcb42Sjoerg // TODO: Is that needed?
2107*13fbcb42Sjoerg CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
2108*13fbcb42Sjoerg
2109*13fbcb42Sjoerg Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
2110*13fbcb42Sjoerg llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),
2111*13fbcb42Sjoerg "captured_vars_addrs");
2112*13fbcb42Sjoerg // There's something to share.
2113*13fbcb42Sjoerg if (!CapturedVars.empty()) {
2114*13fbcb42Sjoerg // Prepare for parallel region. Indicate the outlined function.
2115*13fbcb42Sjoerg ASTContext &Ctx = CGF.getContext();
2116*13fbcb42Sjoerg unsigned Idx = 0;
2117*13fbcb42Sjoerg for (llvm::Value *V : CapturedVars) {
2118*13fbcb42Sjoerg Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);
2119*13fbcb42Sjoerg llvm::Value *PtrV;
2120*13fbcb42Sjoerg if (V->getType()->isIntegerTy())
2121*13fbcb42Sjoerg PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
2122*13fbcb42Sjoerg else
2123*13fbcb42Sjoerg PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
2124*13fbcb42Sjoerg CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
2125*13fbcb42Sjoerg Ctx.getPointerType(Ctx.VoidPtrTy));
2126*13fbcb42Sjoerg ++Idx;
2127*13fbcb42Sjoerg }
2128*13fbcb42Sjoerg }
2129*13fbcb42Sjoerg
2130*13fbcb42Sjoerg llvm::Value *IfCondVal = nullptr;
2131*13fbcb42Sjoerg if (IfCond)
2132*13fbcb42Sjoerg IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,
2133*13fbcb42Sjoerg /* isSigned */ false);
2134*13fbcb42Sjoerg else
2135*13fbcb42Sjoerg IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
2136*13fbcb42Sjoerg
2137*13fbcb42Sjoerg assert(IfCondVal && "Expected a value");
2138*13fbcb42Sjoerg llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2139*13fbcb42Sjoerg llvm::Value *Args[] = {
2140*13fbcb42Sjoerg RTLoc,
2141*13fbcb42Sjoerg getThreadID(CGF, Loc),
2142*13fbcb42Sjoerg IfCondVal,
2143*13fbcb42Sjoerg llvm::ConstantInt::get(CGF.Int32Ty, -1),
2144*13fbcb42Sjoerg llvm::ConstantInt::get(CGF.Int32Ty, -1),
2145*13fbcb42Sjoerg FnPtr,
2146*13fbcb42Sjoerg ID,
2147*13fbcb42Sjoerg Bld.CreateBitOrPointerCast(CapturedVarsAddrs.getPointer(),
2148*13fbcb42Sjoerg CGF.VoidPtrPtrTy),
2149*13fbcb42Sjoerg llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
2150*13fbcb42Sjoerg CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
2151*13fbcb42Sjoerg CGM.getModule(), OMPRTL___kmpc_parallel_51),
2152*13fbcb42Sjoerg Args);
2153*13fbcb42Sjoerg };
2154*13fbcb42Sjoerg
2155*13fbcb42Sjoerg RegionCodeGenTy RCG(ParallelGen);
2156*13fbcb42Sjoerg RCG(CGF);
2157*13fbcb42Sjoerg }
2158*13fbcb42Sjoerg
syncCTAThreads(CodeGenFunction & CGF)2159*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
2160*13fbcb42Sjoerg // Always emit simple barriers!
2161*13fbcb42Sjoerg if (!CGF.HaveInsertPoint())
2162*13fbcb42Sjoerg return;
2163*13fbcb42Sjoerg // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
2164*13fbcb42Sjoerg // This function does not use parameters, so we can emit just default values.
2165*13fbcb42Sjoerg llvm::Value *Args[] = {
2166*13fbcb42Sjoerg llvm::ConstantPointerNull::get(
2167*13fbcb42Sjoerg cast<llvm::PointerType>(getIdentTyPointerTy())),
2168*13fbcb42Sjoerg llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
2169*13fbcb42Sjoerg CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
2170*13fbcb42Sjoerg CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd),
2171*13fbcb42Sjoerg Args);
2172*13fbcb42Sjoerg }
2173*13fbcb42Sjoerg
emitBarrierCall(CodeGenFunction & CGF,SourceLocation Loc,OpenMPDirectiveKind Kind,bool,bool)2174*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF,
2175*13fbcb42Sjoerg SourceLocation Loc,
2176*13fbcb42Sjoerg OpenMPDirectiveKind Kind, bool,
2177*13fbcb42Sjoerg bool) {
2178*13fbcb42Sjoerg // Always emit simple barriers!
2179*13fbcb42Sjoerg if (!CGF.HaveInsertPoint())
2180*13fbcb42Sjoerg return;
2181*13fbcb42Sjoerg // Build call __kmpc_cancel_barrier(loc, thread_id);
2182*13fbcb42Sjoerg unsigned Flags = getDefaultFlagsForBarriers(Kind);
2183*13fbcb42Sjoerg llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
2184*13fbcb42Sjoerg getThreadID(CGF, Loc)};
2185*13fbcb42Sjoerg
2186*13fbcb42Sjoerg CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
2187*13fbcb42Sjoerg CGM.getModule(), OMPRTL___kmpc_barrier),
2188*13fbcb42Sjoerg Args);
2189*13fbcb42Sjoerg }
2190*13fbcb42Sjoerg
emitCriticalRegion(CodeGenFunction & CGF,StringRef CriticalName,const RegionCodeGenTy & CriticalOpGen,SourceLocation Loc,const Expr * Hint)2191*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitCriticalRegion(
2192*13fbcb42Sjoerg CodeGenFunction &CGF, StringRef CriticalName,
2193*13fbcb42Sjoerg const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
2194*13fbcb42Sjoerg const Expr *Hint) {
2195*13fbcb42Sjoerg llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
2196*13fbcb42Sjoerg llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
2197*13fbcb42Sjoerg llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
2198*13fbcb42Sjoerg llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
2199*13fbcb42Sjoerg llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
2200*13fbcb42Sjoerg
2201*13fbcb42Sjoerg auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
2202*13fbcb42Sjoerg
2203*13fbcb42Sjoerg // Get the mask of active threads in the warp.
2204*13fbcb42Sjoerg llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
2205*13fbcb42Sjoerg CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask));
2206*13fbcb42Sjoerg // Fetch team-local id of the thread.
2207*13fbcb42Sjoerg llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
2208*13fbcb42Sjoerg
2209*13fbcb42Sjoerg // Get the width of the team.
2210*13fbcb42Sjoerg llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
2211*13fbcb42Sjoerg
2212*13fbcb42Sjoerg // Initialize the counter variable for the loop.
2213*13fbcb42Sjoerg QualType Int32Ty =
2214*13fbcb42Sjoerg CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
2215*13fbcb42Sjoerg Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
2216*13fbcb42Sjoerg LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
2217*13fbcb42Sjoerg CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
2218*13fbcb42Sjoerg /*isInit=*/true);
2219*13fbcb42Sjoerg
2220*13fbcb42Sjoerg // Block checks if loop counter exceeds upper bound.
2221*13fbcb42Sjoerg CGF.EmitBlock(LoopBB);
2222*13fbcb42Sjoerg llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2223*13fbcb42Sjoerg llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
2224*13fbcb42Sjoerg CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
2225*13fbcb42Sjoerg
2226*13fbcb42Sjoerg // Block tests which single thread should execute region, and which threads
2227*13fbcb42Sjoerg // should go straight to synchronisation point.
2228*13fbcb42Sjoerg CGF.EmitBlock(TestBB);
2229*13fbcb42Sjoerg CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2230*13fbcb42Sjoerg llvm::Value *CmpThreadToCounter =
2231*13fbcb42Sjoerg CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
2232*13fbcb42Sjoerg CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
2233*13fbcb42Sjoerg
2234*13fbcb42Sjoerg // Block emits the body of the critical region.
2235*13fbcb42Sjoerg CGF.EmitBlock(BodyBB);
2236*13fbcb42Sjoerg
2237*13fbcb42Sjoerg // Output the critical statement.
2238*13fbcb42Sjoerg CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
2239*13fbcb42Sjoerg Hint);
2240*13fbcb42Sjoerg
2241*13fbcb42Sjoerg // After the body surrounded by the critical region, the single executing
2242*13fbcb42Sjoerg // thread will jump to the synchronisation point.
2243*13fbcb42Sjoerg // Block waits for all threads in current team to finish then increments the
2244*13fbcb42Sjoerg // counter variable and returns to the loop.
2245*13fbcb42Sjoerg CGF.EmitBlock(SyncBB);
2246*13fbcb42Sjoerg // Reconverge active threads in the warp.
2247*13fbcb42Sjoerg (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
2248*13fbcb42Sjoerg CGM.getModule(), OMPRTL___kmpc_syncwarp),
2249*13fbcb42Sjoerg Mask);
2250*13fbcb42Sjoerg
2251*13fbcb42Sjoerg llvm::Value *IncCounterVal =
2252*13fbcb42Sjoerg CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
2253*13fbcb42Sjoerg CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
2254*13fbcb42Sjoerg CGF.EmitBranch(LoopBB);
2255*13fbcb42Sjoerg
2256*13fbcb42Sjoerg // Block that is reached when all threads in the team complete the region.
2257*13fbcb42Sjoerg CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
2258*13fbcb42Sjoerg }
2259*13fbcb42Sjoerg
2260*13fbcb42Sjoerg /// Cast value to the specified type.
castValueToType(CodeGenFunction & CGF,llvm::Value * Val,QualType ValTy,QualType CastTy,SourceLocation Loc)2261*13fbcb42Sjoerg static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
2262*13fbcb42Sjoerg QualType ValTy, QualType CastTy,
2263*13fbcb42Sjoerg SourceLocation Loc) {
2264*13fbcb42Sjoerg assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
2265*13fbcb42Sjoerg "Cast type must sized.");
2266*13fbcb42Sjoerg assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
2267*13fbcb42Sjoerg "Val type must sized.");
2268*13fbcb42Sjoerg llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
2269*13fbcb42Sjoerg if (ValTy == CastTy)
2270*13fbcb42Sjoerg return Val;
2271*13fbcb42Sjoerg if (CGF.getContext().getTypeSizeInChars(ValTy) ==
2272*13fbcb42Sjoerg CGF.getContext().getTypeSizeInChars(CastTy))
2273*13fbcb42Sjoerg return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
2274*13fbcb42Sjoerg if (CastTy->isIntegerType() && ValTy->isIntegerType())
2275*13fbcb42Sjoerg return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
2276*13fbcb42Sjoerg CastTy->hasSignedIntegerRepresentation());
2277*13fbcb42Sjoerg Address CastItem = CGF.CreateMemTemp(CastTy);
2278*13fbcb42Sjoerg Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2279*13fbcb42Sjoerg CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
2280*13fbcb42Sjoerg CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy,
2281*13fbcb42Sjoerg LValueBaseInfo(AlignmentSource::Type),
2282*13fbcb42Sjoerg TBAAAccessInfo());
2283*13fbcb42Sjoerg return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc,
2284*13fbcb42Sjoerg LValueBaseInfo(AlignmentSource::Type),
2285*13fbcb42Sjoerg TBAAAccessInfo());
2286*13fbcb42Sjoerg }
2287*13fbcb42Sjoerg
2288*13fbcb42Sjoerg /// This function creates calls to one of two shuffle functions to copy
2289*13fbcb42Sjoerg /// variables between lanes in a warp.
createRuntimeShuffleFunction(CodeGenFunction & CGF,llvm::Value * Elem,QualType ElemType,llvm::Value * Offset,SourceLocation Loc)2290*13fbcb42Sjoerg static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
2291*13fbcb42Sjoerg llvm::Value *Elem,
2292*13fbcb42Sjoerg QualType ElemType,
2293*13fbcb42Sjoerg llvm::Value *Offset,
2294*13fbcb42Sjoerg SourceLocation Loc) {
2295*13fbcb42Sjoerg CodeGenModule &CGM = CGF.CGM;
2296*13fbcb42Sjoerg CGBuilderTy &Bld = CGF.Builder;
2297*13fbcb42Sjoerg CGOpenMPRuntimeGPU &RT =
2298*13fbcb42Sjoerg *(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime()));
2299*13fbcb42Sjoerg llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();
2300*13fbcb42Sjoerg
2301*13fbcb42Sjoerg CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2302*13fbcb42Sjoerg assert(Size.getQuantity() <= 8 &&
2303*13fbcb42Sjoerg "Unsupported bitwidth in shuffle instruction.");
2304*13fbcb42Sjoerg
2305*13fbcb42Sjoerg RuntimeFunction ShuffleFn = Size.getQuantity() <= 4
2306*13fbcb42Sjoerg ? OMPRTL___kmpc_shuffle_int32
2307*13fbcb42Sjoerg : OMPRTL___kmpc_shuffle_int64;
2308*13fbcb42Sjoerg
2309*13fbcb42Sjoerg // Cast all types to 32- or 64-bit values before calling shuffle routines.
2310*13fbcb42Sjoerg QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
2311*13fbcb42Sjoerg Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
2312*13fbcb42Sjoerg llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
2313*13fbcb42Sjoerg llvm::Value *WarpSize =
2314*13fbcb42Sjoerg Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
2315*13fbcb42Sjoerg
2316*13fbcb42Sjoerg llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
2317*13fbcb42Sjoerg OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn),
2318*13fbcb42Sjoerg {ElemCast, Offset, WarpSize});
2319*13fbcb42Sjoerg
2320*13fbcb42Sjoerg return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
2321*13fbcb42Sjoerg }
2322*13fbcb42Sjoerg
shuffleAndStore(CodeGenFunction & CGF,Address SrcAddr,Address DestAddr,QualType ElemType,llvm::Value * Offset,SourceLocation Loc)2323*13fbcb42Sjoerg static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
2324*13fbcb42Sjoerg Address DestAddr, QualType ElemType,
2325*13fbcb42Sjoerg llvm::Value *Offset, SourceLocation Loc) {
2326*13fbcb42Sjoerg CGBuilderTy &Bld = CGF.Builder;
2327*13fbcb42Sjoerg
2328*13fbcb42Sjoerg CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2329*13fbcb42Sjoerg // Create the loop over the big sized data.
2330*13fbcb42Sjoerg // ptr = (void*)Elem;
2331*13fbcb42Sjoerg // ptrEnd = (void*) Elem + 1;
2332*13fbcb42Sjoerg // Step = 8;
2333*13fbcb42Sjoerg // while (ptr + Step < ptrEnd)
2334*13fbcb42Sjoerg // shuffle((int64_t)*ptr);
2335*13fbcb42Sjoerg // Step = 4;
2336*13fbcb42Sjoerg // while (ptr + Step < ptrEnd)
2337*13fbcb42Sjoerg // shuffle((int32_t)*ptr);
2338*13fbcb42Sjoerg // ...
2339*13fbcb42Sjoerg Address ElemPtr = DestAddr;
2340*13fbcb42Sjoerg Address Ptr = SrcAddr;
2341*13fbcb42Sjoerg Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
2342*13fbcb42Sjoerg Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy);
2343*13fbcb42Sjoerg for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
2344*13fbcb42Sjoerg if (Size < CharUnits::fromQuantity(IntSize))
2345*13fbcb42Sjoerg continue;
2346*13fbcb42Sjoerg QualType IntType = CGF.getContext().getIntTypeForBitwidth(
2347*13fbcb42Sjoerg CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
2348*13fbcb42Sjoerg /*Signed=*/1);
2349*13fbcb42Sjoerg llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
2350*13fbcb42Sjoerg Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo());
2351*13fbcb42Sjoerg ElemPtr =
2352*13fbcb42Sjoerg Bld.CreatePointerBitCastOrAddrSpaceCast(ElemPtr, IntTy->getPointerTo());
2353*13fbcb42Sjoerg if (Size.getQuantity() / IntSize > 1) {
2354*13fbcb42Sjoerg llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
2355*13fbcb42Sjoerg llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
2356*13fbcb42Sjoerg llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
2357*13fbcb42Sjoerg llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
2358*13fbcb42Sjoerg CGF.EmitBlock(PreCondBB);
2359*13fbcb42Sjoerg llvm::PHINode *PhiSrc =
2360*13fbcb42Sjoerg Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
2361*13fbcb42Sjoerg PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
2362*13fbcb42Sjoerg llvm::PHINode *PhiDest =
2363*13fbcb42Sjoerg Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
2364*13fbcb42Sjoerg PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
2365*13fbcb42Sjoerg Ptr = Address(PhiSrc, Ptr.getAlignment());
2366*13fbcb42Sjoerg ElemPtr = Address(PhiDest, ElemPtr.getAlignment());
2367*13fbcb42Sjoerg llvm::Value *PtrDiff = Bld.CreatePtrDiff(
2368*13fbcb42Sjoerg PtrEnd.getPointer(), Bld.CreatePointerBitCastOrAddrSpaceCast(
2369*13fbcb42Sjoerg Ptr.getPointer(), CGF.VoidPtrTy));
2370*13fbcb42Sjoerg Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
2371*13fbcb42Sjoerg ThenBB, ExitBB);
2372*13fbcb42Sjoerg CGF.EmitBlock(ThenBB);
2373*13fbcb42Sjoerg llvm::Value *Res = createRuntimeShuffleFunction(
2374*13fbcb42Sjoerg CGF,
2375*13fbcb42Sjoerg CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
2376*13fbcb42Sjoerg LValueBaseInfo(AlignmentSource::Type),
2377*13fbcb42Sjoerg TBAAAccessInfo()),
2378*13fbcb42Sjoerg IntType, Offset, Loc);
2379*13fbcb42Sjoerg CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
2380*13fbcb42Sjoerg LValueBaseInfo(AlignmentSource::Type),
2381*13fbcb42Sjoerg TBAAAccessInfo());
2382*13fbcb42Sjoerg Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
2383*13fbcb42Sjoerg Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
2384*13fbcb42Sjoerg PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
2385*13fbcb42Sjoerg PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
2386*13fbcb42Sjoerg CGF.EmitBranch(PreCondBB);
2387*13fbcb42Sjoerg CGF.EmitBlock(ExitBB);
2388*13fbcb42Sjoerg } else {
2389*13fbcb42Sjoerg llvm::Value *Res = createRuntimeShuffleFunction(
2390*13fbcb42Sjoerg CGF,
2391*13fbcb42Sjoerg CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
2392*13fbcb42Sjoerg LValueBaseInfo(AlignmentSource::Type),
2393*13fbcb42Sjoerg TBAAAccessInfo()),
2394*13fbcb42Sjoerg IntType, Offset, Loc);
2395*13fbcb42Sjoerg CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
2396*13fbcb42Sjoerg LValueBaseInfo(AlignmentSource::Type),
2397*13fbcb42Sjoerg TBAAAccessInfo());
2398*13fbcb42Sjoerg Ptr = Bld.CreateConstGEP(Ptr, 1);
2399*13fbcb42Sjoerg ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
2400*13fbcb42Sjoerg }
2401*13fbcb42Sjoerg Size = Size % IntSize;
2402*13fbcb42Sjoerg }
2403*13fbcb42Sjoerg }
2404*13fbcb42Sjoerg
2405*13fbcb42Sjoerg namespace {
2406*13fbcb42Sjoerg enum CopyAction : unsigned {
2407*13fbcb42Sjoerg // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
2408*13fbcb42Sjoerg // the warp using shuffle instructions.
2409*13fbcb42Sjoerg RemoteLaneToThread,
2410*13fbcb42Sjoerg // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
2411*13fbcb42Sjoerg ThreadCopy,
2412*13fbcb42Sjoerg // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
2413*13fbcb42Sjoerg ThreadToScratchpad,
2414*13fbcb42Sjoerg // ScratchpadToThread: Copy from a scratchpad array in global memory
2415*13fbcb42Sjoerg // containing team-reduced data to a thread's stack.
2416*13fbcb42Sjoerg ScratchpadToThread,
2417*13fbcb42Sjoerg };
2418*13fbcb42Sjoerg } // namespace
2419*13fbcb42Sjoerg
2420*13fbcb42Sjoerg struct CopyOptionsTy {
2421*13fbcb42Sjoerg llvm::Value *RemoteLaneOffset;
2422*13fbcb42Sjoerg llvm::Value *ScratchpadIndex;
2423*13fbcb42Sjoerg llvm::Value *ScratchpadWidth;
2424*13fbcb42Sjoerg };
2425*13fbcb42Sjoerg
2426*13fbcb42Sjoerg /// Emit instructions to copy a Reduce list, which contains partially
2427*13fbcb42Sjoerg /// aggregated values, in the specified direction.
emitReductionListCopy(CopyAction Action,CodeGenFunction & CGF,QualType ReductionArrayTy,ArrayRef<const Expr * > Privates,Address SrcBase,Address DestBase,CopyOptionsTy CopyOptions={nullptr, nullptr, nullptr})2428*13fbcb42Sjoerg static void emitReductionListCopy(
2429*13fbcb42Sjoerg CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
2430*13fbcb42Sjoerg ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
2431*13fbcb42Sjoerg CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
2432*13fbcb42Sjoerg
2433*13fbcb42Sjoerg CodeGenModule &CGM = CGF.CGM;
2434*13fbcb42Sjoerg ASTContext &C = CGM.getContext();
2435*13fbcb42Sjoerg CGBuilderTy &Bld = CGF.Builder;
2436*13fbcb42Sjoerg
2437*13fbcb42Sjoerg llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2438*13fbcb42Sjoerg llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
2439*13fbcb42Sjoerg llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
2440*13fbcb42Sjoerg
2441*13fbcb42Sjoerg // Iterates, element-by-element, through the source Reduce list and
2442*13fbcb42Sjoerg // make a copy.
2443*13fbcb42Sjoerg unsigned Idx = 0;
2444*13fbcb42Sjoerg unsigned Size = Privates.size();
2445*13fbcb42Sjoerg for (const Expr *Private : Privates) {
2446*13fbcb42Sjoerg Address SrcElementAddr = Address::invalid();
2447*13fbcb42Sjoerg Address DestElementAddr = Address::invalid();
2448*13fbcb42Sjoerg Address DestElementPtrAddr = Address::invalid();
2449*13fbcb42Sjoerg // Should we shuffle in an element from a remote lane?
2450*13fbcb42Sjoerg bool ShuffleInElement = false;
2451*13fbcb42Sjoerg // Set to true to update the pointer in the dest Reduce list to a
2452*13fbcb42Sjoerg // newly created element.
2453*13fbcb42Sjoerg bool UpdateDestListPtr = false;
2454*13fbcb42Sjoerg // Increment the src or dest pointer to the scratchpad, for each
2455*13fbcb42Sjoerg // new element.
2456*13fbcb42Sjoerg bool IncrScratchpadSrc = false;
2457*13fbcb42Sjoerg bool IncrScratchpadDest = false;
2458*13fbcb42Sjoerg
2459*13fbcb42Sjoerg switch (Action) {
2460*13fbcb42Sjoerg case RemoteLaneToThread: {
2461*13fbcb42Sjoerg // Step 1.1: Get the address for the src element in the Reduce list.
2462*13fbcb42Sjoerg Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
2463*13fbcb42Sjoerg SrcElementAddr = CGF.EmitLoadOfPointer(
2464*13fbcb42Sjoerg SrcElementPtrAddr,
2465*13fbcb42Sjoerg C.getPointerType(Private->getType())->castAs<PointerType>());
2466*13fbcb42Sjoerg
2467*13fbcb42Sjoerg // Step 1.2: Create a temporary to store the element in the destination
2468*13fbcb42Sjoerg // Reduce list.
2469*13fbcb42Sjoerg DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
2470*13fbcb42Sjoerg DestElementAddr =
2471*13fbcb42Sjoerg CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
2472*13fbcb42Sjoerg ShuffleInElement = true;
2473*13fbcb42Sjoerg UpdateDestListPtr = true;
2474*13fbcb42Sjoerg break;
2475*13fbcb42Sjoerg }
2476*13fbcb42Sjoerg case ThreadCopy: {
2477*13fbcb42Sjoerg // Step 1.1: Get the address for the src element in the Reduce list.
2478*13fbcb42Sjoerg Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
2479*13fbcb42Sjoerg SrcElementAddr = CGF.EmitLoadOfPointer(
2480*13fbcb42Sjoerg SrcElementPtrAddr,
2481*13fbcb42Sjoerg C.getPointerType(Private->getType())->castAs<PointerType>());
2482*13fbcb42Sjoerg
2483*13fbcb42Sjoerg // Step 1.2: Get the address for dest element. The destination
2484*13fbcb42Sjoerg // element has already been created on the thread's stack.
2485*13fbcb42Sjoerg DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
2486*13fbcb42Sjoerg DestElementAddr = CGF.EmitLoadOfPointer(
2487*13fbcb42Sjoerg DestElementPtrAddr,
2488*13fbcb42Sjoerg C.getPointerType(Private->getType())->castAs<PointerType>());
2489*13fbcb42Sjoerg break;
2490*13fbcb42Sjoerg }
2491*13fbcb42Sjoerg case ThreadToScratchpad: {
2492*13fbcb42Sjoerg // Step 1.1: Get the address for the src element in the Reduce list.
2493*13fbcb42Sjoerg Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
2494*13fbcb42Sjoerg SrcElementAddr = CGF.EmitLoadOfPointer(
2495*13fbcb42Sjoerg SrcElementPtrAddr,
2496*13fbcb42Sjoerg C.getPointerType(Private->getType())->castAs<PointerType>());
2497*13fbcb42Sjoerg
2498*13fbcb42Sjoerg // Step 1.2: Get the address for dest element:
2499*13fbcb42Sjoerg // address = base + index * ElementSizeInChars.
2500*13fbcb42Sjoerg llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2501*13fbcb42Sjoerg llvm::Value *CurrentOffset =
2502*13fbcb42Sjoerg Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
2503*13fbcb42Sjoerg llvm::Value *ScratchPadElemAbsolutePtrVal =
2504*13fbcb42Sjoerg Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
2505*13fbcb42Sjoerg ScratchPadElemAbsolutePtrVal =
2506*13fbcb42Sjoerg Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
2507*13fbcb42Sjoerg DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
2508*13fbcb42Sjoerg C.getTypeAlignInChars(Private->getType()));
2509*13fbcb42Sjoerg IncrScratchpadDest = true;
2510*13fbcb42Sjoerg break;
2511*13fbcb42Sjoerg }
2512*13fbcb42Sjoerg case ScratchpadToThread: {
2513*13fbcb42Sjoerg // Step 1.1: Get the address for the src element in the scratchpad.
2514*13fbcb42Sjoerg // address = base + index * ElementSizeInChars.
2515*13fbcb42Sjoerg llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2516*13fbcb42Sjoerg llvm::Value *CurrentOffset =
2517*13fbcb42Sjoerg Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
2518*13fbcb42Sjoerg llvm::Value *ScratchPadElemAbsolutePtrVal =
2519*13fbcb42Sjoerg Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
2520*13fbcb42Sjoerg ScratchPadElemAbsolutePtrVal =
2521*13fbcb42Sjoerg Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
2522*13fbcb42Sjoerg SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
2523*13fbcb42Sjoerg C.getTypeAlignInChars(Private->getType()));
2524*13fbcb42Sjoerg IncrScratchpadSrc = true;
2525*13fbcb42Sjoerg
2526*13fbcb42Sjoerg // Step 1.2: Create a temporary to store the element in the destination
2527*13fbcb42Sjoerg // Reduce list.
2528*13fbcb42Sjoerg DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
2529*13fbcb42Sjoerg DestElementAddr =
2530*13fbcb42Sjoerg CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
2531*13fbcb42Sjoerg UpdateDestListPtr = true;
2532*13fbcb42Sjoerg break;
2533*13fbcb42Sjoerg }
2534*13fbcb42Sjoerg }
2535*13fbcb42Sjoerg
2536*13fbcb42Sjoerg // Regardless of src and dest of copy, we emit the load of src
2537*13fbcb42Sjoerg // element as this is required in all directions
2538*13fbcb42Sjoerg SrcElementAddr = Bld.CreateElementBitCast(
2539*13fbcb42Sjoerg SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
2540*13fbcb42Sjoerg DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
2541*13fbcb42Sjoerg SrcElementAddr.getElementType());
2542*13fbcb42Sjoerg
2543*13fbcb42Sjoerg // Now that all active lanes have read the element in the
2544*13fbcb42Sjoerg // Reduce list, shuffle over the value from the remote lane.
2545*13fbcb42Sjoerg if (ShuffleInElement) {
2546*13fbcb42Sjoerg shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
2547*13fbcb42Sjoerg RemoteLaneOffset, Private->getExprLoc());
2548*13fbcb42Sjoerg } else {
2549*13fbcb42Sjoerg switch (CGF.getEvaluationKind(Private->getType())) {
2550*13fbcb42Sjoerg case TEK_Scalar: {
2551*13fbcb42Sjoerg llvm::Value *Elem = CGF.EmitLoadOfScalar(
2552*13fbcb42Sjoerg SrcElementAddr, /*Volatile=*/false, Private->getType(),
2553*13fbcb42Sjoerg Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type),
2554*13fbcb42Sjoerg TBAAAccessInfo());
2555*13fbcb42Sjoerg // Store the source element value to the dest element address.
2556*13fbcb42Sjoerg CGF.EmitStoreOfScalar(
2557*13fbcb42Sjoerg Elem, DestElementAddr, /*Volatile=*/false, Private->getType(),
2558*13fbcb42Sjoerg LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
2559*13fbcb42Sjoerg break;
2560*13fbcb42Sjoerg }
2561*13fbcb42Sjoerg case TEK_Complex: {
2562*13fbcb42Sjoerg CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex(
2563*13fbcb42Sjoerg CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
2564*13fbcb42Sjoerg Private->getExprLoc());
2565*13fbcb42Sjoerg CGF.EmitStoreOfComplex(
2566*13fbcb42Sjoerg Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
2567*13fbcb42Sjoerg /*isInit=*/false);
2568*13fbcb42Sjoerg break;
2569*13fbcb42Sjoerg }
2570*13fbcb42Sjoerg case TEK_Aggregate:
2571*13fbcb42Sjoerg CGF.EmitAggregateCopy(
2572*13fbcb42Sjoerg CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
2573*13fbcb42Sjoerg CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
2574*13fbcb42Sjoerg Private->getType(), AggValueSlot::DoesNotOverlap);
2575*13fbcb42Sjoerg break;
2576*13fbcb42Sjoerg }
2577*13fbcb42Sjoerg }
2578*13fbcb42Sjoerg
2579*13fbcb42Sjoerg // Step 3.1: Modify reference in dest Reduce list as needed.
2580*13fbcb42Sjoerg // Modifying the reference in Reduce list to point to the newly
2581*13fbcb42Sjoerg // created element. The element is live in the current function
2582*13fbcb42Sjoerg // scope and that of functions it invokes (i.e., reduce_function).
2583*13fbcb42Sjoerg // RemoteReduceData[i] = (void*)&RemoteElem
2584*13fbcb42Sjoerg if (UpdateDestListPtr) {
2585*13fbcb42Sjoerg CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
2586*13fbcb42Sjoerg DestElementAddr.getPointer(), CGF.VoidPtrTy),
2587*13fbcb42Sjoerg DestElementPtrAddr, /*Volatile=*/false,
2588*13fbcb42Sjoerg C.VoidPtrTy);
2589*13fbcb42Sjoerg }
2590*13fbcb42Sjoerg
2591*13fbcb42Sjoerg // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
2592*13fbcb42Sjoerg // address of the next element in scratchpad memory, unless we're currently
2593*13fbcb42Sjoerg // processing the last one. Memory alignment is also taken care of here.
2594*13fbcb42Sjoerg if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
2595*13fbcb42Sjoerg llvm::Value *ScratchpadBasePtr =
2596*13fbcb42Sjoerg IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
2597*13fbcb42Sjoerg llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2598*13fbcb42Sjoerg ScratchpadBasePtr = Bld.CreateNUWAdd(
2599*13fbcb42Sjoerg ScratchpadBasePtr,
2600*13fbcb42Sjoerg Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
2601*13fbcb42Sjoerg
2602*13fbcb42Sjoerg // Take care of global memory alignment for performance
2603*13fbcb42Sjoerg ScratchpadBasePtr = Bld.CreateNUWSub(
2604*13fbcb42Sjoerg ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
2605*13fbcb42Sjoerg ScratchpadBasePtr = Bld.CreateUDiv(
2606*13fbcb42Sjoerg ScratchpadBasePtr,
2607*13fbcb42Sjoerg llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
2608*13fbcb42Sjoerg ScratchpadBasePtr = Bld.CreateNUWAdd(
2609*13fbcb42Sjoerg ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
2610*13fbcb42Sjoerg ScratchpadBasePtr = Bld.CreateNUWMul(
2611*13fbcb42Sjoerg ScratchpadBasePtr,
2612*13fbcb42Sjoerg llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
2613*13fbcb42Sjoerg
2614*13fbcb42Sjoerg if (IncrScratchpadDest)
2615*13fbcb42Sjoerg DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
2616*13fbcb42Sjoerg else /* IncrScratchpadSrc = true */
2617*13fbcb42Sjoerg SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
2618*13fbcb42Sjoerg }
2619*13fbcb42Sjoerg
2620*13fbcb42Sjoerg ++Idx;
2621*13fbcb42Sjoerg }
2622*13fbcb42Sjoerg }
2623*13fbcb42Sjoerg
2624*13fbcb42Sjoerg /// This function emits a helper that gathers Reduce lists from the first
2625*13fbcb42Sjoerg /// lane of every active warp to lanes in the first warp.
2626*13fbcb42Sjoerg ///
2627*13fbcb42Sjoerg /// void inter_warp_copy_func(void* reduce_data, num_warps)
2628*13fbcb42Sjoerg /// shared smem[warp_size];
2629*13fbcb42Sjoerg /// For all data entries D in reduce_data:
2630*13fbcb42Sjoerg /// sync
2631*13fbcb42Sjoerg /// If (I am the first lane in each warp)
2632*13fbcb42Sjoerg /// Copy my local D to smem[warp_id]
2633*13fbcb42Sjoerg /// sync
2634*13fbcb42Sjoerg /// if (I am the first warp)
2635*13fbcb42Sjoerg /// Copy smem[thread_id] to my local D
emitInterWarpCopyFunction(CodeGenModule & CGM,ArrayRef<const Expr * > Privates,QualType ReductionArrayTy,SourceLocation Loc)2636*13fbcb42Sjoerg static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
2637*13fbcb42Sjoerg ArrayRef<const Expr *> Privates,
2638*13fbcb42Sjoerg QualType ReductionArrayTy,
2639*13fbcb42Sjoerg SourceLocation Loc) {
2640*13fbcb42Sjoerg ASTContext &C = CGM.getContext();
2641*13fbcb42Sjoerg llvm::Module &M = CGM.getModule();
2642*13fbcb42Sjoerg
2643*13fbcb42Sjoerg // ReduceList: thread local Reduce list.
2644*13fbcb42Sjoerg // At the stage of the computation when this function is called, partially
2645*13fbcb42Sjoerg // aggregated values reside in the first lane of every active warp.
2646*13fbcb42Sjoerg ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2647*13fbcb42Sjoerg C.VoidPtrTy, ImplicitParamDecl::Other);
2648*13fbcb42Sjoerg // NumWarps: number of warps active in the parallel region. This could
2649*13fbcb42Sjoerg // be smaller than 32 (max warps in a CTA) for partial block reduction.
2650*13fbcb42Sjoerg ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2651*13fbcb42Sjoerg C.getIntTypeForBitwidth(32, /* Signed */ true),
2652*13fbcb42Sjoerg ImplicitParamDecl::Other);
2653*13fbcb42Sjoerg FunctionArgList Args;
2654*13fbcb42Sjoerg Args.push_back(&ReduceListArg);
2655*13fbcb42Sjoerg Args.push_back(&NumWarpsArg);
2656*13fbcb42Sjoerg
2657*13fbcb42Sjoerg const CGFunctionInfo &CGFI =
2658*13fbcb42Sjoerg CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2659*13fbcb42Sjoerg auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
2660*13fbcb42Sjoerg llvm::GlobalValue::InternalLinkage,
2661*13fbcb42Sjoerg "_omp_reduction_inter_warp_copy_func", &M);
2662*13fbcb42Sjoerg CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2663*13fbcb42Sjoerg Fn->setDoesNotRecurse();
2664*13fbcb42Sjoerg CodeGenFunction CGF(CGM);
2665*13fbcb42Sjoerg CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2666*13fbcb42Sjoerg
2667*13fbcb42Sjoerg CGBuilderTy &Bld = CGF.Builder;
2668*13fbcb42Sjoerg
2669*13fbcb42Sjoerg // This array is used as a medium to transfer, one reduce element at a time,
2670*13fbcb42Sjoerg // the data from the first lane of every warp to lanes in the first warp
2671*13fbcb42Sjoerg // in order to perform the final step of a reduction in a parallel region
2672*13fbcb42Sjoerg // (reduction across warps). The array is placed in NVPTX __shared__ memory
2673*13fbcb42Sjoerg // for reduced latency, as well as to have a distinct copy for concurrently
2674*13fbcb42Sjoerg // executing target regions. The array is declared with common linkage so
2675*13fbcb42Sjoerg // as to be shared across compilation units.
2676*13fbcb42Sjoerg StringRef TransferMediumName =
2677*13fbcb42Sjoerg "__openmp_nvptx_data_transfer_temporary_storage";
2678*13fbcb42Sjoerg llvm::GlobalVariable *TransferMedium =
2679*13fbcb42Sjoerg M.getGlobalVariable(TransferMediumName);
2680*13fbcb42Sjoerg unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
2681*13fbcb42Sjoerg if (!TransferMedium) {
2682*13fbcb42Sjoerg auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
2683*13fbcb42Sjoerg unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
2684*13fbcb42Sjoerg TransferMedium = new llvm::GlobalVariable(
2685*13fbcb42Sjoerg M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,
2686*13fbcb42Sjoerg llvm::UndefValue::get(Ty), TransferMediumName,
2687*13fbcb42Sjoerg /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
2688*13fbcb42Sjoerg SharedAddressSpace);
2689*13fbcb42Sjoerg CGM.addCompilerUsedGlobal(TransferMedium);
2690*13fbcb42Sjoerg }
2691*13fbcb42Sjoerg
2692*13fbcb42Sjoerg auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
2693*13fbcb42Sjoerg // Get the CUDA thread id of the current OpenMP thread on the GPU.
2694*13fbcb42Sjoerg llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
2695*13fbcb42Sjoerg // nvptx_lane_id = nvptx_id % warpsize
2696*13fbcb42Sjoerg llvm::Value *LaneID = getNVPTXLaneID(CGF);
2697*13fbcb42Sjoerg // nvptx_warp_id = nvptx_id / warpsize
2698*13fbcb42Sjoerg llvm::Value *WarpID = getNVPTXWarpID(CGF);
2699*13fbcb42Sjoerg
2700*13fbcb42Sjoerg Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2701*13fbcb42Sjoerg Address LocalReduceList(
2702*13fbcb42Sjoerg Bld.CreatePointerBitCastOrAddrSpaceCast(
2703*13fbcb42Sjoerg CGF.EmitLoadOfScalar(
2704*13fbcb42Sjoerg AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,
2705*13fbcb42Sjoerg LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()),
2706*13fbcb42Sjoerg CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2707*13fbcb42Sjoerg CGF.getPointerAlign());
2708*13fbcb42Sjoerg
2709*13fbcb42Sjoerg unsigned Idx = 0;
2710*13fbcb42Sjoerg for (const Expr *Private : Privates) {
2711*13fbcb42Sjoerg //
2712*13fbcb42Sjoerg // Warp master copies reduce element to transfer medium in __shared__
2713*13fbcb42Sjoerg // memory.
2714*13fbcb42Sjoerg //
2715*13fbcb42Sjoerg unsigned RealTySize =
2716*13fbcb42Sjoerg C.getTypeSizeInChars(Private->getType())
2717*13fbcb42Sjoerg .alignTo(C.getTypeAlignInChars(Private->getType()))
2718*13fbcb42Sjoerg .getQuantity();
2719*13fbcb42Sjoerg for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
2720*13fbcb42Sjoerg unsigned NumIters = RealTySize / TySize;
2721*13fbcb42Sjoerg if (NumIters == 0)
2722*13fbcb42Sjoerg continue;
2723*13fbcb42Sjoerg QualType CType = C.getIntTypeForBitwidth(
2724*13fbcb42Sjoerg C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
2725*13fbcb42Sjoerg llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
2726*13fbcb42Sjoerg CharUnits Align = CharUnits::fromQuantity(TySize);
2727*13fbcb42Sjoerg llvm::Value *Cnt = nullptr;
2728*13fbcb42Sjoerg Address CntAddr = Address::invalid();
2729*13fbcb42Sjoerg llvm::BasicBlock *PrecondBB = nullptr;
2730*13fbcb42Sjoerg llvm::BasicBlock *ExitBB = nullptr;
2731*13fbcb42Sjoerg if (NumIters > 1) {
2732*13fbcb42Sjoerg CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
2733*13fbcb42Sjoerg CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
2734*13fbcb42Sjoerg /*Volatile=*/false, C.IntTy);
2735*13fbcb42Sjoerg PrecondBB = CGF.createBasicBlock("precond");
2736*13fbcb42Sjoerg ExitBB = CGF.createBasicBlock("exit");
2737*13fbcb42Sjoerg llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
2738*13fbcb42Sjoerg // There is no need to emit line number for unconditional branch.
2739*13fbcb42Sjoerg (void)ApplyDebugLocation::CreateEmpty(CGF);
2740*13fbcb42Sjoerg CGF.EmitBlock(PrecondBB);
2741*13fbcb42Sjoerg Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
2742*13fbcb42Sjoerg llvm::Value *Cmp =
2743*13fbcb42Sjoerg Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
2744*13fbcb42Sjoerg Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
2745*13fbcb42Sjoerg CGF.EmitBlock(BodyBB);
2746*13fbcb42Sjoerg }
2747*13fbcb42Sjoerg // kmpc_barrier.
2748*13fbcb42Sjoerg CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
2749*13fbcb42Sjoerg /*EmitChecks=*/false,
2750*13fbcb42Sjoerg /*ForceSimpleCall=*/true);
2751*13fbcb42Sjoerg llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2752*13fbcb42Sjoerg llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2753*13fbcb42Sjoerg llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2754*13fbcb42Sjoerg
2755*13fbcb42Sjoerg // if (lane_id == 0)
2756*13fbcb42Sjoerg llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
2757*13fbcb42Sjoerg Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2758*13fbcb42Sjoerg CGF.EmitBlock(ThenBB);
2759*13fbcb42Sjoerg
2760*13fbcb42Sjoerg // Reduce element = LocalReduceList[i]
2761*13fbcb42Sjoerg Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2762*13fbcb42Sjoerg llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2763*13fbcb42Sjoerg ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2764*13fbcb42Sjoerg // elemptr = ((CopyType*)(elemptrptr)) + I
2765*13fbcb42Sjoerg Address ElemPtr = Address(ElemPtrPtr, Align);
2766*13fbcb42Sjoerg ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
2767*13fbcb42Sjoerg if (NumIters > 1) {
2768*13fbcb42Sjoerg ElemPtr = Address(Bld.CreateGEP(ElemPtr.getPointer(), Cnt),
2769*13fbcb42Sjoerg ElemPtr.getAlignment());
2770*13fbcb42Sjoerg }
2771*13fbcb42Sjoerg
2772*13fbcb42Sjoerg // Get pointer to location in transfer medium.
2773*13fbcb42Sjoerg // MediumPtr = &medium[warp_id]
2774*13fbcb42Sjoerg llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
2775*13fbcb42Sjoerg TransferMedium->getValueType(), TransferMedium,
2776*13fbcb42Sjoerg {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
2777*13fbcb42Sjoerg Address MediumPtr(MediumPtrVal, Align);
2778*13fbcb42Sjoerg // Casting to actual data type.
2779*13fbcb42Sjoerg // MediumPtr = (CopyType*)MediumPtrAddr;
2780*13fbcb42Sjoerg MediumPtr = Bld.CreateElementBitCast(MediumPtr, CopyType);
2781*13fbcb42Sjoerg
2782*13fbcb42Sjoerg // elem = *elemptr
2783*13fbcb42Sjoerg //*MediumPtr = elem
2784*13fbcb42Sjoerg llvm::Value *Elem = CGF.EmitLoadOfScalar(
2785*13fbcb42Sjoerg ElemPtr, /*Volatile=*/false, CType, Loc,
2786*13fbcb42Sjoerg LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
2787*13fbcb42Sjoerg // Store the source element value to the dest element address.
2788*13fbcb42Sjoerg CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType,
2789*13fbcb42Sjoerg LValueBaseInfo(AlignmentSource::Type),
2790*13fbcb42Sjoerg TBAAAccessInfo());
2791*13fbcb42Sjoerg
2792*13fbcb42Sjoerg Bld.CreateBr(MergeBB);
2793*13fbcb42Sjoerg
2794*13fbcb42Sjoerg CGF.EmitBlock(ElseBB);
2795*13fbcb42Sjoerg Bld.CreateBr(MergeBB);
2796*13fbcb42Sjoerg
2797*13fbcb42Sjoerg CGF.EmitBlock(MergeBB);
2798*13fbcb42Sjoerg
2799*13fbcb42Sjoerg // kmpc_barrier.
2800*13fbcb42Sjoerg CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
2801*13fbcb42Sjoerg /*EmitChecks=*/false,
2802*13fbcb42Sjoerg /*ForceSimpleCall=*/true);
2803*13fbcb42Sjoerg
2804*13fbcb42Sjoerg //
2805*13fbcb42Sjoerg // Warp 0 copies reduce element from transfer medium.
2806*13fbcb42Sjoerg //
2807*13fbcb42Sjoerg llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
2808*13fbcb42Sjoerg llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
2809*13fbcb42Sjoerg llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
2810*13fbcb42Sjoerg
2811*13fbcb42Sjoerg Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
2812*13fbcb42Sjoerg llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
2813*13fbcb42Sjoerg AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
2814*13fbcb42Sjoerg
2815*13fbcb42Sjoerg // Up to 32 threads in warp 0 are active.
2816*13fbcb42Sjoerg llvm::Value *IsActiveThread =
2817*13fbcb42Sjoerg Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
2818*13fbcb42Sjoerg Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2819*13fbcb42Sjoerg
2820*13fbcb42Sjoerg CGF.EmitBlock(W0ThenBB);
2821*13fbcb42Sjoerg
2822*13fbcb42Sjoerg // SrcMediumPtr = &medium[tid]
2823*13fbcb42Sjoerg llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
2824*13fbcb42Sjoerg TransferMedium->getValueType(), TransferMedium,
2825*13fbcb42Sjoerg {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
2826*13fbcb42Sjoerg Address SrcMediumPtr(SrcMediumPtrVal, Align);
2827*13fbcb42Sjoerg // SrcMediumVal = *SrcMediumPtr;
2828*13fbcb42Sjoerg SrcMediumPtr = Bld.CreateElementBitCast(SrcMediumPtr, CopyType);
2829*13fbcb42Sjoerg
2830*13fbcb42Sjoerg // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2831*13fbcb42Sjoerg Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2832*13fbcb42Sjoerg llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
2833*13fbcb42Sjoerg TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
2834*13fbcb42Sjoerg Address TargetElemPtr = Address(TargetElemPtrVal, Align);
2835*13fbcb42Sjoerg TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
2836*13fbcb42Sjoerg if (NumIters > 1) {
2837*13fbcb42Sjoerg TargetElemPtr = Address(Bld.CreateGEP(TargetElemPtr.getPointer(), Cnt),
2838*13fbcb42Sjoerg TargetElemPtr.getAlignment());
2839*13fbcb42Sjoerg }
2840*13fbcb42Sjoerg
2841*13fbcb42Sjoerg // *TargetElemPtr = SrcMediumVal;
2842*13fbcb42Sjoerg llvm::Value *SrcMediumValue =
2843*13fbcb42Sjoerg CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
2844*13fbcb42Sjoerg CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
2845*13fbcb42Sjoerg CType);
2846*13fbcb42Sjoerg Bld.CreateBr(W0MergeBB);
2847*13fbcb42Sjoerg
2848*13fbcb42Sjoerg CGF.EmitBlock(W0ElseBB);
2849*13fbcb42Sjoerg Bld.CreateBr(W0MergeBB);
2850*13fbcb42Sjoerg
2851*13fbcb42Sjoerg CGF.EmitBlock(W0MergeBB);
2852*13fbcb42Sjoerg
2853*13fbcb42Sjoerg if (NumIters > 1) {
2854*13fbcb42Sjoerg Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
2855*13fbcb42Sjoerg CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
2856*13fbcb42Sjoerg CGF.EmitBranch(PrecondBB);
2857*13fbcb42Sjoerg (void)ApplyDebugLocation::CreateEmpty(CGF);
2858*13fbcb42Sjoerg CGF.EmitBlock(ExitBB);
2859*13fbcb42Sjoerg }
2860*13fbcb42Sjoerg RealTySize %= TySize;
2861*13fbcb42Sjoerg }
2862*13fbcb42Sjoerg ++Idx;
2863*13fbcb42Sjoerg }
2864*13fbcb42Sjoerg
2865*13fbcb42Sjoerg CGF.FinishFunction();
2866*13fbcb42Sjoerg return Fn;
2867*13fbcb42Sjoerg }
2868*13fbcb42Sjoerg
2869*13fbcb42Sjoerg /// Emit a helper that reduces data across two OpenMP threads (lanes)
2870*13fbcb42Sjoerg /// in the same warp. It uses shuffle instructions to copy over data from
2871*13fbcb42Sjoerg /// a remote lane's stack. The reduction algorithm performed is specified
2872*13fbcb42Sjoerg /// by the fourth parameter.
2873*13fbcb42Sjoerg ///
2874*13fbcb42Sjoerg /// Algorithm Versions.
2875*13fbcb42Sjoerg /// Full Warp Reduce (argument value 0):
2876*13fbcb42Sjoerg /// This algorithm assumes that all 32 lanes are active and gathers
2877*13fbcb42Sjoerg /// data from these 32 lanes, producing a single resultant value.
2878*13fbcb42Sjoerg /// Contiguous Partial Warp Reduce (argument value 1):
2879*13fbcb42Sjoerg /// This algorithm assumes that only a *contiguous* subset of lanes
2880*13fbcb42Sjoerg /// are active. This happens for the last warp in a parallel region
2881*13fbcb42Sjoerg /// when the user specified num_threads is not an integer multiple of
2882*13fbcb42Sjoerg /// 32. This contiguous subset always starts with the zeroth lane.
2883*13fbcb42Sjoerg /// Partial Warp Reduce (argument value 2):
2884*13fbcb42Sjoerg /// This algorithm gathers data from any number of lanes at any position.
2885*13fbcb42Sjoerg /// All reduced values are stored in the lowest possible lane. The set
2886*13fbcb42Sjoerg /// of problems every algorithm addresses is a super set of those
2887*13fbcb42Sjoerg /// addressable by algorithms with a lower version number. Overhead
2888*13fbcb42Sjoerg /// increases as algorithm version increases.
2889*13fbcb42Sjoerg ///
2890*13fbcb42Sjoerg /// Terminology
2891*13fbcb42Sjoerg /// Reduce element:
2892*13fbcb42Sjoerg /// Reduce element refers to the individual data field with primitive
2893*13fbcb42Sjoerg /// data types to be combined and reduced across threads.
2894*13fbcb42Sjoerg /// Reduce list:
2895*13fbcb42Sjoerg /// Reduce list refers to a collection of local, thread-private
2896*13fbcb42Sjoerg /// reduce elements.
2897*13fbcb42Sjoerg /// Remote Reduce list:
2898*13fbcb42Sjoerg /// Remote Reduce list refers to a collection of remote (relative to
2899*13fbcb42Sjoerg /// the current thread) reduce elements.
2900*13fbcb42Sjoerg ///
2901*13fbcb42Sjoerg /// We distinguish between three states of threads that are important to
2902*13fbcb42Sjoerg /// the implementation of this function.
2903*13fbcb42Sjoerg /// Alive threads:
2904*13fbcb42Sjoerg /// Threads in a warp executing the SIMT instruction, as distinguished from
2905*13fbcb42Sjoerg /// threads that are inactive due to divergent control flow.
2906*13fbcb42Sjoerg /// Active threads:
2907*13fbcb42Sjoerg /// The minimal set of threads that has to be alive upon entry to this
2908*13fbcb42Sjoerg /// function. The computation is correct iff active threads are alive.
2909*13fbcb42Sjoerg /// Some threads are alive but they are not active because they do not
2910*13fbcb42Sjoerg /// contribute to the computation in any useful manner. Turning them off
2911*13fbcb42Sjoerg /// may introduce control flow overheads without any tangible benefits.
2912*13fbcb42Sjoerg /// Effective threads:
2913*13fbcb42Sjoerg /// In order to comply with the argument requirements of the shuffle
2914*13fbcb42Sjoerg /// function, we must keep all lanes holding data alive. But at most
2915*13fbcb42Sjoerg /// half of them perform value aggregation; we refer to this half of
2916*13fbcb42Sjoerg /// threads as effective. The other half is simply handing off their
2917*13fbcb42Sjoerg /// data.
2918*13fbcb42Sjoerg ///
2919*13fbcb42Sjoerg /// Procedure
2920*13fbcb42Sjoerg /// Value shuffle:
2921*13fbcb42Sjoerg /// In this step active threads transfer data from higher lane positions
2922*13fbcb42Sjoerg /// in the warp to lower lane positions, creating Remote Reduce list.
2923*13fbcb42Sjoerg /// Value aggregation:
2924*13fbcb42Sjoerg /// In this step, effective threads combine their thread local Reduce list
2925*13fbcb42Sjoerg /// with Remote Reduce list and store the result in the thread local
2926*13fbcb42Sjoerg /// Reduce list.
2927*13fbcb42Sjoerg /// Value copy:
2928*13fbcb42Sjoerg /// In this step, we deal with the assumption made by algorithm 2
2929*13fbcb42Sjoerg /// (i.e. contiguity assumption). When we have an odd number of lanes
2930*13fbcb42Sjoerg /// active, say 2k+1, only k threads will be effective and therefore k
2931*13fbcb42Sjoerg /// new values will be produced. However, the Reduce list owned by the
2932*13fbcb42Sjoerg /// (2k+1)th thread is ignored in the value aggregation. Therefore
2933*13fbcb42Sjoerg /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
2934*13fbcb42Sjoerg /// that the contiguity assumption still holds.
emitShuffleAndReduceFunction(CodeGenModule & CGM,ArrayRef<const Expr * > Privates,QualType ReductionArrayTy,llvm::Function * ReduceFn,SourceLocation Loc)2935*13fbcb42Sjoerg static llvm::Function *emitShuffleAndReduceFunction(
2936*13fbcb42Sjoerg CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2937*13fbcb42Sjoerg QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
2938*13fbcb42Sjoerg ASTContext &C = CGM.getContext();
2939*13fbcb42Sjoerg
2940*13fbcb42Sjoerg // Thread local Reduce list used to host the values of data to be reduced.
2941*13fbcb42Sjoerg ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2942*13fbcb42Sjoerg C.VoidPtrTy, ImplicitParamDecl::Other);
2943*13fbcb42Sjoerg // Current lane id; could be logical.
2944*13fbcb42Sjoerg ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
2945*13fbcb42Sjoerg ImplicitParamDecl::Other);
2946*13fbcb42Sjoerg // Offset of the remote source lane relative to the current lane.
2947*13fbcb42Sjoerg ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2948*13fbcb42Sjoerg C.ShortTy, ImplicitParamDecl::Other);
2949*13fbcb42Sjoerg // Algorithm version. This is expected to be known at compile time.
2950*13fbcb42Sjoerg ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2951*13fbcb42Sjoerg C.ShortTy, ImplicitParamDecl::Other);
2952*13fbcb42Sjoerg FunctionArgList Args;
2953*13fbcb42Sjoerg Args.push_back(&ReduceListArg);
2954*13fbcb42Sjoerg Args.push_back(&LaneIDArg);
2955*13fbcb42Sjoerg Args.push_back(&RemoteLaneOffsetArg);
2956*13fbcb42Sjoerg Args.push_back(&AlgoVerArg);
2957*13fbcb42Sjoerg
2958*13fbcb42Sjoerg const CGFunctionInfo &CGFI =
2959*13fbcb42Sjoerg CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2960*13fbcb42Sjoerg auto *Fn = llvm::Function::Create(
2961*13fbcb42Sjoerg CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2962*13fbcb42Sjoerg "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
2963*13fbcb42Sjoerg CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2964*13fbcb42Sjoerg Fn->setDoesNotRecurse();
2965*13fbcb42Sjoerg
2966*13fbcb42Sjoerg CodeGenFunction CGF(CGM);
2967*13fbcb42Sjoerg CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2968*13fbcb42Sjoerg
2969*13fbcb42Sjoerg CGBuilderTy &Bld = CGF.Builder;
2970*13fbcb42Sjoerg
2971*13fbcb42Sjoerg Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2972*13fbcb42Sjoerg Address LocalReduceList(
2973*13fbcb42Sjoerg Bld.CreatePointerBitCastOrAddrSpaceCast(
2974*13fbcb42Sjoerg CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2975*13fbcb42Sjoerg C.VoidPtrTy, SourceLocation()),
2976*13fbcb42Sjoerg CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2977*13fbcb42Sjoerg CGF.getPointerAlign());
2978*13fbcb42Sjoerg
2979*13fbcb42Sjoerg Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
2980*13fbcb42Sjoerg llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
2981*13fbcb42Sjoerg AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2982*13fbcb42Sjoerg
2983*13fbcb42Sjoerg Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
2984*13fbcb42Sjoerg llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
2985*13fbcb42Sjoerg AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2986*13fbcb42Sjoerg
2987*13fbcb42Sjoerg Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
2988*13fbcb42Sjoerg llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
2989*13fbcb42Sjoerg AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2990*13fbcb42Sjoerg
2991*13fbcb42Sjoerg // Create a local thread-private variable to host the Reduce list
2992*13fbcb42Sjoerg // from a remote lane.
2993*13fbcb42Sjoerg Address RemoteReduceList =
2994*13fbcb42Sjoerg CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
2995*13fbcb42Sjoerg
2996*13fbcb42Sjoerg // This loop iterates through the list of reduce elements and copies,
2997*13fbcb42Sjoerg // element by element, from a remote lane in the warp to RemoteReduceList,
2998*13fbcb42Sjoerg // hosted on the thread's stack.
2999*13fbcb42Sjoerg emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
3000*13fbcb42Sjoerg LocalReduceList, RemoteReduceList,
3001*13fbcb42Sjoerg {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
3002*13fbcb42Sjoerg /*ScratchpadIndex=*/nullptr,
3003*13fbcb42Sjoerg /*ScratchpadWidth=*/nullptr});
3004*13fbcb42Sjoerg
3005*13fbcb42Sjoerg // The actions to be performed on the Remote Reduce list is dependent
3006*13fbcb42Sjoerg // on the algorithm version.
3007*13fbcb42Sjoerg //
3008*13fbcb42Sjoerg // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
3009*13fbcb42Sjoerg // LaneId % 2 == 0 && Offset > 0):
3010*13fbcb42Sjoerg // do the reduction value aggregation
3011*13fbcb42Sjoerg //
3012*13fbcb42Sjoerg // The thread local variable Reduce list is mutated in place to host the
3013*13fbcb42Sjoerg // reduced data, which is the aggregated value produced from local and
3014*13fbcb42Sjoerg // remote lanes.
3015*13fbcb42Sjoerg //
3016*13fbcb42Sjoerg // Note that AlgoVer is expected to be a constant integer known at compile
3017*13fbcb42Sjoerg // time.
3018*13fbcb42Sjoerg // When AlgoVer==0, the first conjunction evaluates to true, making
3019*13fbcb42Sjoerg // the entire predicate true during compile time.
3020*13fbcb42Sjoerg // When AlgoVer==1, the second conjunction has only the second part to be
3021*13fbcb42Sjoerg // evaluated during runtime. Other conjunctions evaluates to false
3022*13fbcb42Sjoerg // during compile time.
3023*13fbcb42Sjoerg // When AlgoVer==2, the third conjunction has only the second part to be
3024*13fbcb42Sjoerg // evaluated during runtime. Other conjunctions evaluates to false
3025*13fbcb42Sjoerg // during compile time.
3026*13fbcb42Sjoerg llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
3027*13fbcb42Sjoerg
3028*13fbcb42Sjoerg llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
3029*13fbcb42Sjoerg llvm::Value *CondAlgo1 = Bld.CreateAnd(
3030*13fbcb42Sjoerg Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
3031*13fbcb42Sjoerg
3032*13fbcb42Sjoerg llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
3033*13fbcb42Sjoerg llvm::Value *CondAlgo2 = Bld.CreateAnd(
3034*13fbcb42Sjoerg Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
3035*13fbcb42Sjoerg CondAlgo2 = Bld.CreateAnd(
3036*13fbcb42Sjoerg CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
3037*13fbcb42Sjoerg
3038*13fbcb42Sjoerg llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
3039*13fbcb42Sjoerg CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
3040*13fbcb42Sjoerg
3041*13fbcb42Sjoerg llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3042*13fbcb42Sjoerg llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3043*13fbcb42Sjoerg llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3044*13fbcb42Sjoerg Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
3045*13fbcb42Sjoerg
3046*13fbcb42Sjoerg CGF.EmitBlock(ThenBB);
3047*13fbcb42Sjoerg // reduce_function(LocalReduceList, RemoteReduceList)
3048*13fbcb42Sjoerg llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3049*13fbcb42Sjoerg LocalReduceList.getPointer(), CGF.VoidPtrTy);
3050*13fbcb42Sjoerg llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3051*13fbcb42Sjoerg RemoteReduceList.getPointer(), CGF.VoidPtrTy);
3052*13fbcb42Sjoerg CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3053*13fbcb42Sjoerg CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
3054*13fbcb42Sjoerg Bld.CreateBr(MergeBB);
3055*13fbcb42Sjoerg
3056*13fbcb42Sjoerg CGF.EmitBlock(ElseBB);
3057*13fbcb42Sjoerg Bld.CreateBr(MergeBB);
3058*13fbcb42Sjoerg
3059*13fbcb42Sjoerg CGF.EmitBlock(MergeBB);
3060*13fbcb42Sjoerg
3061*13fbcb42Sjoerg // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3062*13fbcb42Sjoerg // Reduce list.
3063*13fbcb42Sjoerg Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
3064*13fbcb42Sjoerg llvm::Value *CondCopy = Bld.CreateAnd(
3065*13fbcb42Sjoerg Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
3066*13fbcb42Sjoerg
3067*13fbcb42Sjoerg llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
3068*13fbcb42Sjoerg llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
3069*13fbcb42Sjoerg llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
3070*13fbcb42Sjoerg Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3071*13fbcb42Sjoerg
3072*13fbcb42Sjoerg CGF.EmitBlock(CpyThenBB);
3073*13fbcb42Sjoerg emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
3074*13fbcb42Sjoerg RemoteReduceList, LocalReduceList);
3075*13fbcb42Sjoerg Bld.CreateBr(CpyMergeBB);
3076*13fbcb42Sjoerg
3077*13fbcb42Sjoerg CGF.EmitBlock(CpyElseBB);
3078*13fbcb42Sjoerg Bld.CreateBr(CpyMergeBB);
3079*13fbcb42Sjoerg
3080*13fbcb42Sjoerg CGF.EmitBlock(CpyMergeBB);
3081*13fbcb42Sjoerg
3082*13fbcb42Sjoerg CGF.FinishFunction();
3083*13fbcb42Sjoerg return Fn;
3084*13fbcb42Sjoerg }
3085*13fbcb42Sjoerg
3086*13fbcb42Sjoerg /// This function emits a helper that copies all the reduction variables from
3087*13fbcb42Sjoerg /// the team into the provided global buffer for the reduction variables.
3088*13fbcb42Sjoerg ///
3089*13fbcb42Sjoerg /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
3090*13fbcb42Sjoerg /// For all data entries D in reduce_data:
3091*13fbcb42Sjoerg /// Copy local D to buffer.D[Idx]
emitListToGlobalCopyFunction(CodeGenModule & CGM,ArrayRef<const Expr * > Privates,QualType ReductionArrayTy,SourceLocation Loc,const RecordDecl * TeamReductionRec,const llvm::SmallDenseMap<const ValueDecl *,const FieldDecl * > & VarFieldMap)3092*13fbcb42Sjoerg static llvm::Value *emitListToGlobalCopyFunction(
3093*13fbcb42Sjoerg CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3094*13fbcb42Sjoerg QualType ReductionArrayTy, SourceLocation Loc,
3095*13fbcb42Sjoerg const RecordDecl *TeamReductionRec,
3096*13fbcb42Sjoerg const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3097*13fbcb42Sjoerg &VarFieldMap) {
3098*13fbcb42Sjoerg ASTContext &C = CGM.getContext();
3099*13fbcb42Sjoerg
3100*13fbcb42Sjoerg // Buffer: global reduction buffer.
3101*13fbcb42Sjoerg ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3102*13fbcb42Sjoerg C.VoidPtrTy, ImplicitParamDecl::Other);
3103*13fbcb42Sjoerg // Idx: index of the buffer.
3104*13fbcb42Sjoerg ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3105*13fbcb42Sjoerg ImplicitParamDecl::Other);
3106*13fbcb42Sjoerg // ReduceList: thread local Reduce list.
3107*13fbcb42Sjoerg ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3108*13fbcb42Sjoerg C.VoidPtrTy, ImplicitParamDecl::Other);
3109*13fbcb42Sjoerg FunctionArgList Args;
3110*13fbcb42Sjoerg Args.push_back(&BufferArg);
3111*13fbcb42Sjoerg Args.push_back(&IdxArg);
3112*13fbcb42Sjoerg Args.push_back(&ReduceListArg);
3113*13fbcb42Sjoerg
3114*13fbcb42Sjoerg const CGFunctionInfo &CGFI =
3115*13fbcb42Sjoerg CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
3116*13fbcb42Sjoerg auto *Fn = llvm::Function::Create(
3117*13fbcb42Sjoerg CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3118*13fbcb42Sjoerg "_omp_reduction_list_to_global_copy_func", &CGM.getModule());
3119*13fbcb42Sjoerg CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3120*13fbcb42Sjoerg Fn->setDoesNotRecurse();
3121*13fbcb42Sjoerg CodeGenFunction CGF(CGM);
3122*13fbcb42Sjoerg CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3123*13fbcb42Sjoerg
3124*13fbcb42Sjoerg CGBuilderTy &Bld = CGF.Builder;
3125*13fbcb42Sjoerg
3126*13fbcb42Sjoerg Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3127*13fbcb42Sjoerg Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3128*13fbcb42Sjoerg Address LocalReduceList(
3129*13fbcb42Sjoerg Bld.CreatePointerBitCastOrAddrSpaceCast(
3130*13fbcb42Sjoerg CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3131*13fbcb42Sjoerg C.VoidPtrTy, Loc),
3132*13fbcb42Sjoerg CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3133*13fbcb42Sjoerg CGF.getPointerAlign());
3134*13fbcb42Sjoerg QualType StaticTy = C.getRecordType(TeamReductionRec);
3135*13fbcb42Sjoerg llvm::Type *LLVMReductionsBufferTy =
3136*13fbcb42Sjoerg CGM.getTypes().ConvertTypeForMem(StaticTy);
3137*13fbcb42Sjoerg llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3138*13fbcb42Sjoerg CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3139*13fbcb42Sjoerg LLVMReductionsBufferTy->getPointerTo());
3140*13fbcb42Sjoerg llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3141*13fbcb42Sjoerg CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3142*13fbcb42Sjoerg /*Volatile=*/false, C.IntTy,
3143*13fbcb42Sjoerg Loc)};
3144*13fbcb42Sjoerg unsigned Idx = 0;
3145*13fbcb42Sjoerg for (const Expr *Private : Privates) {
3146*13fbcb42Sjoerg // Reduce element = LocalReduceList[i]
3147*13fbcb42Sjoerg Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3148*13fbcb42Sjoerg llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3149*13fbcb42Sjoerg ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3150*13fbcb42Sjoerg // elemptr = ((CopyType*)(elemptrptr)) + I
3151*13fbcb42Sjoerg ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3152*13fbcb42Sjoerg ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
3153*13fbcb42Sjoerg Address ElemPtr =
3154*13fbcb42Sjoerg Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
3155*13fbcb42Sjoerg const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
3156*13fbcb42Sjoerg // Global = Buffer.VD[Idx];
3157*13fbcb42Sjoerg const FieldDecl *FD = VarFieldMap.lookup(VD);
3158*13fbcb42Sjoerg LValue GlobLVal = CGF.EmitLValueForField(
3159*13fbcb42Sjoerg CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3160*13fbcb42Sjoerg Address GlobAddr = GlobLVal.getAddress(CGF);
3161*13fbcb42Sjoerg llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
3162*13fbcb42Sjoerg GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
3163*13fbcb42Sjoerg GlobLVal.setAddress(Address(BufferPtr, GlobAddr.getAlignment()));
3164*13fbcb42Sjoerg switch (CGF.getEvaluationKind(Private->getType())) {
3165*13fbcb42Sjoerg case TEK_Scalar: {
3166*13fbcb42Sjoerg llvm::Value *V = CGF.EmitLoadOfScalar(
3167*13fbcb42Sjoerg ElemPtr, /*Volatile=*/false, Private->getType(), Loc,
3168*13fbcb42Sjoerg LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
3169*13fbcb42Sjoerg CGF.EmitStoreOfScalar(V, GlobLVal);
3170*13fbcb42Sjoerg break;
3171*13fbcb42Sjoerg }
3172*13fbcb42Sjoerg case TEK_Complex: {
3173*13fbcb42Sjoerg CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(
3174*13fbcb42Sjoerg CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
3175*13fbcb42Sjoerg CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false);
3176*13fbcb42Sjoerg break;
3177*13fbcb42Sjoerg }
3178*13fbcb42Sjoerg case TEK_Aggregate:
3179*13fbcb42Sjoerg CGF.EmitAggregateCopy(GlobLVal,
3180*13fbcb42Sjoerg CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3181*13fbcb42Sjoerg Private->getType(), AggValueSlot::DoesNotOverlap);
3182*13fbcb42Sjoerg break;
3183*13fbcb42Sjoerg }
3184*13fbcb42Sjoerg ++Idx;
3185*13fbcb42Sjoerg }
3186*13fbcb42Sjoerg
3187*13fbcb42Sjoerg CGF.FinishFunction();
3188*13fbcb42Sjoerg return Fn;
3189*13fbcb42Sjoerg }
3190*13fbcb42Sjoerg
3191*13fbcb42Sjoerg /// This function emits a helper that reduces all the reduction variables from
3192*13fbcb42Sjoerg /// the team into the provided global buffer for the reduction variables.
3193*13fbcb42Sjoerg ///
3194*13fbcb42Sjoerg /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
3195*13fbcb42Sjoerg /// void *GlobPtrs[];
3196*13fbcb42Sjoerg /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
3197*13fbcb42Sjoerg /// ...
3198*13fbcb42Sjoerg /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
3199*13fbcb42Sjoerg /// reduce_function(GlobPtrs, reduce_data);
emitListToGlobalReduceFunction(CodeGenModule & CGM,ArrayRef<const Expr * > Privates,QualType ReductionArrayTy,SourceLocation Loc,const RecordDecl * TeamReductionRec,const llvm::SmallDenseMap<const ValueDecl *,const FieldDecl * > & VarFieldMap,llvm::Function * ReduceFn)3200*13fbcb42Sjoerg static llvm::Value *emitListToGlobalReduceFunction(
3201*13fbcb42Sjoerg CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3202*13fbcb42Sjoerg QualType ReductionArrayTy, SourceLocation Loc,
3203*13fbcb42Sjoerg const RecordDecl *TeamReductionRec,
3204*13fbcb42Sjoerg const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3205*13fbcb42Sjoerg &VarFieldMap,
3206*13fbcb42Sjoerg llvm::Function *ReduceFn) {
3207*13fbcb42Sjoerg ASTContext &C = CGM.getContext();
3208*13fbcb42Sjoerg
3209*13fbcb42Sjoerg // Buffer: global reduction buffer.
3210*13fbcb42Sjoerg ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3211*13fbcb42Sjoerg C.VoidPtrTy, ImplicitParamDecl::Other);
3212*13fbcb42Sjoerg // Idx: index of the buffer.
3213*13fbcb42Sjoerg ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3214*13fbcb42Sjoerg ImplicitParamDecl::Other);
3215*13fbcb42Sjoerg // ReduceList: thread local Reduce list.
3216*13fbcb42Sjoerg ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3217*13fbcb42Sjoerg C.VoidPtrTy, ImplicitParamDecl::Other);
3218*13fbcb42Sjoerg FunctionArgList Args;
3219*13fbcb42Sjoerg Args.push_back(&BufferArg);
3220*13fbcb42Sjoerg Args.push_back(&IdxArg);
3221*13fbcb42Sjoerg Args.push_back(&ReduceListArg);
3222*13fbcb42Sjoerg
3223*13fbcb42Sjoerg const CGFunctionInfo &CGFI =
3224*13fbcb42Sjoerg CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
3225*13fbcb42Sjoerg auto *Fn = llvm::Function::Create(
3226*13fbcb42Sjoerg CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3227*13fbcb42Sjoerg "_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
3228*13fbcb42Sjoerg CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3229*13fbcb42Sjoerg Fn->setDoesNotRecurse();
3230*13fbcb42Sjoerg CodeGenFunction CGF(CGM);
3231*13fbcb42Sjoerg CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3232*13fbcb42Sjoerg
3233*13fbcb42Sjoerg CGBuilderTy &Bld = CGF.Builder;
3234*13fbcb42Sjoerg
3235*13fbcb42Sjoerg Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3236*13fbcb42Sjoerg QualType StaticTy = C.getRecordType(TeamReductionRec);
3237*13fbcb42Sjoerg llvm::Type *LLVMReductionsBufferTy =
3238*13fbcb42Sjoerg CGM.getTypes().ConvertTypeForMem(StaticTy);
3239*13fbcb42Sjoerg llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3240*13fbcb42Sjoerg CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3241*13fbcb42Sjoerg LLVMReductionsBufferTy->getPointerTo());
3242*13fbcb42Sjoerg
3243*13fbcb42Sjoerg // 1. Build a list of reduction variables.
3244*13fbcb42Sjoerg // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3245*13fbcb42Sjoerg Address ReductionList =
3246*13fbcb42Sjoerg CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3247*13fbcb42Sjoerg auto IPriv = Privates.begin();
3248*13fbcb42Sjoerg llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3249*13fbcb42Sjoerg CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3250*13fbcb42Sjoerg /*Volatile=*/false, C.IntTy,
3251*13fbcb42Sjoerg Loc)};
3252*13fbcb42Sjoerg unsigned Idx = 0;
3253*13fbcb42Sjoerg for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
3254*13fbcb42Sjoerg Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3255*13fbcb42Sjoerg // Global = Buffer.VD[Idx];
3256*13fbcb42Sjoerg const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
3257*13fbcb42Sjoerg const FieldDecl *FD = VarFieldMap.lookup(VD);
3258*13fbcb42Sjoerg LValue GlobLVal = CGF.EmitLValueForField(
3259*13fbcb42Sjoerg CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3260*13fbcb42Sjoerg Address GlobAddr = GlobLVal.getAddress(CGF);
3261*13fbcb42Sjoerg llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
3262*13fbcb42Sjoerg GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
3263*13fbcb42Sjoerg llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
3264*13fbcb42Sjoerg CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
3265*13fbcb42Sjoerg if ((*IPriv)->getType()->isVariablyModifiedType()) {
3266*13fbcb42Sjoerg // Store array size.
3267*13fbcb42Sjoerg ++Idx;
3268*13fbcb42Sjoerg Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3269*13fbcb42Sjoerg llvm::Value *Size = CGF.Builder.CreateIntCast(
3270*13fbcb42Sjoerg CGF.getVLASize(
3271*13fbcb42Sjoerg CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3272*13fbcb42Sjoerg .NumElts,
3273*13fbcb42Sjoerg CGF.SizeTy, /*isSigned=*/false);
3274*13fbcb42Sjoerg CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3275*13fbcb42Sjoerg Elem);
3276*13fbcb42Sjoerg }
3277*13fbcb42Sjoerg }
3278*13fbcb42Sjoerg
3279*13fbcb42Sjoerg // Call reduce_function(GlobalReduceList, ReduceList)
3280*13fbcb42Sjoerg llvm::Value *GlobalReduceList =
3281*13fbcb42Sjoerg CGF.EmitCastToVoidPtr(ReductionList.getPointer());
3282*13fbcb42Sjoerg Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3283*13fbcb42Sjoerg llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
3284*13fbcb42Sjoerg AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
3285*13fbcb42Sjoerg CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3286*13fbcb42Sjoerg CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
3287*13fbcb42Sjoerg CGF.FinishFunction();
3288*13fbcb42Sjoerg return Fn;
3289*13fbcb42Sjoerg }
3290*13fbcb42Sjoerg
3291*13fbcb42Sjoerg /// This function emits a helper that copies all the reduction variables from
3292*13fbcb42Sjoerg /// the team into the provided global buffer for the reduction variables.
3293*13fbcb42Sjoerg ///
3294*13fbcb42Sjoerg /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
3295*13fbcb42Sjoerg /// For all data entries D in reduce_data:
3296*13fbcb42Sjoerg /// Copy buffer.D[Idx] to local D;
emitGlobalToListCopyFunction(CodeGenModule & CGM,ArrayRef<const Expr * > Privates,QualType ReductionArrayTy,SourceLocation Loc,const RecordDecl * TeamReductionRec,const llvm::SmallDenseMap<const ValueDecl *,const FieldDecl * > & VarFieldMap)3297*13fbcb42Sjoerg static llvm::Value *emitGlobalToListCopyFunction(
3298*13fbcb42Sjoerg CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3299*13fbcb42Sjoerg QualType ReductionArrayTy, SourceLocation Loc,
3300*13fbcb42Sjoerg const RecordDecl *TeamReductionRec,
3301*13fbcb42Sjoerg const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3302*13fbcb42Sjoerg &VarFieldMap) {
3303*13fbcb42Sjoerg ASTContext &C = CGM.getContext();
3304*13fbcb42Sjoerg
3305*13fbcb42Sjoerg // Buffer: global reduction buffer.
3306*13fbcb42Sjoerg ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3307*13fbcb42Sjoerg C.VoidPtrTy, ImplicitParamDecl::Other);
3308*13fbcb42Sjoerg // Idx: index of the buffer.
3309*13fbcb42Sjoerg ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3310*13fbcb42Sjoerg ImplicitParamDecl::Other);
3311*13fbcb42Sjoerg // ReduceList: thread local Reduce list.
3312*13fbcb42Sjoerg ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3313*13fbcb42Sjoerg C.VoidPtrTy, ImplicitParamDecl::Other);
3314*13fbcb42Sjoerg FunctionArgList Args;
3315*13fbcb42Sjoerg Args.push_back(&BufferArg);
3316*13fbcb42Sjoerg Args.push_back(&IdxArg);
3317*13fbcb42Sjoerg Args.push_back(&ReduceListArg);
3318*13fbcb42Sjoerg
3319*13fbcb42Sjoerg const CGFunctionInfo &CGFI =
3320*13fbcb42Sjoerg CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
3321*13fbcb42Sjoerg auto *Fn = llvm::Function::Create(
3322*13fbcb42Sjoerg CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3323*13fbcb42Sjoerg "_omp_reduction_global_to_list_copy_func", &CGM.getModule());
3324*13fbcb42Sjoerg CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3325*13fbcb42Sjoerg Fn->setDoesNotRecurse();
3326*13fbcb42Sjoerg CodeGenFunction CGF(CGM);
3327*13fbcb42Sjoerg CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3328*13fbcb42Sjoerg
3329*13fbcb42Sjoerg CGBuilderTy &Bld = CGF.Builder;
3330*13fbcb42Sjoerg
3331*13fbcb42Sjoerg Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3332*13fbcb42Sjoerg Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3333*13fbcb42Sjoerg Address LocalReduceList(
3334*13fbcb42Sjoerg Bld.CreatePointerBitCastOrAddrSpaceCast(
3335*13fbcb42Sjoerg CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3336*13fbcb42Sjoerg C.VoidPtrTy, Loc),
3337*13fbcb42Sjoerg CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3338*13fbcb42Sjoerg CGF.getPointerAlign());
3339*13fbcb42Sjoerg QualType StaticTy = C.getRecordType(TeamReductionRec);
3340*13fbcb42Sjoerg llvm::Type *LLVMReductionsBufferTy =
3341*13fbcb42Sjoerg CGM.getTypes().ConvertTypeForMem(StaticTy);
3342*13fbcb42Sjoerg llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3343*13fbcb42Sjoerg CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3344*13fbcb42Sjoerg LLVMReductionsBufferTy->getPointerTo());
3345*13fbcb42Sjoerg
3346*13fbcb42Sjoerg llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3347*13fbcb42Sjoerg CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3348*13fbcb42Sjoerg /*Volatile=*/false, C.IntTy,
3349*13fbcb42Sjoerg Loc)};
3350*13fbcb42Sjoerg unsigned Idx = 0;
3351*13fbcb42Sjoerg for (const Expr *Private : Privates) {
3352*13fbcb42Sjoerg // Reduce element = LocalReduceList[i]
3353*13fbcb42Sjoerg Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3354*13fbcb42Sjoerg llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3355*13fbcb42Sjoerg ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3356*13fbcb42Sjoerg // elemptr = ((CopyType*)(elemptrptr)) + I
3357*13fbcb42Sjoerg ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3358*13fbcb42Sjoerg ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
3359*13fbcb42Sjoerg Address ElemPtr =
3360*13fbcb42Sjoerg Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
3361*13fbcb42Sjoerg const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
3362*13fbcb42Sjoerg // Global = Buffer.VD[Idx];
3363*13fbcb42Sjoerg const FieldDecl *FD = VarFieldMap.lookup(VD);
3364*13fbcb42Sjoerg LValue GlobLVal = CGF.EmitLValueForField(
3365*13fbcb42Sjoerg CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3366*13fbcb42Sjoerg Address GlobAddr = GlobLVal.getAddress(CGF);
3367*13fbcb42Sjoerg llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
3368*13fbcb42Sjoerg GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
3369*13fbcb42Sjoerg GlobLVal.setAddress(Address(BufferPtr, GlobAddr.getAlignment()));
3370*13fbcb42Sjoerg switch (CGF.getEvaluationKind(Private->getType())) {
3371*13fbcb42Sjoerg case TEK_Scalar: {
3372*13fbcb42Sjoerg llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
3373*13fbcb42Sjoerg CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType(),
3374*13fbcb42Sjoerg LValueBaseInfo(AlignmentSource::Type),
3375*13fbcb42Sjoerg TBAAAccessInfo());
3376*13fbcb42Sjoerg break;
3377*13fbcb42Sjoerg }
3378*13fbcb42Sjoerg case TEK_Complex: {
3379*13fbcb42Sjoerg CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc);
3380*13fbcb42Sjoerg CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3381*13fbcb42Sjoerg /*isInit=*/false);
3382*13fbcb42Sjoerg break;
3383*13fbcb42Sjoerg }
3384*13fbcb42Sjoerg case TEK_Aggregate:
3385*13fbcb42Sjoerg CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3386*13fbcb42Sjoerg GlobLVal, Private->getType(),
3387*13fbcb42Sjoerg AggValueSlot::DoesNotOverlap);
3388*13fbcb42Sjoerg break;
3389*13fbcb42Sjoerg }
3390*13fbcb42Sjoerg ++Idx;
3391*13fbcb42Sjoerg }
3392*13fbcb42Sjoerg
3393*13fbcb42Sjoerg CGF.FinishFunction();
3394*13fbcb42Sjoerg return Fn;
3395*13fbcb42Sjoerg }
3396*13fbcb42Sjoerg
3397*13fbcb42Sjoerg /// This function emits a helper that reduces all the reduction variables from
3398*13fbcb42Sjoerg /// the team into the provided global buffer for the reduction variables.
3399*13fbcb42Sjoerg ///
3400*13fbcb42Sjoerg /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
3401*13fbcb42Sjoerg /// void *GlobPtrs[];
3402*13fbcb42Sjoerg /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
3403*13fbcb42Sjoerg /// ...
3404*13fbcb42Sjoerg /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
3405*13fbcb42Sjoerg /// reduce_function(reduce_data, GlobPtrs);
emitGlobalToListReduceFunction(CodeGenModule & CGM,ArrayRef<const Expr * > Privates,QualType ReductionArrayTy,SourceLocation Loc,const RecordDecl * TeamReductionRec,const llvm::SmallDenseMap<const ValueDecl *,const FieldDecl * > & VarFieldMap,llvm::Function * ReduceFn)3406*13fbcb42Sjoerg static llvm::Value *emitGlobalToListReduceFunction(
3407*13fbcb42Sjoerg CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3408*13fbcb42Sjoerg QualType ReductionArrayTy, SourceLocation Loc,
3409*13fbcb42Sjoerg const RecordDecl *TeamReductionRec,
3410*13fbcb42Sjoerg const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3411*13fbcb42Sjoerg &VarFieldMap,
3412*13fbcb42Sjoerg llvm::Function *ReduceFn) {
3413*13fbcb42Sjoerg ASTContext &C = CGM.getContext();
3414*13fbcb42Sjoerg
3415*13fbcb42Sjoerg // Buffer: global reduction buffer.
3416*13fbcb42Sjoerg ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3417*13fbcb42Sjoerg C.VoidPtrTy, ImplicitParamDecl::Other);
3418*13fbcb42Sjoerg // Idx: index of the buffer.
3419*13fbcb42Sjoerg ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3420*13fbcb42Sjoerg ImplicitParamDecl::Other);
3421*13fbcb42Sjoerg // ReduceList: thread local Reduce list.
3422*13fbcb42Sjoerg ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3423*13fbcb42Sjoerg C.VoidPtrTy, ImplicitParamDecl::Other);
3424*13fbcb42Sjoerg FunctionArgList Args;
3425*13fbcb42Sjoerg Args.push_back(&BufferArg);
3426*13fbcb42Sjoerg Args.push_back(&IdxArg);
3427*13fbcb42Sjoerg Args.push_back(&ReduceListArg);
3428*13fbcb42Sjoerg
3429*13fbcb42Sjoerg const CGFunctionInfo &CGFI =
3430*13fbcb42Sjoerg CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
3431*13fbcb42Sjoerg auto *Fn = llvm::Function::Create(
3432*13fbcb42Sjoerg CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3433*13fbcb42Sjoerg "_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
3434*13fbcb42Sjoerg CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3435*13fbcb42Sjoerg Fn->setDoesNotRecurse();
3436*13fbcb42Sjoerg CodeGenFunction CGF(CGM);
3437*13fbcb42Sjoerg CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3438*13fbcb42Sjoerg
3439*13fbcb42Sjoerg CGBuilderTy &Bld = CGF.Builder;
3440*13fbcb42Sjoerg
3441*13fbcb42Sjoerg Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3442*13fbcb42Sjoerg QualType StaticTy = C.getRecordType(TeamReductionRec);
3443*13fbcb42Sjoerg llvm::Type *LLVMReductionsBufferTy =
3444*13fbcb42Sjoerg CGM.getTypes().ConvertTypeForMem(StaticTy);
3445*13fbcb42Sjoerg llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3446*13fbcb42Sjoerg CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3447*13fbcb42Sjoerg LLVMReductionsBufferTy->getPointerTo());
3448*13fbcb42Sjoerg
3449*13fbcb42Sjoerg // 1. Build a list of reduction variables.
3450*13fbcb42Sjoerg // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3451*13fbcb42Sjoerg Address ReductionList =
3452*13fbcb42Sjoerg CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3453*13fbcb42Sjoerg auto IPriv = Privates.begin();
3454*13fbcb42Sjoerg llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3455*13fbcb42Sjoerg CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3456*13fbcb42Sjoerg /*Volatile=*/false, C.IntTy,
3457*13fbcb42Sjoerg Loc)};
3458*13fbcb42Sjoerg unsigned Idx = 0;
3459*13fbcb42Sjoerg for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
3460*13fbcb42Sjoerg Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3461*13fbcb42Sjoerg // Global = Buffer.VD[Idx];
3462*13fbcb42Sjoerg const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
3463*13fbcb42Sjoerg const FieldDecl *FD = VarFieldMap.lookup(VD);
3464*13fbcb42Sjoerg LValue GlobLVal = CGF.EmitLValueForField(
3465*13fbcb42Sjoerg CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3466*13fbcb42Sjoerg Address GlobAddr = GlobLVal.getAddress(CGF);
3467*13fbcb42Sjoerg llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
3468*13fbcb42Sjoerg GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
3469*13fbcb42Sjoerg llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
3470*13fbcb42Sjoerg CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
3471*13fbcb42Sjoerg if ((*IPriv)->getType()->isVariablyModifiedType()) {
3472*13fbcb42Sjoerg // Store array size.
3473*13fbcb42Sjoerg ++Idx;
3474*13fbcb42Sjoerg Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3475*13fbcb42Sjoerg llvm::Value *Size = CGF.Builder.CreateIntCast(
3476*13fbcb42Sjoerg CGF.getVLASize(
3477*13fbcb42Sjoerg CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3478*13fbcb42Sjoerg .NumElts,
3479*13fbcb42Sjoerg CGF.SizeTy, /*isSigned=*/false);
3480*13fbcb42Sjoerg CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3481*13fbcb42Sjoerg Elem);
3482*13fbcb42Sjoerg }
3483*13fbcb42Sjoerg }
3484*13fbcb42Sjoerg
3485*13fbcb42Sjoerg // Call reduce_function(ReduceList, GlobalReduceList)
3486*13fbcb42Sjoerg llvm::Value *GlobalReduceList =
3487*13fbcb42Sjoerg CGF.EmitCastToVoidPtr(ReductionList.getPointer());
3488*13fbcb42Sjoerg Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3489*13fbcb42Sjoerg llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
3490*13fbcb42Sjoerg AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
3491*13fbcb42Sjoerg CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3492*13fbcb42Sjoerg CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
3493*13fbcb42Sjoerg CGF.FinishFunction();
3494*13fbcb42Sjoerg return Fn;
3495*13fbcb42Sjoerg }
3496*13fbcb42Sjoerg
3497*13fbcb42Sjoerg ///
3498*13fbcb42Sjoerg /// Design of OpenMP reductions on the GPU
3499*13fbcb42Sjoerg ///
3500*13fbcb42Sjoerg /// Consider a typical OpenMP program with one or more reduction
3501*13fbcb42Sjoerg /// clauses:
3502*13fbcb42Sjoerg ///
3503*13fbcb42Sjoerg /// float foo;
3504*13fbcb42Sjoerg /// double bar;
3505*13fbcb42Sjoerg /// #pragma omp target teams distribute parallel for \
3506*13fbcb42Sjoerg /// reduction(+:foo) reduction(*:bar)
3507*13fbcb42Sjoerg /// for (int i = 0; i < N; i++) {
3508*13fbcb42Sjoerg /// foo += A[i]; bar *= B[i];
3509*13fbcb42Sjoerg /// }
3510*13fbcb42Sjoerg ///
3511*13fbcb42Sjoerg /// where 'foo' and 'bar' are reduced across all OpenMP threads in
3512*13fbcb42Sjoerg /// all teams. In our OpenMP implementation on the NVPTX device an
3513*13fbcb42Sjoerg /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
3514*13fbcb42Sjoerg /// within a team are mapped to CUDA threads within a threadblock.
3515*13fbcb42Sjoerg /// Our goal is to efficiently aggregate values across all OpenMP
3516*13fbcb42Sjoerg /// threads such that:
3517*13fbcb42Sjoerg ///
3518*13fbcb42Sjoerg /// - the compiler and runtime are logically concise, and
3519*13fbcb42Sjoerg /// - the reduction is performed efficiently in a hierarchical
3520*13fbcb42Sjoerg /// manner as follows: within OpenMP threads in the same warp,
3521*13fbcb42Sjoerg /// across warps in a threadblock, and finally across teams on
3522*13fbcb42Sjoerg /// the NVPTX device.
3523*13fbcb42Sjoerg ///
3524*13fbcb42Sjoerg /// Introduction to Decoupling
3525*13fbcb42Sjoerg ///
3526*13fbcb42Sjoerg /// We would like to decouple the compiler and the runtime so that the
3527*13fbcb42Sjoerg /// latter is ignorant of the reduction variables (number, data types)
3528*13fbcb42Sjoerg /// and the reduction operators. This allows a simpler interface
3529*13fbcb42Sjoerg /// and implementation while still attaining good performance.
3530*13fbcb42Sjoerg ///
3531*13fbcb42Sjoerg /// Pseudocode for the aforementioned OpenMP program generated by the
3532*13fbcb42Sjoerg /// compiler is as follows:
3533*13fbcb42Sjoerg ///
3534*13fbcb42Sjoerg /// 1. Create private copies of reduction variables on each OpenMP
3535*13fbcb42Sjoerg /// thread: 'foo_private', 'bar_private'
3536*13fbcb42Sjoerg /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
3537*13fbcb42Sjoerg /// to it and writes the result in 'foo_private' and 'bar_private'
3538*13fbcb42Sjoerg /// respectively.
3539*13fbcb42Sjoerg /// 3. Call the OpenMP runtime on the GPU to reduce within a team
3540*13fbcb42Sjoerg /// and store the result on the team master:
3541*13fbcb42Sjoerg ///
3542*13fbcb42Sjoerg /// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
3543*13fbcb42Sjoerg /// reduceData, shuffleReduceFn, interWarpCpyFn)
3544*13fbcb42Sjoerg ///
3545*13fbcb42Sjoerg /// where:
3546*13fbcb42Sjoerg /// struct ReduceData {
3547*13fbcb42Sjoerg /// double *foo;
3548*13fbcb42Sjoerg /// double *bar;
3549*13fbcb42Sjoerg /// } reduceData
3550*13fbcb42Sjoerg /// reduceData.foo = &foo_private
3551*13fbcb42Sjoerg /// reduceData.bar = &bar_private
3552*13fbcb42Sjoerg ///
3553*13fbcb42Sjoerg /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
3554*13fbcb42Sjoerg /// auxiliary functions generated by the compiler that operate on
3555*13fbcb42Sjoerg /// variables of type 'ReduceData'. They aid the runtime perform
3556*13fbcb42Sjoerg /// algorithmic steps in a data agnostic manner.
3557*13fbcb42Sjoerg ///
3558*13fbcb42Sjoerg /// 'shuffleReduceFn' is a pointer to a function that reduces data
3559*13fbcb42Sjoerg /// of type 'ReduceData' across two OpenMP threads (lanes) in the
3560*13fbcb42Sjoerg /// same warp. It takes the following arguments as input:
3561*13fbcb42Sjoerg ///
3562*13fbcb42Sjoerg /// a. variable of type 'ReduceData' on the calling lane,
3563*13fbcb42Sjoerg /// b. its lane_id,
3564*13fbcb42Sjoerg /// c. an offset relative to the current lane_id to generate a
3565*13fbcb42Sjoerg /// remote_lane_id. The remote lane contains the second
3566*13fbcb42Sjoerg /// variable of type 'ReduceData' that is to be reduced.
3567*13fbcb42Sjoerg /// d. an algorithm version parameter determining which reduction
3568*13fbcb42Sjoerg /// algorithm to use.
3569*13fbcb42Sjoerg ///
3570*13fbcb42Sjoerg /// 'shuffleReduceFn' retrieves data from the remote lane using
3571*13fbcb42Sjoerg /// efficient GPU shuffle intrinsics and reduces, using the
3572*13fbcb42Sjoerg /// algorithm specified by the 4th parameter, the two operands
3573*13fbcb42Sjoerg /// element-wise. The result is written to the first operand.
3574*13fbcb42Sjoerg ///
3575*13fbcb42Sjoerg /// Different reduction algorithms are implemented in different
3576*13fbcb42Sjoerg /// runtime functions, all calling 'shuffleReduceFn' to perform
3577*13fbcb42Sjoerg /// the essential reduction step. Therefore, based on the 4th
3578*13fbcb42Sjoerg /// parameter, this function behaves slightly differently to
3579*13fbcb42Sjoerg /// cooperate with the runtime to ensure correctness under
3580*13fbcb42Sjoerg /// different circumstances.
3581*13fbcb42Sjoerg ///
3582*13fbcb42Sjoerg /// 'InterWarpCpyFn' is a pointer to a function that transfers
3583*13fbcb42Sjoerg /// reduced variables across warps. It tunnels, through CUDA
3584*13fbcb42Sjoerg /// shared memory, the thread-private data of type 'ReduceData'
3585*13fbcb42Sjoerg /// from lane 0 of each warp to a lane in the first warp.
3586*13fbcb42Sjoerg /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
3587*13fbcb42Sjoerg /// The last team writes the global reduced value to memory.
3588*13fbcb42Sjoerg ///
3589*13fbcb42Sjoerg /// ret = __kmpc_nvptx_teams_reduce_nowait(...,
3590*13fbcb42Sjoerg /// reduceData, shuffleReduceFn, interWarpCpyFn,
3591*13fbcb42Sjoerg /// scratchpadCopyFn, loadAndReduceFn)
3592*13fbcb42Sjoerg ///
3593*13fbcb42Sjoerg /// 'scratchpadCopyFn' is a helper that stores reduced
3594*13fbcb42Sjoerg /// data from the team master to a scratchpad array in
3595*13fbcb42Sjoerg /// global memory.
3596*13fbcb42Sjoerg ///
3597*13fbcb42Sjoerg /// 'loadAndReduceFn' is a helper that loads data from
3598*13fbcb42Sjoerg /// the scratchpad array and reduces it with the input
3599*13fbcb42Sjoerg /// operand.
3600*13fbcb42Sjoerg ///
3601*13fbcb42Sjoerg /// These compiler generated functions hide address
3602*13fbcb42Sjoerg /// calculation and alignment information from the runtime.
3603*13fbcb42Sjoerg /// 5. if ret == 1:
3604*13fbcb42Sjoerg /// The team master of the last team stores the reduced
3605*13fbcb42Sjoerg /// result to the globals in memory.
3606*13fbcb42Sjoerg /// foo += reduceData.foo; bar *= reduceData.bar
3607*13fbcb42Sjoerg ///
3608*13fbcb42Sjoerg ///
3609*13fbcb42Sjoerg /// Warp Reduction Algorithms
3610*13fbcb42Sjoerg ///
3611*13fbcb42Sjoerg /// On the warp level, we have three algorithms implemented in the
3612*13fbcb42Sjoerg /// OpenMP runtime depending on the number of active lanes:
3613*13fbcb42Sjoerg ///
3614*13fbcb42Sjoerg /// Full Warp Reduction
3615*13fbcb42Sjoerg ///
3616*13fbcb42Sjoerg /// The reduce algorithm within a warp where all lanes are active
3617*13fbcb42Sjoerg /// is implemented in the runtime as follows:
3618*13fbcb42Sjoerg ///
3619*13fbcb42Sjoerg /// full_warp_reduce(void *reduce_data,
3620*13fbcb42Sjoerg /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3621*13fbcb42Sjoerg /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
3622*13fbcb42Sjoerg /// ShuffleReduceFn(reduce_data, 0, offset, 0);
3623*13fbcb42Sjoerg /// }
3624*13fbcb42Sjoerg ///
3625*13fbcb42Sjoerg /// The algorithm completes in log(2, WARPSIZE) steps.
3626*13fbcb42Sjoerg ///
3627*13fbcb42Sjoerg /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
3628*13fbcb42Sjoerg /// not used therefore we save instructions by not retrieving lane_id
3629*13fbcb42Sjoerg /// from the corresponding special registers. The 4th parameter, which
3630*13fbcb42Sjoerg /// represents the version of the algorithm being used, is set to 0 to
3631*13fbcb42Sjoerg /// signify full warp reduction.
3632*13fbcb42Sjoerg ///
3633*13fbcb42Sjoerg /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3634*13fbcb42Sjoerg ///
3635*13fbcb42Sjoerg /// #reduce_elem refers to an element in the local lane's data structure
3636*13fbcb42Sjoerg /// #remote_elem is retrieved from a remote lane
3637*13fbcb42Sjoerg /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3638*13fbcb42Sjoerg /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
3639*13fbcb42Sjoerg ///
3640*13fbcb42Sjoerg /// Contiguous Partial Warp Reduction
3641*13fbcb42Sjoerg ///
3642*13fbcb42Sjoerg /// This reduce algorithm is used within a warp where only the first
3643*13fbcb42Sjoerg /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
3644*13fbcb42Sjoerg /// number of OpenMP threads in a parallel region is not a multiple of
3645*13fbcb42Sjoerg /// WARPSIZE. The algorithm is implemented in the runtime as follows:
3646*13fbcb42Sjoerg ///
3647*13fbcb42Sjoerg /// void
3648*13fbcb42Sjoerg /// contiguous_partial_reduce(void *reduce_data,
3649*13fbcb42Sjoerg /// kmp_ShuffleReductFctPtr ShuffleReduceFn,
3650*13fbcb42Sjoerg /// int size, int lane_id) {
3651*13fbcb42Sjoerg /// int curr_size;
3652*13fbcb42Sjoerg /// int offset;
3653*13fbcb42Sjoerg /// curr_size = size;
3654*13fbcb42Sjoerg /// mask = curr_size/2;
3655*13fbcb42Sjoerg /// while (offset>0) {
3656*13fbcb42Sjoerg /// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
3657*13fbcb42Sjoerg /// curr_size = (curr_size+1)/2;
3658*13fbcb42Sjoerg /// offset = curr_size/2;
3659*13fbcb42Sjoerg /// }
3660*13fbcb42Sjoerg /// }
3661*13fbcb42Sjoerg ///
3662*13fbcb42Sjoerg /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3663*13fbcb42Sjoerg ///
3664*13fbcb42Sjoerg /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3665*13fbcb42Sjoerg /// if (lane_id < offset)
3666*13fbcb42Sjoerg /// reduce_elem = reduce_elem REDUCE_OP remote_elem
3667*13fbcb42Sjoerg /// else
3668*13fbcb42Sjoerg /// reduce_elem = remote_elem
3669*13fbcb42Sjoerg ///
3670*13fbcb42Sjoerg /// This algorithm assumes that the data to be reduced are located in a
3671*13fbcb42Sjoerg /// contiguous subset of lanes starting from the first. When there is
3672*13fbcb42Sjoerg /// an odd number of active lanes, the data in the last lane is not
3673*13fbcb42Sjoerg /// aggregated with any other lane's dat but is instead copied over.
3674*13fbcb42Sjoerg ///
3675*13fbcb42Sjoerg /// Dispersed Partial Warp Reduction
3676*13fbcb42Sjoerg ///
3677*13fbcb42Sjoerg /// This algorithm is used within a warp when any discontiguous subset of
3678*13fbcb42Sjoerg /// lanes are active. It is used to implement the reduction operation
3679*13fbcb42Sjoerg /// across lanes in an OpenMP simd region or in a nested parallel region.
3680*13fbcb42Sjoerg ///
3681*13fbcb42Sjoerg /// void
3682*13fbcb42Sjoerg /// dispersed_partial_reduce(void *reduce_data,
3683*13fbcb42Sjoerg /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3684*13fbcb42Sjoerg /// int size, remote_id;
3685*13fbcb42Sjoerg /// int logical_lane_id = number_of_active_lanes_before_me() * 2;
3686*13fbcb42Sjoerg /// do {
3687*13fbcb42Sjoerg /// remote_id = next_active_lane_id_right_after_me();
3688*13fbcb42Sjoerg /// # the above function returns 0 of no active lane
3689*13fbcb42Sjoerg /// # is present right after the current lane.
3690*13fbcb42Sjoerg /// size = number_of_active_lanes_in_this_warp();
3691*13fbcb42Sjoerg /// logical_lane_id /= 2;
3692*13fbcb42Sjoerg /// ShuffleReduceFn(reduce_data, logical_lane_id,
3693*13fbcb42Sjoerg /// remote_id-1-threadIdx.x, 2);
3694*13fbcb42Sjoerg /// } while (logical_lane_id % 2 == 0 && size > 1);
3695*13fbcb42Sjoerg /// }
3696*13fbcb42Sjoerg ///
3697*13fbcb42Sjoerg /// There is no assumption made about the initial state of the reduction.
3698*13fbcb42Sjoerg /// Any number of lanes (>=1) could be active at any position. The reduction
3699*13fbcb42Sjoerg /// result is returned in the first active lane.
3700*13fbcb42Sjoerg ///
3701*13fbcb42Sjoerg /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3702*13fbcb42Sjoerg ///
3703*13fbcb42Sjoerg /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3704*13fbcb42Sjoerg /// if (lane_id % 2 == 0 && offset > 0)
3705*13fbcb42Sjoerg /// reduce_elem = reduce_elem REDUCE_OP remote_elem
3706*13fbcb42Sjoerg /// else
3707*13fbcb42Sjoerg /// reduce_elem = remote_elem
3708*13fbcb42Sjoerg ///
3709*13fbcb42Sjoerg ///
3710*13fbcb42Sjoerg /// Intra-Team Reduction
3711*13fbcb42Sjoerg ///
3712*13fbcb42Sjoerg /// This function, as implemented in the runtime call
3713*13fbcb42Sjoerg /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
3714*13fbcb42Sjoerg /// threads in a team. It first reduces within a warp using the
3715*13fbcb42Sjoerg /// aforementioned algorithms. We then proceed to gather all such
3716*13fbcb42Sjoerg /// reduced values at the first warp.
3717*13fbcb42Sjoerg ///
3718*13fbcb42Sjoerg /// The runtime makes use of the function 'InterWarpCpyFn', which copies
3719*13fbcb42Sjoerg /// data from each of the "warp master" (zeroth lane of each warp, where
3720*13fbcb42Sjoerg /// warp-reduced data is held) to the zeroth warp. This step reduces (in
3721*13fbcb42Sjoerg /// a mathematical sense) the problem of reduction across warp masters in
3722*13fbcb42Sjoerg /// a block to the problem of warp reduction.
3723*13fbcb42Sjoerg ///
3724*13fbcb42Sjoerg ///
3725*13fbcb42Sjoerg /// Inter-Team Reduction
3726*13fbcb42Sjoerg ///
3727*13fbcb42Sjoerg /// Once a team has reduced its data to a single value, it is stored in
3728*13fbcb42Sjoerg /// a global scratchpad array. Since each team has a distinct slot, this
3729*13fbcb42Sjoerg /// can be done without locking.
3730*13fbcb42Sjoerg ///
3731*13fbcb42Sjoerg /// The last team to write to the scratchpad array proceeds to reduce the
3732*13fbcb42Sjoerg /// scratchpad array. One or more workers in the last team use the helper
3733*13fbcb42Sjoerg /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
3734*13fbcb42Sjoerg /// the k'th worker reduces every k'th element.
3735*13fbcb42Sjoerg ///
3736*13fbcb42Sjoerg /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
3737*13fbcb42Sjoerg /// reduce across workers and compute a globally reduced value.
3738*13fbcb42Sjoerg ///
emitReduction(CodeGenFunction & CGF,SourceLocation Loc,ArrayRef<const Expr * > Privates,ArrayRef<const Expr * > LHSExprs,ArrayRef<const Expr * > RHSExprs,ArrayRef<const Expr * > ReductionOps,ReductionOptionsTy Options)3739*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitReduction(
3740*13fbcb42Sjoerg CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
3741*13fbcb42Sjoerg ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
3742*13fbcb42Sjoerg ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
3743*13fbcb42Sjoerg if (!CGF.HaveInsertPoint())
3744*13fbcb42Sjoerg return;
3745*13fbcb42Sjoerg
3746*13fbcb42Sjoerg bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
3747*13fbcb42Sjoerg #ifndef NDEBUG
3748*13fbcb42Sjoerg bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
3749*13fbcb42Sjoerg #endif
3750*13fbcb42Sjoerg
3751*13fbcb42Sjoerg if (Options.SimpleReduction) {
3752*13fbcb42Sjoerg assert(!TeamsReduction && !ParallelReduction &&
3753*13fbcb42Sjoerg "Invalid reduction selection in emitReduction.");
3754*13fbcb42Sjoerg CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
3755*13fbcb42Sjoerg ReductionOps, Options);
3756*13fbcb42Sjoerg return;
3757*13fbcb42Sjoerg }
3758*13fbcb42Sjoerg
3759*13fbcb42Sjoerg assert((TeamsReduction || ParallelReduction) &&
3760*13fbcb42Sjoerg "Invalid reduction selection in emitReduction.");
3761*13fbcb42Sjoerg
3762*13fbcb42Sjoerg // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3763*13fbcb42Sjoerg // RedList, shuffle_reduce_func, interwarp_copy_func);
3764*13fbcb42Sjoerg // or
3765*13fbcb42Sjoerg // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3766*13fbcb42Sjoerg llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
3767*13fbcb42Sjoerg llvm::Value *ThreadId = getThreadID(CGF, Loc);
3768*13fbcb42Sjoerg
3769*13fbcb42Sjoerg llvm::Value *Res;
3770*13fbcb42Sjoerg ASTContext &C = CGM.getContext();
3771*13fbcb42Sjoerg // 1. Build a list of reduction variables.
3772*13fbcb42Sjoerg // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3773*13fbcb42Sjoerg auto Size = RHSExprs.size();
3774*13fbcb42Sjoerg for (const Expr *E : Privates) {
3775*13fbcb42Sjoerg if (E->getType()->isVariablyModifiedType())
3776*13fbcb42Sjoerg // Reserve place for array size.
3777*13fbcb42Sjoerg ++Size;
3778*13fbcb42Sjoerg }
3779*13fbcb42Sjoerg llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
3780*13fbcb42Sjoerg QualType ReductionArrayTy =
3781*13fbcb42Sjoerg C.getConstantArrayType(C.VoidPtrTy, ArraySize, nullptr, ArrayType::Normal,
3782*13fbcb42Sjoerg /*IndexTypeQuals=*/0);
3783*13fbcb42Sjoerg Address ReductionList =
3784*13fbcb42Sjoerg CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3785*13fbcb42Sjoerg auto IPriv = Privates.begin();
3786*13fbcb42Sjoerg unsigned Idx = 0;
3787*13fbcb42Sjoerg for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
3788*13fbcb42Sjoerg Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3789*13fbcb42Sjoerg CGF.Builder.CreateStore(
3790*13fbcb42Sjoerg CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3791*13fbcb42Sjoerg CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),
3792*13fbcb42Sjoerg Elem);
3793*13fbcb42Sjoerg if ((*IPriv)->getType()->isVariablyModifiedType()) {
3794*13fbcb42Sjoerg // Store array size.
3795*13fbcb42Sjoerg ++Idx;
3796*13fbcb42Sjoerg Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3797*13fbcb42Sjoerg llvm::Value *Size = CGF.Builder.CreateIntCast(
3798*13fbcb42Sjoerg CGF.getVLASize(
3799*13fbcb42Sjoerg CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3800*13fbcb42Sjoerg .NumElts,
3801*13fbcb42Sjoerg CGF.SizeTy, /*isSigned=*/false);
3802*13fbcb42Sjoerg CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3803*13fbcb42Sjoerg Elem);
3804*13fbcb42Sjoerg }
3805*13fbcb42Sjoerg }
3806*13fbcb42Sjoerg
3807*13fbcb42Sjoerg llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3808*13fbcb42Sjoerg ReductionList.getPointer(), CGF.VoidPtrTy);
3809*13fbcb42Sjoerg llvm::Function *ReductionFn = emitReductionFunction(
3810*13fbcb42Sjoerg Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates,
3811*13fbcb42Sjoerg LHSExprs, RHSExprs, ReductionOps);
3812*13fbcb42Sjoerg llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
3813*13fbcb42Sjoerg llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
3814*13fbcb42Sjoerg CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
3815*13fbcb42Sjoerg llvm::Value *InterWarpCopyFn =
3816*13fbcb42Sjoerg emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
3817*13fbcb42Sjoerg
3818*13fbcb42Sjoerg if (ParallelReduction) {
3819*13fbcb42Sjoerg llvm::Value *Args[] = {RTLoc,
3820*13fbcb42Sjoerg ThreadId,
3821*13fbcb42Sjoerg CGF.Builder.getInt32(RHSExprs.size()),
3822*13fbcb42Sjoerg ReductionArrayTySize,
3823*13fbcb42Sjoerg RL,
3824*13fbcb42Sjoerg ShuffleAndReduceFn,
3825*13fbcb42Sjoerg InterWarpCopyFn};
3826*13fbcb42Sjoerg
3827*13fbcb42Sjoerg Res = CGF.EmitRuntimeCall(
3828*13fbcb42Sjoerg OMPBuilder.getOrCreateRuntimeFunction(
3829*13fbcb42Sjoerg CGM.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
3830*13fbcb42Sjoerg Args);
3831*13fbcb42Sjoerg } else {
3832*13fbcb42Sjoerg assert(TeamsReduction && "expected teams reduction.");
3833*13fbcb42Sjoerg llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
3834*13fbcb42Sjoerg llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
3835*13fbcb42Sjoerg int Cnt = 0;
3836*13fbcb42Sjoerg for (const Expr *DRE : Privates) {
3837*13fbcb42Sjoerg PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
3838*13fbcb42Sjoerg ++Cnt;
3839*13fbcb42Sjoerg }
3840*13fbcb42Sjoerg const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars(
3841*13fbcb42Sjoerg CGM.getContext(), PrivatesReductions, llvm::None, VarFieldMap,
3842*13fbcb42Sjoerg C.getLangOpts().OpenMPCUDAReductionBufNum);
3843*13fbcb42Sjoerg TeamsReductions.push_back(TeamReductionRec);
3844*13fbcb42Sjoerg if (!KernelTeamsReductionPtr) {
3845*13fbcb42Sjoerg KernelTeamsReductionPtr = new llvm::GlobalVariable(
3846*13fbcb42Sjoerg CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
3847*13fbcb42Sjoerg llvm::GlobalValue::InternalLinkage, nullptr,
3848*13fbcb42Sjoerg "_openmp_teams_reductions_buffer_$_$ptr");
3849*13fbcb42Sjoerg }
3850*13fbcb42Sjoerg llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
3851*13fbcb42Sjoerg Address(KernelTeamsReductionPtr, CGM.getPointerAlign()),
3852*13fbcb42Sjoerg /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc);
3853*13fbcb42Sjoerg llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
3854*13fbcb42Sjoerg CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
3855*13fbcb42Sjoerg llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
3856*13fbcb42Sjoerg CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
3857*13fbcb42Sjoerg ReductionFn);
3858*13fbcb42Sjoerg llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
3859*13fbcb42Sjoerg CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
3860*13fbcb42Sjoerg llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
3861*13fbcb42Sjoerg CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
3862*13fbcb42Sjoerg ReductionFn);
3863*13fbcb42Sjoerg
3864*13fbcb42Sjoerg llvm::Value *Args[] = {
3865*13fbcb42Sjoerg RTLoc,
3866*13fbcb42Sjoerg ThreadId,
3867*13fbcb42Sjoerg GlobalBufferPtr,
3868*13fbcb42Sjoerg CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
3869*13fbcb42Sjoerg RL,
3870*13fbcb42Sjoerg ShuffleAndReduceFn,
3871*13fbcb42Sjoerg InterWarpCopyFn,
3872*13fbcb42Sjoerg GlobalToBufferCpyFn,
3873*13fbcb42Sjoerg GlobalToBufferRedFn,
3874*13fbcb42Sjoerg BufferToGlobalCpyFn,
3875*13fbcb42Sjoerg BufferToGlobalRedFn};
3876*13fbcb42Sjoerg
3877*13fbcb42Sjoerg Res = CGF.EmitRuntimeCall(
3878*13fbcb42Sjoerg OMPBuilder.getOrCreateRuntimeFunction(
3879*13fbcb42Sjoerg CGM.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
3880*13fbcb42Sjoerg Args);
3881*13fbcb42Sjoerg }
3882*13fbcb42Sjoerg
3883*13fbcb42Sjoerg // 5. Build if (res == 1)
3884*13fbcb42Sjoerg llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
3885*13fbcb42Sjoerg llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
3886*13fbcb42Sjoerg llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
3887*13fbcb42Sjoerg Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
3888*13fbcb42Sjoerg CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3889*13fbcb42Sjoerg
3890*13fbcb42Sjoerg // 6. Build then branch: where we have reduced values in the master
3891*13fbcb42Sjoerg // thread in each team.
3892*13fbcb42Sjoerg // __kmpc_end_reduce{_nowait}(<gtid>);
3893*13fbcb42Sjoerg // break;
3894*13fbcb42Sjoerg CGF.EmitBlock(ThenBB);
3895*13fbcb42Sjoerg
3896*13fbcb42Sjoerg // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3897*13fbcb42Sjoerg auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
3898*13fbcb42Sjoerg this](CodeGenFunction &CGF, PrePostActionTy &Action) {
3899*13fbcb42Sjoerg auto IPriv = Privates.begin();
3900*13fbcb42Sjoerg auto ILHS = LHSExprs.begin();
3901*13fbcb42Sjoerg auto IRHS = RHSExprs.begin();
3902*13fbcb42Sjoerg for (const Expr *E : ReductionOps) {
3903*13fbcb42Sjoerg emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
3904*13fbcb42Sjoerg cast<DeclRefExpr>(*IRHS));
3905*13fbcb42Sjoerg ++IPriv;
3906*13fbcb42Sjoerg ++ILHS;
3907*13fbcb42Sjoerg ++IRHS;
3908*13fbcb42Sjoerg }
3909*13fbcb42Sjoerg };
3910*13fbcb42Sjoerg llvm::Value *EndArgs[] = {ThreadId};
3911*13fbcb42Sjoerg RegionCodeGenTy RCG(CodeGen);
3912*13fbcb42Sjoerg NVPTXActionTy Action(
3913*13fbcb42Sjoerg nullptr, llvm::None,
3914*13fbcb42Sjoerg OMPBuilder.getOrCreateRuntimeFunction(
3915*13fbcb42Sjoerg CGM.getModule(), OMPRTL___kmpc_nvptx_end_reduce_nowait),
3916*13fbcb42Sjoerg EndArgs);
3917*13fbcb42Sjoerg RCG.setAction(Action);
3918*13fbcb42Sjoerg RCG(CGF);
3919*13fbcb42Sjoerg // There is no need to emit line number for unconditional branch.
3920*13fbcb42Sjoerg (void)ApplyDebugLocation::CreateEmpty(CGF);
3921*13fbcb42Sjoerg CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
3922*13fbcb42Sjoerg }
3923*13fbcb42Sjoerg
3924*13fbcb42Sjoerg const VarDecl *
translateParameter(const FieldDecl * FD,const VarDecl * NativeParam) const3925*13fbcb42Sjoerg CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD,
3926*13fbcb42Sjoerg const VarDecl *NativeParam) const {
3927*13fbcb42Sjoerg if (!NativeParam->getType()->isReferenceType())
3928*13fbcb42Sjoerg return NativeParam;
3929*13fbcb42Sjoerg QualType ArgType = NativeParam->getType();
3930*13fbcb42Sjoerg QualifierCollector QC;
3931*13fbcb42Sjoerg const Type *NonQualTy = QC.strip(ArgType);
3932*13fbcb42Sjoerg QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3933*13fbcb42Sjoerg if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
3934*13fbcb42Sjoerg if (Attr->getCaptureKind() == OMPC_map) {
3935*13fbcb42Sjoerg PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
3936*13fbcb42Sjoerg LangAS::opencl_global);
3937*13fbcb42Sjoerg } else if (Attr->getCaptureKind() == OMPC_firstprivate &&
3938*13fbcb42Sjoerg PointeeTy.isConstant(CGM.getContext())) {
3939*13fbcb42Sjoerg PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
3940*13fbcb42Sjoerg LangAS::opencl_generic);
3941*13fbcb42Sjoerg }
3942*13fbcb42Sjoerg }
3943*13fbcb42Sjoerg ArgType = CGM.getContext().getPointerType(PointeeTy);
3944*13fbcb42Sjoerg QC.addRestrict();
3945*13fbcb42Sjoerg enum { NVPTX_local_addr = 5 };
3946*13fbcb42Sjoerg QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
3947*13fbcb42Sjoerg ArgType = QC.apply(CGM.getContext(), ArgType);
3948*13fbcb42Sjoerg if (isa<ImplicitParamDecl>(NativeParam))
3949*13fbcb42Sjoerg return ImplicitParamDecl::Create(
3950*13fbcb42Sjoerg CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
3951*13fbcb42Sjoerg NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
3952*13fbcb42Sjoerg return ParmVarDecl::Create(
3953*13fbcb42Sjoerg CGM.getContext(),
3954*13fbcb42Sjoerg const_cast<DeclContext *>(NativeParam->getDeclContext()),
3955*13fbcb42Sjoerg NativeParam->getBeginLoc(), NativeParam->getLocation(),
3956*13fbcb42Sjoerg NativeParam->getIdentifier(), ArgType,
3957*13fbcb42Sjoerg /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
3958*13fbcb42Sjoerg }
3959*13fbcb42Sjoerg
3960*13fbcb42Sjoerg Address
getParameterAddress(CodeGenFunction & CGF,const VarDecl * NativeParam,const VarDecl * TargetParam) const3961*13fbcb42Sjoerg CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF,
3962*13fbcb42Sjoerg const VarDecl *NativeParam,
3963*13fbcb42Sjoerg const VarDecl *TargetParam) const {
3964*13fbcb42Sjoerg assert(NativeParam != TargetParam &&
3965*13fbcb42Sjoerg NativeParam->getType()->isReferenceType() &&
3966*13fbcb42Sjoerg "Native arg must not be the same as target arg.");
3967*13fbcb42Sjoerg Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
3968*13fbcb42Sjoerg QualType NativeParamType = NativeParam->getType();
3969*13fbcb42Sjoerg QualifierCollector QC;
3970*13fbcb42Sjoerg const Type *NonQualTy = QC.strip(NativeParamType);
3971*13fbcb42Sjoerg QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3972*13fbcb42Sjoerg unsigned NativePointeeAddrSpace =
3973*13fbcb42Sjoerg CGF.getContext().getTargetAddressSpace(NativePointeeTy);
3974*13fbcb42Sjoerg QualType TargetTy = TargetParam->getType();
3975*13fbcb42Sjoerg llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
3976*13fbcb42Sjoerg LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
3977*13fbcb42Sjoerg // First cast to generic.
3978*13fbcb42Sjoerg TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3979*13fbcb42Sjoerg TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
3980*13fbcb42Sjoerg /*AddrSpace=*/0));
3981*13fbcb42Sjoerg // Cast from generic to native address space.
3982*13fbcb42Sjoerg TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3983*13fbcb42Sjoerg TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
3984*13fbcb42Sjoerg NativePointeeAddrSpace));
3985*13fbcb42Sjoerg Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
3986*13fbcb42Sjoerg CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
3987*13fbcb42Sjoerg NativeParamType);
3988*13fbcb42Sjoerg return NativeParamAddr;
3989*13fbcb42Sjoerg }
3990*13fbcb42Sjoerg
emitOutlinedFunctionCall(CodeGenFunction & CGF,SourceLocation Loc,llvm::FunctionCallee OutlinedFn,ArrayRef<llvm::Value * > Args) const3991*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall(
3992*13fbcb42Sjoerg CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
3993*13fbcb42Sjoerg ArrayRef<llvm::Value *> Args) const {
3994*13fbcb42Sjoerg SmallVector<llvm::Value *, 4> TargetArgs;
3995*13fbcb42Sjoerg TargetArgs.reserve(Args.size());
3996*13fbcb42Sjoerg auto *FnType = OutlinedFn.getFunctionType();
3997*13fbcb42Sjoerg for (unsigned I = 0, E = Args.size(); I < E; ++I) {
3998*13fbcb42Sjoerg if (FnType->isVarArg() && FnType->getNumParams() <= I) {
3999*13fbcb42Sjoerg TargetArgs.append(std::next(Args.begin(), I), Args.end());
4000*13fbcb42Sjoerg break;
4001*13fbcb42Sjoerg }
4002*13fbcb42Sjoerg llvm::Type *TargetType = FnType->getParamType(I);
4003*13fbcb42Sjoerg llvm::Value *NativeArg = Args[I];
4004*13fbcb42Sjoerg if (!TargetType->isPointerTy()) {
4005*13fbcb42Sjoerg TargetArgs.emplace_back(NativeArg);
4006*13fbcb42Sjoerg continue;
4007*13fbcb42Sjoerg }
4008*13fbcb42Sjoerg llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
4009*13fbcb42Sjoerg NativeArg,
4010*13fbcb42Sjoerg NativeArg->getType()->getPointerElementType()->getPointerTo());
4011*13fbcb42Sjoerg TargetArgs.emplace_back(
4012*13fbcb42Sjoerg CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
4013*13fbcb42Sjoerg }
4014*13fbcb42Sjoerg CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
4015*13fbcb42Sjoerg }
4016*13fbcb42Sjoerg
4017*13fbcb42Sjoerg /// Emit function which wraps the outline parallel region
4018*13fbcb42Sjoerg /// and controls the arguments which are passed to this function.
4019*13fbcb42Sjoerg /// The wrapper ensures that the outlined function is called
4020*13fbcb42Sjoerg /// with the correct arguments when data is shared.
createParallelDataSharingWrapper(llvm::Function * OutlinedParallelFn,const OMPExecutableDirective & D)4021*13fbcb42Sjoerg llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
4022*13fbcb42Sjoerg llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
4023*13fbcb42Sjoerg ASTContext &Ctx = CGM.getContext();
4024*13fbcb42Sjoerg const auto &CS = *D.getCapturedStmt(OMPD_parallel);
4025*13fbcb42Sjoerg
4026*13fbcb42Sjoerg // Create a function that takes as argument the source thread.
4027*13fbcb42Sjoerg FunctionArgList WrapperArgs;
4028*13fbcb42Sjoerg QualType Int16QTy =
4029*13fbcb42Sjoerg Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
4030*13fbcb42Sjoerg QualType Int32QTy =
4031*13fbcb42Sjoerg Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
4032*13fbcb42Sjoerg ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
4033*13fbcb42Sjoerg /*Id=*/nullptr, Int16QTy,
4034*13fbcb42Sjoerg ImplicitParamDecl::Other);
4035*13fbcb42Sjoerg ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
4036*13fbcb42Sjoerg /*Id=*/nullptr, Int32QTy,
4037*13fbcb42Sjoerg ImplicitParamDecl::Other);
4038*13fbcb42Sjoerg WrapperArgs.emplace_back(&ParallelLevelArg);
4039*13fbcb42Sjoerg WrapperArgs.emplace_back(&WrapperArg);
4040*13fbcb42Sjoerg
4041*13fbcb42Sjoerg const CGFunctionInfo &CGFI =
4042*13fbcb42Sjoerg CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
4043*13fbcb42Sjoerg
4044*13fbcb42Sjoerg auto *Fn = llvm::Function::Create(
4045*13fbcb42Sjoerg CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
4046*13fbcb42Sjoerg Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
4047*13fbcb42Sjoerg
4048*13fbcb42Sjoerg // Ensure we do not inline the function. This is trivially true for the ones
4049*13fbcb42Sjoerg // passed to __kmpc_fork_call but the ones calles in serialized regions
4050*13fbcb42Sjoerg // could be inlined. This is not a perfect but it is closer to the invariant
4051*13fbcb42Sjoerg // we want, namely, every data environment starts with a new function.
4052*13fbcb42Sjoerg // TODO: We should pass the if condition to the runtime function and do the
4053*13fbcb42Sjoerg // handling there. Much cleaner code.
4054*13fbcb42Sjoerg Fn->addFnAttr(llvm::Attribute::NoInline);
4055*13fbcb42Sjoerg
4056*13fbcb42Sjoerg CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
4057*13fbcb42Sjoerg Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
4058*13fbcb42Sjoerg Fn->setDoesNotRecurse();
4059*13fbcb42Sjoerg
4060*13fbcb42Sjoerg CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
4061*13fbcb42Sjoerg CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
4062*13fbcb42Sjoerg D.getBeginLoc(), D.getBeginLoc());
4063*13fbcb42Sjoerg
4064*13fbcb42Sjoerg const auto *RD = CS.getCapturedRecordDecl();
4065*13fbcb42Sjoerg auto CurField = RD->field_begin();
4066*13fbcb42Sjoerg
4067*13fbcb42Sjoerg Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
4068*13fbcb42Sjoerg /*Name=*/".zero.addr");
4069*13fbcb42Sjoerg CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
4070*13fbcb42Sjoerg // Get the array of arguments.
4071*13fbcb42Sjoerg SmallVector<llvm::Value *, 8> Args;
4072*13fbcb42Sjoerg
4073*13fbcb42Sjoerg Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
4074*13fbcb42Sjoerg Args.emplace_back(ZeroAddr.getPointer());
4075*13fbcb42Sjoerg
4076*13fbcb42Sjoerg CGBuilderTy &Bld = CGF.Builder;
4077*13fbcb42Sjoerg auto CI = CS.capture_begin();
4078*13fbcb42Sjoerg
4079*13fbcb42Sjoerg // Use global memory for data sharing.
4080*13fbcb42Sjoerg // Handle passing of global args to workers.
4081*13fbcb42Sjoerg Address GlobalArgs =
4082*13fbcb42Sjoerg CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
4083*13fbcb42Sjoerg llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
4084*13fbcb42Sjoerg llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
4085*13fbcb42Sjoerg CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
4086*13fbcb42Sjoerg CGM.getModule(), OMPRTL___kmpc_get_shared_variables),
4087*13fbcb42Sjoerg DataSharingArgs);
4088*13fbcb42Sjoerg
4089*13fbcb42Sjoerg // Retrieve the shared variables from the list of references returned
4090*13fbcb42Sjoerg // by the runtime. Pass the variables to the outlined function.
4091*13fbcb42Sjoerg Address SharedArgListAddress = Address::invalid();
4092*13fbcb42Sjoerg if (CS.capture_size() > 0 ||
4093*13fbcb42Sjoerg isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
4094*13fbcb42Sjoerg SharedArgListAddress = CGF.EmitLoadOfPointer(
4095*13fbcb42Sjoerg GlobalArgs, CGF.getContext()
4096*13fbcb42Sjoerg .getPointerType(CGF.getContext().getPointerType(
4097*13fbcb42Sjoerg CGF.getContext().VoidPtrTy))
4098*13fbcb42Sjoerg .castAs<PointerType>());
4099*13fbcb42Sjoerg }
4100*13fbcb42Sjoerg unsigned Idx = 0;
4101*13fbcb42Sjoerg if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
4102*13fbcb42Sjoerg Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
4103*13fbcb42Sjoerg Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4104*13fbcb42Sjoerg Src, CGF.SizeTy->getPointerTo());
4105*13fbcb42Sjoerg llvm::Value *LB = CGF.EmitLoadOfScalar(
4106*13fbcb42Sjoerg TypedAddress,
4107*13fbcb42Sjoerg /*Volatile=*/false,
4108*13fbcb42Sjoerg CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
4109*13fbcb42Sjoerg cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
4110*13fbcb42Sjoerg Args.emplace_back(LB);
4111*13fbcb42Sjoerg ++Idx;
4112*13fbcb42Sjoerg Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
4113*13fbcb42Sjoerg TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4114*13fbcb42Sjoerg Src, CGF.SizeTy->getPointerTo());
4115*13fbcb42Sjoerg llvm::Value *UB = CGF.EmitLoadOfScalar(
4116*13fbcb42Sjoerg TypedAddress,
4117*13fbcb42Sjoerg /*Volatile=*/false,
4118*13fbcb42Sjoerg CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
4119*13fbcb42Sjoerg cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
4120*13fbcb42Sjoerg Args.emplace_back(UB);
4121*13fbcb42Sjoerg ++Idx;
4122*13fbcb42Sjoerg }
4123*13fbcb42Sjoerg if (CS.capture_size() > 0) {
4124*13fbcb42Sjoerg ASTContext &CGFContext = CGF.getContext();
4125*13fbcb42Sjoerg for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
4126*13fbcb42Sjoerg QualType ElemTy = CurField->getType();
4127*13fbcb42Sjoerg Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
4128*13fbcb42Sjoerg Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4129*13fbcb42Sjoerg Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
4130*13fbcb42Sjoerg llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
4131*13fbcb42Sjoerg /*Volatile=*/false,
4132*13fbcb42Sjoerg CGFContext.getPointerType(ElemTy),
4133*13fbcb42Sjoerg CI->getLocation());
4134*13fbcb42Sjoerg if (CI->capturesVariableByCopy() &&
4135*13fbcb42Sjoerg !CI->getCapturedVar()->getType()->isAnyPointerType()) {
4136*13fbcb42Sjoerg Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
4137*13fbcb42Sjoerg CI->getLocation());
4138*13fbcb42Sjoerg }
4139*13fbcb42Sjoerg Args.emplace_back(Arg);
4140*13fbcb42Sjoerg }
4141*13fbcb42Sjoerg }
4142*13fbcb42Sjoerg
4143*13fbcb42Sjoerg emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
4144*13fbcb42Sjoerg CGF.FinishFunction();
4145*13fbcb42Sjoerg return Fn;
4146*13fbcb42Sjoerg }
4147*13fbcb42Sjoerg
emitFunctionProlog(CodeGenFunction & CGF,const Decl * D)4148*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
4149*13fbcb42Sjoerg const Decl *D) {
4150*13fbcb42Sjoerg if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
4151*13fbcb42Sjoerg return;
4152*13fbcb42Sjoerg
4153*13fbcb42Sjoerg assert(D && "Expected function or captured|block decl.");
4154*13fbcb42Sjoerg assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
4155*13fbcb42Sjoerg "Function is registered already.");
4156*13fbcb42Sjoerg assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
4157*13fbcb42Sjoerg "Team is set but not processed.");
4158*13fbcb42Sjoerg const Stmt *Body = nullptr;
4159*13fbcb42Sjoerg bool NeedToDelayGlobalization = false;
4160*13fbcb42Sjoerg if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
4161*13fbcb42Sjoerg Body = FD->getBody();
4162*13fbcb42Sjoerg } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
4163*13fbcb42Sjoerg Body = BD->getBody();
4164*13fbcb42Sjoerg } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
4165*13fbcb42Sjoerg Body = CD->getBody();
4166*13fbcb42Sjoerg NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
4167*13fbcb42Sjoerg if (NeedToDelayGlobalization &&
4168*13fbcb42Sjoerg getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
4169*13fbcb42Sjoerg return;
4170*13fbcb42Sjoerg }
4171*13fbcb42Sjoerg if (!Body)
4172*13fbcb42Sjoerg return;
4173*13fbcb42Sjoerg CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
4174*13fbcb42Sjoerg VarChecker.Visit(Body);
4175*13fbcb42Sjoerg const RecordDecl *GlobalizedVarsRecord =
4176*13fbcb42Sjoerg VarChecker.getGlobalizedRecord(IsInTTDRegion);
4177*13fbcb42Sjoerg TeamAndReductions.first = nullptr;
4178*13fbcb42Sjoerg TeamAndReductions.second.clear();
4179*13fbcb42Sjoerg ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
4180*13fbcb42Sjoerg VarChecker.getEscapedVariableLengthDecls();
4181*13fbcb42Sjoerg if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
4182*13fbcb42Sjoerg return;
4183*13fbcb42Sjoerg auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
4184*13fbcb42Sjoerg I->getSecond().MappedParams =
4185*13fbcb42Sjoerg std::make_unique<CodeGenFunction::OMPMapVars>();
4186*13fbcb42Sjoerg I->getSecond().GlobalRecord = GlobalizedVarsRecord;
4187*13fbcb42Sjoerg I->getSecond().EscapedParameters.insert(
4188*13fbcb42Sjoerg VarChecker.getEscapedParameters().begin(),
4189*13fbcb42Sjoerg VarChecker.getEscapedParameters().end());
4190*13fbcb42Sjoerg I->getSecond().EscapedVariableLengthDecls.append(
4191*13fbcb42Sjoerg EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
4192*13fbcb42Sjoerg DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
4193*13fbcb42Sjoerg for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
4194*13fbcb42Sjoerg assert(VD->isCanonicalDecl() && "Expected canonical declaration");
4195*13fbcb42Sjoerg const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
4196*13fbcb42Sjoerg Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion)));
4197*13fbcb42Sjoerg }
4198*13fbcb42Sjoerg if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
4199*13fbcb42Sjoerg CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None);
4200*13fbcb42Sjoerg VarChecker.Visit(Body);
4201*13fbcb42Sjoerg I->getSecond().SecondaryGlobalRecord =
4202*13fbcb42Sjoerg VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true);
4203*13fbcb42Sjoerg I->getSecond().SecondaryLocalVarData.emplace();
4204*13fbcb42Sjoerg DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue();
4205*13fbcb42Sjoerg for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
4206*13fbcb42Sjoerg assert(VD->isCanonicalDecl() && "Expected canonical declaration");
4207*13fbcb42Sjoerg const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
4208*13fbcb42Sjoerg Data.insert(
4209*13fbcb42Sjoerg std::make_pair(VD, MappedVarData(FD, /*IsInTTDRegion=*/true)));
4210*13fbcb42Sjoerg }
4211*13fbcb42Sjoerg }
4212*13fbcb42Sjoerg if (!NeedToDelayGlobalization) {
4213*13fbcb42Sjoerg emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true);
4214*13fbcb42Sjoerg struct GlobalizationScope final : EHScopeStack::Cleanup {
4215*13fbcb42Sjoerg GlobalizationScope() = default;
4216*13fbcb42Sjoerg
4217*13fbcb42Sjoerg void Emit(CodeGenFunction &CGF, Flags flags) override {
4218*13fbcb42Sjoerg static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
4219*13fbcb42Sjoerg .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true);
4220*13fbcb42Sjoerg }
4221*13fbcb42Sjoerg };
4222*13fbcb42Sjoerg CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
4223*13fbcb42Sjoerg }
4224*13fbcb42Sjoerg }
4225*13fbcb42Sjoerg
getAddressOfLocalVariable(CodeGenFunction & CGF,const VarDecl * VD)4226*13fbcb42Sjoerg Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
4227*13fbcb42Sjoerg const VarDecl *VD) {
4228*13fbcb42Sjoerg if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
4229*13fbcb42Sjoerg const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
4230*13fbcb42Sjoerg auto AS = LangAS::Default;
4231*13fbcb42Sjoerg switch (A->getAllocatorType()) {
4232*13fbcb42Sjoerg // Use the default allocator here as by default local vars are
4233*13fbcb42Sjoerg // threadlocal.
4234*13fbcb42Sjoerg case OMPAllocateDeclAttr::OMPNullMemAlloc:
4235*13fbcb42Sjoerg case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
4236*13fbcb42Sjoerg case OMPAllocateDeclAttr::OMPThreadMemAlloc:
4237*13fbcb42Sjoerg case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
4238*13fbcb42Sjoerg case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
4239*13fbcb42Sjoerg // Follow the user decision - use default allocation.
4240*13fbcb42Sjoerg return Address::invalid();
4241*13fbcb42Sjoerg case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
4242*13fbcb42Sjoerg // TODO: implement aupport for user-defined allocators.
4243*13fbcb42Sjoerg return Address::invalid();
4244*13fbcb42Sjoerg case OMPAllocateDeclAttr::OMPConstMemAlloc:
4245*13fbcb42Sjoerg AS = LangAS::cuda_constant;
4246*13fbcb42Sjoerg break;
4247*13fbcb42Sjoerg case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
4248*13fbcb42Sjoerg AS = LangAS::cuda_shared;
4249*13fbcb42Sjoerg break;
4250*13fbcb42Sjoerg case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
4251*13fbcb42Sjoerg case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
4252*13fbcb42Sjoerg break;
4253*13fbcb42Sjoerg }
4254*13fbcb42Sjoerg llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
4255*13fbcb42Sjoerg auto *GV = new llvm::GlobalVariable(
4256*13fbcb42Sjoerg CGM.getModule(), VarTy, /*isConstant=*/false,
4257*13fbcb42Sjoerg llvm::GlobalValue::InternalLinkage, llvm::Constant::getNullValue(VarTy),
4258*13fbcb42Sjoerg VD->getName(),
4259*13fbcb42Sjoerg /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
4260*13fbcb42Sjoerg CGM.getContext().getTargetAddressSpace(AS));
4261*13fbcb42Sjoerg CharUnits Align = CGM.getContext().getDeclAlign(VD);
4262*13fbcb42Sjoerg GV->setAlignment(Align.getAsAlign());
4263*13fbcb42Sjoerg return Address(
4264*13fbcb42Sjoerg CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
4265*13fbcb42Sjoerg GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace(
4266*13fbcb42Sjoerg VD->getType().getAddressSpace()))),
4267*13fbcb42Sjoerg Align);
4268*13fbcb42Sjoerg }
4269*13fbcb42Sjoerg
4270*13fbcb42Sjoerg if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
4271*13fbcb42Sjoerg return Address::invalid();
4272*13fbcb42Sjoerg
4273*13fbcb42Sjoerg VD = VD->getCanonicalDecl();
4274*13fbcb42Sjoerg auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
4275*13fbcb42Sjoerg if (I == FunctionGlobalizedDecls.end())
4276*13fbcb42Sjoerg return Address::invalid();
4277*13fbcb42Sjoerg auto VDI = I->getSecond().LocalVarData.find(VD);
4278*13fbcb42Sjoerg if (VDI != I->getSecond().LocalVarData.end())
4279*13fbcb42Sjoerg return VDI->second.PrivateAddr;
4280*13fbcb42Sjoerg if (VD->hasAttrs()) {
4281*13fbcb42Sjoerg for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
4282*13fbcb42Sjoerg E(VD->attr_end());
4283*13fbcb42Sjoerg IT != E; ++IT) {
4284*13fbcb42Sjoerg auto VDI = I->getSecond().LocalVarData.find(
4285*13fbcb42Sjoerg cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
4286*13fbcb42Sjoerg ->getCanonicalDecl());
4287*13fbcb42Sjoerg if (VDI != I->getSecond().LocalVarData.end())
4288*13fbcb42Sjoerg return VDI->second.PrivateAddr;
4289*13fbcb42Sjoerg }
4290*13fbcb42Sjoerg }
4291*13fbcb42Sjoerg
4292*13fbcb42Sjoerg return Address::invalid();
4293*13fbcb42Sjoerg }
4294*13fbcb42Sjoerg
functionFinished(CodeGenFunction & CGF)4295*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) {
4296*13fbcb42Sjoerg FunctionGlobalizedDecls.erase(CGF.CurFn);
4297*13fbcb42Sjoerg CGOpenMPRuntime::functionFinished(CGF);
4298*13fbcb42Sjoerg }
4299*13fbcb42Sjoerg
getDefaultDistScheduleAndChunk(CodeGenFunction & CGF,const OMPLoopDirective & S,OpenMPDistScheduleClauseKind & ScheduleKind,llvm::Value * & Chunk) const4300*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
4301*13fbcb42Sjoerg CodeGenFunction &CGF, const OMPLoopDirective &S,
4302*13fbcb42Sjoerg OpenMPDistScheduleClauseKind &ScheduleKind,
4303*13fbcb42Sjoerg llvm::Value *&Chunk) const {
4304*13fbcb42Sjoerg auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
4305*13fbcb42Sjoerg if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
4306*13fbcb42Sjoerg ScheduleKind = OMPC_DIST_SCHEDULE_static;
4307*13fbcb42Sjoerg Chunk = CGF.EmitScalarConversion(
4308*13fbcb42Sjoerg RT.getGPUNumThreads(CGF),
4309*13fbcb42Sjoerg CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
4310*13fbcb42Sjoerg S.getIterationVariable()->getType(), S.getBeginLoc());
4311*13fbcb42Sjoerg return;
4312*13fbcb42Sjoerg }
4313*13fbcb42Sjoerg CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
4314*13fbcb42Sjoerg CGF, S, ScheduleKind, Chunk);
4315*13fbcb42Sjoerg }
4316*13fbcb42Sjoerg
getDefaultScheduleAndChunk(CodeGenFunction & CGF,const OMPLoopDirective & S,OpenMPScheduleClauseKind & ScheduleKind,const Expr * & ChunkExpr) const4317*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk(
4318*13fbcb42Sjoerg CodeGenFunction &CGF, const OMPLoopDirective &S,
4319*13fbcb42Sjoerg OpenMPScheduleClauseKind &ScheduleKind,
4320*13fbcb42Sjoerg const Expr *&ChunkExpr) const {
4321*13fbcb42Sjoerg ScheduleKind = OMPC_SCHEDULE_static;
4322*13fbcb42Sjoerg // Chunk size is 1 in this case.
4323*13fbcb42Sjoerg llvm::APInt ChunkSize(32, 1);
4324*13fbcb42Sjoerg ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
4325*13fbcb42Sjoerg CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
4326*13fbcb42Sjoerg SourceLocation());
4327*13fbcb42Sjoerg }
4328*13fbcb42Sjoerg
adjustTargetSpecificDataForLambdas(CodeGenFunction & CGF,const OMPExecutableDirective & D) const4329*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas(
4330*13fbcb42Sjoerg CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
4331*13fbcb42Sjoerg assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&
4332*13fbcb42Sjoerg " Expected target-based directive.");
4333*13fbcb42Sjoerg const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
4334*13fbcb42Sjoerg for (const CapturedStmt::Capture &C : CS->captures()) {
4335*13fbcb42Sjoerg // Capture variables captured by reference in lambdas for target-based
4336*13fbcb42Sjoerg // directives.
4337*13fbcb42Sjoerg if (!C.capturesVariable())
4338*13fbcb42Sjoerg continue;
4339*13fbcb42Sjoerg const VarDecl *VD = C.getCapturedVar();
4340*13fbcb42Sjoerg const auto *RD = VD->getType()
4341*13fbcb42Sjoerg .getCanonicalType()
4342*13fbcb42Sjoerg .getNonReferenceType()
4343*13fbcb42Sjoerg ->getAsCXXRecordDecl();
4344*13fbcb42Sjoerg if (!RD || !RD->isLambda())
4345*13fbcb42Sjoerg continue;
4346*13fbcb42Sjoerg Address VDAddr = CGF.GetAddrOfLocalVar(VD);
4347*13fbcb42Sjoerg LValue VDLVal;
4348*13fbcb42Sjoerg if (VD->getType().getCanonicalType()->isReferenceType())
4349*13fbcb42Sjoerg VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
4350*13fbcb42Sjoerg else
4351*13fbcb42Sjoerg VDLVal = CGF.MakeAddrLValue(
4352*13fbcb42Sjoerg VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
4353*13fbcb42Sjoerg llvm::DenseMap<const VarDecl *, FieldDecl *> Captures;
4354*13fbcb42Sjoerg FieldDecl *ThisCapture = nullptr;
4355*13fbcb42Sjoerg RD->getCaptureFields(Captures, ThisCapture);
4356*13fbcb42Sjoerg if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
4357*13fbcb42Sjoerg LValue ThisLVal =
4358*13fbcb42Sjoerg CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
4359*13fbcb42Sjoerg llvm::Value *CXXThis = CGF.LoadCXXThis();
4360*13fbcb42Sjoerg CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
4361*13fbcb42Sjoerg }
4362*13fbcb42Sjoerg for (const LambdaCapture &LC : RD->captures()) {
4363*13fbcb42Sjoerg if (LC.getCaptureKind() != LCK_ByRef)
4364*13fbcb42Sjoerg continue;
4365*13fbcb42Sjoerg const VarDecl *VD = LC.getCapturedVar();
4366*13fbcb42Sjoerg if (!CS->capturesVariable(VD))
4367*13fbcb42Sjoerg continue;
4368*13fbcb42Sjoerg auto It = Captures.find(VD);
4369*13fbcb42Sjoerg assert(It != Captures.end() && "Found lambda capture without field.");
4370*13fbcb42Sjoerg LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
4371*13fbcb42Sjoerg Address VDAddr = CGF.GetAddrOfLocalVar(VD);
4372*13fbcb42Sjoerg if (VD->getType().getCanonicalType()->isReferenceType())
4373*13fbcb42Sjoerg VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
4374*13fbcb42Sjoerg VD->getType().getCanonicalType())
4375*13fbcb42Sjoerg .getAddress(CGF);
4376*13fbcb42Sjoerg CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
4377*13fbcb42Sjoerg }
4378*13fbcb42Sjoerg }
4379*13fbcb42Sjoerg }
4380*13fbcb42Sjoerg
getDefaultFirstprivateAddressSpace() const4381*13fbcb42Sjoerg unsigned CGOpenMPRuntimeGPU::getDefaultFirstprivateAddressSpace() const {
4382*13fbcb42Sjoerg return CGM.getContext().getTargetAddressSpace(LangAS::cuda_constant);
4383*13fbcb42Sjoerg }
4384*13fbcb42Sjoerg
hasAllocateAttributeForGlobalVar(const VarDecl * VD,LangAS & AS)4385*13fbcb42Sjoerg bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
4386*13fbcb42Sjoerg LangAS &AS) {
4387*13fbcb42Sjoerg if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
4388*13fbcb42Sjoerg return false;
4389*13fbcb42Sjoerg const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
4390*13fbcb42Sjoerg switch(A->getAllocatorType()) {
4391*13fbcb42Sjoerg case OMPAllocateDeclAttr::OMPNullMemAlloc:
4392*13fbcb42Sjoerg case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
4393*13fbcb42Sjoerg // Not supported, fallback to the default mem space.
4394*13fbcb42Sjoerg case OMPAllocateDeclAttr::OMPThreadMemAlloc:
4395*13fbcb42Sjoerg case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
4396*13fbcb42Sjoerg case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
4397*13fbcb42Sjoerg case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
4398*13fbcb42Sjoerg case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
4399*13fbcb42Sjoerg AS = LangAS::Default;
4400*13fbcb42Sjoerg return true;
4401*13fbcb42Sjoerg case OMPAllocateDeclAttr::OMPConstMemAlloc:
4402*13fbcb42Sjoerg AS = LangAS::cuda_constant;
4403*13fbcb42Sjoerg return true;
4404*13fbcb42Sjoerg case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
4405*13fbcb42Sjoerg AS = LangAS::cuda_shared;
4406*13fbcb42Sjoerg return true;
4407*13fbcb42Sjoerg case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
4408*13fbcb42Sjoerg llvm_unreachable("Expected predefined allocator for the variables with the "
4409*13fbcb42Sjoerg "static storage.");
4410*13fbcb42Sjoerg }
4411*13fbcb42Sjoerg return false;
4412*13fbcb42Sjoerg }
4413*13fbcb42Sjoerg
4414*13fbcb42Sjoerg // Get current CudaArch and ignore any unknown values
getCudaArch(CodeGenModule & CGM)4415*13fbcb42Sjoerg static CudaArch getCudaArch(CodeGenModule &CGM) {
4416*13fbcb42Sjoerg if (!CGM.getTarget().hasFeature("ptx"))
4417*13fbcb42Sjoerg return CudaArch::UNKNOWN;
4418*13fbcb42Sjoerg for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
4419*13fbcb42Sjoerg if (Feature.getValue()) {
4420*13fbcb42Sjoerg CudaArch Arch = StringToCudaArch(Feature.getKey());
4421*13fbcb42Sjoerg if (Arch != CudaArch::UNKNOWN)
4422*13fbcb42Sjoerg return Arch;
4423*13fbcb42Sjoerg }
4424*13fbcb42Sjoerg }
4425*13fbcb42Sjoerg return CudaArch::UNKNOWN;
4426*13fbcb42Sjoerg }
4427*13fbcb42Sjoerg
4428*13fbcb42Sjoerg /// Check to see if target architecture supports unified addressing which is
4429*13fbcb42Sjoerg /// a restriction for OpenMP requires clause "unified_shared_memory".
processRequiresDirective(const OMPRequiresDecl * D)4430*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::processRequiresDirective(
4431*13fbcb42Sjoerg const OMPRequiresDecl *D) {
4432*13fbcb42Sjoerg for (const OMPClause *Clause : D->clauselists()) {
4433*13fbcb42Sjoerg if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
4434*13fbcb42Sjoerg CudaArch Arch = getCudaArch(CGM);
4435*13fbcb42Sjoerg switch (Arch) {
4436*13fbcb42Sjoerg case CudaArch::SM_20:
4437*13fbcb42Sjoerg case CudaArch::SM_21:
4438*13fbcb42Sjoerg case CudaArch::SM_30:
4439*13fbcb42Sjoerg case CudaArch::SM_32:
4440*13fbcb42Sjoerg case CudaArch::SM_35:
4441*13fbcb42Sjoerg case CudaArch::SM_37:
4442*13fbcb42Sjoerg case CudaArch::SM_50:
4443*13fbcb42Sjoerg case CudaArch::SM_52:
4444*13fbcb42Sjoerg case CudaArch::SM_53: {
4445*13fbcb42Sjoerg SmallString<256> Buffer;
4446*13fbcb42Sjoerg llvm::raw_svector_ostream Out(Buffer);
4447*13fbcb42Sjoerg Out << "Target architecture " << CudaArchToString(Arch)
4448*13fbcb42Sjoerg << " does not support unified addressing";
4449*13fbcb42Sjoerg CGM.Error(Clause->getBeginLoc(), Out.str());
4450*13fbcb42Sjoerg return;
4451*13fbcb42Sjoerg }
4452*13fbcb42Sjoerg case CudaArch::SM_60:
4453*13fbcb42Sjoerg case CudaArch::SM_61:
4454*13fbcb42Sjoerg case CudaArch::SM_62:
4455*13fbcb42Sjoerg case CudaArch::SM_70:
4456*13fbcb42Sjoerg case CudaArch::SM_72:
4457*13fbcb42Sjoerg case CudaArch::SM_75:
4458*13fbcb42Sjoerg case CudaArch::SM_80:
4459*13fbcb42Sjoerg case CudaArch::SM_86:
4460*13fbcb42Sjoerg case CudaArch::GFX600:
4461*13fbcb42Sjoerg case CudaArch::GFX601:
4462*13fbcb42Sjoerg case CudaArch::GFX602:
4463*13fbcb42Sjoerg case CudaArch::GFX700:
4464*13fbcb42Sjoerg case CudaArch::GFX701:
4465*13fbcb42Sjoerg case CudaArch::GFX702:
4466*13fbcb42Sjoerg case CudaArch::GFX703:
4467*13fbcb42Sjoerg case CudaArch::GFX704:
4468*13fbcb42Sjoerg case CudaArch::GFX705:
4469*13fbcb42Sjoerg case CudaArch::GFX801:
4470*13fbcb42Sjoerg case CudaArch::GFX802:
4471*13fbcb42Sjoerg case CudaArch::GFX803:
4472*13fbcb42Sjoerg case CudaArch::GFX805:
4473*13fbcb42Sjoerg case CudaArch::GFX810:
4474*13fbcb42Sjoerg case CudaArch::GFX900:
4475*13fbcb42Sjoerg case CudaArch::GFX902:
4476*13fbcb42Sjoerg case CudaArch::GFX904:
4477*13fbcb42Sjoerg case CudaArch::GFX906:
4478*13fbcb42Sjoerg case CudaArch::GFX908:
4479*13fbcb42Sjoerg case CudaArch::GFX909:
4480*13fbcb42Sjoerg case CudaArch::GFX90a:
4481*13fbcb42Sjoerg case CudaArch::GFX90c:
4482*13fbcb42Sjoerg case CudaArch::GFX1010:
4483*13fbcb42Sjoerg case CudaArch::GFX1011:
4484*13fbcb42Sjoerg case CudaArch::GFX1012:
4485*13fbcb42Sjoerg case CudaArch::GFX1030:
4486*13fbcb42Sjoerg case CudaArch::GFX1031:
4487*13fbcb42Sjoerg case CudaArch::GFX1032:
4488*13fbcb42Sjoerg case CudaArch::GFX1033:
4489*13fbcb42Sjoerg case CudaArch::GFX1034:
4490*13fbcb42Sjoerg case CudaArch::UNUSED:
4491*13fbcb42Sjoerg case CudaArch::UNKNOWN:
4492*13fbcb42Sjoerg break;
4493*13fbcb42Sjoerg case CudaArch::LAST:
4494*13fbcb42Sjoerg llvm_unreachable("Unexpected Cuda arch.");
4495*13fbcb42Sjoerg }
4496*13fbcb42Sjoerg }
4497*13fbcb42Sjoerg }
4498*13fbcb42Sjoerg CGOpenMPRuntime::processRequiresDirective(D);
4499*13fbcb42Sjoerg }
4500*13fbcb42Sjoerg
4501*13fbcb42Sjoerg /// Get number of SMs and number of blocks per SM.
getSMsBlocksPerSM(CodeGenModule & CGM)4502*13fbcb42Sjoerg static std::pair<unsigned, unsigned> getSMsBlocksPerSM(CodeGenModule &CGM) {
4503*13fbcb42Sjoerg std::pair<unsigned, unsigned> Data;
4504*13fbcb42Sjoerg if (CGM.getLangOpts().OpenMPCUDANumSMs)
4505*13fbcb42Sjoerg Data.first = CGM.getLangOpts().OpenMPCUDANumSMs;
4506*13fbcb42Sjoerg if (CGM.getLangOpts().OpenMPCUDABlocksPerSM)
4507*13fbcb42Sjoerg Data.second = CGM.getLangOpts().OpenMPCUDABlocksPerSM;
4508*13fbcb42Sjoerg if (Data.first && Data.second)
4509*13fbcb42Sjoerg return Data;
4510*13fbcb42Sjoerg switch (getCudaArch(CGM)) {
4511*13fbcb42Sjoerg case CudaArch::SM_20:
4512*13fbcb42Sjoerg case CudaArch::SM_21:
4513*13fbcb42Sjoerg case CudaArch::SM_30:
4514*13fbcb42Sjoerg case CudaArch::SM_32:
4515*13fbcb42Sjoerg case CudaArch::SM_35:
4516*13fbcb42Sjoerg case CudaArch::SM_37:
4517*13fbcb42Sjoerg case CudaArch::SM_50:
4518*13fbcb42Sjoerg case CudaArch::SM_52:
4519*13fbcb42Sjoerg case CudaArch::SM_53:
4520*13fbcb42Sjoerg return {16, 16};
4521*13fbcb42Sjoerg case CudaArch::SM_60:
4522*13fbcb42Sjoerg case CudaArch::SM_61:
4523*13fbcb42Sjoerg case CudaArch::SM_62:
4524*13fbcb42Sjoerg return {56, 32};
4525*13fbcb42Sjoerg case CudaArch::SM_70:
4526*13fbcb42Sjoerg case CudaArch::SM_72:
4527*13fbcb42Sjoerg case CudaArch::SM_75:
4528*13fbcb42Sjoerg case CudaArch::SM_80:
4529*13fbcb42Sjoerg case CudaArch::SM_86:
4530*13fbcb42Sjoerg return {84, 32};
4531*13fbcb42Sjoerg case CudaArch::GFX600:
4532*13fbcb42Sjoerg case CudaArch::GFX601:
4533*13fbcb42Sjoerg case CudaArch::GFX602:
4534*13fbcb42Sjoerg case CudaArch::GFX700:
4535*13fbcb42Sjoerg case CudaArch::GFX701:
4536*13fbcb42Sjoerg case CudaArch::GFX702:
4537*13fbcb42Sjoerg case CudaArch::GFX703:
4538*13fbcb42Sjoerg case CudaArch::GFX704:
4539*13fbcb42Sjoerg case CudaArch::GFX705:
4540*13fbcb42Sjoerg case CudaArch::GFX801:
4541*13fbcb42Sjoerg case CudaArch::GFX802:
4542*13fbcb42Sjoerg case CudaArch::GFX803:
4543*13fbcb42Sjoerg case CudaArch::GFX805:
4544*13fbcb42Sjoerg case CudaArch::GFX810:
4545*13fbcb42Sjoerg case CudaArch::GFX900:
4546*13fbcb42Sjoerg case CudaArch::GFX902:
4547*13fbcb42Sjoerg case CudaArch::GFX904:
4548*13fbcb42Sjoerg case CudaArch::GFX906:
4549*13fbcb42Sjoerg case CudaArch::GFX908:
4550*13fbcb42Sjoerg case CudaArch::GFX909:
4551*13fbcb42Sjoerg case CudaArch::GFX90a:
4552*13fbcb42Sjoerg case CudaArch::GFX90c:
4553*13fbcb42Sjoerg case CudaArch::GFX1010:
4554*13fbcb42Sjoerg case CudaArch::GFX1011:
4555*13fbcb42Sjoerg case CudaArch::GFX1012:
4556*13fbcb42Sjoerg case CudaArch::GFX1030:
4557*13fbcb42Sjoerg case CudaArch::GFX1031:
4558*13fbcb42Sjoerg case CudaArch::GFX1032:
4559*13fbcb42Sjoerg case CudaArch::GFX1033:
4560*13fbcb42Sjoerg case CudaArch::GFX1034:
4561*13fbcb42Sjoerg case CudaArch::UNUSED:
4562*13fbcb42Sjoerg case CudaArch::UNKNOWN:
4563*13fbcb42Sjoerg break;
4564*13fbcb42Sjoerg case CudaArch::LAST:
4565*13fbcb42Sjoerg llvm_unreachable("Unexpected Cuda arch.");
4566*13fbcb42Sjoerg }
4567*13fbcb42Sjoerg llvm_unreachable("Unexpected NVPTX target without ptx feature.");
4568*13fbcb42Sjoerg }
4569*13fbcb42Sjoerg
clear()4570*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::clear() {
4571*13fbcb42Sjoerg if (!GlobalizedRecords.empty() &&
4572*13fbcb42Sjoerg !CGM.getLangOpts().OpenMPCUDATargetParallel) {
4573*13fbcb42Sjoerg ASTContext &C = CGM.getContext();
4574*13fbcb42Sjoerg llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> GlobalRecs;
4575*13fbcb42Sjoerg llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> SharedRecs;
4576*13fbcb42Sjoerg RecordDecl *StaticRD = C.buildImplicitRecord(
4577*13fbcb42Sjoerg "_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
4578*13fbcb42Sjoerg StaticRD->startDefinition();
4579*13fbcb42Sjoerg RecordDecl *SharedStaticRD = C.buildImplicitRecord(
4580*13fbcb42Sjoerg "_shared_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
4581*13fbcb42Sjoerg SharedStaticRD->startDefinition();
4582*13fbcb42Sjoerg for (const GlobalPtrSizeRecsTy &Records : GlobalizedRecords) {
4583*13fbcb42Sjoerg if (Records.Records.empty())
4584*13fbcb42Sjoerg continue;
4585*13fbcb42Sjoerg unsigned Size = 0;
4586*13fbcb42Sjoerg unsigned RecAlignment = 0;
4587*13fbcb42Sjoerg for (const RecordDecl *RD : Records.Records) {
4588*13fbcb42Sjoerg QualType RDTy = C.getRecordType(RD);
4589*13fbcb42Sjoerg unsigned Alignment = C.getTypeAlignInChars(RDTy).getQuantity();
4590*13fbcb42Sjoerg RecAlignment = std::max(RecAlignment, Alignment);
4591*13fbcb42Sjoerg unsigned RecSize = C.getTypeSizeInChars(RDTy).getQuantity();
4592*13fbcb42Sjoerg Size =
4593*13fbcb42Sjoerg llvm::alignTo(llvm::alignTo(Size, Alignment) + RecSize, Alignment);
4594*13fbcb42Sjoerg }
4595*13fbcb42Sjoerg Size = llvm::alignTo(Size, RecAlignment);
4596*13fbcb42Sjoerg llvm::APInt ArySize(/*numBits=*/64, Size);
4597*13fbcb42Sjoerg QualType SubTy = C.getConstantArrayType(
4598*13fbcb42Sjoerg C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0);
4599*13fbcb42Sjoerg const bool UseSharedMemory = Size <= SharedMemorySize;
4600*13fbcb42Sjoerg auto *Field =
4601*13fbcb42Sjoerg FieldDecl::Create(C, UseSharedMemory ? SharedStaticRD : StaticRD,
4602*13fbcb42Sjoerg SourceLocation(), SourceLocation(), nullptr, SubTy,
4603*13fbcb42Sjoerg C.getTrivialTypeSourceInfo(SubTy, SourceLocation()),
4604*13fbcb42Sjoerg /*BW=*/nullptr, /*Mutable=*/false,
4605*13fbcb42Sjoerg /*InitStyle=*/ICIS_NoInit);
4606*13fbcb42Sjoerg Field->setAccess(AS_public);
4607*13fbcb42Sjoerg if (UseSharedMemory) {
4608*13fbcb42Sjoerg SharedStaticRD->addDecl(Field);
4609*13fbcb42Sjoerg SharedRecs.push_back(&Records);
4610*13fbcb42Sjoerg } else {
4611*13fbcb42Sjoerg StaticRD->addDecl(Field);
4612*13fbcb42Sjoerg GlobalRecs.push_back(&Records);
4613*13fbcb42Sjoerg }
4614*13fbcb42Sjoerg Records.RecSize->setInitializer(llvm::ConstantInt::get(CGM.SizeTy, Size));
4615*13fbcb42Sjoerg Records.UseSharedMemory->setInitializer(
4616*13fbcb42Sjoerg llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0));
4617*13fbcb42Sjoerg }
4618*13fbcb42Sjoerg // Allocate SharedMemorySize buffer for the shared memory.
4619*13fbcb42Sjoerg // FIXME: nvlink does not handle weak linkage correctly (object with the
4620*13fbcb42Sjoerg // different size are reported as erroneous).
4621*13fbcb42Sjoerg // Restore this code as sson as nvlink is fixed.
4622*13fbcb42Sjoerg if (!SharedStaticRD->field_empty()) {
4623*13fbcb42Sjoerg llvm::APInt ArySize(/*numBits=*/64, SharedMemorySize);
4624*13fbcb42Sjoerg QualType SubTy = C.getConstantArrayType(
4625*13fbcb42Sjoerg C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0);
4626*13fbcb42Sjoerg auto *Field = FieldDecl::Create(
4627*13fbcb42Sjoerg C, SharedStaticRD, SourceLocation(), SourceLocation(), nullptr, SubTy,
4628*13fbcb42Sjoerg C.getTrivialTypeSourceInfo(SubTy, SourceLocation()),
4629*13fbcb42Sjoerg /*BW=*/nullptr, /*Mutable=*/false,
4630*13fbcb42Sjoerg /*InitStyle=*/ICIS_NoInit);
4631*13fbcb42Sjoerg Field->setAccess(AS_public);
4632*13fbcb42Sjoerg SharedStaticRD->addDecl(Field);
4633*13fbcb42Sjoerg }
4634*13fbcb42Sjoerg SharedStaticRD->completeDefinition();
4635*13fbcb42Sjoerg if (!SharedStaticRD->field_empty()) {
4636*13fbcb42Sjoerg QualType StaticTy = C.getRecordType(SharedStaticRD);
4637*13fbcb42Sjoerg llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy);
4638*13fbcb42Sjoerg auto *GV = new llvm::GlobalVariable(
4639*13fbcb42Sjoerg CGM.getModule(), LLVMStaticTy,
4640*13fbcb42Sjoerg /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage,
4641*13fbcb42Sjoerg llvm::UndefValue::get(LLVMStaticTy),
4642*13fbcb42Sjoerg "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr,
4643*13fbcb42Sjoerg llvm::GlobalValue::NotThreadLocal,
4644*13fbcb42Sjoerg C.getTargetAddressSpace(LangAS::cuda_shared));
4645*13fbcb42Sjoerg auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
4646*13fbcb42Sjoerg GV, CGM.VoidPtrTy);
4647*13fbcb42Sjoerg for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) {
4648*13fbcb42Sjoerg Rec->Buffer->replaceAllUsesWith(Replacement);
4649*13fbcb42Sjoerg Rec->Buffer->eraseFromParent();
4650*13fbcb42Sjoerg }
4651*13fbcb42Sjoerg }
4652*13fbcb42Sjoerg StaticRD->completeDefinition();
4653*13fbcb42Sjoerg if (!StaticRD->field_empty()) {
4654*13fbcb42Sjoerg QualType StaticTy = C.getRecordType(StaticRD);
4655*13fbcb42Sjoerg std::pair<unsigned, unsigned> SMsBlockPerSM = getSMsBlocksPerSM(CGM);
4656*13fbcb42Sjoerg llvm::APInt Size1(32, SMsBlockPerSM.second);
4657*13fbcb42Sjoerg QualType Arr1Ty =
4658*13fbcb42Sjoerg C.getConstantArrayType(StaticTy, Size1, nullptr, ArrayType::Normal,
4659*13fbcb42Sjoerg /*IndexTypeQuals=*/0);
4660*13fbcb42Sjoerg llvm::APInt Size2(32, SMsBlockPerSM.first);
4661*13fbcb42Sjoerg QualType Arr2Ty =
4662*13fbcb42Sjoerg C.getConstantArrayType(Arr1Ty, Size2, nullptr, ArrayType::Normal,
4663*13fbcb42Sjoerg /*IndexTypeQuals=*/0);
4664*13fbcb42Sjoerg llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty);
4665*13fbcb42Sjoerg // FIXME: nvlink does not handle weak linkage correctly (object with the
4666*13fbcb42Sjoerg // different size are reported as erroneous).
4667*13fbcb42Sjoerg // Restore CommonLinkage as soon as nvlink is fixed.
4668*13fbcb42Sjoerg auto *GV = new llvm::GlobalVariable(
4669*13fbcb42Sjoerg CGM.getModule(), LLVMArr2Ty,
4670*13fbcb42Sjoerg /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
4671*13fbcb42Sjoerg llvm::Constant::getNullValue(LLVMArr2Ty),
4672*13fbcb42Sjoerg "_openmp_static_glob_rd_$_");
4673*13fbcb42Sjoerg auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
4674*13fbcb42Sjoerg GV, CGM.VoidPtrTy);
4675*13fbcb42Sjoerg for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) {
4676*13fbcb42Sjoerg Rec->Buffer->replaceAllUsesWith(Replacement);
4677*13fbcb42Sjoerg Rec->Buffer->eraseFromParent();
4678*13fbcb42Sjoerg }
4679*13fbcb42Sjoerg }
4680*13fbcb42Sjoerg }
4681*13fbcb42Sjoerg if (!TeamsReductions.empty()) {
4682*13fbcb42Sjoerg ASTContext &C = CGM.getContext();
4683*13fbcb42Sjoerg RecordDecl *StaticRD = C.buildImplicitRecord(
4684*13fbcb42Sjoerg "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
4685*13fbcb42Sjoerg StaticRD->startDefinition();
4686*13fbcb42Sjoerg for (const RecordDecl *TeamReductionRec : TeamsReductions) {
4687*13fbcb42Sjoerg QualType RecTy = C.getRecordType(TeamReductionRec);
4688*13fbcb42Sjoerg auto *Field = FieldDecl::Create(
4689*13fbcb42Sjoerg C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
4690*13fbcb42Sjoerg C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
4691*13fbcb42Sjoerg /*BW=*/nullptr, /*Mutable=*/false,
4692*13fbcb42Sjoerg /*InitStyle=*/ICIS_NoInit);
4693*13fbcb42Sjoerg Field->setAccess(AS_public);
4694*13fbcb42Sjoerg StaticRD->addDecl(Field);
4695*13fbcb42Sjoerg }
4696*13fbcb42Sjoerg StaticRD->completeDefinition();
4697*13fbcb42Sjoerg QualType StaticTy = C.getRecordType(StaticRD);
4698*13fbcb42Sjoerg llvm::Type *LLVMReductionsBufferTy =
4699*13fbcb42Sjoerg CGM.getTypes().ConvertTypeForMem(StaticTy);
4700*13fbcb42Sjoerg // FIXME: nvlink does not handle weak linkage correctly (object with the
4701*13fbcb42Sjoerg // different size are reported as erroneous).
4702*13fbcb42Sjoerg // Restore CommonLinkage as soon as nvlink is fixed.
4703*13fbcb42Sjoerg auto *GV = new llvm::GlobalVariable(
4704*13fbcb42Sjoerg CGM.getModule(), LLVMReductionsBufferTy,
4705*13fbcb42Sjoerg /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
4706*13fbcb42Sjoerg llvm::Constant::getNullValue(LLVMReductionsBufferTy),
4707*13fbcb42Sjoerg "_openmp_teams_reductions_buffer_$_");
4708*13fbcb42Sjoerg KernelTeamsReductionPtr->setInitializer(
4709*13fbcb42Sjoerg llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
4710*13fbcb42Sjoerg CGM.VoidPtrTy));
4711*13fbcb42Sjoerg }
4712*13fbcb42Sjoerg CGOpenMPRuntime::clear();
4713*13fbcb42Sjoerg }
4714