1*13fbcb42Sjoerg //===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
2*13fbcb42Sjoerg //
3*13fbcb42Sjoerg // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*13fbcb42Sjoerg // See https://llvm.org/LICENSE.txt for license information.
5*13fbcb42Sjoerg // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*13fbcb42Sjoerg //
7*13fbcb42Sjoerg //===----------------------------------------------------------------------===//
8*13fbcb42Sjoerg //
9*13fbcb42Sjoerg // This provides a generalized class for OpenMP runtime code generation
10*13fbcb42Sjoerg // specialized by GPU targets NVPTX and AMDGCN.
11*13fbcb42Sjoerg //
12*13fbcb42Sjoerg //===----------------------------------------------------------------------===//
13*13fbcb42Sjoerg 
14*13fbcb42Sjoerg #include "CGOpenMPRuntimeGPU.h"
15*13fbcb42Sjoerg #include "CGOpenMPRuntimeNVPTX.h"
16*13fbcb42Sjoerg #include "CodeGenFunction.h"
17*13fbcb42Sjoerg #include "clang/AST/Attr.h"
18*13fbcb42Sjoerg #include "clang/AST/DeclOpenMP.h"
19*13fbcb42Sjoerg #include "clang/AST/StmtOpenMP.h"
20*13fbcb42Sjoerg #include "clang/AST/StmtVisitor.h"
21*13fbcb42Sjoerg #include "clang/Basic/Cuda.h"
22*13fbcb42Sjoerg #include "llvm/ADT/SmallPtrSet.h"
23*13fbcb42Sjoerg #include "llvm/Frontend/OpenMP/OMPGridValues.h"
24*13fbcb42Sjoerg #include "llvm/IR/IntrinsicsNVPTX.h"
25*13fbcb42Sjoerg 
26*13fbcb42Sjoerg using namespace clang;
27*13fbcb42Sjoerg using namespace CodeGen;
28*13fbcb42Sjoerg using namespace llvm::omp;
29*13fbcb42Sjoerg 
30*13fbcb42Sjoerg namespace {
31*13fbcb42Sjoerg /// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
32*13fbcb42Sjoerg class NVPTXActionTy final : public PrePostActionTy {
33*13fbcb42Sjoerg   llvm::FunctionCallee EnterCallee = nullptr;
34*13fbcb42Sjoerg   ArrayRef<llvm::Value *> EnterArgs;
35*13fbcb42Sjoerg   llvm::FunctionCallee ExitCallee = nullptr;
36*13fbcb42Sjoerg   ArrayRef<llvm::Value *> ExitArgs;
37*13fbcb42Sjoerg   bool Conditional = false;
38*13fbcb42Sjoerg   llvm::BasicBlock *ContBlock = nullptr;
39*13fbcb42Sjoerg 
40*13fbcb42Sjoerg public:
NVPTXActionTy(llvm::FunctionCallee EnterCallee,ArrayRef<llvm::Value * > EnterArgs,llvm::FunctionCallee ExitCallee,ArrayRef<llvm::Value * > ExitArgs,bool Conditional=false)41*13fbcb42Sjoerg   NVPTXActionTy(llvm::FunctionCallee EnterCallee,
42*13fbcb42Sjoerg                 ArrayRef<llvm::Value *> EnterArgs,
43*13fbcb42Sjoerg                 llvm::FunctionCallee ExitCallee,
44*13fbcb42Sjoerg                 ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
45*13fbcb42Sjoerg       : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
46*13fbcb42Sjoerg         ExitArgs(ExitArgs), Conditional(Conditional) {}
Enter(CodeGenFunction & CGF)47*13fbcb42Sjoerg   void Enter(CodeGenFunction &CGF) override {
48*13fbcb42Sjoerg     llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
49*13fbcb42Sjoerg     if (Conditional) {
50*13fbcb42Sjoerg       llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
51*13fbcb42Sjoerg       auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
52*13fbcb42Sjoerg       ContBlock = CGF.createBasicBlock("omp_if.end");
53*13fbcb42Sjoerg       // Generate the branch (If-stmt)
54*13fbcb42Sjoerg       CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
55*13fbcb42Sjoerg       CGF.EmitBlock(ThenBlock);
56*13fbcb42Sjoerg     }
57*13fbcb42Sjoerg   }
Done(CodeGenFunction & CGF)58*13fbcb42Sjoerg   void Done(CodeGenFunction &CGF) {
59*13fbcb42Sjoerg     // Emit the rest of blocks/branches
60*13fbcb42Sjoerg     CGF.EmitBranch(ContBlock);
61*13fbcb42Sjoerg     CGF.EmitBlock(ContBlock, true);
62*13fbcb42Sjoerg   }
Exit(CodeGenFunction & CGF)63*13fbcb42Sjoerg   void Exit(CodeGenFunction &CGF) override {
64*13fbcb42Sjoerg     CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
65*13fbcb42Sjoerg   }
66*13fbcb42Sjoerg };
67*13fbcb42Sjoerg 
68*13fbcb42Sjoerg /// A class to track the execution mode when codegening directives within
69*13fbcb42Sjoerg /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
70*13fbcb42Sjoerg /// to the target region and used by containing directives such as 'parallel'
71*13fbcb42Sjoerg /// to emit optimized code.
72*13fbcb42Sjoerg class ExecutionRuntimeModesRAII {
73*13fbcb42Sjoerg private:
74*13fbcb42Sjoerg   CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode =
75*13fbcb42Sjoerg       CGOpenMPRuntimeGPU::EM_Unknown;
76*13fbcb42Sjoerg   CGOpenMPRuntimeGPU::ExecutionMode &ExecMode;
77*13fbcb42Sjoerg   bool SavedRuntimeMode = false;
78*13fbcb42Sjoerg   bool *RuntimeMode = nullptr;
79*13fbcb42Sjoerg 
80*13fbcb42Sjoerg public:
81*13fbcb42Sjoerg   /// Constructor for Non-SPMD mode.
ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode & ExecMode)82*13fbcb42Sjoerg   ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode)
83*13fbcb42Sjoerg       : ExecMode(ExecMode) {
84*13fbcb42Sjoerg     SavedExecMode = ExecMode;
85*13fbcb42Sjoerg     ExecMode = CGOpenMPRuntimeGPU::EM_NonSPMD;
86*13fbcb42Sjoerg   }
87*13fbcb42Sjoerg   /// Constructor for SPMD mode.
ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode & ExecMode,bool & RuntimeMode,bool FullRuntimeMode)88*13fbcb42Sjoerg   ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
89*13fbcb42Sjoerg                             bool &RuntimeMode, bool FullRuntimeMode)
90*13fbcb42Sjoerg       : ExecMode(ExecMode), RuntimeMode(&RuntimeMode) {
91*13fbcb42Sjoerg     SavedExecMode = ExecMode;
92*13fbcb42Sjoerg     SavedRuntimeMode = RuntimeMode;
93*13fbcb42Sjoerg     ExecMode = CGOpenMPRuntimeGPU::EM_SPMD;
94*13fbcb42Sjoerg     RuntimeMode = FullRuntimeMode;
95*13fbcb42Sjoerg   }
~ExecutionRuntimeModesRAII()96*13fbcb42Sjoerg   ~ExecutionRuntimeModesRAII() {
97*13fbcb42Sjoerg     ExecMode = SavedExecMode;
98*13fbcb42Sjoerg     if (RuntimeMode)
99*13fbcb42Sjoerg       *RuntimeMode = SavedRuntimeMode;
100*13fbcb42Sjoerg   }
101*13fbcb42Sjoerg };
102*13fbcb42Sjoerg 
103*13fbcb42Sjoerg /// GPU Configuration:  This information can be derived from cuda registers,
104*13fbcb42Sjoerg /// however, providing compile time constants helps generate more efficient
105*13fbcb42Sjoerg /// code.  For all practical purposes this is fine because the configuration
106*13fbcb42Sjoerg /// is the same for all known NVPTX architectures.
107*13fbcb42Sjoerg enum MachineConfiguration : unsigned {
108*13fbcb42Sjoerg   /// See "llvm/Frontend/OpenMP/OMPGridValues.h" for various related target
109*13fbcb42Sjoerg   /// specific Grid Values like GV_Warp_Size, GV_Warp_Size_Log2,
110*13fbcb42Sjoerg   /// and GV_Warp_Size_Log2_Mask.
111*13fbcb42Sjoerg 
112*13fbcb42Sjoerg   /// Global memory alignment for performance.
113*13fbcb42Sjoerg   GlobalMemoryAlignment = 128,
114*13fbcb42Sjoerg 
115*13fbcb42Sjoerg   /// Maximal size of the shared memory buffer.
116*13fbcb42Sjoerg   SharedMemorySize = 128,
117*13fbcb42Sjoerg };
118*13fbcb42Sjoerg 
getPrivateItem(const Expr * RefExpr)119*13fbcb42Sjoerg static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
120*13fbcb42Sjoerg   RefExpr = RefExpr->IgnoreParens();
121*13fbcb42Sjoerg   if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
122*13fbcb42Sjoerg     const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
123*13fbcb42Sjoerg     while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
124*13fbcb42Sjoerg       Base = TempASE->getBase()->IgnoreParenImpCasts();
125*13fbcb42Sjoerg     RefExpr = Base;
126*13fbcb42Sjoerg   } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
127*13fbcb42Sjoerg     const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
128*13fbcb42Sjoerg     while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
129*13fbcb42Sjoerg       Base = TempOASE->getBase()->IgnoreParenImpCasts();
130*13fbcb42Sjoerg     while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
131*13fbcb42Sjoerg       Base = TempASE->getBase()->IgnoreParenImpCasts();
132*13fbcb42Sjoerg     RefExpr = Base;
133*13fbcb42Sjoerg   }
134*13fbcb42Sjoerg   RefExpr = RefExpr->IgnoreParenImpCasts();
135*13fbcb42Sjoerg   if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
136*13fbcb42Sjoerg     return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
137*13fbcb42Sjoerg   const auto *ME = cast<MemberExpr>(RefExpr);
138*13fbcb42Sjoerg   return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
139*13fbcb42Sjoerg }
140*13fbcb42Sjoerg 
141*13fbcb42Sjoerg 
buildRecordForGlobalizedVars(ASTContext & C,ArrayRef<const ValueDecl * > EscapedDecls,ArrayRef<const ValueDecl * > EscapedDeclsForTeams,llvm::SmallDenseMap<const ValueDecl *,const FieldDecl * > & MappedDeclsFields,int BufSize)142*13fbcb42Sjoerg static RecordDecl *buildRecordForGlobalizedVars(
143*13fbcb42Sjoerg     ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
144*13fbcb42Sjoerg     ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
145*13fbcb42Sjoerg     llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
146*13fbcb42Sjoerg         &MappedDeclsFields, int BufSize) {
147*13fbcb42Sjoerg   using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
148*13fbcb42Sjoerg   if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
149*13fbcb42Sjoerg     return nullptr;
150*13fbcb42Sjoerg   SmallVector<VarsDataTy, 4> GlobalizedVars;
151*13fbcb42Sjoerg   for (const ValueDecl *D : EscapedDecls)
152*13fbcb42Sjoerg     GlobalizedVars.emplace_back(
153*13fbcb42Sjoerg         CharUnits::fromQuantity(std::max(
154*13fbcb42Sjoerg             C.getDeclAlign(D).getQuantity(),
155*13fbcb42Sjoerg             static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
156*13fbcb42Sjoerg         D);
157*13fbcb42Sjoerg   for (const ValueDecl *D : EscapedDeclsForTeams)
158*13fbcb42Sjoerg     GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
159*13fbcb42Sjoerg   llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) {
160*13fbcb42Sjoerg     return L.first > R.first;
161*13fbcb42Sjoerg   });
162*13fbcb42Sjoerg 
163*13fbcb42Sjoerg   // Build struct _globalized_locals_ty {
164*13fbcb42Sjoerg   //         /*  globalized vars  */[WarSize] align (max(decl_align,
165*13fbcb42Sjoerg   //         GlobalMemoryAlignment))
166*13fbcb42Sjoerg   //         /*  globalized vars  */ for EscapedDeclsForTeams
167*13fbcb42Sjoerg   //       };
168*13fbcb42Sjoerg   RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
169*13fbcb42Sjoerg   GlobalizedRD->startDefinition();
170*13fbcb42Sjoerg   llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
171*13fbcb42Sjoerg       EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
172*13fbcb42Sjoerg   for (const auto &Pair : GlobalizedVars) {
173*13fbcb42Sjoerg     const ValueDecl *VD = Pair.second;
174*13fbcb42Sjoerg     QualType Type = VD->getType();
175*13fbcb42Sjoerg     if (Type->isLValueReferenceType())
176*13fbcb42Sjoerg       Type = C.getPointerType(Type.getNonReferenceType());
177*13fbcb42Sjoerg     else
178*13fbcb42Sjoerg       Type = Type.getNonReferenceType();
179*13fbcb42Sjoerg     SourceLocation Loc = VD->getLocation();
180*13fbcb42Sjoerg     FieldDecl *Field;
181*13fbcb42Sjoerg     if (SingleEscaped.count(VD)) {
182*13fbcb42Sjoerg       Field = FieldDecl::Create(
183*13fbcb42Sjoerg           C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
184*13fbcb42Sjoerg           C.getTrivialTypeSourceInfo(Type, SourceLocation()),
185*13fbcb42Sjoerg           /*BW=*/nullptr, /*Mutable=*/false,
186*13fbcb42Sjoerg           /*InitStyle=*/ICIS_NoInit);
187*13fbcb42Sjoerg       Field->setAccess(AS_public);
188*13fbcb42Sjoerg       if (VD->hasAttrs()) {
189*13fbcb42Sjoerg         for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
190*13fbcb42Sjoerg              E(VD->getAttrs().end());
191*13fbcb42Sjoerg              I != E; ++I)
192*13fbcb42Sjoerg           Field->addAttr(*I);
193*13fbcb42Sjoerg       }
194*13fbcb42Sjoerg     } else {
195*13fbcb42Sjoerg       llvm::APInt ArraySize(32, BufSize);
196*13fbcb42Sjoerg       Type = C.getConstantArrayType(Type, ArraySize, nullptr, ArrayType::Normal,
197*13fbcb42Sjoerg                                     0);
198*13fbcb42Sjoerg       Field = FieldDecl::Create(
199*13fbcb42Sjoerg           C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
200*13fbcb42Sjoerg           C.getTrivialTypeSourceInfo(Type, SourceLocation()),
201*13fbcb42Sjoerg           /*BW=*/nullptr, /*Mutable=*/false,
202*13fbcb42Sjoerg           /*InitStyle=*/ICIS_NoInit);
203*13fbcb42Sjoerg       Field->setAccess(AS_public);
204*13fbcb42Sjoerg       llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
205*13fbcb42Sjoerg                                      static_cast<CharUnits::QuantityType>(
206*13fbcb42Sjoerg                                          GlobalMemoryAlignment)));
207*13fbcb42Sjoerg       Field->addAttr(AlignedAttr::CreateImplicit(
208*13fbcb42Sjoerg           C, /*IsAlignmentExpr=*/true,
209*13fbcb42Sjoerg           IntegerLiteral::Create(C, Align,
210*13fbcb42Sjoerg                                  C.getIntTypeForBitwidth(32, /*Signed=*/0),
211*13fbcb42Sjoerg                                  SourceLocation()),
212*13fbcb42Sjoerg           {}, AttributeCommonInfo::AS_GNU, AlignedAttr::GNU_aligned));
213*13fbcb42Sjoerg     }
214*13fbcb42Sjoerg     GlobalizedRD->addDecl(Field);
215*13fbcb42Sjoerg     MappedDeclsFields.try_emplace(VD, Field);
216*13fbcb42Sjoerg   }
217*13fbcb42Sjoerg   GlobalizedRD->completeDefinition();
218*13fbcb42Sjoerg   return GlobalizedRD;
219*13fbcb42Sjoerg }
220*13fbcb42Sjoerg 
221*13fbcb42Sjoerg /// Get the list of variables that can escape their declaration context.
222*13fbcb42Sjoerg class CheckVarsEscapingDeclContext final
223*13fbcb42Sjoerg     : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
224*13fbcb42Sjoerg   CodeGenFunction &CGF;
225*13fbcb42Sjoerg   llvm::SetVector<const ValueDecl *> EscapedDecls;
226*13fbcb42Sjoerg   llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
227*13fbcb42Sjoerg   llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
228*13fbcb42Sjoerg   RecordDecl *GlobalizedRD = nullptr;
229*13fbcb42Sjoerg   llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
230*13fbcb42Sjoerg   bool AllEscaped = false;
231*13fbcb42Sjoerg   bool IsForCombinedParallelRegion = false;
232*13fbcb42Sjoerg 
markAsEscaped(const ValueDecl * VD)233*13fbcb42Sjoerg   void markAsEscaped(const ValueDecl *VD) {
234*13fbcb42Sjoerg     // Do not globalize declare target variables.
235*13fbcb42Sjoerg     if (!isa<VarDecl>(VD) ||
236*13fbcb42Sjoerg         OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
237*13fbcb42Sjoerg       return;
238*13fbcb42Sjoerg     VD = cast<ValueDecl>(VD->getCanonicalDecl());
239*13fbcb42Sjoerg     // Use user-specified allocation.
240*13fbcb42Sjoerg     if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
241*13fbcb42Sjoerg       return;
242*13fbcb42Sjoerg     // Variables captured by value must be globalized.
243*13fbcb42Sjoerg     if (auto *CSI = CGF.CapturedStmtInfo) {
244*13fbcb42Sjoerg       if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
245*13fbcb42Sjoerg         // Check if need to capture the variable that was already captured by
246*13fbcb42Sjoerg         // value in the outer region.
247*13fbcb42Sjoerg         if (!IsForCombinedParallelRegion) {
248*13fbcb42Sjoerg           if (!FD->hasAttrs())
249*13fbcb42Sjoerg             return;
250*13fbcb42Sjoerg           const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
251*13fbcb42Sjoerg           if (!Attr)
252*13fbcb42Sjoerg             return;
253*13fbcb42Sjoerg           if (((Attr->getCaptureKind() != OMPC_map) &&
254*13fbcb42Sjoerg                !isOpenMPPrivate(Attr->getCaptureKind())) ||
255*13fbcb42Sjoerg               ((Attr->getCaptureKind() == OMPC_map) &&
256*13fbcb42Sjoerg                !FD->getType()->isAnyPointerType()))
257*13fbcb42Sjoerg             return;
258*13fbcb42Sjoerg         }
259*13fbcb42Sjoerg         if (!FD->getType()->isReferenceType()) {
260*13fbcb42Sjoerg           assert(!VD->getType()->isVariablyModifiedType() &&
261*13fbcb42Sjoerg                  "Parameter captured by value with variably modified type");
262*13fbcb42Sjoerg           EscapedParameters.insert(VD);
263*13fbcb42Sjoerg         } else if (!IsForCombinedParallelRegion) {
264*13fbcb42Sjoerg           return;
265*13fbcb42Sjoerg         }
266*13fbcb42Sjoerg       }
267*13fbcb42Sjoerg     }
268*13fbcb42Sjoerg     if ((!CGF.CapturedStmtInfo ||
269*13fbcb42Sjoerg          (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
270*13fbcb42Sjoerg         VD->getType()->isReferenceType())
271*13fbcb42Sjoerg       // Do not globalize variables with reference type.
272*13fbcb42Sjoerg       return;
273*13fbcb42Sjoerg     if (VD->getType()->isVariablyModifiedType())
274*13fbcb42Sjoerg       EscapedVariableLengthDecls.insert(VD);
275*13fbcb42Sjoerg     else
276*13fbcb42Sjoerg       EscapedDecls.insert(VD);
277*13fbcb42Sjoerg   }
278*13fbcb42Sjoerg 
VisitValueDecl(const ValueDecl * VD)279*13fbcb42Sjoerg   void VisitValueDecl(const ValueDecl *VD) {
280*13fbcb42Sjoerg     if (VD->getType()->isLValueReferenceType())
281*13fbcb42Sjoerg       markAsEscaped(VD);
282*13fbcb42Sjoerg     if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
283*13fbcb42Sjoerg       if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
284*13fbcb42Sjoerg         const bool SavedAllEscaped = AllEscaped;
285*13fbcb42Sjoerg         AllEscaped = VD->getType()->isLValueReferenceType();
286*13fbcb42Sjoerg         Visit(VarD->getInit());
287*13fbcb42Sjoerg         AllEscaped = SavedAllEscaped;
288*13fbcb42Sjoerg       }
289*13fbcb42Sjoerg     }
290*13fbcb42Sjoerg   }
VisitOpenMPCapturedStmt(const CapturedStmt * S,ArrayRef<OMPClause * > Clauses,bool IsCombinedParallelRegion)291*13fbcb42Sjoerg   void VisitOpenMPCapturedStmt(const CapturedStmt *S,
292*13fbcb42Sjoerg                                ArrayRef<OMPClause *> Clauses,
293*13fbcb42Sjoerg                                bool IsCombinedParallelRegion) {
294*13fbcb42Sjoerg     if (!S)
295*13fbcb42Sjoerg       return;
296*13fbcb42Sjoerg     for (const CapturedStmt::Capture &C : S->captures()) {
297*13fbcb42Sjoerg       if (C.capturesVariable() && !C.capturesVariableByCopy()) {
298*13fbcb42Sjoerg         const ValueDecl *VD = C.getCapturedVar();
299*13fbcb42Sjoerg         bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
300*13fbcb42Sjoerg         if (IsCombinedParallelRegion) {
301*13fbcb42Sjoerg           // Check if the variable is privatized in the combined construct and
302*13fbcb42Sjoerg           // those private copies must be shared in the inner parallel
303*13fbcb42Sjoerg           // directive.
304*13fbcb42Sjoerg           IsForCombinedParallelRegion = false;
305*13fbcb42Sjoerg           for (const OMPClause *C : Clauses) {
306*13fbcb42Sjoerg             if (!isOpenMPPrivate(C->getClauseKind()) ||
307*13fbcb42Sjoerg                 C->getClauseKind() == OMPC_reduction ||
308*13fbcb42Sjoerg                 C->getClauseKind() == OMPC_linear ||
309*13fbcb42Sjoerg                 C->getClauseKind() == OMPC_private)
310*13fbcb42Sjoerg               continue;
311*13fbcb42Sjoerg             ArrayRef<const Expr *> Vars;
312*13fbcb42Sjoerg             if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
313*13fbcb42Sjoerg               Vars = PC->getVarRefs();
314*13fbcb42Sjoerg             else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
315*13fbcb42Sjoerg               Vars = PC->getVarRefs();
316*13fbcb42Sjoerg             else
317*13fbcb42Sjoerg               llvm_unreachable("Unexpected clause.");
318*13fbcb42Sjoerg             for (const auto *E : Vars) {
319*13fbcb42Sjoerg               const Decl *D =
320*13fbcb42Sjoerg                   cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
321*13fbcb42Sjoerg               if (D == VD->getCanonicalDecl()) {
322*13fbcb42Sjoerg                 IsForCombinedParallelRegion = true;
323*13fbcb42Sjoerg                 break;
324*13fbcb42Sjoerg               }
325*13fbcb42Sjoerg             }
326*13fbcb42Sjoerg             if (IsForCombinedParallelRegion)
327*13fbcb42Sjoerg               break;
328*13fbcb42Sjoerg           }
329*13fbcb42Sjoerg         }
330*13fbcb42Sjoerg         markAsEscaped(VD);
331*13fbcb42Sjoerg         if (isa<OMPCapturedExprDecl>(VD))
332*13fbcb42Sjoerg           VisitValueDecl(VD);
333*13fbcb42Sjoerg         IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
334*13fbcb42Sjoerg       }
335*13fbcb42Sjoerg     }
336*13fbcb42Sjoerg   }
337*13fbcb42Sjoerg 
buildRecordForGlobalizedVars(bool IsInTTDRegion)338*13fbcb42Sjoerg   void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
339*13fbcb42Sjoerg     assert(!GlobalizedRD &&
340*13fbcb42Sjoerg            "Record for globalized variables is built already.");
341*13fbcb42Sjoerg     ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
342*13fbcb42Sjoerg     unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
343*13fbcb42Sjoerg     if (IsInTTDRegion)
344*13fbcb42Sjoerg       EscapedDeclsForTeams = EscapedDecls.getArrayRef();
345*13fbcb42Sjoerg     else
346*13fbcb42Sjoerg       EscapedDeclsForParallel = EscapedDecls.getArrayRef();
347*13fbcb42Sjoerg     GlobalizedRD = ::buildRecordForGlobalizedVars(
348*13fbcb42Sjoerg         CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
349*13fbcb42Sjoerg         MappedDeclsFields, WarpSize);
350*13fbcb42Sjoerg   }
351*13fbcb42Sjoerg 
352*13fbcb42Sjoerg public:
CheckVarsEscapingDeclContext(CodeGenFunction & CGF,ArrayRef<const ValueDecl * > TeamsReductions)353*13fbcb42Sjoerg   CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
354*13fbcb42Sjoerg                                ArrayRef<const ValueDecl *> TeamsReductions)
355*13fbcb42Sjoerg       : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
356*13fbcb42Sjoerg   }
357*13fbcb42Sjoerg   virtual ~CheckVarsEscapingDeclContext() = default;
VisitDeclStmt(const DeclStmt * S)358*13fbcb42Sjoerg   void VisitDeclStmt(const DeclStmt *S) {
359*13fbcb42Sjoerg     if (!S)
360*13fbcb42Sjoerg       return;
361*13fbcb42Sjoerg     for (const Decl *D : S->decls())
362*13fbcb42Sjoerg       if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
363*13fbcb42Sjoerg         VisitValueDecl(VD);
364*13fbcb42Sjoerg   }
VisitOMPExecutableDirective(const OMPExecutableDirective * D)365*13fbcb42Sjoerg   void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
366*13fbcb42Sjoerg     if (!D)
367*13fbcb42Sjoerg       return;
368*13fbcb42Sjoerg     if (!D->hasAssociatedStmt())
369*13fbcb42Sjoerg       return;
370*13fbcb42Sjoerg     if (const auto *S =
371*13fbcb42Sjoerg             dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
372*13fbcb42Sjoerg       // Do not analyze directives that do not actually require capturing,
373*13fbcb42Sjoerg       // like `omp for` or `omp simd` directives.
374*13fbcb42Sjoerg       llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
375*13fbcb42Sjoerg       getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
376*13fbcb42Sjoerg       if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
377*13fbcb42Sjoerg         VisitStmt(S->getCapturedStmt());
378*13fbcb42Sjoerg         return;
379*13fbcb42Sjoerg       }
380*13fbcb42Sjoerg       VisitOpenMPCapturedStmt(
381*13fbcb42Sjoerg           S, D->clauses(),
382*13fbcb42Sjoerg           CaptureRegions.back() == OMPD_parallel &&
383*13fbcb42Sjoerg               isOpenMPDistributeDirective(D->getDirectiveKind()));
384*13fbcb42Sjoerg     }
385*13fbcb42Sjoerg   }
VisitCapturedStmt(const CapturedStmt * S)386*13fbcb42Sjoerg   void VisitCapturedStmt(const CapturedStmt *S) {
387*13fbcb42Sjoerg     if (!S)
388*13fbcb42Sjoerg       return;
389*13fbcb42Sjoerg     for (const CapturedStmt::Capture &C : S->captures()) {
390*13fbcb42Sjoerg       if (C.capturesVariable() && !C.capturesVariableByCopy()) {
391*13fbcb42Sjoerg         const ValueDecl *VD = C.getCapturedVar();
392*13fbcb42Sjoerg         markAsEscaped(VD);
393*13fbcb42Sjoerg         if (isa<OMPCapturedExprDecl>(VD))
394*13fbcb42Sjoerg           VisitValueDecl(VD);
395*13fbcb42Sjoerg       }
396*13fbcb42Sjoerg     }
397*13fbcb42Sjoerg   }
VisitLambdaExpr(const LambdaExpr * E)398*13fbcb42Sjoerg   void VisitLambdaExpr(const LambdaExpr *E) {
399*13fbcb42Sjoerg     if (!E)
400*13fbcb42Sjoerg       return;
401*13fbcb42Sjoerg     for (const LambdaCapture &C : E->captures()) {
402*13fbcb42Sjoerg       if (C.capturesVariable()) {
403*13fbcb42Sjoerg         if (C.getCaptureKind() == LCK_ByRef) {
404*13fbcb42Sjoerg           const ValueDecl *VD = C.getCapturedVar();
405*13fbcb42Sjoerg           markAsEscaped(VD);
406*13fbcb42Sjoerg           if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
407*13fbcb42Sjoerg             VisitValueDecl(VD);
408*13fbcb42Sjoerg         }
409*13fbcb42Sjoerg       }
410*13fbcb42Sjoerg     }
411*13fbcb42Sjoerg   }
VisitBlockExpr(const BlockExpr * E)412*13fbcb42Sjoerg   void VisitBlockExpr(const BlockExpr *E) {
413*13fbcb42Sjoerg     if (!E)
414*13fbcb42Sjoerg       return;
415*13fbcb42Sjoerg     for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
416*13fbcb42Sjoerg       if (C.isByRef()) {
417*13fbcb42Sjoerg         const VarDecl *VD = C.getVariable();
418*13fbcb42Sjoerg         markAsEscaped(VD);
419*13fbcb42Sjoerg         if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
420*13fbcb42Sjoerg           VisitValueDecl(VD);
421*13fbcb42Sjoerg       }
422*13fbcb42Sjoerg     }
423*13fbcb42Sjoerg   }
VisitCallExpr(const CallExpr * E)424*13fbcb42Sjoerg   void VisitCallExpr(const CallExpr *E) {
425*13fbcb42Sjoerg     if (!E)
426*13fbcb42Sjoerg       return;
427*13fbcb42Sjoerg     for (const Expr *Arg : E->arguments()) {
428*13fbcb42Sjoerg       if (!Arg)
429*13fbcb42Sjoerg         continue;
430*13fbcb42Sjoerg       if (Arg->isLValue()) {
431*13fbcb42Sjoerg         const bool SavedAllEscaped = AllEscaped;
432*13fbcb42Sjoerg         AllEscaped = true;
433*13fbcb42Sjoerg         Visit(Arg);
434*13fbcb42Sjoerg         AllEscaped = SavedAllEscaped;
435*13fbcb42Sjoerg       } else {
436*13fbcb42Sjoerg         Visit(Arg);
437*13fbcb42Sjoerg       }
438*13fbcb42Sjoerg     }
439*13fbcb42Sjoerg     Visit(E->getCallee());
440*13fbcb42Sjoerg   }
VisitDeclRefExpr(const DeclRefExpr * E)441*13fbcb42Sjoerg   void VisitDeclRefExpr(const DeclRefExpr *E) {
442*13fbcb42Sjoerg     if (!E)
443*13fbcb42Sjoerg       return;
444*13fbcb42Sjoerg     const ValueDecl *VD = E->getDecl();
445*13fbcb42Sjoerg     if (AllEscaped)
446*13fbcb42Sjoerg       markAsEscaped(VD);
447*13fbcb42Sjoerg     if (isa<OMPCapturedExprDecl>(VD))
448*13fbcb42Sjoerg       VisitValueDecl(VD);
449*13fbcb42Sjoerg     else if (const auto *VarD = dyn_cast<VarDecl>(VD))
450*13fbcb42Sjoerg       if (VarD->isInitCapture())
451*13fbcb42Sjoerg         VisitValueDecl(VD);
452*13fbcb42Sjoerg   }
VisitUnaryOperator(const UnaryOperator * E)453*13fbcb42Sjoerg   void VisitUnaryOperator(const UnaryOperator *E) {
454*13fbcb42Sjoerg     if (!E)
455*13fbcb42Sjoerg       return;
456*13fbcb42Sjoerg     if (E->getOpcode() == UO_AddrOf) {
457*13fbcb42Sjoerg       const bool SavedAllEscaped = AllEscaped;
458*13fbcb42Sjoerg       AllEscaped = true;
459*13fbcb42Sjoerg       Visit(E->getSubExpr());
460*13fbcb42Sjoerg       AllEscaped = SavedAllEscaped;
461*13fbcb42Sjoerg     } else {
462*13fbcb42Sjoerg       Visit(E->getSubExpr());
463*13fbcb42Sjoerg     }
464*13fbcb42Sjoerg   }
VisitImplicitCastExpr(const ImplicitCastExpr * E)465*13fbcb42Sjoerg   void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
466*13fbcb42Sjoerg     if (!E)
467*13fbcb42Sjoerg       return;
468*13fbcb42Sjoerg     if (E->getCastKind() == CK_ArrayToPointerDecay) {
469*13fbcb42Sjoerg       const bool SavedAllEscaped = AllEscaped;
470*13fbcb42Sjoerg       AllEscaped = true;
471*13fbcb42Sjoerg       Visit(E->getSubExpr());
472*13fbcb42Sjoerg       AllEscaped = SavedAllEscaped;
473*13fbcb42Sjoerg     } else {
474*13fbcb42Sjoerg       Visit(E->getSubExpr());
475*13fbcb42Sjoerg     }
476*13fbcb42Sjoerg   }
VisitExpr(const Expr * E)477*13fbcb42Sjoerg   void VisitExpr(const Expr *E) {
478*13fbcb42Sjoerg     if (!E)
479*13fbcb42Sjoerg       return;
480*13fbcb42Sjoerg     bool SavedAllEscaped = AllEscaped;
481*13fbcb42Sjoerg     if (!E->isLValue())
482*13fbcb42Sjoerg       AllEscaped = false;
483*13fbcb42Sjoerg     for (const Stmt *Child : E->children())
484*13fbcb42Sjoerg       if (Child)
485*13fbcb42Sjoerg         Visit(Child);
486*13fbcb42Sjoerg     AllEscaped = SavedAllEscaped;
487*13fbcb42Sjoerg   }
VisitStmt(const Stmt * S)488*13fbcb42Sjoerg   void VisitStmt(const Stmt *S) {
489*13fbcb42Sjoerg     if (!S)
490*13fbcb42Sjoerg       return;
491*13fbcb42Sjoerg     for (const Stmt *Child : S->children())
492*13fbcb42Sjoerg       if (Child)
493*13fbcb42Sjoerg         Visit(Child);
494*13fbcb42Sjoerg   }
495*13fbcb42Sjoerg 
496*13fbcb42Sjoerg   /// Returns the record that handles all the escaped local variables and used
497*13fbcb42Sjoerg   /// instead of their original storage.
getGlobalizedRecord(bool IsInTTDRegion)498*13fbcb42Sjoerg   const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
499*13fbcb42Sjoerg     if (!GlobalizedRD)
500*13fbcb42Sjoerg       buildRecordForGlobalizedVars(IsInTTDRegion);
501*13fbcb42Sjoerg     return GlobalizedRD;
502*13fbcb42Sjoerg   }
503*13fbcb42Sjoerg 
504*13fbcb42Sjoerg   /// Returns the field in the globalized record for the escaped variable.
getFieldForGlobalizedVar(const ValueDecl * VD) const505*13fbcb42Sjoerg   const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
506*13fbcb42Sjoerg     assert(GlobalizedRD &&
507*13fbcb42Sjoerg            "Record for globalized variables must be generated already.");
508*13fbcb42Sjoerg     auto I = MappedDeclsFields.find(VD);
509*13fbcb42Sjoerg     if (I == MappedDeclsFields.end())
510*13fbcb42Sjoerg       return nullptr;
511*13fbcb42Sjoerg     return I->getSecond();
512*13fbcb42Sjoerg   }
513*13fbcb42Sjoerg 
514*13fbcb42Sjoerg   /// Returns the list of the escaped local variables/parameters.
getEscapedDecls() const515*13fbcb42Sjoerg   ArrayRef<const ValueDecl *> getEscapedDecls() const {
516*13fbcb42Sjoerg     return EscapedDecls.getArrayRef();
517*13fbcb42Sjoerg   }
518*13fbcb42Sjoerg 
519*13fbcb42Sjoerg   /// Checks if the escaped local variable is actually a parameter passed by
520*13fbcb42Sjoerg   /// value.
getEscapedParameters() const521*13fbcb42Sjoerg   const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
522*13fbcb42Sjoerg     return EscapedParameters;
523*13fbcb42Sjoerg   }
524*13fbcb42Sjoerg 
525*13fbcb42Sjoerg   /// Returns the list of the escaped variables with the variably modified
526*13fbcb42Sjoerg   /// types.
getEscapedVariableLengthDecls() const527*13fbcb42Sjoerg   ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
528*13fbcb42Sjoerg     return EscapedVariableLengthDecls.getArrayRef();
529*13fbcb42Sjoerg   }
530*13fbcb42Sjoerg };
531*13fbcb42Sjoerg } // anonymous namespace
532*13fbcb42Sjoerg 
533*13fbcb42Sjoerg /// Get the id of the warp in the block.
534*13fbcb42Sjoerg /// We assume that the warp size is 32, which is always the case
535*13fbcb42Sjoerg /// on the NVPTX device, to generate more efficient code.
getNVPTXWarpID(CodeGenFunction & CGF)536*13fbcb42Sjoerg static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
537*13fbcb42Sjoerg   CGBuilderTy &Bld = CGF.Builder;
538*13fbcb42Sjoerg   unsigned LaneIDBits =
539*13fbcb42Sjoerg       CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size_Log2);
540*13fbcb42Sjoerg   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
541*13fbcb42Sjoerg   return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
542*13fbcb42Sjoerg }
543*13fbcb42Sjoerg 
544*13fbcb42Sjoerg /// Get the id of the current lane in the Warp.
545*13fbcb42Sjoerg /// We assume that the warp size is 32, which is always the case
546*13fbcb42Sjoerg /// on the NVPTX device, to generate more efficient code.
getNVPTXLaneID(CodeGenFunction & CGF)547*13fbcb42Sjoerg static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
548*13fbcb42Sjoerg   CGBuilderTy &Bld = CGF.Builder;
549*13fbcb42Sjoerg   unsigned LaneIDMask = CGF.getContext().getTargetInfo().getGridValue(
550*13fbcb42Sjoerg       llvm::omp::GV_Warp_Size_Log2_Mask);
551*13fbcb42Sjoerg   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
552*13fbcb42Sjoerg   return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
553*13fbcb42Sjoerg                        "nvptx_lane_id");
554*13fbcb42Sjoerg }
555*13fbcb42Sjoerg 
556*13fbcb42Sjoerg /// Get the value of the thread_limit clause in the teams directive.
557*13fbcb42Sjoerg /// For the 'generic' execution mode, the runtime encodes thread_limit in
558*13fbcb42Sjoerg /// the launch parameters, always starting thread_limit+warpSize threads per
559*13fbcb42Sjoerg /// CTA. The threads in the last warp are reserved for master execution.
560*13fbcb42Sjoerg /// For the 'spmd' execution mode, all threads in a CTA are part of the team.
getThreadLimit(CodeGenFunction & CGF,bool IsInSPMDExecutionMode=false)561*13fbcb42Sjoerg static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
562*13fbcb42Sjoerg                                    bool IsInSPMDExecutionMode = false) {
563*13fbcb42Sjoerg   CGBuilderTy &Bld = CGF.Builder;
564*13fbcb42Sjoerg   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
565*13fbcb42Sjoerg   llvm::Value *ThreadLimit = nullptr;
566*13fbcb42Sjoerg   if (IsInSPMDExecutionMode)
567*13fbcb42Sjoerg     ThreadLimit = RT.getGPUNumThreads(CGF);
568*13fbcb42Sjoerg   else {
569*13fbcb42Sjoerg     llvm::Value *GPUNumThreads = RT.getGPUNumThreads(CGF);
570*13fbcb42Sjoerg     llvm::Value *GPUWarpSize = RT.getGPUWarpSize(CGF);
571*13fbcb42Sjoerg     ThreadLimit = Bld.CreateNUWSub(GPUNumThreads, GPUWarpSize, "thread_limit");
572*13fbcb42Sjoerg   }
573*13fbcb42Sjoerg   assert(ThreadLimit != nullptr && "Expected non-null ThreadLimit");
574*13fbcb42Sjoerg   return ThreadLimit;
575*13fbcb42Sjoerg }
576*13fbcb42Sjoerg 
577*13fbcb42Sjoerg /// Get the thread id of the OMP master thread.
578*13fbcb42Sjoerg /// The master thread id is the first thread (lane) of the last warp in the
579*13fbcb42Sjoerg /// GPU block.  Warp size is assumed to be some power of 2.
580*13fbcb42Sjoerg /// Thread id is 0 indexed.
581*13fbcb42Sjoerg /// E.g: If NumThreads is 33, master id is 32.
582*13fbcb42Sjoerg ///      If NumThreads is 64, master id is 32.
583*13fbcb42Sjoerg ///      If NumThreads is 1024, master id is 992.
getMasterThreadID(CodeGenFunction & CGF)584*13fbcb42Sjoerg static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
585*13fbcb42Sjoerg   CGBuilderTy &Bld = CGF.Builder;
586*13fbcb42Sjoerg   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
587*13fbcb42Sjoerg   llvm::Value *NumThreads = RT.getGPUNumThreads(CGF);
588*13fbcb42Sjoerg   // We assume that the warp size is a power of 2.
589*13fbcb42Sjoerg   llvm::Value *Mask = Bld.CreateNUWSub(RT.getGPUWarpSize(CGF), Bld.getInt32(1));
590*13fbcb42Sjoerg 
591*13fbcb42Sjoerg   llvm::Value *NumThreadsSubOne = Bld.CreateNUWSub(NumThreads, Bld.getInt32(1));
592*13fbcb42Sjoerg   return Bld.CreateAnd(NumThreadsSubOne, Bld.CreateNot(Mask), "master_tid");
593*13fbcb42Sjoerg }
594*13fbcb42Sjoerg 
WorkerFunctionState(CodeGenModule & CGM,SourceLocation Loc)595*13fbcb42Sjoerg CGOpenMPRuntimeGPU::WorkerFunctionState::WorkerFunctionState(
596*13fbcb42Sjoerg     CodeGenModule &CGM, SourceLocation Loc)
597*13fbcb42Sjoerg     : WorkerFn(nullptr), CGFI(CGM.getTypes().arrangeNullaryFunction()),
598*13fbcb42Sjoerg       Loc(Loc) {
599*13fbcb42Sjoerg   createWorkerFunction(CGM);
600*13fbcb42Sjoerg }
601*13fbcb42Sjoerg 
createWorkerFunction(CodeGenModule & CGM)602*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::WorkerFunctionState::createWorkerFunction(
603*13fbcb42Sjoerg     CodeGenModule &CGM) {
604*13fbcb42Sjoerg   // Create an worker function with no arguments.
605*13fbcb42Sjoerg 
606*13fbcb42Sjoerg   WorkerFn = llvm::Function::Create(
607*13fbcb42Sjoerg       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
608*13fbcb42Sjoerg       /*placeholder=*/"_worker", &CGM.getModule());
609*13fbcb42Sjoerg   CGM.SetInternalFunctionAttributes(GlobalDecl(), WorkerFn, CGFI);
610*13fbcb42Sjoerg   WorkerFn->setDoesNotRecurse();
611*13fbcb42Sjoerg }
612*13fbcb42Sjoerg 
613*13fbcb42Sjoerg CGOpenMPRuntimeGPU::ExecutionMode
getExecutionMode() const614*13fbcb42Sjoerg CGOpenMPRuntimeGPU::getExecutionMode() const {
615*13fbcb42Sjoerg   return CurrentExecutionMode;
616*13fbcb42Sjoerg }
617*13fbcb42Sjoerg 
618*13fbcb42Sjoerg static CGOpenMPRuntimeGPU::DataSharingMode
getDataSharingMode(CodeGenModule & CGM)619*13fbcb42Sjoerg getDataSharingMode(CodeGenModule &CGM) {
620*13fbcb42Sjoerg   return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeGPU::CUDA
621*13fbcb42Sjoerg                                           : CGOpenMPRuntimeGPU::Generic;
622*13fbcb42Sjoerg }
623*13fbcb42Sjoerg 
624*13fbcb42Sjoerg /// Check for inner (nested) SPMD construct, if any
hasNestedSPMDDirective(ASTContext & Ctx,const OMPExecutableDirective & D)625*13fbcb42Sjoerg static bool hasNestedSPMDDirective(ASTContext &Ctx,
626*13fbcb42Sjoerg                                    const OMPExecutableDirective &D) {
627*13fbcb42Sjoerg   const auto *CS = D.getInnermostCapturedStmt();
628*13fbcb42Sjoerg   const auto *Body =
629*13fbcb42Sjoerg       CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
630*13fbcb42Sjoerg   const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
631*13fbcb42Sjoerg 
632*13fbcb42Sjoerg   if (const auto *NestedDir =
633*13fbcb42Sjoerg           dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
634*13fbcb42Sjoerg     OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
635*13fbcb42Sjoerg     switch (D.getDirectiveKind()) {
636*13fbcb42Sjoerg     case OMPD_target:
637*13fbcb42Sjoerg       if (isOpenMPParallelDirective(DKind))
638*13fbcb42Sjoerg         return true;
639*13fbcb42Sjoerg       if (DKind == OMPD_teams) {
640*13fbcb42Sjoerg         Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
641*13fbcb42Sjoerg             /*IgnoreCaptured=*/true);
642*13fbcb42Sjoerg         if (!Body)
643*13fbcb42Sjoerg           return false;
644*13fbcb42Sjoerg         ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
645*13fbcb42Sjoerg         if (const auto *NND =
646*13fbcb42Sjoerg                 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
647*13fbcb42Sjoerg           DKind = NND->getDirectiveKind();
648*13fbcb42Sjoerg           if (isOpenMPParallelDirective(DKind))
649*13fbcb42Sjoerg             return true;
650*13fbcb42Sjoerg         }
651*13fbcb42Sjoerg       }
652*13fbcb42Sjoerg       return false;
653*13fbcb42Sjoerg     case OMPD_target_teams:
654*13fbcb42Sjoerg       return isOpenMPParallelDirective(DKind);
655*13fbcb42Sjoerg     case OMPD_target_simd:
656*13fbcb42Sjoerg     case OMPD_target_parallel:
657*13fbcb42Sjoerg     case OMPD_target_parallel_for:
658*13fbcb42Sjoerg     case OMPD_target_parallel_for_simd:
659*13fbcb42Sjoerg     case OMPD_target_teams_distribute:
660*13fbcb42Sjoerg     case OMPD_target_teams_distribute_simd:
661*13fbcb42Sjoerg     case OMPD_target_teams_distribute_parallel_for:
662*13fbcb42Sjoerg     case OMPD_target_teams_distribute_parallel_for_simd:
663*13fbcb42Sjoerg     case OMPD_parallel:
664*13fbcb42Sjoerg     case OMPD_for:
665*13fbcb42Sjoerg     case OMPD_parallel_for:
666*13fbcb42Sjoerg     case OMPD_parallel_master:
667*13fbcb42Sjoerg     case OMPD_parallel_sections:
668*13fbcb42Sjoerg     case OMPD_for_simd:
669*13fbcb42Sjoerg     case OMPD_parallel_for_simd:
670*13fbcb42Sjoerg     case OMPD_cancel:
671*13fbcb42Sjoerg     case OMPD_cancellation_point:
672*13fbcb42Sjoerg     case OMPD_ordered:
673*13fbcb42Sjoerg     case OMPD_threadprivate:
674*13fbcb42Sjoerg     case OMPD_allocate:
675*13fbcb42Sjoerg     case OMPD_task:
676*13fbcb42Sjoerg     case OMPD_simd:
677*13fbcb42Sjoerg     case OMPD_sections:
678*13fbcb42Sjoerg     case OMPD_section:
679*13fbcb42Sjoerg     case OMPD_single:
680*13fbcb42Sjoerg     case OMPD_master:
681*13fbcb42Sjoerg     case OMPD_critical:
682*13fbcb42Sjoerg     case OMPD_taskyield:
683*13fbcb42Sjoerg     case OMPD_barrier:
684*13fbcb42Sjoerg     case OMPD_taskwait:
685*13fbcb42Sjoerg     case OMPD_taskgroup:
686*13fbcb42Sjoerg     case OMPD_atomic:
687*13fbcb42Sjoerg     case OMPD_flush:
688*13fbcb42Sjoerg     case OMPD_depobj:
689*13fbcb42Sjoerg     case OMPD_scan:
690*13fbcb42Sjoerg     case OMPD_teams:
691*13fbcb42Sjoerg     case OMPD_target_data:
692*13fbcb42Sjoerg     case OMPD_target_exit_data:
693*13fbcb42Sjoerg     case OMPD_target_enter_data:
694*13fbcb42Sjoerg     case OMPD_distribute:
695*13fbcb42Sjoerg     case OMPD_distribute_simd:
696*13fbcb42Sjoerg     case OMPD_distribute_parallel_for:
697*13fbcb42Sjoerg     case OMPD_distribute_parallel_for_simd:
698*13fbcb42Sjoerg     case OMPD_teams_distribute:
699*13fbcb42Sjoerg     case OMPD_teams_distribute_simd:
700*13fbcb42Sjoerg     case OMPD_teams_distribute_parallel_for:
701*13fbcb42Sjoerg     case OMPD_teams_distribute_parallel_for_simd:
702*13fbcb42Sjoerg     case OMPD_target_update:
703*13fbcb42Sjoerg     case OMPD_declare_simd:
704*13fbcb42Sjoerg     case OMPD_declare_variant:
705*13fbcb42Sjoerg     case OMPD_begin_declare_variant:
706*13fbcb42Sjoerg     case OMPD_end_declare_variant:
707*13fbcb42Sjoerg     case OMPD_declare_target:
708*13fbcb42Sjoerg     case OMPD_end_declare_target:
709*13fbcb42Sjoerg     case OMPD_declare_reduction:
710*13fbcb42Sjoerg     case OMPD_declare_mapper:
711*13fbcb42Sjoerg     case OMPD_taskloop:
712*13fbcb42Sjoerg     case OMPD_taskloop_simd:
713*13fbcb42Sjoerg     case OMPD_master_taskloop:
714*13fbcb42Sjoerg     case OMPD_master_taskloop_simd:
715*13fbcb42Sjoerg     case OMPD_parallel_master_taskloop:
716*13fbcb42Sjoerg     case OMPD_parallel_master_taskloop_simd:
717*13fbcb42Sjoerg     case OMPD_requires:
718*13fbcb42Sjoerg     case OMPD_unknown:
719*13fbcb42Sjoerg     default:
720*13fbcb42Sjoerg       llvm_unreachable("Unexpected directive.");
721*13fbcb42Sjoerg     }
722*13fbcb42Sjoerg   }
723*13fbcb42Sjoerg 
724*13fbcb42Sjoerg   return false;
725*13fbcb42Sjoerg }
726*13fbcb42Sjoerg 
supportsSPMDExecutionMode(ASTContext & Ctx,const OMPExecutableDirective & D)727*13fbcb42Sjoerg static bool supportsSPMDExecutionMode(ASTContext &Ctx,
728*13fbcb42Sjoerg                                       const OMPExecutableDirective &D) {
729*13fbcb42Sjoerg   OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
730*13fbcb42Sjoerg   switch (DirectiveKind) {
731*13fbcb42Sjoerg   case OMPD_target:
732*13fbcb42Sjoerg   case OMPD_target_teams:
733*13fbcb42Sjoerg     return hasNestedSPMDDirective(Ctx, D);
734*13fbcb42Sjoerg   case OMPD_target_parallel:
735*13fbcb42Sjoerg   case OMPD_target_parallel_for:
736*13fbcb42Sjoerg   case OMPD_target_parallel_for_simd:
737*13fbcb42Sjoerg   case OMPD_target_teams_distribute_parallel_for:
738*13fbcb42Sjoerg   case OMPD_target_teams_distribute_parallel_for_simd:
739*13fbcb42Sjoerg   case OMPD_target_simd:
740*13fbcb42Sjoerg   case OMPD_target_teams_distribute_simd:
741*13fbcb42Sjoerg     return true;
742*13fbcb42Sjoerg   case OMPD_target_teams_distribute:
743*13fbcb42Sjoerg     return false;
744*13fbcb42Sjoerg   case OMPD_parallel:
745*13fbcb42Sjoerg   case OMPD_for:
746*13fbcb42Sjoerg   case OMPD_parallel_for:
747*13fbcb42Sjoerg   case OMPD_parallel_master:
748*13fbcb42Sjoerg   case OMPD_parallel_sections:
749*13fbcb42Sjoerg   case OMPD_for_simd:
750*13fbcb42Sjoerg   case OMPD_parallel_for_simd:
751*13fbcb42Sjoerg   case OMPD_cancel:
752*13fbcb42Sjoerg   case OMPD_cancellation_point:
753*13fbcb42Sjoerg   case OMPD_ordered:
754*13fbcb42Sjoerg   case OMPD_threadprivate:
755*13fbcb42Sjoerg   case OMPD_allocate:
756*13fbcb42Sjoerg   case OMPD_task:
757*13fbcb42Sjoerg   case OMPD_simd:
758*13fbcb42Sjoerg   case OMPD_sections:
759*13fbcb42Sjoerg   case OMPD_section:
760*13fbcb42Sjoerg   case OMPD_single:
761*13fbcb42Sjoerg   case OMPD_master:
762*13fbcb42Sjoerg   case OMPD_critical:
763*13fbcb42Sjoerg   case OMPD_taskyield:
764*13fbcb42Sjoerg   case OMPD_barrier:
765*13fbcb42Sjoerg   case OMPD_taskwait:
766*13fbcb42Sjoerg   case OMPD_taskgroup:
767*13fbcb42Sjoerg   case OMPD_atomic:
768*13fbcb42Sjoerg   case OMPD_flush:
769*13fbcb42Sjoerg   case OMPD_depobj:
770*13fbcb42Sjoerg   case OMPD_scan:
771*13fbcb42Sjoerg   case OMPD_teams:
772*13fbcb42Sjoerg   case OMPD_target_data:
773*13fbcb42Sjoerg   case OMPD_target_exit_data:
774*13fbcb42Sjoerg   case OMPD_target_enter_data:
775*13fbcb42Sjoerg   case OMPD_distribute:
776*13fbcb42Sjoerg   case OMPD_distribute_simd:
777*13fbcb42Sjoerg   case OMPD_distribute_parallel_for:
778*13fbcb42Sjoerg   case OMPD_distribute_parallel_for_simd:
779*13fbcb42Sjoerg   case OMPD_teams_distribute:
780*13fbcb42Sjoerg   case OMPD_teams_distribute_simd:
781*13fbcb42Sjoerg   case OMPD_teams_distribute_parallel_for:
782*13fbcb42Sjoerg   case OMPD_teams_distribute_parallel_for_simd:
783*13fbcb42Sjoerg   case OMPD_target_update:
784*13fbcb42Sjoerg   case OMPD_declare_simd:
785*13fbcb42Sjoerg   case OMPD_declare_variant:
786*13fbcb42Sjoerg   case OMPD_begin_declare_variant:
787*13fbcb42Sjoerg   case OMPD_end_declare_variant:
788*13fbcb42Sjoerg   case OMPD_declare_target:
789*13fbcb42Sjoerg   case OMPD_end_declare_target:
790*13fbcb42Sjoerg   case OMPD_declare_reduction:
791*13fbcb42Sjoerg   case OMPD_declare_mapper:
792*13fbcb42Sjoerg   case OMPD_taskloop:
793*13fbcb42Sjoerg   case OMPD_taskloop_simd:
794*13fbcb42Sjoerg   case OMPD_master_taskloop:
795*13fbcb42Sjoerg   case OMPD_master_taskloop_simd:
796*13fbcb42Sjoerg   case OMPD_parallel_master_taskloop:
797*13fbcb42Sjoerg   case OMPD_parallel_master_taskloop_simd:
798*13fbcb42Sjoerg   case OMPD_requires:
799*13fbcb42Sjoerg   case OMPD_unknown:
800*13fbcb42Sjoerg   default:
801*13fbcb42Sjoerg     break;
802*13fbcb42Sjoerg   }
803*13fbcb42Sjoerg   llvm_unreachable(
804*13fbcb42Sjoerg       "Unknown programming model for OpenMP directive on NVPTX target.");
805*13fbcb42Sjoerg }
806*13fbcb42Sjoerg 
807*13fbcb42Sjoerg /// Check if the directive is loops based and has schedule clause at all or has
808*13fbcb42Sjoerg /// static scheduling.
hasStaticScheduling(const OMPExecutableDirective & D)809*13fbcb42Sjoerg static bool hasStaticScheduling(const OMPExecutableDirective &D) {
810*13fbcb42Sjoerg   assert(isOpenMPWorksharingDirective(D.getDirectiveKind()) &&
811*13fbcb42Sjoerg          isOpenMPLoopDirective(D.getDirectiveKind()) &&
812*13fbcb42Sjoerg          "Expected loop-based directive.");
813*13fbcb42Sjoerg   return !D.hasClausesOfKind<OMPOrderedClause>() &&
814*13fbcb42Sjoerg          (!D.hasClausesOfKind<OMPScheduleClause>() ||
815*13fbcb42Sjoerg           llvm::any_of(D.getClausesOfKind<OMPScheduleClause>(),
816*13fbcb42Sjoerg                        [](const OMPScheduleClause *C) {
817*13fbcb42Sjoerg                          return C->getScheduleKind() == OMPC_SCHEDULE_static;
818*13fbcb42Sjoerg                        }));
819*13fbcb42Sjoerg }
820*13fbcb42Sjoerg 
821*13fbcb42Sjoerg /// Check for inner (nested) lightweight runtime construct, if any
hasNestedLightweightDirective(ASTContext & Ctx,const OMPExecutableDirective & D)822*13fbcb42Sjoerg static bool hasNestedLightweightDirective(ASTContext &Ctx,
823*13fbcb42Sjoerg                                           const OMPExecutableDirective &D) {
824*13fbcb42Sjoerg   assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive.");
825*13fbcb42Sjoerg   const auto *CS = D.getInnermostCapturedStmt();
826*13fbcb42Sjoerg   const auto *Body =
827*13fbcb42Sjoerg       CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
828*13fbcb42Sjoerg   const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
829*13fbcb42Sjoerg 
830*13fbcb42Sjoerg   if (const auto *NestedDir =
831*13fbcb42Sjoerg           dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
832*13fbcb42Sjoerg     OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
833*13fbcb42Sjoerg     switch (D.getDirectiveKind()) {
834*13fbcb42Sjoerg     case OMPD_target:
835*13fbcb42Sjoerg       if (isOpenMPParallelDirective(DKind) &&
836*13fbcb42Sjoerg           isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
837*13fbcb42Sjoerg           hasStaticScheduling(*NestedDir))
838*13fbcb42Sjoerg         return true;
839*13fbcb42Sjoerg       if (DKind == OMPD_teams_distribute_simd || DKind == OMPD_simd)
840*13fbcb42Sjoerg         return true;
841*13fbcb42Sjoerg       if (DKind == OMPD_parallel) {
842*13fbcb42Sjoerg         Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
843*13fbcb42Sjoerg             /*IgnoreCaptured=*/true);
844*13fbcb42Sjoerg         if (!Body)
845*13fbcb42Sjoerg           return false;
846*13fbcb42Sjoerg         ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
847*13fbcb42Sjoerg         if (const auto *NND =
848*13fbcb42Sjoerg                 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
849*13fbcb42Sjoerg           DKind = NND->getDirectiveKind();
850*13fbcb42Sjoerg           if (isOpenMPWorksharingDirective(DKind) &&
851*13fbcb42Sjoerg               isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
852*13fbcb42Sjoerg             return true;
853*13fbcb42Sjoerg         }
854*13fbcb42Sjoerg       } else if (DKind == OMPD_teams) {
855*13fbcb42Sjoerg         Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
856*13fbcb42Sjoerg             /*IgnoreCaptured=*/true);
857*13fbcb42Sjoerg         if (!Body)
858*13fbcb42Sjoerg           return false;
859*13fbcb42Sjoerg         ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
860*13fbcb42Sjoerg         if (const auto *NND =
861*13fbcb42Sjoerg                 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
862*13fbcb42Sjoerg           DKind = NND->getDirectiveKind();
863*13fbcb42Sjoerg           if (isOpenMPParallelDirective(DKind) &&
864*13fbcb42Sjoerg               isOpenMPWorksharingDirective(DKind) &&
865*13fbcb42Sjoerg               isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
866*13fbcb42Sjoerg             return true;
867*13fbcb42Sjoerg           if (DKind == OMPD_parallel) {
868*13fbcb42Sjoerg             Body = NND->getInnermostCapturedStmt()->IgnoreContainers(
869*13fbcb42Sjoerg                 /*IgnoreCaptured=*/true);
870*13fbcb42Sjoerg             if (!Body)
871*13fbcb42Sjoerg               return false;
872*13fbcb42Sjoerg             ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
873*13fbcb42Sjoerg             if (const auto *NND =
874*13fbcb42Sjoerg                     dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
875*13fbcb42Sjoerg               DKind = NND->getDirectiveKind();
876*13fbcb42Sjoerg               if (isOpenMPWorksharingDirective(DKind) &&
877*13fbcb42Sjoerg                   isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
878*13fbcb42Sjoerg                 return true;
879*13fbcb42Sjoerg             }
880*13fbcb42Sjoerg           }
881*13fbcb42Sjoerg         }
882*13fbcb42Sjoerg       }
883*13fbcb42Sjoerg       return false;
884*13fbcb42Sjoerg     case OMPD_target_teams:
885*13fbcb42Sjoerg       if (isOpenMPParallelDirective(DKind) &&
886*13fbcb42Sjoerg           isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
887*13fbcb42Sjoerg           hasStaticScheduling(*NestedDir))
888*13fbcb42Sjoerg         return true;
889*13fbcb42Sjoerg       if (DKind == OMPD_distribute_simd || DKind == OMPD_simd)
890*13fbcb42Sjoerg         return true;
891*13fbcb42Sjoerg       if (DKind == OMPD_parallel) {
892*13fbcb42Sjoerg         Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
893*13fbcb42Sjoerg             /*IgnoreCaptured=*/true);
894*13fbcb42Sjoerg         if (!Body)
895*13fbcb42Sjoerg           return false;
896*13fbcb42Sjoerg         ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
897*13fbcb42Sjoerg         if (const auto *NND =
898*13fbcb42Sjoerg                 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
899*13fbcb42Sjoerg           DKind = NND->getDirectiveKind();
900*13fbcb42Sjoerg           if (isOpenMPWorksharingDirective(DKind) &&
901*13fbcb42Sjoerg               isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
902*13fbcb42Sjoerg             return true;
903*13fbcb42Sjoerg         }
904*13fbcb42Sjoerg       }
905*13fbcb42Sjoerg       return false;
906*13fbcb42Sjoerg     case OMPD_target_parallel:
907*13fbcb42Sjoerg       if (DKind == OMPD_simd)
908*13fbcb42Sjoerg         return true;
909*13fbcb42Sjoerg       return isOpenMPWorksharingDirective(DKind) &&
910*13fbcb42Sjoerg              isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir);
911*13fbcb42Sjoerg     case OMPD_target_teams_distribute:
912*13fbcb42Sjoerg     case OMPD_target_simd:
913*13fbcb42Sjoerg     case OMPD_target_parallel_for:
914*13fbcb42Sjoerg     case OMPD_target_parallel_for_simd:
915*13fbcb42Sjoerg     case OMPD_target_teams_distribute_simd:
916*13fbcb42Sjoerg     case OMPD_target_teams_distribute_parallel_for:
917*13fbcb42Sjoerg     case OMPD_target_teams_distribute_parallel_for_simd:
918*13fbcb42Sjoerg     case OMPD_parallel:
919*13fbcb42Sjoerg     case OMPD_for:
920*13fbcb42Sjoerg     case OMPD_parallel_for:
921*13fbcb42Sjoerg     case OMPD_parallel_master:
922*13fbcb42Sjoerg     case OMPD_parallel_sections:
923*13fbcb42Sjoerg     case OMPD_for_simd:
924*13fbcb42Sjoerg     case OMPD_parallel_for_simd:
925*13fbcb42Sjoerg     case OMPD_cancel:
926*13fbcb42Sjoerg     case OMPD_cancellation_point:
927*13fbcb42Sjoerg     case OMPD_ordered:
928*13fbcb42Sjoerg     case OMPD_threadprivate:
929*13fbcb42Sjoerg     case OMPD_allocate:
930*13fbcb42Sjoerg     case OMPD_task:
931*13fbcb42Sjoerg     case OMPD_simd:
932*13fbcb42Sjoerg     case OMPD_sections:
933*13fbcb42Sjoerg     case OMPD_section:
934*13fbcb42Sjoerg     case OMPD_single:
935*13fbcb42Sjoerg     case OMPD_master:
936*13fbcb42Sjoerg     case OMPD_critical:
937*13fbcb42Sjoerg     case OMPD_taskyield:
938*13fbcb42Sjoerg     case OMPD_barrier:
939*13fbcb42Sjoerg     case OMPD_taskwait:
940*13fbcb42Sjoerg     case OMPD_taskgroup:
941*13fbcb42Sjoerg     case OMPD_atomic:
942*13fbcb42Sjoerg     case OMPD_flush:
943*13fbcb42Sjoerg     case OMPD_depobj:
944*13fbcb42Sjoerg     case OMPD_scan:
945*13fbcb42Sjoerg     case OMPD_teams:
946*13fbcb42Sjoerg     case OMPD_target_data:
947*13fbcb42Sjoerg     case OMPD_target_exit_data:
948*13fbcb42Sjoerg     case OMPD_target_enter_data:
949*13fbcb42Sjoerg     case OMPD_distribute:
950*13fbcb42Sjoerg     case OMPD_distribute_simd:
951*13fbcb42Sjoerg     case OMPD_distribute_parallel_for:
952*13fbcb42Sjoerg     case OMPD_distribute_parallel_for_simd:
953*13fbcb42Sjoerg     case OMPD_teams_distribute:
954*13fbcb42Sjoerg     case OMPD_teams_distribute_simd:
955*13fbcb42Sjoerg     case OMPD_teams_distribute_parallel_for:
956*13fbcb42Sjoerg     case OMPD_teams_distribute_parallel_for_simd:
957*13fbcb42Sjoerg     case OMPD_target_update:
958*13fbcb42Sjoerg     case OMPD_declare_simd:
959*13fbcb42Sjoerg     case OMPD_declare_variant:
960*13fbcb42Sjoerg     case OMPD_begin_declare_variant:
961*13fbcb42Sjoerg     case OMPD_end_declare_variant:
962*13fbcb42Sjoerg     case OMPD_declare_target:
963*13fbcb42Sjoerg     case OMPD_end_declare_target:
964*13fbcb42Sjoerg     case OMPD_declare_reduction:
965*13fbcb42Sjoerg     case OMPD_declare_mapper:
966*13fbcb42Sjoerg     case OMPD_taskloop:
967*13fbcb42Sjoerg     case OMPD_taskloop_simd:
968*13fbcb42Sjoerg     case OMPD_master_taskloop:
969*13fbcb42Sjoerg     case OMPD_master_taskloop_simd:
970*13fbcb42Sjoerg     case OMPD_parallel_master_taskloop:
971*13fbcb42Sjoerg     case OMPD_parallel_master_taskloop_simd:
972*13fbcb42Sjoerg     case OMPD_requires:
973*13fbcb42Sjoerg     case OMPD_unknown:
974*13fbcb42Sjoerg     default:
975*13fbcb42Sjoerg       llvm_unreachable("Unexpected directive.");
976*13fbcb42Sjoerg     }
977*13fbcb42Sjoerg   }
978*13fbcb42Sjoerg 
979*13fbcb42Sjoerg   return false;
980*13fbcb42Sjoerg }
981*13fbcb42Sjoerg 
982*13fbcb42Sjoerg /// Checks if the construct supports lightweight runtime. It must be SPMD
983*13fbcb42Sjoerg /// construct + inner loop-based construct with static scheduling.
supportsLightweightRuntime(ASTContext & Ctx,const OMPExecutableDirective & D)984*13fbcb42Sjoerg static bool supportsLightweightRuntime(ASTContext &Ctx,
985*13fbcb42Sjoerg                                        const OMPExecutableDirective &D) {
986*13fbcb42Sjoerg   if (!supportsSPMDExecutionMode(Ctx, D))
987*13fbcb42Sjoerg     return false;
988*13fbcb42Sjoerg   OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
989*13fbcb42Sjoerg   switch (DirectiveKind) {
990*13fbcb42Sjoerg   case OMPD_target:
991*13fbcb42Sjoerg   case OMPD_target_teams:
992*13fbcb42Sjoerg   case OMPD_target_parallel:
993*13fbcb42Sjoerg     return hasNestedLightweightDirective(Ctx, D);
994*13fbcb42Sjoerg   case OMPD_target_parallel_for:
995*13fbcb42Sjoerg   case OMPD_target_parallel_for_simd:
996*13fbcb42Sjoerg   case OMPD_target_teams_distribute_parallel_for:
997*13fbcb42Sjoerg   case OMPD_target_teams_distribute_parallel_for_simd:
998*13fbcb42Sjoerg     // (Last|First)-privates must be shared in parallel region.
999*13fbcb42Sjoerg     return hasStaticScheduling(D);
1000*13fbcb42Sjoerg   case OMPD_target_simd:
1001*13fbcb42Sjoerg   case OMPD_target_teams_distribute_simd:
1002*13fbcb42Sjoerg     return true;
1003*13fbcb42Sjoerg   case OMPD_target_teams_distribute:
1004*13fbcb42Sjoerg     return false;
1005*13fbcb42Sjoerg   case OMPD_parallel:
1006*13fbcb42Sjoerg   case OMPD_for:
1007*13fbcb42Sjoerg   case OMPD_parallel_for:
1008*13fbcb42Sjoerg   case OMPD_parallel_master:
1009*13fbcb42Sjoerg   case OMPD_parallel_sections:
1010*13fbcb42Sjoerg   case OMPD_for_simd:
1011*13fbcb42Sjoerg   case OMPD_parallel_for_simd:
1012*13fbcb42Sjoerg   case OMPD_cancel:
1013*13fbcb42Sjoerg   case OMPD_cancellation_point:
1014*13fbcb42Sjoerg   case OMPD_ordered:
1015*13fbcb42Sjoerg   case OMPD_threadprivate:
1016*13fbcb42Sjoerg   case OMPD_allocate:
1017*13fbcb42Sjoerg   case OMPD_task:
1018*13fbcb42Sjoerg   case OMPD_simd:
1019*13fbcb42Sjoerg   case OMPD_sections:
1020*13fbcb42Sjoerg   case OMPD_section:
1021*13fbcb42Sjoerg   case OMPD_single:
1022*13fbcb42Sjoerg   case OMPD_master:
1023*13fbcb42Sjoerg   case OMPD_critical:
1024*13fbcb42Sjoerg   case OMPD_taskyield:
1025*13fbcb42Sjoerg   case OMPD_barrier:
1026*13fbcb42Sjoerg   case OMPD_taskwait:
1027*13fbcb42Sjoerg   case OMPD_taskgroup:
1028*13fbcb42Sjoerg   case OMPD_atomic:
1029*13fbcb42Sjoerg   case OMPD_flush:
1030*13fbcb42Sjoerg   case OMPD_depobj:
1031*13fbcb42Sjoerg   case OMPD_scan:
1032*13fbcb42Sjoerg   case OMPD_teams:
1033*13fbcb42Sjoerg   case OMPD_target_data:
1034*13fbcb42Sjoerg   case OMPD_target_exit_data:
1035*13fbcb42Sjoerg   case OMPD_target_enter_data:
1036*13fbcb42Sjoerg   case OMPD_distribute:
1037*13fbcb42Sjoerg   case OMPD_distribute_simd:
1038*13fbcb42Sjoerg   case OMPD_distribute_parallel_for:
1039*13fbcb42Sjoerg   case OMPD_distribute_parallel_for_simd:
1040*13fbcb42Sjoerg   case OMPD_teams_distribute:
1041*13fbcb42Sjoerg   case OMPD_teams_distribute_simd:
1042*13fbcb42Sjoerg   case OMPD_teams_distribute_parallel_for:
1043*13fbcb42Sjoerg   case OMPD_teams_distribute_parallel_for_simd:
1044*13fbcb42Sjoerg   case OMPD_target_update:
1045*13fbcb42Sjoerg   case OMPD_declare_simd:
1046*13fbcb42Sjoerg   case OMPD_declare_variant:
1047*13fbcb42Sjoerg   case OMPD_begin_declare_variant:
1048*13fbcb42Sjoerg   case OMPD_end_declare_variant:
1049*13fbcb42Sjoerg   case OMPD_declare_target:
1050*13fbcb42Sjoerg   case OMPD_end_declare_target:
1051*13fbcb42Sjoerg   case OMPD_declare_reduction:
1052*13fbcb42Sjoerg   case OMPD_declare_mapper:
1053*13fbcb42Sjoerg   case OMPD_taskloop:
1054*13fbcb42Sjoerg   case OMPD_taskloop_simd:
1055*13fbcb42Sjoerg   case OMPD_master_taskloop:
1056*13fbcb42Sjoerg   case OMPD_master_taskloop_simd:
1057*13fbcb42Sjoerg   case OMPD_parallel_master_taskloop:
1058*13fbcb42Sjoerg   case OMPD_parallel_master_taskloop_simd:
1059*13fbcb42Sjoerg   case OMPD_requires:
1060*13fbcb42Sjoerg   case OMPD_unknown:
1061*13fbcb42Sjoerg   default:
1062*13fbcb42Sjoerg     break;
1063*13fbcb42Sjoerg   }
1064*13fbcb42Sjoerg   llvm_unreachable(
1065*13fbcb42Sjoerg       "Unknown programming model for OpenMP directive on NVPTX target.");
1066*13fbcb42Sjoerg }
1067*13fbcb42Sjoerg 
emitNonSPMDKernel(const OMPExecutableDirective & D,StringRef ParentName,llvm::Function * & OutlinedFn,llvm::Constant * & OutlinedFnID,bool IsOffloadEntry,const RegionCodeGenTy & CodeGen)1068*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
1069*13fbcb42Sjoerg                                              StringRef ParentName,
1070*13fbcb42Sjoerg                                              llvm::Function *&OutlinedFn,
1071*13fbcb42Sjoerg                                              llvm::Constant *&OutlinedFnID,
1072*13fbcb42Sjoerg                                              bool IsOffloadEntry,
1073*13fbcb42Sjoerg                                              const RegionCodeGenTy &CodeGen) {
1074*13fbcb42Sjoerg   ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode);
1075*13fbcb42Sjoerg   EntryFunctionState EST;
1076*13fbcb42Sjoerg   WorkerFunctionState WST(CGM, D.getBeginLoc());
1077*13fbcb42Sjoerg   Work.clear();
1078*13fbcb42Sjoerg   WrapperFunctionsMap.clear();
1079*13fbcb42Sjoerg 
1080*13fbcb42Sjoerg   // Emit target region as a standalone region.
1081*13fbcb42Sjoerg   class NVPTXPrePostActionTy : public PrePostActionTy {
1082*13fbcb42Sjoerg     CGOpenMPRuntimeGPU::EntryFunctionState &EST;
1083*13fbcb42Sjoerg     CGOpenMPRuntimeGPU::WorkerFunctionState &WST;
1084*13fbcb42Sjoerg 
1085*13fbcb42Sjoerg   public:
1086*13fbcb42Sjoerg     NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST,
1087*13fbcb42Sjoerg                          CGOpenMPRuntimeGPU::WorkerFunctionState &WST)
1088*13fbcb42Sjoerg         : EST(EST), WST(WST) {}
1089*13fbcb42Sjoerg     void Enter(CodeGenFunction &CGF) override {
1090*13fbcb42Sjoerg       auto &RT =
1091*13fbcb42Sjoerg           static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1092*13fbcb42Sjoerg       RT.emitNonSPMDEntryHeader(CGF, EST, WST);
1093*13fbcb42Sjoerg       // Skip target region initialization.
1094*13fbcb42Sjoerg       RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1095*13fbcb42Sjoerg     }
1096*13fbcb42Sjoerg     void Exit(CodeGenFunction &CGF) override {
1097*13fbcb42Sjoerg       auto &RT =
1098*13fbcb42Sjoerg           static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1099*13fbcb42Sjoerg       RT.clearLocThreadIdInsertPt(CGF);
1100*13fbcb42Sjoerg       RT.emitNonSPMDEntryFooter(CGF, EST);
1101*13fbcb42Sjoerg     }
1102*13fbcb42Sjoerg   } Action(EST, WST);
1103*13fbcb42Sjoerg   CodeGen.setAction(Action);
1104*13fbcb42Sjoerg   IsInTTDRegion = true;
1105*13fbcb42Sjoerg   // Reserve place for the globalized memory.
1106*13fbcb42Sjoerg   GlobalizedRecords.emplace_back();
1107*13fbcb42Sjoerg   if (!KernelStaticGlobalized) {
1108*13fbcb42Sjoerg     KernelStaticGlobalized = new llvm::GlobalVariable(
1109*13fbcb42Sjoerg         CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
1110*13fbcb42Sjoerg         llvm::GlobalValue::InternalLinkage,
1111*13fbcb42Sjoerg         llvm::UndefValue::get(CGM.VoidPtrTy),
1112*13fbcb42Sjoerg         "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
1113*13fbcb42Sjoerg         llvm::GlobalValue::NotThreadLocal,
1114*13fbcb42Sjoerg         CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
1115*13fbcb42Sjoerg   }
1116*13fbcb42Sjoerg   emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1117*13fbcb42Sjoerg                                    IsOffloadEntry, CodeGen);
1118*13fbcb42Sjoerg   IsInTTDRegion = false;
1119*13fbcb42Sjoerg 
1120*13fbcb42Sjoerg   // Now change the name of the worker function to correspond to this target
1121*13fbcb42Sjoerg   // region's entry function.
1122*13fbcb42Sjoerg   WST.WorkerFn->setName(Twine(OutlinedFn->getName(), "_worker"));
1123*13fbcb42Sjoerg 
1124*13fbcb42Sjoerg   // Create the worker function
1125*13fbcb42Sjoerg   emitWorkerFunction(WST);
1126*13fbcb42Sjoerg }
1127*13fbcb42Sjoerg 
1128*13fbcb42Sjoerg // Setup NVPTX threads for master-worker OpenMP scheme.
emitNonSPMDEntryHeader(CodeGenFunction & CGF,EntryFunctionState & EST,WorkerFunctionState & WST)1129*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
1130*13fbcb42Sjoerg                                                   EntryFunctionState &EST,
1131*13fbcb42Sjoerg                                                   WorkerFunctionState &WST) {
1132*13fbcb42Sjoerg   CGBuilderTy &Bld = CGF.Builder;
1133*13fbcb42Sjoerg 
1134*13fbcb42Sjoerg   llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
1135*13fbcb42Sjoerg   llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
1136*13fbcb42Sjoerg   llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
1137*13fbcb42Sjoerg   EST.ExitBB = CGF.createBasicBlock(".exit");
1138*13fbcb42Sjoerg 
1139*13fbcb42Sjoerg   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1140*13fbcb42Sjoerg   llvm::Value *GPUThreadID = RT.getGPUThreadID(CGF);
1141*13fbcb42Sjoerg   llvm::Value *ThreadLimit = getThreadLimit(CGF);
1142*13fbcb42Sjoerg   llvm::Value *IsWorker = Bld.CreateICmpULT(GPUThreadID, ThreadLimit);
1143*13fbcb42Sjoerg   Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
1144*13fbcb42Sjoerg 
1145*13fbcb42Sjoerg   CGF.EmitBlock(WorkerBB);
1146*13fbcb42Sjoerg   emitCall(CGF, WST.Loc, WST.WorkerFn);
1147*13fbcb42Sjoerg   CGF.EmitBranch(EST.ExitBB);
1148*13fbcb42Sjoerg 
1149*13fbcb42Sjoerg   CGF.EmitBlock(MasterCheckBB);
1150*13fbcb42Sjoerg   GPUThreadID = RT.getGPUThreadID(CGF);
1151*13fbcb42Sjoerg   llvm::Value *MasterThreadID = getMasterThreadID(CGF);
1152*13fbcb42Sjoerg   llvm::Value *IsMaster = Bld.CreateICmpEQ(GPUThreadID, MasterThreadID);
1153*13fbcb42Sjoerg   Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
1154*13fbcb42Sjoerg 
1155*13fbcb42Sjoerg   CGF.EmitBlock(MasterBB);
1156*13fbcb42Sjoerg   IsInTargetMasterThreadRegion = true;
1157*13fbcb42Sjoerg   // SEQUENTIAL (MASTER) REGION START
1158*13fbcb42Sjoerg   // First action in sequential region:
1159*13fbcb42Sjoerg   // Initialize the state of the OpenMP runtime library on the GPU.
1160*13fbcb42Sjoerg   // TODO: Optimize runtime initialization and pass in correct value.
1161*13fbcb42Sjoerg   llvm::Value *Args[] = {getThreadLimit(CGF),
1162*13fbcb42Sjoerg                          Bld.getInt16(/*RequiresOMPRuntime=*/1)};
1163*13fbcb42Sjoerg   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1164*13fbcb42Sjoerg                           CGM.getModule(), OMPRTL___kmpc_kernel_init),
1165*13fbcb42Sjoerg                       Args);
1166*13fbcb42Sjoerg 
1167*13fbcb42Sjoerg   // For data sharing, we need to initialize the stack.
1168*13fbcb42Sjoerg   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1169*13fbcb42Sjoerg       CGM.getModule(), OMPRTL___kmpc_data_sharing_init_stack));
1170*13fbcb42Sjoerg 
1171*13fbcb42Sjoerg   emitGenericVarsProlog(CGF, WST.Loc);
1172*13fbcb42Sjoerg }
1173*13fbcb42Sjoerg 
emitNonSPMDEntryFooter(CodeGenFunction & CGF,EntryFunctionState & EST)1174*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitNonSPMDEntryFooter(CodeGenFunction &CGF,
1175*13fbcb42Sjoerg                                                   EntryFunctionState &EST) {
1176*13fbcb42Sjoerg   IsInTargetMasterThreadRegion = false;
1177*13fbcb42Sjoerg   if (!CGF.HaveInsertPoint())
1178*13fbcb42Sjoerg     return;
1179*13fbcb42Sjoerg 
1180*13fbcb42Sjoerg   emitGenericVarsEpilog(CGF);
1181*13fbcb42Sjoerg 
1182*13fbcb42Sjoerg   if (!EST.ExitBB)
1183*13fbcb42Sjoerg     EST.ExitBB = CGF.createBasicBlock(".exit");
1184*13fbcb42Sjoerg 
1185*13fbcb42Sjoerg   llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
1186*13fbcb42Sjoerg   CGF.EmitBranch(TerminateBB);
1187*13fbcb42Sjoerg 
1188*13fbcb42Sjoerg   CGF.EmitBlock(TerminateBB);
1189*13fbcb42Sjoerg   // Signal termination condition.
1190*13fbcb42Sjoerg   // TODO: Optimize runtime initialization and pass in correct value.
1191*13fbcb42Sjoerg   llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)};
1192*13fbcb42Sjoerg   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1193*13fbcb42Sjoerg                           CGM.getModule(), OMPRTL___kmpc_kernel_deinit),
1194*13fbcb42Sjoerg                       Args);
1195*13fbcb42Sjoerg   // Barrier to terminate worker threads.
1196*13fbcb42Sjoerg   syncCTAThreads(CGF);
1197*13fbcb42Sjoerg   // Master thread jumps to exit point.
1198*13fbcb42Sjoerg   CGF.EmitBranch(EST.ExitBB);
1199*13fbcb42Sjoerg 
1200*13fbcb42Sjoerg   CGF.EmitBlock(EST.ExitBB);
1201*13fbcb42Sjoerg   EST.ExitBB = nullptr;
1202*13fbcb42Sjoerg }
1203*13fbcb42Sjoerg 
emitSPMDKernel(const OMPExecutableDirective & D,StringRef ParentName,llvm::Function * & OutlinedFn,llvm::Constant * & OutlinedFnID,bool IsOffloadEntry,const RegionCodeGenTy & CodeGen)1204*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
1205*13fbcb42Sjoerg                                           StringRef ParentName,
1206*13fbcb42Sjoerg                                           llvm::Function *&OutlinedFn,
1207*13fbcb42Sjoerg                                           llvm::Constant *&OutlinedFnID,
1208*13fbcb42Sjoerg                                           bool IsOffloadEntry,
1209*13fbcb42Sjoerg                                           const RegionCodeGenTy &CodeGen) {
1210*13fbcb42Sjoerg   ExecutionRuntimeModesRAII ModeRAII(
1211*13fbcb42Sjoerg       CurrentExecutionMode, RequiresFullRuntime,
1212*13fbcb42Sjoerg       CGM.getLangOpts().OpenMPCUDAForceFullRuntime ||
1213*13fbcb42Sjoerg           !supportsLightweightRuntime(CGM.getContext(), D));
1214*13fbcb42Sjoerg   EntryFunctionState EST;
1215*13fbcb42Sjoerg 
1216*13fbcb42Sjoerg   // Emit target region as a standalone region.
1217*13fbcb42Sjoerg   class NVPTXPrePostActionTy : public PrePostActionTy {
1218*13fbcb42Sjoerg     CGOpenMPRuntimeGPU &RT;
1219*13fbcb42Sjoerg     CGOpenMPRuntimeGPU::EntryFunctionState &EST;
1220*13fbcb42Sjoerg     const OMPExecutableDirective &D;
1221*13fbcb42Sjoerg 
1222*13fbcb42Sjoerg   public:
1223*13fbcb42Sjoerg     NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
1224*13fbcb42Sjoerg                          CGOpenMPRuntimeGPU::EntryFunctionState &EST,
1225*13fbcb42Sjoerg                          const OMPExecutableDirective &D)
1226*13fbcb42Sjoerg         : RT(RT), EST(EST), D(D) {}
1227*13fbcb42Sjoerg     void Enter(CodeGenFunction &CGF) override {
1228*13fbcb42Sjoerg       RT.emitSPMDEntryHeader(CGF, EST, D);
1229*13fbcb42Sjoerg       // Skip target region initialization.
1230*13fbcb42Sjoerg       RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1231*13fbcb42Sjoerg     }
1232*13fbcb42Sjoerg     void Exit(CodeGenFunction &CGF) override {
1233*13fbcb42Sjoerg       RT.clearLocThreadIdInsertPt(CGF);
1234*13fbcb42Sjoerg       RT.emitSPMDEntryFooter(CGF, EST);
1235*13fbcb42Sjoerg     }
1236*13fbcb42Sjoerg   } Action(*this, EST, D);
1237*13fbcb42Sjoerg   CodeGen.setAction(Action);
1238*13fbcb42Sjoerg   IsInTTDRegion = true;
1239*13fbcb42Sjoerg   // Reserve place for the globalized memory.
1240*13fbcb42Sjoerg   GlobalizedRecords.emplace_back();
1241*13fbcb42Sjoerg   if (!KernelStaticGlobalized) {
1242*13fbcb42Sjoerg     KernelStaticGlobalized = new llvm::GlobalVariable(
1243*13fbcb42Sjoerg         CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
1244*13fbcb42Sjoerg         llvm::GlobalValue::InternalLinkage,
1245*13fbcb42Sjoerg         llvm::UndefValue::get(CGM.VoidPtrTy),
1246*13fbcb42Sjoerg         "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
1247*13fbcb42Sjoerg         llvm::GlobalValue::NotThreadLocal,
1248*13fbcb42Sjoerg         CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
1249*13fbcb42Sjoerg   }
1250*13fbcb42Sjoerg   emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1251*13fbcb42Sjoerg                                    IsOffloadEntry, CodeGen);
1252*13fbcb42Sjoerg   IsInTTDRegion = false;
1253*13fbcb42Sjoerg }
1254*13fbcb42Sjoerg 
emitSPMDEntryHeader(CodeGenFunction & CGF,EntryFunctionState & EST,const OMPExecutableDirective & D)1255*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitSPMDEntryHeader(
1256*13fbcb42Sjoerg     CodeGenFunction &CGF, EntryFunctionState &EST,
1257*13fbcb42Sjoerg     const OMPExecutableDirective &D) {
1258*13fbcb42Sjoerg   CGBuilderTy &Bld = CGF.Builder;
1259*13fbcb42Sjoerg 
1260*13fbcb42Sjoerg   // Setup BBs in entry function.
1261*13fbcb42Sjoerg   llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
1262*13fbcb42Sjoerg   EST.ExitBB = CGF.createBasicBlock(".exit");
1263*13fbcb42Sjoerg 
1264*13fbcb42Sjoerg   llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSPMDExecutionMode=*/true),
1265*13fbcb42Sjoerg                          /*RequiresOMPRuntime=*/
1266*13fbcb42Sjoerg                          Bld.getInt16(RequiresFullRuntime ? 1 : 0)};
1267*13fbcb42Sjoerg   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1268*13fbcb42Sjoerg                           CGM.getModule(), OMPRTL___kmpc_spmd_kernel_init),
1269*13fbcb42Sjoerg                       Args);
1270*13fbcb42Sjoerg 
1271*13fbcb42Sjoerg   if (RequiresFullRuntime) {
1272*13fbcb42Sjoerg     // For data sharing, we need to initialize the stack.
1273*13fbcb42Sjoerg     CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1274*13fbcb42Sjoerg         CGM.getModule(), OMPRTL___kmpc_data_sharing_init_stack_spmd));
1275*13fbcb42Sjoerg   }
1276*13fbcb42Sjoerg 
1277*13fbcb42Sjoerg   CGF.EmitBranch(ExecuteBB);
1278*13fbcb42Sjoerg 
1279*13fbcb42Sjoerg   CGF.EmitBlock(ExecuteBB);
1280*13fbcb42Sjoerg 
1281*13fbcb42Sjoerg   IsInTargetMasterThreadRegion = true;
1282*13fbcb42Sjoerg }
1283*13fbcb42Sjoerg 
emitSPMDEntryFooter(CodeGenFunction & CGF,EntryFunctionState & EST)1284*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitSPMDEntryFooter(CodeGenFunction &CGF,
1285*13fbcb42Sjoerg                                                EntryFunctionState &EST) {
1286*13fbcb42Sjoerg   IsInTargetMasterThreadRegion = false;
1287*13fbcb42Sjoerg   if (!CGF.HaveInsertPoint())
1288*13fbcb42Sjoerg     return;
1289*13fbcb42Sjoerg 
1290*13fbcb42Sjoerg   if (!EST.ExitBB)
1291*13fbcb42Sjoerg     EST.ExitBB = CGF.createBasicBlock(".exit");
1292*13fbcb42Sjoerg 
1293*13fbcb42Sjoerg   llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
1294*13fbcb42Sjoerg   CGF.EmitBranch(OMPDeInitBB);
1295*13fbcb42Sjoerg 
1296*13fbcb42Sjoerg   CGF.EmitBlock(OMPDeInitBB);
1297*13fbcb42Sjoerg   // DeInitialize the OMP state in the runtime; called by all active threads.
1298*13fbcb42Sjoerg   llvm::Value *Args[] = {/*RequiresOMPRuntime=*/
1299*13fbcb42Sjoerg                          CGF.Builder.getInt16(RequiresFullRuntime ? 1 : 0)};
1300*13fbcb42Sjoerg   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1301*13fbcb42Sjoerg                           CGM.getModule(), OMPRTL___kmpc_spmd_kernel_deinit_v2),
1302*13fbcb42Sjoerg                       Args);
1303*13fbcb42Sjoerg   CGF.EmitBranch(EST.ExitBB);
1304*13fbcb42Sjoerg 
1305*13fbcb42Sjoerg   CGF.EmitBlock(EST.ExitBB);
1306*13fbcb42Sjoerg   EST.ExitBB = nullptr;
1307*13fbcb42Sjoerg }
1308*13fbcb42Sjoerg 
1309*13fbcb42Sjoerg // Create a unique global variable to indicate the execution mode of this target
1310*13fbcb42Sjoerg // region. The execution mode is either 'generic', or 'spmd' depending on the
1311*13fbcb42Sjoerg // target directive. This variable is picked up by the offload library to setup
1312*13fbcb42Sjoerg // the device appropriately before kernel launch. If the execution mode is
1313*13fbcb42Sjoerg // 'generic', the runtime reserves one warp for the master, otherwise, all
1314*13fbcb42Sjoerg // warps participate in parallel work.
setPropertyExecutionMode(CodeGenModule & CGM,StringRef Name,bool Mode)1315*13fbcb42Sjoerg static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
1316*13fbcb42Sjoerg                                      bool Mode) {
1317*13fbcb42Sjoerg   auto *GVMode =
1318*13fbcb42Sjoerg       new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
1319*13fbcb42Sjoerg                                llvm::GlobalValue::WeakAnyLinkage,
1320*13fbcb42Sjoerg                                llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1),
1321*13fbcb42Sjoerg                                Twine(Name, "_exec_mode"));
1322*13fbcb42Sjoerg   CGM.addCompilerUsedGlobal(GVMode);
1323*13fbcb42Sjoerg }
1324*13fbcb42Sjoerg 
emitWorkerFunction(WorkerFunctionState & WST)1325*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitWorkerFunction(WorkerFunctionState &WST) {
1326*13fbcb42Sjoerg   ASTContext &Ctx = CGM.getContext();
1327*13fbcb42Sjoerg 
1328*13fbcb42Sjoerg   CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
1329*13fbcb42Sjoerg   CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, WST.CGFI, {},
1330*13fbcb42Sjoerg                     WST.Loc, WST.Loc);
1331*13fbcb42Sjoerg   emitWorkerLoop(CGF, WST);
1332*13fbcb42Sjoerg   CGF.FinishFunction();
1333*13fbcb42Sjoerg }
1334*13fbcb42Sjoerg 
emitWorkerLoop(CodeGenFunction & CGF,WorkerFunctionState & WST)1335*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitWorkerLoop(CodeGenFunction &CGF,
1336*13fbcb42Sjoerg                                         WorkerFunctionState &WST) {
1337*13fbcb42Sjoerg   //
1338*13fbcb42Sjoerg   // The workers enter this loop and wait for parallel work from the master.
1339*13fbcb42Sjoerg   // When the master encounters a parallel region it sets up the work + variable
1340*13fbcb42Sjoerg   // arguments, and wakes up the workers.  The workers first check to see if
1341*13fbcb42Sjoerg   // they are required for the parallel region, i.e., within the # of requested
1342*13fbcb42Sjoerg   // parallel threads.  The activated workers load the variable arguments and
1343*13fbcb42Sjoerg   // execute the parallel work.
1344*13fbcb42Sjoerg   //
1345*13fbcb42Sjoerg 
1346*13fbcb42Sjoerg   CGBuilderTy &Bld = CGF.Builder;
1347*13fbcb42Sjoerg 
1348*13fbcb42Sjoerg   llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
1349*13fbcb42Sjoerg   llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
1350*13fbcb42Sjoerg   llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
1351*13fbcb42Sjoerg   llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
1352*13fbcb42Sjoerg   llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
1353*13fbcb42Sjoerg   llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
1354*13fbcb42Sjoerg 
1355*13fbcb42Sjoerg   CGF.EmitBranch(AwaitBB);
1356*13fbcb42Sjoerg 
1357*13fbcb42Sjoerg   // Workers wait for work from master.
1358*13fbcb42Sjoerg   CGF.EmitBlock(AwaitBB);
1359*13fbcb42Sjoerg   // Wait for parallel work
1360*13fbcb42Sjoerg   syncCTAThreads(CGF);
1361*13fbcb42Sjoerg 
1362*13fbcb42Sjoerg   Address WorkFn =
1363*13fbcb42Sjoerg       CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");
1364*13fbcb42Sjoerg   Address ExecStatus =
1365*13fbcb42Sjoerg       CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status");
1366*13fbcb42Sjoerg   CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
1367*13fbcb42Sjoerg   CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
1368*13fbcb42Sjoerg 
1369*13fbcb42Sjoerg   // TODO: Optimize runtime initialization and pass in correct value.
1370*13fbcb42Sjoerg   llvm::Value *Args[] = {WorkFn.getPointer()};
1371*13fbcb42Sjoerg   llvm::Value *Ret =
1372*13fbcb42Sjoerg       CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1373*13fbcb42Sjoerg                               CGM.getModule(), OMPRTL___kmpc_kernel_parallel),
1374*13fbcb42Sjoerg                           Args);
1375*13fbcb42Sjoerg   Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
1376*13fbcb42Sjoerg 
1377*13fbcb42Sjoerg   // On termination condition (workid == 0), exit loop.
1378*13fbcb42Sjoerg   llvm::Value *WorkID = Bld.CreateLoad(WorkFn);
1379*13fbcb42Sjoerg   llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID, "should_terminate");
1380*13fbcb42Sjoerg   Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
1381*13fbcb42Sjoerg 
1382*13fbcb42Sjoerg   // Activate requested workers.
1383*13fbcb42Sjoerg   CGF.EmitBlock(SelectWorkersBB);
1384*13fbcb42Sjoerg   llvm::Value *IsActive =
1385*13fbcb42Sjoerg       Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
1386*13fbcb42Sjoerg   Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
1387*13fbcb42Sjoerg 
1388*13fbcb42Sjoerg   // Signal start of parallel region.
1389*13fbcb42Sjoerg   CGF.EmitBlock(ExecuteBB);
1390*13fbcb42Sjoerg   // Skip initialization.
1391*13fbcb42Sjoerg   setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1392*13fbcb42Sjoerg 
1393*13fbcb42Sjoerg   // Process work items: outlined parallel functions.
1394*13fbcb42Sjoerg   for (llvm::Function *W : Work) {
1395*13fbcb42Sjoerg     // Try to match this outlined function.
1396*13fbcb42Sjoerg     llvm::Value *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy);
1397*13fbcb42Sjoerg 
1398*13fbcb42Sjoerg     llvm::Value *WorkFnMatch =
1399*13fbcb42Sjoerg         Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
1400*13fbcb42Sjoerg 
1401*13fbcb42Sjoerg     llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
1402*13fbcb42Sjoerg     llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
1403*13fbcb42Sjoerg     Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
1404*13fbcb42Sjoerg 
1405*13fbcb42Sjoerg     // Execute this outlined function.
1406*13fbcb42Sjoerg     CGF.EmitBlock(ExecuteFNBB);
1407*13fbcb42Sjoerg 
1408*13fbcb42Sjoerg     // Insert call to work function via shared wrapper. The shared
1409*13fbcb42Sjoerg     // wrapper takes two arguments:
1410*13fbcb42Sjoerg     //   - the parallelism level;
1411*13fbcb42Sjoerg     //   - the thread ID;
1412*13fbcb42Sjoerg     emitCall(CGF, WST.Loc, W,
1413*13fbcb42Sjoerg              {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1414*13fbcb42Sjoerg 
1415*13fbcb42Sjoerg     // Go to end of parallel region.
1416*13fbcb42Sjoerg     CGF.EmitBranch(TerminateBB);
1417*13fbcb42Sjoerg 
1418*13fbcb42Sjoerg     CGF.EmitBlock(CheckNextBB);
1419*13fbcb42Sjoerg   }
1420*13fbcb42Sjoerg   // Default case: call to outlined function through pointer if the target
1421*13fbcb42Sjoerg   // region makes a declare target call that may contain an orphaned parallel
1422*13fbcb42Sjoerg   // directive.
1423*13fbcb42Sjoerg   auto *ParallelFnTy =
1424*13fbcb42Sjoerg       llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty},
1425*13fbcb42Sjoerg                               /*isVarArg=*/false);
1426*13fbcb42Sjoerg   llvm::Value *WorkFnCast =
1427*13fbcb42Sjoerg       Bld.CreateBitCast(WorkID, ParallelFnTy->getPointerTo());
1428*13fbcb42Sjoerg   // Insert call to work function via shared wrapper. The shared
1429*13fbcb42Sjoerg   // wrapper takes two arguments:
1430*13fbcb42Sjoerg   //   - the parallelism level;
1431*13fbcb42Sjoerg   //   - the thread ID;
1432*13fbcb42Sjoerg   emitCall(CGF, WST.Loc, {ParallelFnTy, WorkFnCast},
1433*13fbcb42Sjoerg            {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1434*13fbcb42Sjoerg   // Go to end of parallel region.
1435*13fbcb42Sjoerg   CGF.EmitBranch(TerminateBB);
1436*13fbcb42Sjoerg 
1437*13fbcb42Sjoerg   // Signal end of parallel region.
1438*13fbcb42Sjoerg   CGF.EmitBlock(TerminateBB);
1439*13fbcb42Sjoerg   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1440*13fbcb42Sjoerg                           CGM.getModule(), OMPRTL___kmpc_kernel_end_parallel),
1441*13fbcb42Sjoerg                       llvm::None);
1442*13fbcb42Sjoerg   CGF.EmitBranch(BarrierBB);
1443*13fbcb42Sjoerg 
1444*13fbcb42Sjoerg   // All active and inactive workers wait at a barrier after parallel region.
1445*13fbcb42Sjoerg   CGF.EmitBlock(BarrierBB);
1446*13fbcb42Sjoerg   // Barrier after parallel region.
1447*13fbcb42Sjoerg   syncCTAThreads(CGF);
1448*13fbcb42Sjoerg   CGF.EmitBranch(AwaitBB);
1449*13fbcb42Sjoerg 
1450*13fbcb42Sjoerg   // Exit target region.
1451*13fbcb42Sjoerg   CGF.EmitBlock(ExitBB);
1452*13fbcb42Sjoerg   // Skip initialization.
1453*13fbcb42Sjoerg   clearLocThreadIdInsertPt(CGF);
1454*13fbcb42Sjoerg }
1455*13fbcb42Sjoerg 
createOffloadEntry(llvm::Constant * ID,llvm::Constant * Addr,uint64_t Size,int32_t,llvm::GlobalValue::LinkageTypes)1456*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::createOffloadEntry(llvm::Constant *ID,
1457*13fbcb42Sjoerg                                               llvm::Constant *Addr,
1458*13fbcb42Sjoerg                                               uint64_t Size, int32_t,
1459*13fbcb42Sjoerg                                               llvm::GlobalValue::LinkageTypes) {
1460*13fbcb42Sjoerg   // TODO: Add support for global variables on the device after declare target
1461*13fbcb42Sjoerg   // support.
1462*13fbcb42Sjoerg   if (!isa<llvm::Function>(Addr))
1463*13fbcb42Sjoerg     return;
1464*13fbcb42Sjoerg   llvm::Module &M = CGM.getModule();
1465*13fbcb42Sjoerg   llvm::LLVMContext &Ctx = CGM.getLLVMContext();
1466*13fbcb42Sjoerg 
1467*13fbcb42Sjoerg   // Get "nvvm.annotations" metadata node
1468*13fbcb42Sjoerg   llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
1469*13fbcb42Sjoerg 
1470*13fbcb42Sjoerg   llvm::Metadata *MDVals[] = {
1471*13fbcb42Sjoerg       llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"),
1472*13fbcb42Sjoerg       llvm::ConstantAsMetadata::get(
1473*13fbcb42Sjoerg           llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
1474*13fbcb42Sjoerg   // Append metadata to nvvm.annotations
1475*13fbcb42Sjoerg   MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
1476*13fbcb42Sjoerg }
1477*13fbcb42Sjoerg 
emitTargetOutlinedFunction(const OMPExecutableDirective & D,StringRef ParentName,llvm::Function * & OutlinedFn,llvm::Constant * & OutlinedFnID,bool IsOffloadEntry,const RegionCodeGenTy & CodeGen)1478*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
1479*13fbcb42Sjoerg     const OMPExecutableDirective &D, StringRef ParentName,
1480*13fbcb42Sjoerg     llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
1481*13fbcb42Sjoerg     bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
1482*13fbcb42Sjoerg   if (!IsOffloadEntry) // Nothing to do.
1483*13fbcb42Sjoerg     return;
1484*13fbcb42Sjoerg 
1485*13fbcb42Sjoerg   assert(!ParentName.empty() && "Invalid target region parent name!");
1486*13fbcb42Sjoerg 
1487*13fbcb42Sjoerg   bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
1488*13fbcb42Sjoerg   if (Mode)
1489*13fbcb42Sjoerg     emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1490*13fbcb42Sjoerg                    CodeGen);
1491*13fbcb42Sjoerg   else
1492*13fbcb42Sjoerg     emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1493*13fbcb42Sjoerg                       CodeGen);
1494*13fbcb42Sjoerg 
1495*13fbcb42Sjoerg   setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
1496*13fbcb42Sjoerg }
1497*13fbcb42Sjoerg 
1498*13fbcb42Sjoerg namespace {
1499*13fbcb42Sjoerg LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
1500*13fbcb42Sjoerg /// Enum for accesseing the reserved_2 field of the ident_t struct.
1501*13fbcb42Sjoerg enum ModeFlagsTy : unsigned {
1502*13fbcb42Sjoerg   /// Bit set to 1 when in SPMD mode.
1503*13fbcb42Sjoerg   KMP_IDENT_SPMD_MODE = 0x01,
1504*13fbcb42Sjoerg   /// Bit set to 1 when a simplified runtime is used.
1505*13fbcb42Sjoerg   KMP_IDENT_SIMPLE_RT_MODE = 0x02,
1506*13fbcb42Sjoerg   LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/KMP_IDENT_SIMPLE_RT_MODE)
1507*13fbcb42Sjoerg };
1508*13fbcb42Sjoerg 
1509*13fbcb42Sjoerg /// Special mode Undefined. Is the combination of Non-SPMD mode + SimpleRuntime.
1510*13fbcb42Sjoerg static const ModeFlagsTy UndefinedMode =
1511*13fbcb42Sjoerg     (~KMP_IDENT_SPMD_MODE) & KMP_IDENT_SIMPLE_RT_MODE;
1512*13fbcb42Sjoerg } // anonymous namespace
1513*13fbcb42Sjoerg 
getDefaultLocationReserved2Flags() const1514*13fbcb42Sjoerg unsigned CGOpenMPRuntimeGPU::getDefaultLocationReserved2Flags() const {
1515*13fbcb42Sjoerg   switch (getExecutionMode()) {
1516*13fbcb42Sjoerg   case EM_SPMD:
1517*13fbcb42Sjoerg     if (requiresFullRuntime())
1518*13fbcb42Sjoerg       return KMP_IDENT_SPMD_MODE & (~KMP_IDENT_SIMPLE_RT_MODE);
1519*13fbcb42Sjoerg     return KMP_IDENT_SPMD_MODE | KMP_IDENT_SIMPLE_RT_MODE;
1520*13fbcb42Sjoerg   case EM_NonSPMD:
1521*13fbcb42Sjoerg     assert(requiresFullRuntime() && "Expected full runtime.");
1522*13fbcb42Sjoerg     return (~KMP_IDENT_SPMD_MODE) & (~KMP_IDENT_SIMPLE_RT_MODE);
1523*13fbcb42Sjoerg   case EM_Unknown:
1524*13fbcb42Sjoerg     return UndefinedMode;
1525*13fbcb42Sjoerg   }
1526*13fbcb42Sjoerg   llvm_unreachable("Unknown flags are requested.");
1527*13fbcb42Sjoerg }
1528*13fbcb42Sjoerg 
CGOpenMPRuntimeGPU(CodeGenModule & CGM)1529*13fbcb42Sjoerg CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
1530*13fbcb42Sjoerg     : CGOpenMPRuntime(CGM, "_", "$") {
1531*13fbcb42Sjoerg   if (!CGM.getLangOpts().OpenMPIsDevice)
1532*13fbcb42Sjoerg     llvm_unreachable("OpenMP NVPTX can only handle device code.");
1533*13fbcb42Sjoerg }
1534*13fbcb42Sjoerg 
emitProcBindClause(CodeGenFunction & CGF,ProcBindKind ProcBind,SourceLocation Loc)1535*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,
1536*13fbcb42Sjoerg                                               ProcBindKind ProcBind,
1537*13fbcb42Sjoerg                                               SourceLocation Loc) {
1538*13fbcb42Sjoerg   // Do nothing in case of SPMD mode and L0 parallel.
1539*13fbcb42Sjoerg   if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
1540*13fbcb42Sjoerg     return;
1541*13fbcb42Sjoerg 
1542*13fbcb42Sjoerg   CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
1543*13fbcb42Sjoerg }
1544*13fbcb42Sjoerg 
emitNumThreadsClause(CodeGenFunction & CGF,llvm::Value * NumThreads,SourceLocation Loc)1545*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF,
1546*13fbcb42Sjoerg                                                 llvm::Value *NumThreads,
1547*13fbcb42Sjoerg                                                 SourceLocation Loc) {
1548*13fbcb42Sjoerg   // Do nothing in case of SPMD mode and L0 parallel.
1549*13fbcb42Sjoerg   if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
1550*13fbcb42Sjoerg     return;
1551*13fbcb42Sjoerg 
1552*13fbcb42Sjoerg   CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
1553*13fbcb42Sjoerg }
1554*13fbcb42Sjoerg 
emitNumTeamsClause(CodeGenFunction & CGF,const Expr * NumTeams,const Expr * ThreadLimit,SourceLocation Loc)1555*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF,
1556*13fbcb42Sjoerg                                               const Expr *NumTeams,
1557*13fbcb42Sjoerg                                               const Expr *ThreadLimit,
1558*13fbcb42Sjoerg                                               SourceLocation Loc) {}
1559*13fbcb42Sjoerg 
emitParallelOutlinedFunction(const OMPExecutableDirective & D,const VarDecl * ThreadIDVar,OpenMPDirectiveKind InnermostKind,const RegionCodeGenTy & CodeGen)1560*13fbcb42Sjoerg llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction(
1561*13fbcb42Sjoerg     const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1562*13fbcb42Sjoerg     OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1563*13fbcb42Sjoerg   // Emit target region as a standalone region.
1564*13fbcb42Sjoerg   class NVPTXPrePostActionTy : public PrePostActionTy {
1565*13fbcb42Sjoerg     bool &IsInParallelRegion;
1566*13fbcb42Sjoerg     bool PrevIsInParallelRegion;
1567*13fbcb42Sjoerg 
1568*13fbcb42Sjoerg   public:
1569*13fbcb42Sjoerg     NVPTXPrePostActionTy(bool &IsInParallelRegion)
1570*13fbcb42Sjoerg         : IsInParallelRegion(IsInParallelRegion) {}
1571*13fbcb42Sjoerg     void Enter(CodeGenFunction &CGF) override {
1572*13fbcb42Sjoerg       PrevIsInParallelRegion = IsInParallelRegion;
1573*13fbcb42Sjoerg       IsInParallelRegion = true;
1574*13fbcb42Sjoerg     }
1575*13fbcb42Sjoerg     void Exit(CodeGenFunction &CGF) override {
1576*13fbcb42Sjoerg       IsInParallelRegion = PrevIsInParallelRegion;
1577*13fbcb42Sjoerg     }
1578*13fbcb42Sjoerg   } Action(IsInParallelRegion);
1579*13fbcb42Sjoerg   CodeGen.setAction(Action);
1580*13fbcb42Sjoerg   bool PrevIsInTTDRegion = IsInTTDRegion;
1581*13fbcb42Sjoerg   IsInTTDRegion = false;
1582*13fbcb42Sjoerg   bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
1583*13fbcb42Sjoerg   IsInTargetMasterThreadRegion = false;
1584*13fbcb42Sjoerg   auto *OutlinedFun =
1585*13fbcb42Sjoerg       cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
1586*13fbcb42Sjoerg           D, ThreadIDVar, InnermostKind, CodeGen));
1587*13fbcb42Sjoerg   IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
1588*13fbcb42Sjoerg   IsInTTDRegion = PrevIsInTTDRegion;
1589*13fbcb42Sjoerg   if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD &&
1590*13fbcb42Sjoerg       !IsInParallelRegion) {
1591*13fbcb42Sjoerg     llvm::Function *WrapperFun =
1592*13fbcb42Sjoerg         createParallelDataSharingWrapper(OutlinedFun, D);
1593*13fbcb42Sjoerg     WrapperFunctionsMap[OutlinedFun] = WrapperFun;
1594*13fbcb42Sjoerg   }
1595*13fbcb42Sjoerg 
1596*13fbcb42Sjoerg   return OutlinedFun;
1597*13fbcb42Sjoerg }
1598*13fbcb42Sjoerg 
1599*13fbcb42Sjoerg /// Get list of lastprivate variables from the teams distribute ... or
1600*13fbcb42Sjoerg /// teams {distribute ...} directives.
1601*13fbcb42Sjoerg static void
getDistributeLastprivateVars(ASTContext & Ctx,const OMPExecutableDirective & D,llvm::SmallVectorImpl<const ValueDecl * > & Vars)1602*13fbcb42Sjoerg getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,
1603*13fbcb42Sjoerg                              llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
1604*13fbcb42Sjoerg   assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
1605*13fbcb42Sjoerg          "expected teams directive.");
1606*13fbcb42Sjoerg   const OMPExecutableDirective *Dir = &D;
1607*13fbcb42Sjoerg   if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
1608*13fbcb42Sjoerg     if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild(
1609*13fbcb42Sjoerg             Ctx,
1610*13fbcb42Sjoerg             D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
1611*13fbcb42Sjoerg                 /*IgnoreCaptured=*/true))) {
1612*13fbcb42Sjoerg       Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
1613*13fbcb42Sjoerg       if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
1614*13fbcb42Sjoerg         Dir = nullptr;
1615*13fbcb42Sjoerg     }
1616*13fbcb42Sjoerg   }
1617*13fbcb42Sjoerg   if (!Dir)
1618*13fbcb42Sjoerg     return;
1619*13fbcb42Sjoerg   for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
1620*13fbcb42Sjoerg     for (const Expr *E : C->getVarRefs())
1621*13fbcb42Sjoerg       Vars.push_back(getPrivateItem(E));
1622*13fbcb42Sjoerg   }
1623*13fbcb42Sjoerg }
1624*13fbcb42Sjoerg 
1625*13fbcb42Sjoerg /// Get list of reduction variables from the teams ... directives.
1626*13fbcb42Sjoerg static void
getTeamsReductionVars(ASTContext & Ctx,const OMPExecutableDirective & D,llvm::SmallVectorImpl<const ValueDecl * > & Vars)1627*13fbcb42Sjoerg getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,
1628*13fbcb42Sjoerg                       llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
1629*13fbcb42Sjoerg   assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
1630*13fbcb42Sjoerg          "expected teams directive.");
1631*13fbcb42Sjoerg   for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
1632*13fbcb42Sjoerg     for (const Expr *E : C->privates())
1633*13fbcb42Sjoerg       Vars.push_back(getPrivateItem(E));
1634*13fbcb42Sjoerg   }
1635*13fbcb42Sjoerg }
1636*13fbcb42Sjoerg 
emitTeamsOutlinedFunction(const OMPExecutableDirective & D,const VarDecl * ThreadIDVar,OpenMPDirectiveKind InnermostKind,const RegionCodeGenTy & CodeGen)1637*13fbcb42Sjoerg llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
1638*13fbcb42Sjoerg     const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1639*13fbcb42Sjoerg     OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1640*13fbcb42Sjoerg   SourceLocation Loc = D.getBeginLoc();
1641*13fbcb42Sjoerg 
1642*13fbcb42Sjoerg   const RecordDecl *GlobalizedRD = nullptr;
1643*13fbcb42Sjoerg   llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
1644*13fbcb42Sjoerg   llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
1645*13fbcb42Sjoerg   unsigned WarpSize = CGM.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
1646*13fbcb42Sjoerg   // Globalize team reductions variable unconditionally in all modes.
1647*13fbcb42Sjoerg   if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1648*13fbcb42Sjoerg     getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
1649*13fbcb42Sjoerg   if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
1650*13fbcb42Sjoerg     getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
1651*13fbcb42Sjoerg     if (!LastPrivatesReductions.empty()) {
1652*13fbcb42Sjoerg       GlobalizedRD = ::buildRecordForGlobalizedVars(
1653*13fbcb42Sjoerg           CGM.getContext(), llvm::None, LastPrivatesReductions,
1654*13fbcb42Sjoerg           MappedDeclsFields, WarpSize);
1655*13fbcb42Sjoerg     }
1656*13fbcb42Sjoerg   } else if (!LastPrivatesReductions.empty()) {
1657*13fbcb42Sjoerg     assert(!TeamAndReductions.first &&
1658*13fbcb42Sjoerg            "Previous team declaration is not expected.");
1659*13fbcb42Sjoerg     TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
1660*13fbcb42Sjoerg     std::swap(TeamAndReductions.second, LastPrivatesReductions);
1661*13fbcb42Sjoerg   }
1662*13fbcb42Sjoerg 
1663*13fbcb42Sjoerg   // Emit target region as a standalone region.
1664*13fbcb42Sjoerg   class NVPTXPrePostActionTy : public PrePostActionTy {
1665*13fbcb42Sjoerg     SourceLocation &Loc;
1666*13fbcb42Sjoerg     const RecordDecl *GlobalizedRD;
1667*13fbcb42Sjoerg     llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1668*13fbcb42Sjoerg         &MappedDeclsFields;
1669*13fbcb42Sjoerg 
1670*13fbcb42Sjoerg   public:
1671*13fbcb42Sjoerg     NVPTXPrePostActionTy(
1672*13fbcb42Sjoerg         SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1673*13fbcb42Sjoerg         llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1674*13fbcb42Sjoerg             &MappedDeclsFields)
1675*13fbcb42Sjoerg         : Loc(Loc), GlobalizedRD(GlobalizedRD),
1676*13fbcb42Sjoerg           MappedDeclsFields(MappedDeclsFields) {}
1677*13fbcb42Sjoerg     void Enter(CodeGenFunction &CGF) override {
1678*13fbcb42Sjoerg       auto &Rt =
1679*13fbcb42Sjoerg           static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1680*13fbcb42Sjoerg       if (GlobalizedRD) {
1681*13fbcb42Sjoerg         auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
1682*13fbcb42Sjoerg         I->getSecond().GlobalRecord = GlobalizedRD;
1683*13fbcb42Sjoerg         I->getSecond().MappedParams =
1684*13fbcb42Sjoerg             std::make_unique<CodeGenFunction::OMPMapVars>();
1685*13fbcb42Sjoerg         DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
1686*13fbcb42Sjoerg         for (const auto &Pair : MappedDeclsFields) {
1687*13fbcb42Sjoerg           assert(Pair.getFirst()->isCanonicalDecl() &&
1688*13fbcb42Sjoerg                  "Expected canonical declaration");
1689*13fbcb42Sjoerg           Data.insert(std::make_pair(Pair.getFirst(),
1690*13fbcb42Sjoerg                                      MappedVarData(Pair.getSecond(),
1691*13fbcb42Sjoerg                                                    /*IsOnePerTeam=*/true)));
1692*13fbcb42Sjoerg         }
1693*13fbcb42Sjoerg       }
1694*13fbcb42Sjoerg       Rt.emitGenericVarsProlog(CGF, Loc);
1695*13fbcb42Sjoerg     }
1696*13fbcb42Sjoerg     void Exit(CodeGenFunction &CGF) override {
1697*13fbcb42Sjoerg       static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
1698*13fbcb42Sjoerg           .emitGenericVarsEpilog(CGF);
1699*13fbcb42Sjoerg     }
1700*13fbcb42Sjoerg   } Action(Loc, GlobalizedRD, MappedDeclsFields);
1701*13fbcb42Sjoerg   CodeGen.setAction(Action);
1702*13fbcb42Sjoerg   llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1703*13fbcb42Sjoerg       D, ThreadIDVar, InnermostKind, CodeGen);
1704*13fbcb42Sjoerg 
1705*13fbcb42Sjoerg   return OutlinedFun;
1706*13fbcb42Sjoerg }
1707*13fbcb42Sjoerg 
emitGenericVarsProlog(CodeGenFunction & CGF,SourceLocation Loc,bool WithSPMDCheck)1708*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
1709*13fbcb42Sjoerg                                                  SourceLocation Loc,
1710*13fbcb42Sjoerg                                                  bool WithSPMDCheck) {
1711*13fbcb42Sjoerg   if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
1712*13fbcb42Sjoerg       getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1713*13fbcb42Sjoerg     return;
1714*13fbcb42Sjoerg 
1715*13fbcb42Sjoerg   CGBuilderTy &Bld = CGF.Builder;
1716*13fbcb42Sjoerg 
1717*13fbcb42Sjoerg   const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1718*13fbcb42Sjoerg   if (I == FunctionGlobalizedDecls.end())
1719*13fbcb42Sjoerg     return;
1720*13fbcb42Sjoerg   if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) {
1721*13fbcb42Sjoerg     QualType GlobalRecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord);
1722*13fbcb42Sjoerg     QualType SecGlobalRecTy;
1723*13fbcb42Sjoerg 
1724*13fbcb42Sjoerg     // Recover pointer to this function's global record. The runtime will
1725*13fbcb42Sjoerg     // handle the specifics of the allocation of the memory.
1726*13fbcb42Sjoerg     // Use actual memory size of the record including the padding
1727*13fbcb42Sjoerg     // for alignment purposes.
1728*13fbcb42Sjoerg     unsigned Alignment =
1729*13fbcb42Sjoerg         CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
1730*13fbcb42Sjoerg     unsigned GlobalRecordSize =
1731*13fbcb42Sjoerg         CGM.getContext().getTypeSizeInChars(GlobalRecTy).getQuantity();
1732*13fbcb42Sjoerg     GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
1733*13fbcb42Sjoerg 
1734*13fbcb42Sjoerg     llvm::PointerType *GlobalRecPtrTy =
1735*13fbcb42Sjoerg         CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo();
1736*13fbcb42Sjoerg     llvm::Value *GlobalRecCastAddr;
1737*13fbcb42Sjoerg     llvm::Value *IsTTD = nullptr;
1738*13fbcb42Sjoerg     if (!IsInTTDRegion &&
1739*13fbcb42Sjoerg         (WithSPMDCheck ||
1740*13fbcb42Sjoerg          getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) {
1741*13fbcb42Sjoerg       llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
1742*13fbcb42Sjoerg       llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd");
1743*13fbcb42Sjoerg       llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
1744*13fbcb42Sjoerg       if (I->getSecond().SecondaryGlobalRecord.hasValue()) {
1745*13fbcb42Sjoerg         llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1746*13fbcb42Sjoerg         llvm::Value *ThreadID = getThreadID(CGF, Loc);
1747*13fbcb42Sjoerg         llvm::Value *PL = CGF.EmitRuntimeCall(
1748*13fbcb42Sjoerg             OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
1749*13fbcb42Sjoerg                                                   OMPRTL___kmpc_parallel_level),
1750*13fbcb42Sjoerg             {RTLoc, ThreadID});
1751*13fbcb42Sjoerg         IsTTD = Bld.CreateIsNull(PL);
1752*13fbcb42Sjoerg       }
1753*13fbcb42Sjoerg       llvm::Value *IsSPMD = Bld.CreateIsNotNull(
1754*13fbcb42Sjoerg           CGF.EmitNounwindRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1755*13fbcb42Sjoerg               CGM.getModule(), OMPRTL___kmpc_is_spmd_exec_mode)));
1756*13fbcb42Sjoerg       Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB);
1757*13fbcb42Sjoerg       // There is no need to emit line number for unconditional branch.
1758*13fbcb42Sjoerg       (void)ApplyDebugLocation::CreateEmpty(CGF);
1759*13fbcb42Sjoerg       CGF.EmitBlock(SPMDBB);
1760*13fbcb42Sjoerg       Address RecPtr = Address(llvm::ConstantPointerNull::get(GlobalRecPtrTy),
1761*13fbcb42Sjoerg                                CharUnits::fromQuantity(Alignment));
1762*13fbcb42Sjoerg       CGF.EmitBranch(ExitBB);
1763*13fbcb42Sjoerg       // There is no need to emit line number for unconditional branch.
1764*13fbcb42Sjoerg       (void)ApplyDebugLocation::CreateEmpty(CGF);
1765*13fbcb42Sjoerg       CGF.EmitBlock(NonSPMDBB);
1766*13fbcb42Sjoerg       llvm::Value *Size = llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize);
1767*13fbcb42Sjoerg       if (const RecordDecl *SecGlobalizedVarsRecord =
1768*13fbcb42Sjoerg               I->getSecond().SecondaryGlobalRecord.getValueOr(nullptr)) {
1769*13fbcb42Sjoerg         SecGlobalRecTy =
1770*13fbcb42Sjoerg             CGM.getContext().getRecordType(SecGlobalizedVarsRecord);
1771*13fbcb42Sjoerg 
1772*13fbcb42Sjoerg         // Recover pointer to this function's global record. The runtime will
1773*13fbcb42Sjoerg         // handle the specifics of the allocation of the memory.
1774*13fbcb42Sjoerg         // Use actual memory size of the record including the padding
1775*13fbcb42Sjoerg         // for alignment purposes.
1776*13fbcb42Sjoerg         unsigned Alignment =
1777*13fbcb42Sjoerg             CGM.getContext().getTypeAlignInChars(SecGlobalRecTy).getQuantity();
1778*13fbcb42Sjoerg         unsigned GlobalRecordSize =
1779*13fbcb42Sjoerg             CGM.getContext().getTypeSizeInChars(SecGlobalRecTy).getQuantity();
1780*13fbcb42Sjoerg         GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
1781*13fbcb42Sjoerg         Size = Bld.CreateSelect(
1782*13fbcb42Sjoerg             IsTTD, llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), Size);
1783*13fbcb42Sjoerg       }
1784*13fbcb42Sjoerg       // TODO: allow the usage of shared memory to be controlled by
1785*13fbcb42Sjoerg       // the user, for now, default to global.
1786*13fbcb42Sjoerg       llvm::Value *GlobalRecordSizeArg[] = {
1787*13fbcb42Sjoerg           Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
1788*13fbcb42Sjoerg       llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
1789*13fbcb42Sjoerg           OMPBuilder.getOrCreateRuntimeFunction(
1790*13fbcb42Sjoerg               CGM.getModule(), OMPRTL___kmpc_data_sharing_coalesced_push_stack),
1791*13fbcb42Sjoerg           GlobalRecordSizeArg);
1792*13fbcb42Sjoerg       GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1793*13fbcb42Sjoerg           GlobalRecValue, GlobalRecPtrTy);
1794*13fbcb42Sjoerg       CGF.EmitBlock(ExitBB);
1795*13fbcb42Sjoerg       auto *Phi = Bld.CreatePHI(GlobalRecPtrTy,
1796*13fbcb42Sjoerg                                 /*NumReservedValues=*/2, "_select_stack");
1797*13fbcb42Sjoerg       Phi->addIncoming(RecPtr.getPointer(), SPMDBB);
1798*13fbcb42Sjoerg       Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB);
1799*13fbcb42Sjoerg       GlobalRecCastAddr = Phi;
1800*13fbcb42Sjoerg       I->getSecond().GlobalRecordAddr = Phi;
1801*13fbcb42Sjoerg       I->getSecond().IsInSPMDModeFlag = IsSPMD;
1802*13fbcb42Sjoerg     } else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) {
1803*13fbcb42Sjoerg       assert(GlobalizedRecords.back().Records.size() < 2 &&
1804*13fbcb42Sjoerg              "Expected less than 2 globalized records: one for target and one "
1805*13fbcb42Sjoerg              "for teams.");
1806*13fbcb42Sjoerg       unsigned Offset = 0;
1807*13fbcb42Sjoerg       for (const RecordDecl *RD : GlobalizedRecords.back().Records) {
1808*13fbcb42Sjoerg         QualType RDTy = CGM.getContext().getRecordType(RD);
1809*13fbcb42Sjoerg         unsigned Alignment =
1810*13fbcb42Sjoerg             CGM.getContext().getTypeAlignInChars(RDTy).getQuantity();
1811*13fbcb42Sjoerg         unsigned Size = CGM.getContext().getTypeSizeInChars(RDTy).getQuantity();
1812*13fbcb42Sjoerg         Offset =
1813*13fbcb42Sjoerg             llvm::alignTo(llvm::alignTo(Offset, Alignment) + Size, Alignment);
1814*13fbcb42Sjoerg       }
1815*13fbcb42Sjoerg       unsigned Alignment =
1816*13fbcb42Sjoerg           CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
1817*13fbcb42Sjoerg       Offset = llvm::alignTo(Offset, Alignment);
1818*13fbcb42Sjoerg       GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord);
1819*13fbcb42Sjoerg       ++GlobalizedRecords.back().RegionCounter;
1820*13fbcb42Sjoerg       if (GlobalizedRecords.back().Records.size() == 1) {
1821*13fbcb42Sjoerg         assert(KernelStaticGlobalized &&
1822*13fbcb42Sjoerg                "Kernel static pointer must be initialized already.");
1823*13fbcb42Sjoerg         auto *UseSharedMemory = new llvm::GlobalVariable(
1824*13fbcb42Sjoerg             CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true,
1825*13fbcb42Sjoerg             llvm::GlobalValue::InternalLinkage, nullptr,
1826*13fbcb42Sjoerg             "_openmp_static_kernel$is_shared");
1827*13fbcb42Sjoerg         UseSharedMemory->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
1828*13fbcb42Sjoerg         QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
1829*13fbcb42Sjoerg             /*DestWidth=*/16, /*Signed=*/0);
1830*13fbcb42Sjoerg         llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
1831*13fbcb42Sjoerg             Address(UseSharedMemory,
1832*13fbcb42Sjoerg                     CGM.getContext().getTypeAlignInChars(Int16Ty)),
1833*13fbcb42Sjoerg             /*Volatile=*/false, Int16Ty, Loc);
1834*13fbcb42Sjoerg         auto *StaticGlobalized = new llvm::GlobalVariable(
1835*13fbcb42Sjoerg             CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false,
1836*13fbcb42Sjoerg             llvm::GlobalValue::CommonLinkage, nullptr);
1837*13fbcb42Sjoerg         auto *RecSize = new llvm::GlobalVariable(
1838*13fbcb42Sjoerg             CGM.getModule(), CGM.SizeTy, /*isConstant=*/true,
1839*13fbcb42Sjoerg             llvm::GlobalValue::InternalLinkage, nullptr,
1840*13fbcb42Sjoerg             "_openmp_static_kernel$size");
1841*13fbcb42Sjoerg         RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
1842*13fbcb42Sjoerg         llvm::Value *Ld = CGF.EmitLoadOfScalar(
1843*13fbcb42Sjoerg             Address(RecSize, CGM.getSizeAlign()), /*Volatile=*/false,
1844*13fbcb42Sjoerg             CGM.getContext().getSizeType(), Loc);
1845*13fbcb42Sjoerg         llvm::Value *ResAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1846*13fbcb42Sjoerg             KernelStaticGlobalized, CGM.VoidPtrPtrTy);
1847*13fbcb42Sjoerg         llvm::Value *GlobalRecordSizeArg[] = {
1848*13fbcb42Sjoerg             llvm::ConstantInt::get(
1849*13fbcb42Sjoerg                 CGM.Int16Ty,
1850*13fbcb42Sjoerg                 getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD ? 1 : 0),
1851*13fbcb42Sjoerg             StaticGlobalized, Ld, IsInSharedMemory, ResAddr};
1852*13fbcb42Sjoerg         CGF.EmitRuntimeCall(
1853*13fbcb42Sjoerg             OMPBuilder.getOrCreateRuntimeFunction(
1854*13fbcb42Sjoerg                 CGM.getModule(), OMPRTL___kmpc_get_team_static_memory),
1855*13fbcb42Sjoerg             GlobalRecordSizeArg);
1856*13fbcb42Sjoerg         GlobalizedRecords.back().Buffer = StaticGlobalized;
1857*13fbcb42Sjoerg         GlobalizedRecords.back().RecSize = RecSize;
1858*13fbcb42Sjoerg         GlobalizedRecords.back().UseSharedMemory = UseSharedMemory;
1859*13fbcb42Sjoerg         GlobalizedRecords.back().Loc = Loc;
1860*13fbcb42Sjoerg       }
1861*13fbcb42Sjoerg       assert(KernelStaticGlobalized && "Global address must be set already.");
1862*13fbcb42Sjoerg       Address FrameAddr = CGF.EmitLoadOfPointer(
1863*13fbcb42Sjoerg           Address(KernelStaticGlobalized, CGM.getPointerAlign()),
1864*13fbcb42Sjoerg           CGM.getContext()
1865*13fbcb42Sjoerg               .getPointerType(CGM.getContext().VoidPtrTy)
1866*13fbcb42Sjoerg               .castAs<PointerType>());
1867*13fbcb42Sjoerg       llvm::Value *GlobalRecValue =
1868*13fbcb42Sjoerg           Bld.CreateConstInBoundsGEP(FrameAddr, Offset).getPointer();
1869*13fbcb42Sjoerg       I->getSecond().GlobalRecordAddr = GlobalRecValue;
1870*13fbcb42Sjoerg       I->getSecond().IsInSPMDModeFlag = nullptr;
1871*13fbcb42Sjoerg       GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1872*13fbcb42Sjoerg           GlobalRecValue, CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo());
1873*13fbcb42Sjoerg     } else {
1874*13fbcb42Sjoerg       // TODO: allow the usage of shared memory to be controlled by
1875*13fbcb42Sjoerg       // the user, for now, default to global.
1876*13fbcb42Sjoerg       bool UseSharedMemory =
1877*13fbcb42Sjoerg           IsInTTDRegion && GlobalRecordSize <= SharedMemorySize;
1878*13fbcb42Sjoerg       llvm::Value *GlobalRecordSizeArg[] = {
1879*13fbcb42Sjoerg           llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
1880*13fbcb42Sjoerg           CGF.Builder.getInt16(UseSharedMemory ? 1 : 0)};
1881*13fbcb42Sjoerg       llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
1882*13fbcb42Sjoerg           OMPBuilder.getOrCreateRuntimeFunction(
1883*13fbcb42Sjoerg               CGM.getModule(),
1884*13fbcb42Sjoerg               IsInTTDRegion ? OMPRTL___kmpc_data_sharing_push_stack
1885*13fbcb42Sjoerg                             : OMPRTL___kmpc_data_sharing_coalesced_push_stack),
1886*13fbcb42Sjoerg           GlobalRecordSizeArg);
1887*13fbcb42Sjoerg       GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1888*13fbcb42Sjoerg           GlobalRecValue, GlobalRecPtrTy);
1889*13fbcb42Sjoerg       I->getSecond().GlobalRecordAddr = GlobalRecValue;
1890*13fbcb42Sjoerg       I->getSecond().IsInSPMDModeFlag = nullptr;
1891*13fbcb42Sjoerg     }
1892*13fbcb42Sjoerg     LValue Base =
1893*13fbcb42Sjoerg         CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, GlobalRecTy);
1894*13fbcb42Sjoerg 
1895*13fbcb42Sjoerg     // Emit the "global alloca" which is a GEP from the global declaration
1896*13fbcb42Sjoerg     // record using the pointer returned by the runtime.
1897*13fbcb42Sjoerg     LValue SecBase;
1898*13fbcb42Sjoerg     decltype(I->getSecond().LocalVarData)::const_iterator SecIt;
1899*13fbcb42Sjoerg     if (IsTTD) {
1900*13fbcb42Sjoerg       SecIt = I->getSecond().SecondaryLocalVarData->begin();
1901*13fbcb42Sjoerg       llvm::PointerType *SecGlobalRecPtrTy =
1902*13fbcb42Sjoerg           CGF.ConvertTypeForMem(SecGlobalRecTy)->getPointerTo();
1903*13fbcb42Sjoerg       SecBase = CGF.MakeNaturalAlignPointeeAddrLValue(
1904*13fbcb42Sjoerg           Bld.CreatePointerBitCastOrAddrSpaceCast(
1905*13fbcb42Sjoerg               I->getSecond().GlobalRecordAddr, SecGlobalRecPtrTy),
1906*13fbcb42Sjoerg           SecGlobalRecTy);
1907*13fbcb42Sjoerg     }
1908*13fbcb42Sjoerg     for (auto &Rec : I->getSecond().LocalVarData) {
1909*13fbcb42Sjoerg       bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
1910*13fbcb42Sjoerg       llvm::Value *ParValue;
1911*13fbcb42Sjoerg       if (EscapedParam) {
1912*13fbcb42Sjoerg         const auto *VD = cast<VarDecl>(Rec.first);
1913*13fbcb42Sjoerg         LValue ParLVal =
1914*13fbcb42Sjoerg             CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
1915*13fbcb42Sjoerg         ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
1916*13fbcb42Sjoerg       }
1917*13fbcb42Sjoerg       LValue VarAddr = CGF.EmitLValueForField(Base, Rec.second.FD);
1918*13fbcb42Sjoerg       // Emit VarAddr basing on lane-id if required.
1919*13fbcb42Sjoerg       QualType VarTy;
1920*13fbcb42Sjoerg       if (Rec.second.IsOnePerTeam) {
1921*13fbcb42Sjoerg         VarTy = Rec.second.FD->getType();
1922*13fbcb42Sjoerg       } else {
1923*13fbcb42Sjoerg         Address Addr = VarAddr.getAddress(CGF);
1924*13fbcb42Sjoerg         llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP(
1925*13fbcb42Sjoerg             Addr.getElementType(), Addr.getPointer(),
1926*13fbcb42Sjoerg             {Bld.getInt32(0), getNVPTXLaneID(CGF)});
1927*13fbcb42Sjoerg         VarTy =
1928*13fbcb42Sjoerg             Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType();
1929*13fbcb42Sjoerg         VarAddr = CGF.MakeAddrLValue(
1930*13fbcb42Sjoerg             Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)), VarTy,
1931*13fbcb42Sjoerg             AlignmentSource::Decl);
1932*13fbcb42Sjoerg       }
1933*13fbcb42Sjoerg       Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
1934*13fbcb42Sjoerg       if (!IsInTTDRegion &&
1935*13fbcb42Sjoerg           (WithSPMDCheck ||
1936*13fbcb42Sjoerg            getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) {
1937*13fbcb42Sjoerg         assert(I->getSecond().IsInSPMDModeFlag &&
1938*13fbcb42Sjoerg                "Expected unknown execution mode or required SPMD check.");
1939*13fbcb42Sjoerg         if (IsTTD) {
1940*13fbcb42Sjoerg           assert(SecIt->second.IsOnePerTeam &&
1941*13fbcb42Sjoerg                  "Secondary glob data must be one per team.");
1942*13fbcb42Sjoerg           LValue SecVarAddr = CGF.EmitLValueForField(SecBase, SecIt->second.FD);
1943*13fbcb42Sjoerg           VarAddr.setAddress(
1944*13fbcb42Sjoerg               Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(CGF),
1945*13fbcb42Sjoerg                                        VarAddr.getPointer(CGF)),
1946*13fbcb42Sjoerg                       VarAddr.getAlignment()));
1947*13fbcb42Sjoerg           Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
1948*13fbcb42Sjoerg         }
1949*13fbcb42Sjoerg         Address GlobalPtr = Rec.second.PrivateAddr;
1950*13fbcb42Sjoerg         Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName());
1951*13fbcb42Sjoerg         Rec.second.PrivateAddr = Address(
1952*13fbcb42Sjoerg             Bld.CreateSelect(I->getSecond().IsInSPMDModeFlag,
1953*13fbcb42Sjoerg                              LocalAddr.getPointer(), GlobalPtr.getPointer()),
1954*13fbcb42Sjoerg             LocalAddr.getAlignment());
1955*13fbcb42Sjoerg       }
1956*13fbcb42Sjoerg       if (EscapedParam) {
1957*13fbcb42Sjoerg         const auto *VD = cast<VarDecl>(Rec.first);
1958*13fbcb42Sjoerg         CGF.EmitStoreOfScalar(ParValue, VarAddr);
1959*13fbcb42Sjoerg         I->getSecond().MappedParams->setVarAddr(CGF, VD,
1960*13fbcb42Sjoerg                                                 VarAddr.getAddress(CGF));
1961*13fbcb42Sjoerg       }
1962*13fbcb42Sjoerg       if (IsTTD)
1963*13fbcb42Sjoerg         ++SecIt;
1964*13fbcb42Sjoerg     }
1965*13fbcb42Sjoerg   }
1966*13fbcb42Sjoerg   for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) {
1967*13fbcb42Sjoerg     // Recover pointer to this function's global record. The runtime will
1968*13fbcb42Sjoerg     // handle the specifics of the allocation of the memory.
1969*13fbcb42Sjoerg     // Use actual memory size of the record including the padding
1970*13fbcb42Sjoerg     // for alignment purposes.
1971*13fbcb42Sjoerg     CGBuilderTy &Bld = CGF.Builder;
1972*13fbcb42Sjoerg     llvm::Value *Size = CGF.getTypeSize(VD->getType());
1973*13fbcb42Sjoerg     CharUnits Align = CGM.getContext().getDeclAlign(VD);
1974*13fbcb42Sjoerg     Size = Bld.CreateNUWAdd(
1975*13fbcb42Sjoerg         Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
1976*13fbcb42Sjoerg     llvm::Value *AlignVal =
1977*13fbcb42Sjoerg         llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
1978*13fbcb42Sjoerg     Size = Bld.CreateUDiv(Size, AlignVal);
1979*13fbcb42Sjoerg     Size = Bld.CreateNUWMul(Size, AlignVal);
1980*13fbcb42Sjoerg     // TODO: allow the usage of shared memory to be controlled by
1981*13fbcb42Sjoerg     // the user, for now, default to global.
1982*13fbcb42Sjoerg     llvm::Value *GlobalRecordSizeArg[] = {
1983*13fbcb42Sjoerg         Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
1984*13fbcb42Sjoerg     llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
1985*13fbcb42Sjoerg         OMPBuilder.getOrCreateRuntimeFunction(
1986*13fbcb42Sjoerg             CGM.getModule(), OMPRTL___kmpc_data_sharing_coalesced_push_stack),
1987*13fbcb42Sjoerg         GlobalRecordSizeArg);
1988*13fbcb42Sjoerg     llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1989*13fbcb42Sjoerg         GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo());
1990*13fbcb42Sjoerg     LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(),
1991*13fbcb42Sjoerg                                      CGM.getContext().getDeclAlign(VD),
1992*13fbcb42Sjoerg                                      AlignmentSource::Decl);
1993*13fbcb42Sjoerg     I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
1994*13fbcb42Sjoerg                                             Base.getAddress(CGF));
1995*13fbcb42Sjoerg     I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue);
1996*13fbcb42Sjoerg   }
1997*13fbcb42Sjoerg   I->getSecond().MappedParams->apply(CGF);
1998*13fbcb42Sjoerg }
1999*13fbcb42Sjoerg 
emitGenericVarsEpilog(CodeGenFunction & CGF,bool WithSPMDCheck)2000*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF,
2001*13fbcb42Sjoerg                                                  bool WithSPMDCheck) {
2002*13fbcb42Sjoerg   if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
2003*13fbcb42Sjoerg       getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
2004*13fbcb42Sjoerg     return;
2005*13fbcb42Sjoerg 
2006*13fbcb42Sjoerg   const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
2007*13fbcb42Sjoerg   if (I != FunctionGlobalizedDecls.end()) {
2008*13fbcb42Sjoerg     I->getSecond().MappedParams->restore(CGF);
2009*13fbcb42Sjoerg     if (!CGF.HaveInsertPoint())
2010*13fbcb42Sjoerg       return;
2011*13fbcb42Sjoerg     for (llvm::Value *Addr :
2012*13fbcb42Sjoerg          llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
2013*13fbcb42Sjoerg       CGF.EmitRuntimeCall(
2014*13fbcb42Sjoerg           OMPBuilder.getOrCreateRuntimeFunction(
2015*13fbcb42Sjoerg               CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack),
2016*13fbcb42Sjoerg           Addr);
2017*13fbcb42Sjoerg     }
2018*13fbcb42Sjoerg     if (I->getSecond().GlobalRecordAddr) {
2019*13fbcb42Sjoerg       if (!IsInTTDRegion &&
2020*13fbcb42Sjoerg           (WithSPMDCheck ||
2021*13fbcb42Sjoerg            getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) {
2022*13fbcb42Sjoerg         CGBuilderTy &Bld = CGF.Builder;
2023*13fbcb42Sjoerg         llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2024*13fbcb42Sjoerg         llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
2025*13fbcb42Sjoerg         Bld.CreateCondBr(I->getSecond().IsInSPMDModeFlag, ExitBB, NonSPMDBB);
2026*13fbcb42Sjoerg         // There is no need to emit line number for unconditional branch.
2027*13fbcb42Sjoerg         (void)ApplyDebugLocation::CreateEmpty(CGF);
2028*13fbcb42Sjoerg         CGF.EmitBlock(NonSPMDBB);
2029*13fbcb42Sjoerg         CGF.EmitRuntimeCall(
2030*13fbcb42Sjoerg             OMPBuilder.getOrCreateRuntimeFunction(
2031*13fbcb42Sjoerg                 CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack),
2032*13fbcb42Sjoerg             CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr));
2033*13fbcb42Sjoerg         CGF.EmitBlock(ExitBB);
2034*13fbcb42Sjoerg       } else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) {
2035*13fbcb42Sjoerg         assert(GlobalizedRecords.back().RegionCounter > 0 &&
2036*13fbcb42Sjoerg                "region counter must be > 0.");
2037*13fbcb42Sjoerg         --GlobalizedRecords.back().RegionCounter;
2038*13fbcb42Sjoerg         // Emit the restore function only in the target region.
2039*13fbcb42Sjoerg         if (GlobalizedRecords.back().RegionCounter == 0) {
2040*13fbcb42Sjoerg           QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
2041*13fbcb42Sjoerg               /*DestWidth=*/16, /*Signed=*/0);
2042*13fbcb42Sjoerg           llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
2043*13fbcb42Sjoerg               Address(GlobalizedRecords.back().UseSharedMemory,
2044*13fbcb42Sjoerg                       CGM.getContext().getTypeAlignInChars(Int16Ty)),
2045*13fbcb42Sjoerg               /*Volatile=*/false, Int16Ty, GlobalizedRecords.back().Loc);
2046*13fbcb42Sjoerg           llvm::Value *Args[] = {
2047*13fbcb42Sjoerg               llvm::ConstantInt::get(
2048*13fbcb42Sjoerg                   CGM.Int16Ty,
2049*13fbcb42Sjoerg                   getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD ? 1 : 0),
2050*13fbcb42Sjoerg               IsInSharedMemory};
2051*13fbcb42Sjoerg           CGF.EmitRuntimeCall(
2052*13fbcb42Sjoerg               OMPBuilder.getOrCreateRuntimeFunction(
2053*13fbcb42Sjoerg                   CGM.getModule(), OMPRTL___kmpc_restore_team_static_memory),
2054*13fbcb42Sjoerg               Args);
2055*13fbcb42Sjoerg         }
2056*13fbcb42Sjoerg       } else {
2057*13fbcb42Sjoerg         CGF.EmitRuntimeCall(
2058*13fbcb42Sjoerg             OMPBuilder.getOrCreateRuntimeFunction(
2059*13fbcb42Sjoerg                 CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack),
2060*13fbcb42Sjoerg             I->getSecond().GlobalRecordAddr);
2061*13fbcb42Sjoerg       }
2062*13fbcb42Sjoerg     }
2063*13fbcb42Sjoerg   }
2064*13fbcb42Sjoerg }
2065*13fbcb42Sjoerg 
emitTeamsCall(CodeGenFunction & CGF,const OMPExecutableDirective & D,SourceLocation Loc,llvm::Function * OutlinedFn,ArrayRef<llvm::Value * > CapturedVars)2066*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,
2067*13fbcb42Sjoerg                                          const OMPExecutableDirective &D,
2068*13fbcb42Sjoerg                                          SourceLocation Loc,
2069*13fbcb42Sjoerg                                          llvm::Function *OutlinedFn,
2070*13fbcb42Sjoerg                                          ArrayRef<llvm::Value *> CapturedVars) {
2071*13fbcb42Sjoerg   if (!CGF.HaveInsertPoint())
2072*13fbcb42Sjoerg     return;
2073*13fbcb42Sjoerg 
2074*13fbcb42Sjoerg   Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
2075*13fbcb42Sjoerg                                                       /*Name=*/".zero.addr");
2076*13fbcb42Sjoerg   CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2077*13fbcb42Sjoerg   llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2078*13fbcb42Sjoerg   OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
2079*13fbcb42Sjoerg   OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2080*13fbcb42Sjoerg   OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2081*13fbcb42Sjoerg   emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
2082*13fbcb42Sjoerg }
2083*13fbcb42Sjoerg 
emitParallelCall(CodeGenFunction & CGF,SourceLocation Loc,llvm::Function * OutlinedFn,ArrayRef<llvm::Value * > CapturedVars,const Expr * IfCond)2084*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
2085*13fbcb42Sjoerg                                           SourceLocation Loc,
2086*13fbcb42Sjoerg                                           llvm::Function *OutlinedFn,
2087*13fbcb42Sjoerg                                           ArrayRef<llvm::Value *> CapturedVars,
2088*13fbcb42Sjoerg                                           const Expr *IfCond) {
2089*13fbcb42Sjoerg   if (!CGF.HaveInsertPoint())
2090*13fbcb42Sjoerg     return;
2091*13fbcb42Sjoerg 
2092*13fbcb42Sjoerg   auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars,
2093*13fbcb42Sjoerg                         IfCond](CodeGenFunction &CGF, PrePostActionTy &Action) {
2094*13fbcb42Sjoerg     CGBuilderTy &Bld = CGF.Builder;
2095*13fbcb42Sjoerg     llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
2096*13fbcb42Sjoerg     llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
2097*13fbcb42Sjoerg     if (WFn) {
2098*13fbcb42Sjoerg       ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
2099*13fbcb42Sjoerg       // Remember for post-processing in worker loop.
2100*13fbcb42Sjoerg       Work.emplace_back(WFn);
2101*13fbcb42Sjoerg     }
2102*13fbcb42Sjoerg     llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);
2103*13fbcb42Sjoerg 
2104*13fbcb42Sjoerg     // Create a private scope that will globalize the arguments
2105*13fbcb42Sjoerg     // passed from the outside of the target region.
2106*13fbcb42Sjoerg     // TODO: Is that needed?
2107*13fbcb42Sjoerg     CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
2108*13fbcb42Sjoerg 
2109*13fbcb42Sjoerg     Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
2110*13fbcb42Sjoerg         llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),
2111*13fbcb42Sjoerg         "captured_vars_addrs");
2112*13fbcb42Sjoerg     // There's something to share.
2113*13fbcb42Sjoerg     if (!CapturedVars.empty()) {
2114*13fbcb42Sjoerg       // Prepare for parallel region. Indicate the outlined function.
2115*13fbcb42Sjoerg       ASTContext &Ctx = CGF.getContext();
2116*13fbcb42Sjoerg       unsigned Idx = 0;
2117*13fbcb42Sjoerg       for (llvm::Value *V : CapturedVars) {
2118*13fbcb42Sjoerg         Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);
2119*13fbcb42Sjoerg         llvm::Value *PtrV;
2120*13fbcb42Sjoerg         if (V->getType()->isIntegerTy())
2121*13fbcb42Sjoerg           PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
2122*13fbcb42Sjoerg         else
2123*13fbcb42Sjoerg           PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
2124*13fbcb42Sjoerg         CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
2125*13fbcb42Sjoerg                               Ctx.getPointerType(Ctx.VoidPtrTy));
2126*13fbcb42Sjoerg         ++Idx;
2127*13fbcb42Sjoerg       }
2128*13fbcb42Sjoerg     }
2129*13fbcb42Sjoerg 
2130*13fbcb42Sjoerg     llvm::Value *IfCondVal = nullptr;
2131*13fbcb42Sjoerg     if (IfCond)
2132*13fbcb42Sjoerg       IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,
2133*13fbcb42Sjoerg                                     /* isSigned */ false);
2134*13fbcb42Sjoerg     else
2135*13fbcb42Sjoerg       IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
2136*13fbcb42Sjoerg 
2137*13fbcb42Sjoerg     assert(IfCondVal && "Expected a value");
2138*13fbcb42Sjoerg     llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2139*13fbcb42Sjoerg     llvm::Value *Args[] = {
2140*13fbcb42Sjoerg         RTLoc,
2141*13fbcb42Sjoerg         getThreadID(CGF, Loc),
2142*13fbcb42Sjoerg         IfCondVal,
2143*13fbcb42Sjoerg         llvm::ConstantInt::get(CGF.Int32Ty, -1),
2144*13fbcb42Sjoerg         llvm::ConstantInt::get(CGF.Int32Ty, -1),
2145*13fbcb42Sjoerg         FnPtr,
2146*13fbcb42Sjoerg         ID,
2147*13fbcb42Sjoerg         Bld.CreateBitOrPointerCast(CapturedVarsAddrs.getPointer(),
2148*13fbcb42Sjoerg                                    CGF.VoidPtrPtrTy),
2149*13fbcb42Sjoerg         llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
2150*13fbcb42Sjoerg     CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
2151*13fbcb42Sjoerg                             CGM.getModule(), OMPRTL___kmpc_parallel_51),
2152*13fbcb42Sjoerg                         Args);
2153*13fbcb42Sjoerg   };
2154*13fbcb42Sjoerg 
2155*13fbcb42Sjoerg   RegionCodeGenTy RCG(ParallelGen);
2156*13fbcb42Sjoerg   RCG(CGF);
2157*13fbcb42Sjoerg }
2158*13fbcb42Sjoerg 
syncCTAThreads(CodeGenFunction & CGF)2159*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
2160*13fbcb42Sjoerg   // Always emit simple barriers!
2161*13fbcb42Sjoerg   if (!CGF.HaveInsertPoint())
2162*13fbcb42Sjoerg     return;
2163*13fbcb42Sjoerg   // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
2164*13fbcb42Sjoerg   // This function does not use parameters, so we can emit just default values.
2165*13fbcb42Sjoerg   llvm::Value *Args[] = {
2166*13fbcb42Sjoerg       llvm::ConstantPointerNull::get(
2167*13fbcb42Sjoerg           cast<llvm::PointerType>(getIdentTyPointerTy())),
2168*13fbcb42Sjoerg       llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
2169*13fbcb42Sjoerg   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
2170*13fbcb42Sjoerg                           CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd),
2171*13fbcb42Sjoerg                       Args);
2172*13fbcb42Sjoerg }
2173*13fbcb42Sjoerg 
emitBarrierCall(CodeGenFunction & CGF,SourceLocation Loc,OpenMPDirectiveKind Kind,bool,bool)2174*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF,
2175*13fbcb42Sjoerg                                            SourceLocation Loc,
2176*13fbcb42Sjoerg                                            OpenMPDirectiveKind Kind, bool,
2177*13fbcb42Sjoerg                                            bool) {
2178*13fbcb42Sjoerg   // Always emit simple barriers!
2179*13fbcb42Sjoerg   if (!CGF.HaveInsertPoint())
2180*13fbcb42Sjoerg     return;
2181*13fbcb42Sjoerg   // Build call __kmpc_cancel_barrier(loc, thread_id);
2182*13fbcb42Sjoerg   unsigned Flags = getDefaultFlagsForBarriers(Kind);
2183*13fbcb42Sjoerg   llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
2184*13fbcb42Sjoerg                          getThreadID(CGF, Loc)};
2185*13fbcb42Sjoerg 
2186*13fbcb42Sjoerg   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
2187*13fbcb42Sjoerg                           CGM.getModule(), OMPRTL___kmpc_barrier),
2188*13fbcb42Sjoerg                       Args);
2189*13fbcb42Sjoerg }
2190*13fbcb42Sjoerg 
emitCriticalRegion(CodeGenFunction & CGF,StringRef CriticalName,const RegionCodeGenTy & CriticalOpGen,SourceLocation Loc,const Expr * Hint)2191*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitCriticalRegion(
2192*13fbcb42Sjoerg     CodeGenFunction &CGF, StringRef CriticalName,
2193*13fbcb42Sjoerg     const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
2194*13fbcb42Sjoerg     const Expr *Hint) {
2195*13fbcb42Sjoerg   llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
2196*13fbcb42Sjoerg   llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
2197*13fbcb42Sjoerg   llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
2198*13fbcb42Sjoerg   llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
2199*13fbcb42Sjoerg   llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
2200*13fbcb42Sjoerg 
2201*13fbcb42Sjoerg   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
2202*13fbcb42Sjoerg 
2203*13fbcb42Sjoerg   // Get the mask of active threads in the warp.
2204*13fbcb42Sjoerg   llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
2205*13fbcb42Sjoerg       CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask));
2206*13fbcb42Sjoerg   // Fetch team-local id of the thread.
2207*13fbcb42Sjoerg   llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
2208*13fbcb42Sjoerg 
2209*13fbcb42Sjoerg   // Get the width of the team.
2210*13fbcb42Sjoerg   llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
2211*13fbcb42Sjoerg 
2212*13fbcb42Sjoerg   // Initialize the counter variable for the loop.
2213*13fbcb42Sjoerg   QualType Int32Ty =
2214*13fbcb42Sjoerg       CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
2215*13fbcb42Sjoerg   Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
2216*13fbcb42Sjoerg   LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
2217*13fbcb42Sjoerg   CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
2218*13fbcb42Sjoerg                         /*isInit=*/true);
2219*13fbcb42Sjoerg 
2220*13fbcb42Sjoerg   // Block checks if loop counter exceeds upper bound.
2221*13fbcb42Sjoerg   CGF.EmitBlock(LoopBB);
2222*13fbcb42Sjoerg   llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2223*13fbcb42Sjoerg   llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
2224*13fbcb42Sjoerg   CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
2225*13fbcb42Sjoerg 
2226*13fbcb42Sjoerg   // Block tests which single thread should execute region, and which threads
2227*13fbcb42Sjoerg   // should go straight to synchronisation point.
2228*13fbcb42Sjoerg   CGF.EmitBlock(TestBB);
2229*13fbcb42Sjoerg   CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2230*13fbcb42Sjoerg   llvm::Value *CmpThreadToCounter =
2231*13fbcb42Sjoerg       CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
2232*13fbcb42Sjoerg   CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
2233*13fbcb42Sjoerg 
2234*13fbcb42Sjoerg   // Block emits the body of the critical region.
2235*13fbcb42Sjoerg   CGF.EmitBlock(BodyBB);
2236*13fbcb42Sjoerg 
2237*13fbcb42Sjoerg   // Output the critical statement.
2238*13fbcb42Sjoerg   CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
2239*13fbcb42Sjoerg                                       Hint);
2240*13fbcb42Sjoerg 
2241*13fbcb42Sjoerg   // After the body surrounded by the critical region, the single executing
2242*13fbcb42Sjoerg   // thread will jump to the synchronisation point.
2243*13fbcb42Sjoerg   // Block waits for all threads in current team to finish then increments the
2244*13fbcb42Sjoerg   // counter variable and returns to the loop.
2245*13fbcb42Sjoerg   CGF.EmitBlock(SyncBB);
2246*13fbcb42Sjoerg   // Reconverge active threads in the warp.
2247*13fbcb42Sjoerg   (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
2248*13fbcb42Sjoerg                                 CGM.getModule(), OMPRTL___kmpc_syncwarp),
2249*13fbcb42Sjoerg                             Mask);
2250*13fbcb42Sjoerg 
2251*13fbcb42Sjoerg   llvm::Value *IncCounterVal =
2252*13fbcb42Sjoerg       CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
2253*13fbcb42Sjoerg   CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
2254*13fbcb42Sjoerg   CGF.EmitBranch(LoopBB);
2255*13fbcb42Sjoerg 
2256*13fbcb42Sjoerg   // Block that is reached when  all threads in the team complete the region.
2257*13fbcb42Sjoerg   CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
2258*13fbcb42Sjoerg }
2259*13fbcb42Sjoerg 
2260*13fbcb42Sjoerg /// Cast value to the specified type.
castValueToType(CodeGenFunction & CGF,llvm::Value * Val,QualType ValTy,QualType CastTy,SourceLocation Loc)2261*13fbcb42Sjoerg static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
2262*13fbcb42Sjoerg                                     QualType ValTy, QualType CastTy,
2263*13fbcb42Sjoerg                                     SourceLocation Loc) {
2264*13fbcb42Sjoerg   assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
2265*13fbcb42Sjoerg          "Cast type must sized.");
2266*13fbcb42Sjoerg   assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
2267*13fbcb42Sjoerg          "Val type must sized.");
2268*13fbcb42Sjoerg   llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
2269*13fbcb42Sjoerg   if (ValTy == CastTy)
2270*13fbcb42Sjoerg     return Val;
2271*13fbcb42Sjoerg   if (CGF.getContext().getTypeSizeInChars(ValTy) ==
2272*13fbcb42Sjoerg       CGF.getContext().getTypeSizeInChars(CastTy))
2273*13fbcb42Sjoerg     return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
2274*13fbcb42Sjoerg   if (CastTy->isIntegerType() && ValTy->isIntegerType())
2275*13fbcb42Sjoerg     return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
2276*13fbcb42Sjoerg                                      CastTy->hasSignedIntegerRepresentation());
2277*13fbcb42Sjoerg   Address CastItem = CGF.CreateMemTemp(CastTy);
2278*13fbcb42Sjoerg   Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2279*13fbcb42Sjoerg       CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
2280*13fbcb42Sjoerg   CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy,
2281*13fbcb42Sjoerg                         LValueBaseInfo(AlignmentSource::Type),
2282*13fbcb42Sjoerg                         TBAAAccessInfo());
2283*13fbcb42Sjoerg   return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc,
2284*13fbcb42Sjoerg                               LValueBaseInfo(AlignmentSource::Type),
2285*13fbcb42Sjoerg                               TBAAAccessInfo());
2286*13fbcb42Sjoerg }
2287*13fbcb42Sjoerg 
2288*13fbcb42Sjoerg /// This function creates calls to one of two shuffle functions to copy
2289*13fbcb42Sjoerg /// variables between lanes in a warp.
createRuntimeShuffleFunction(CodeGenFunction & CGF,llvm::Value * Elem,QualType ElemType,llvm::Value * Offset,SourceLocation Loc)2290*13fbcb42Sjoerg static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
2291*13fbcb42Sjoerg                                                  llvm::Value *Elem,
2292*13fbcb42Sjoerg                                                  QualType ElemType,
2293*13fbcb42Sjoerg                                                  llvm::Value *Offset,
2294*13fbcb42Sjoerg                                                  SourceLocation Loc) {
2295*13fbcb42Sjoerg   CodeGenModule &CGM = CGF.CGM;
2296*13fbcb42Sjoerg   CGBuilderTy &Bld = CGF.Builder;
2297*13fbcb42Sjoerg   CGOpenMPRuntimeGPU &RT =
2298*13fbcb42Sjoerg       *(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime()));
2299*13fbcb42Sjoerg   llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();
2300*13fbcb42Sjoerg 
2301*13fbcb42Sjoerg   CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2302*13fbcb42Sjoerg   assert(Size.getQuantity() <= 8 &&
2303*13fbcb42Sjoerg          "Unsupported bitwidth in shuffle instruction.");
2304*13fbcb42Sjoerg 
2305*13fbcb42Sjoerg   RuntimeFunction ShuffleFn = Size.getQuantity() <= 4
2306*13fbcb42Sjoerg                                   ? OMPRTL___kmpc_shuffle_int32
2307*13fbcb42Sjoerg                                   : OMPRTL___kmpc_shuffle_int64;
2308*13fbcb42Sjoerg 
2309*13fbcb42Sjoerg   // Cast all types to 32- or 64-bit values before calling shuffle routines.
2310*13fbcb42Sjoerg   QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
2311*13fbcb42Sjoerg       Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
2312*13fbcb42Sjoerg   llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
2313*13fbcb42Sjoerg   llvm::Value *WarpSize =
2314*13fbcb42Sjoerg       Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
2315*13fbcb42Sjoerg 
2316*13fbcb42Sjoerg   llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
2317*13fbcb42Sjoerg       OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn),
2318*13fbcb42Sjoerg       {ElemCast, Offset, WarpSize});
2319*13fbcb42Sjoerg 
2320*13fbcb42Sjoerg   return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
2321*13fbcb42Sjoerg }
2322*13fbcb42Sjoerg 
shuffleAndStore(CodeGenFunction & CGF,Address SrcAddr,Address DestAddr,QualType ElemType,llvm::Value * Offset,SourceLocation Loc)2323*13fbcb42Sjoerg static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
2324*13fbcb42Sjoerg                             Address DestAddr, QualType ElemType,
2325*13fbcb42Sjoerg                             llvm::Value *Offset, SourceLocation Loc) {
2326*13fbcb42Sjoerg   CGBuilderTy &Bld = CGF.Builder;
2327*13fbcb42Sjoerg 
2328*13fbcb42Sjoerg   CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2329*13fbcb42Sjoerg   // Create the loop over the big sized data.
2330*13fbcb42Sjoerg   // ptr = (void*)Elem;
2331*13fbcb42Sjoerg   // ptrEnd = (void*) Elem + 1;
2332*13fbcb42Sjoerg   // Step = 8;
2333*13fbcb42Sjoerg   // while (ptr + Step < ptrEnd)
2334*13fbcb42Sjoerg   //   shuffle((int64_t)*ptr);
2335*13fbcb42Sjoerg   // Step = 4;
2336*13fbcb42Sjoerg   // while (ptr + Step < ptrEnd)
2337*13fbcb42Sjoerg   //   shuffle((int32_t)*ptr);
2338*13fbcb42Sjoerg   // ...
2339*13fbcb42Sjoerg   Address ElemPtr = DestAddr;
2340*13fbcb42Sjoerg   Address Ptr = SrcAddr;
2341*13fbcb42Sjoerg   Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
2342*13fbcb42Sjoerg       Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy);
2343*13fbcb42Sjoerg   for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
2344*13fbcb42Sjoerg     if (Size < CharUnits::fromQuantity(IntSize))
2345*13fbcb42Sjoerg       continue;
2346*13fbcb42Sjoerg     QualType IntType = CGF.getContext().getIntTypeForBitwidth(
2347*13fbcb42Sjoerg         CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
2348*13fbcb42Sjoerg         /*Signed=*/1);
2349*13fbcb42Sjoerg     llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
2350*13fbcb42Sjoerg     Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo());
2351*13fbcb42Sjoerg     ElemPtr =
2352*13fbcb42Sjoerg         Bld.CreatePointerBitCastOrAddrSpaceCast(ElemPtr, IntTy->getPointerTo());
2353*13fbcb42Sjoerg     if (Size.getQuantity() / IntSize > 1) {
2354*13fbcb42Sjoerg       llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
2355*13fbcb42Sjoerg       llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
2356*13fbcb42Sjoerg       llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
2357*13fbcb42Sjoerg       llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
2358*13fbcb42Sjoerg       CGF.EmitBlock(PreCondBB);
2359*13fbcb42Sjoerg       llvm::PHINode *PhiSrc =
2360*13fbcb42Sjoerg           Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
2361*13fbcb42Sjoerg       PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
2362*13fbcb42Sjoerg       llvm::PHINode *PhiDest =
2363*13fbcb42Sjoerg           Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
2364*13fbcb42Sjoerg       PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
2365*13fbcb42Sjoerg       Ptr = Address(PhiSrc, Ptr.getAlignment());
2366*13fbcb42Sjoerg       ElemPtr = Address(PhiDest, ElemPtr.getAlignment());
2367*13fbcb42Sjoerg       llvm::Value *PtrDiff = Bld.CreatePtrDiff(
2368*13fbcb42Sjoerg           PtrEnd.getPointer(), Bld.CreatePointerBitCastOrAddrSpaceCast(
2369*13fbcb42Sjoerg                                    Ptr.getPointer(), CGF.VoidPtrTy));
2370*13fbcb42Sjoerg       Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
2371*13fbcb42Sjoerg                        ThenBB, ExitBB);
2372*13fbcb42Sjoerg       CGF.EmitBlock(ThenBB);
2373*13fbcb42Sjoerg       llvm::Value *Res = createRuntimeShuffleFunction(
2374*13fbcb42Sjoerg           CGF,
2375*13fbcb42Sjoerg           CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
2376*13fbcb42Sjoerg                                LValueBaseInfo(AlignmentSource::Type),
2377*13fbcb42Sjoerg                                TBAAAccessInfo()),
2378*13fbcb42Sjoerg           IntType, Offset, Loc);
2379*13fbcb42Sjoerg       CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
2380*13fbcb42Sjoerg                             LValueBaseInfo(AlignmentSource::Type),
2381*13fbcb42Sjoerg                             TBAAAccessInfo());
2382*13fbcb42Sjoerg       Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
2383*13fbcb42Sjoerg       Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
2384*13fbcb42Sjoerg       PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
2385*13fbcb42Sjoerg       PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
2386*13fbcb42Sjoerg       CGF.EmitBranch(PreCondBB);
2387*13fbcb42Sjoerg       CGF.EmitBlock(ExitBB);
2388*13fbcb42Sjoerg     } else {
2389*13fbcb42Sjoerg       llvm::Value *Res = createRuntimeShuffleFunction(
2390*13fbcb42Sjoerg           CGF,
2391*13fbcb42Sjoerg           CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
2392*13fbcb42Sjoerg                                LValueBaseInfo(AlignmentSource::Type),
2393*13fbcb42Sjoerg                                TBAAAccessInfo()),
2394*13fbcb42Sjoerg           IntType, Offset, Loc);
2395*13fbcb42Sjoerg       CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
2396*13fbcb42Sjoerg                             LValueBaseInfo(AlignmentSource::Type),
2397*13fbcb42Sjoerg                             TBAAAccessInfo());
2398*13fbcb42Sjoerg       Ptr = Bld.CreateConstGEP(Ptr, 1);
2399*13fbcb42Sjoerg       ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
2400*13fbcb42Sjoerg     }
2401*13fbcb42Sjoerg     Size = Size % IntSize;
2402*13fbcb42Sjoerg   }
2403*13fbcb42Sjoerg }
2404*13fbcb42Sjoerg 
2405*13fbcb42Sjoerg namespace {
2406*13fbcb42Sjoerg enum CopyAction : unsigned {
2407*13fbcb42Sjoerg   // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
2408*13fbcb42Sjoerg   // the warp using shuffle instructions.
2409*13fbcb42Sjoerg   RemoteLaneToThread,
2410*13fbcb42Sjoerg   // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
2411*13fbcb42Sjoerg   ThreadCopy,
2412*13fbcb42Sjoerg   // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
2413*13fbcb42Sjoerg   ThreadToScratchpad,
2414*13fbcb42Sjoerg   // ScratchpadToThread: Copy from a scratchpad array in global memory
2415*13fbcb42Sjoerg   // containing team-reduced data to a thread's stack.
2416*13fbcb42Sjoerg   ScratchpadToThread,
2417*13fbcb42Sjoerg };
2418*13fbcb42Sjoerg } // namespace
2419*13fbcb42Sjoerg 
2420*13fbcb42Sjoerg struct CopyOptionsTy {
2421*13fbcb42Sjoerg   llvm::Value *RemoteLaneOffset;
2422*13fbcb42Sjoerg   llvm::Value *ScratchpadIndex;
2423*13fbcb42Sjoerg   llvm::Value *ScratchpadWidth;
2424*13fbcb42Sjoerg };
2425*13fbcb42Sjoerg 
2426*13fbcb42Sjoerg /// Emit instructions to copy a Reduce list, which contains partially
2427*13fbcb42Sjoerg /// aggregated values, in the specified direction.
emitReductionListCopy(CopyAction Action,CodeGenFunction & CGF,QualType ReductionArrayTy,ArrayRef<const Expr * > Privates,Address SrcBase,Address DestBase,CopyOptionsTy CopyOptions={nullptr, nullptr, nullptr})2428*13fbcb42Sjoerg static void emitReductionListCopy(
2429*13fbcb42Sjoerg     CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
2430*13fbcb42Sjoerg     ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
2431*13fbcb42Sjoerg     CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
2432*13fbcb42Sjoerg 
2433*13fbcb42Sjoerg   CodeGenModule &CGM = CGF.CGM;
2434*13fbcb42Sjoerg   ASTContext &C = CGM.getContext();
2435*13fbcb42Sjoerg   CGBuilderTy &Bld = CGF.Builder;
2436*13fbcb42Sjoerg 
2437*13fbcb42Sjoerg   llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2438*13fbcb42Sjoerg   llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
2439*13fbcb42Sjoerg   llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
2440*13fbcb42Sjoerg 
2441*13fbcb42Sjoerg   // Iterates, element-by-element, through the source Reduce list and
2442*13fbcb42Sjoerg   // make a copy.
2443*13fbcb42Sjoerg   unsigned Idx = 0;
2444*13fbcb42Sjoerg   unsigned Size = Privates.size();
2445*13fbcb42Sjoerg   for (const Expr *Private : Privates) {
2446*13fbcb42Sjoerg     Address SrcElementAddr = Address::invalid();
2447*13fbcb42Sjoerg     Address DestElementAddr = Address::invalid();
2448*13fbcb42Sjoerg     Address DestElementPtrAddr = Address::invalid();
2449*13fbcb42Sjoerg     // Should we shuffle in an element from a remote lane?
2450*13fbcb42Sjoerg     bool ShuffleInElement = false;
2451*13fbcb42Sjoerg     // Set to true to update the pointer in the dest Reduce list to a
2452*13fbcb42Sjoerg     // newly created element.
2453*13fbcb42Sjoerg     bool UpdateDestListPtr = false;
2454*13fbcb42Sjoerg     // Increment the src or dest pointer to the scratchpad, for each
2455*13fbcb42Sjoerg     // new element.
2456*13fbcb42Sjoerg     bool IncrScratchpadSrc = false;
2457*13fbcb42Sjoerg     bool IncrScratchpadDest = false;
2458*13fbcb42Sjoerg 
2459*13fbcb42Sjoerg     switch (Action) {
2460*13fbcb42Sjoerg     case RemoteLaneToThread: {
2461*13fbcb42Sjoerg       // Step 1.1: Get the address for the src element in the Reduce list.
2462*13fbcb42Sjoerg       Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
2463*13fbcb42Sjoerg       SrcElementAddr = CGF.EmitLoadOfPointer(
2464*13fbcb42Sjoerg           SrcElementPtrAddr,
2465*13fbcb42Sjoerg           C.getPointerType(Private->getType())->castAs<PointerType>());
2466*13fbcb42Sjoerg 
2467*13fbcb42Sjoerg       // Step 1.2: Create a temporary to store the element in the destination
2468*13fbcb42Sjoerg       // Reduce list.
2469*13fbcb42Sjoerg       DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
2470*13fbcb42Sjoerg       DestElementAddr =
2471*13fbcb42Sjoerg           CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
2472*13fbcb42Sjoerg       ShuffleInElement = true;
2473*13fbcb42Sjoerg       UpdateDestListPtr = true;
2474*13fbcb42Sjoerg       break;
2475*13fbcb42Sjoerg     }
2476*13fbcb42Sjoerg     case ThreadCopy: {
2477*13fbcb42Sjoerg       // Step 1.1: Get the address for the src element in the Reduce list.
2478*13fbcb42Sjoerg       Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
2479*13fbcb42Sjoerg       SrcElementAddr = CGF.EmitLoadOfPointer(
2480*13fbcb42Sjoerg           SrcElementPtrAddr,
2481*13fbcb42Sjoerg           C.getPointerType(Private->getType())->castAs<PointerType>());
2482*13fbcb42Sjoerg 
2483*13fbcb42Sjoerg       // Step 1.2: Get the address for dest element.  The destination
2484*13fbcb42Sjoerg       // element has already been created on the thread's stack.
2485*13fbcb42Sjoerg       DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
2486*13fbcb42Sjoerg       DestElementAddr = CGF.EmitLoadOfPointer(
2487*13fbcb42Sjoerg           DestElementPtrAddr,
2488*13fbcb42Sjoerg           C.getPointerType(Private->getType())->castAs<PointerType>());
2489*13fbcb42Sjoerg       break;
2490*13fbcb42Sjoerg     }
2491*13fbcb42Sjoerg     case ThreadToScratchpad: {
2492*13fbcb42Sjoerg       // Step 1.1: Get the address for the src element in the Reduce list.
2493*13fbcb42Sjoerg       Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
2494*13fbcb42Sjoerg       SrcElementAddr = CGF.EmitLoadOfPointer(
2495*13fbcb42Sjoerg           SrcElementPtrAddr,
2496*13fbcb42Sjoerg           C.getPointerType(Private->getType())->castAs<PointerType>());
2497*13fbcb42Sjoerg 
2498*13fbcb42Sjoerg       // Step 1.2: Get the address for dest element:
2499*13fbcb42Sjoerg       // address = base + index * ElementSizeInChars.
2500*13fbcb42Sjoerg       llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2501*13fbcb42Sjoerg       llvm::Value *CurrentOffset =
2502*13fbcb42Sjoerg           Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
2503*13fbcb42Sjoerg       llvm::Value *ScratchPadElemAbsolutePtrVal =
2504*13fbcb42Sjoerg           Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
2505*13fbcb42Sjoerg       ScratchPadElemAbsolutePtrVal =
2506*13fbcb42Sjoerg           Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
2507*13fbcb42Sjoerg       DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
2508*13fbcb42Sjoerg                                 C.getTypeAlignInChars(Private->getType()));
2509*13fbcb42Sjoerg       IncrScratchpadDest = true;
2510*13fbcb42Sjoerg       break;
2511*13fbcb42Sjoerg     }
2512*13fbcb42Sjoerg     case ScratchpadToThread: {
2513*13fbcb42Sjoerg       // Step 1.1: Get the address for the src element in the scratchpad.
2514*13fbcb42Sjoerg       // address = base + index * ElementSizeInChars.
2515*13fbcb42Sjoerg       llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2516*13fbcb42Sjoerg       llvm::Value *CurrentOffset =
2517*13fbcb42Sjoerg           Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
2518*13fbcb42Sjoerg       llvm::Value *ScratchPadElemAbsolutePtrVal =
2519*13fbcb42Sjoerg           Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
2520*13fbcb42Sjoerg       ScratchPadElemAbsolutePtrVal =
2521*13fbcb42Sjoerg           Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
2522*13fbcb42Sjoerg       SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
2523*13fbcb42Sjoerg                                C.getTypeAlignInChars(Private->getType()));
2524*13fbcb42Sjoerg       IncrScratchpadSrc = true;
2525*13fbcb42Sjoerg 
2526*13fbcb42Sjoerg       // Step 1.2: Create a temporary to store the element in the destination
2527*13fbcb42Sjoerg       // Reduce list.
2528*13fbcb42Sjoerg       DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
2529*13fbcb42Sjoerg       DestElementAddr =
2530*13fbcb42Sjoerg           CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
2531*13fbcb42Sjoerg       UpdateDestListPtr = true;
2532*13fbcb42Sjoerg       break;
2533*13fbcb42Sjoerg     }
2534*13fbcb42Sjoerg     }
2535*13fbcb42Sjoerg 
2536*13fbcb42Sjoerg     // Regardless of src and dest of copy, we emit the load of src
2537*13fbcb42Sjoerg     // element as this is required in all directions
2538*13fbcb42Sjoerg     SrcElementAddr = Bld.CreateElementBitCast(
2539*13fbcb42Sjoerg         SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
2540*13fbcb42Sjoerg     DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
2541*13fbcb42Sjoerg                                                SrcElementAddr.getElementType());
2542*13fbcb42Sjoerg 
2543*13fbcb42Sjoerg     // Now that all active lanes have read the element in the
2544*13fbcb42Sjoerg     // Reduce list, shuffle over the value from the remote lane.
2545*13fbcb42Sjoerg     if (ShuffleInElement) {
2546*13fbcb42Sjoerg       shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
2547*13fbcb42Sjoerg                       RemoteLaneOffset, Private->getExprLoc());
2548*13fbcb42Sjoerg     } else {
2549*13fbcb42Sjoerg       switch (CGF.getEvaluationKind(Private->getType())) {
2550*13fbcb42Sjoerg       case TEK_Scalar: {
2551*13fbcb42Sjoerg         llvm::Value *Elem = CGF.EmitLoadOfScalar(
2552*13fbcb42Sjoerg             SrcElementAddr, /*Volatile=*/false, Private->getType(),
2553*13fbcb42Sjoerg             Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type),
2554*13fbcb42Sjoerg             TBAAAccessInfo());
2555*13fbcb42Sjoerg         // Store the source element value to the dest element address.
2556*13fbcb42Sjoerg         CGF.EmitStoreOfScalar(
2557*13fbcb42Sjoerg             Elem, DestElementAddr, /*Volatile=*/false, Private->getType(),
2558*13fbcb42Sjoerg             LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
2559*13fbcb42Sjoerg         break;
2560*13fbcb42Sjoerg       }
2561*13fbcb42Sjoerg       case TEK_Complex: {
2562*13fbcb42Sjoerg         CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex(
2563*13fbcb42Sjoerg             CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
2564*13fbcb42Sjoerg             Private->getExprLoc());
2565*13fbcb42Sjoerg         CGF.EmitStoreOfComplex(
2566*13fbcb42Sjoerg             Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
2567*13fbcb42Sjoerg             /*isInit=*/false);
2568*13fbcb42Sjoerg         break;
2569*13fbcb42Sjoerg       }
2570*13fbcb42Sjoerg       case TEK_Aggregate:
2571*13fbcb42Sjoerg         CGF.EmitAggregateCopy(
2572*13fbcb42Sjoerg             CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
2573*13fbcb42Sjoerg             CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
2574*13fbcb42Sjoerg             Private->getType(), AggValueSlot::DoesNotOverlap);
2575*13fbcb42Sjoerg         break;
2576*13fbcb42Sjoerg       }
2577*13fbcb42Sjoerg     }
2578*13fbcb42Sjoerg 
2579*13fbcb42Sjoerg     // Step 3.1: Modify reference in dest Reduce list as needed.
2580*13fbcb42Sjoerg     // Modifying the reference in Reduce list to point to the newly
2581*13fbcb42Sjoerg     // created element.  The element is live in the current function
2582*13fbcb42Sjoerg     // scope and that of functions it invokes (i.e., reduce_function).
2583*13fbcb42Sjoerg     // RemoteReduceData[i] = (void*)&RemoteElem
2584*13fbcb42Sjoerg     if (UpdateDestListPtr) {
2585*13fbcb42Sjoerg       CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
2586*13fbcb42Sjoerg                                 DestElementAddr.getPointer(), CGF.VoidPtrTy),
2587*13fbcb42Sjoerg                             DestElementPtrAddr, /*Volatile=*/false,
2588*13fbcb42Sjoerg                             C.VoidPtrTy);
2589*13fbcb42Sjoerg     }
2590*13fbcb42Sjoerg 
2591*13fbcb42Sjoerg     // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
2592*13fbcb42Sjoerg     // address of the next element in scratchpad memory, unless we're currently
2593*13fbcb42Sjoerg     // processing the last one.  Memory alignment is also taken care of here.
2594*13fbcb42Sjoerg     if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
2595*13fbcb42Sjoerg       llvm::Value *ScratchpadBasePtr =
2596*13fbcb42Sjoerg           IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
2597*13fbcb42Sjoerg       llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2598*13fbcb42Sjoerg       ScratchpadBasePtr = Bld.CreateNUWAdd(
2599*13fbcb42Sjoerg           ScratchpadBasePtr,
2600*13fbcb42Sjoerg           Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
2601*13fbcb42Sjoerg 
2602*13fbcb42Sjoerg       // Take care of global memory alignment for performance
2603*13fbcb42Sjoerg       ScratchpadBasePtr = Bld.CreateNUWSub(
2604*13fbcb42Sjoerg           ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
2605*13fbcb42Sjoerg       ScratchpadBasePtr = Bld.CreateUDiv(
2606*13fbcb42Sjoerg           ScratchpadBasePtr,
2607*13fbcb42Sjoerg           llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
2608*13fbcb42Sjoerg       ScratchpadBasePtr = Bld.CreateNUWAdd(
2609*13fbcb42Sjoerg           ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
2610*13fbcb42Sjoerg       ScratchpadBasePtr = Bld.CreateNUWMul(
2611*13fbcb42Sjoerg           ScratchpadBasePtr,
2612*13fbcb42Sjoerg           llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
2613*13fbcb42Sjoerg 
2614*13fbcb42Sjoerg       if (IncrScratchpadDest)
2615*13fbcb42Sjoerg         DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
2616*13fbcb42Sjoerg       else /* IncrScratchpadSrc = true */
2617*13fbcb42Sjoerg         SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
2618*13fbcb42Sjoerg     }
2619*13fbcb42Sjoerg 
2620*13fbcb42Sjoerg     ++Idx;
2621*13fbcb42Sjoerg   }
2622*13fbcb42Sjoerg }
2623*13fbcb42Sjoerg 
2624*13fbcb42Sjoerg /// This function emits a helper that gathers Reduce lists from the first
2625*13fbcb42Sjoerg /// lane of every active warp to lanes in the first warp.
2626*13fbcb42Sjoerg ///
2627*13fbcb42Sjoerg /// void inter_warp_copy_func(void* reduce_data, num_warps)
2628*13fbcb42Sjoerg ///   shared smem[warp_size];
2629*13fbcb42Sjoerg ///   For all data entries D in reduce_data:
2630*13fbcb42Sjoerg ///     sync
2631*13fbcb42Sjoerg ///     If (I am the first lane in each warp)
2632*13fbcb42Sjoerg ///       Copy my local D to smem[warp_id]
2633*13fbcb42Sjoerg ///     sync
2634*13fbcb42Sjoerg ///     if (I am the first warp)
2635*13fbcb42Sjoerg ///       Copy smem[thread_id] to my local D
emitInterWarpCopyFunction(CodeGenModule & CGM,ArrayRef<const Expr * > Privates,QualType ReductionArrayTy,SourceLocation Loc)2636*13fbcb42Sjoerg static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
2637*13fbcb42Sjoerg                                               ArrayRef<const Expr *> Privates,
2638*13fbcb42Sjoerg                                               QualType ReductionArrayTy,
2639*13fbcb42Sjoerg                                               SourceLocation Loc) {
2640*13fbcb42Sjoerg   ASTContext &C = CGM.getContext();
2641*13fbcb42Sjoerg   llvm::Module &M = CGM.getModule();
2642*13fbcb42Sjoerg 
2643*13fbcb42Sjoerg   // ReduceList: thread local Reduce list.
2644*13fbcb42Sjoerg   // At the stage of the computation when this function is called, partially
2645*13fbcb42Sjoerg   // aggregated values reside in the first lane of every active warp.
2646*13fbcb42Sjoerg   ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2647*13fbcb42Sjoerg                                   C.VoidPtrTy, ImplicitParamDecl::Other);
2648*13fbcb42Sjoerg   // NumWarps: number of warps active in the parallel region.  This could
2649*13fbcb42Sjoerg   // be smaller than 32 (max warps in a CTA) for partial block reduction.
2650*13fbcb42Sjoerg   ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2651*13fbcb42Sjoerg                                 C.getIntTypeForBitwidth(32, /* Signed */ true),
2652*13fbcb42Sjoerg                                 ImplicitParamDecl::Other);
2653*13fbcb42Sjoerg   FunctionArgList Args;
2654*13fbcb42Sjoerg   Args.push_back(&ReduceListArg);
2655*13fbcb42Sjoerg   Args.push_back(&NumWarpsArg);
2656*13fbcb42Sjoerg 
2657*13fbcb42Sjoerg   const CGFunctionInfo &CGFI =
2658*13fbcb42Sjoerg       CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2659*13fbcb42Sjoerg   auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
2660*13fbcb42Sjoerg                                     llvm::GlobalValue::InternalLinkage,
2661*13fbcb42Sjoerg                                     "_omp_reduction_inter_warp_copy_func", &M);
2662*13fbcb42Sjoerg   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2663*13fbcb42Sjoerg   Fn->setDoesNotRecurse();
2664*13fbcb42Sjoerg   CodeGenFunction CGF(CGM);
2665*13fbcb42Sjoerg   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2666*13fbcb42Sjoerg 
2667*13fbcb42Sjoerg   CGBuilderTy &Bld = CGF.Builder;
2668*13fbcb42Sjoerg 
2669*13fbcb42Sjoerg   // This array is used as a medium to transfer, one reduce element at a time,
2670*13fbcb42Sjoerg   // the data from the first lane of every warp to lanes in the first warp
2671*13fbcb42Sjoerg   // in order to perform the final step of a reduction in a parallel region
2672*13fbcb42Sjoerg   // (reduction across warps).  The array is placed in NVPTX __shared__ memory
2673*13fbcb42Sjoerg   // for reduced latency, as well as to have a distinct copy for concurrently
2674*13fbcb42Sjoerg   // executing target regions.  The array is declared with common linkage so
2675*13fbcb42Sjoerg   // as to be shared across compilation units.
2676*13fbcb42Sjoerg   StringRef TransferMediumName =
2677*13fbcb42Sjoerg       "__openmp_nvptx_data_transfer_temporary_storage";
2678*13fbcb42Sjoerg   llvm::GlobalVariable *TransferMedium =
2679*13fbcb42Sjoerg       M.getGlobalVariable(TransferMediumName);
2680*13fbcb42Sjoerg   unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
2681*13fbcb42Sjoerg   if (!TransferMedium) {
2682*13fbcb42Sjoerg     auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
2683*13fbcb42Sjoerg     unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
2684*13fbcb42Sjoerg     TransferMedium = new llvm::GlobalVariable(
2685*13fbcb42Sjoerg         M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,
2686*13fbcb42Sjoerg         llvm::UndefValue::get(Ty), TransferMediumName,
2687*13fbcb42Sjoerg         /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
2688*13fbcb42Sjoerg         SharedAddressSpace);
2689*13fbcb42Sjoerg     CGM.addCompilerUsedGlobal(TransferMedium);
2690*13fbcb42Sjoerg   }
2691*13fbcb42Sjoerg 
2692*13fbcb42Sjoerg   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
2693*13fbcb42Sjoerg   // Get the CUDA thread id of the current OpenMP thread on the GPU.
2694*13fbcb42Sjoerg   llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
2695*13fbcb42Sjoerg   // nvptx_lane_id = nvptx_id % warpsize
2696*13fbcb42Sjoerg   llvm::Value *LaneID = getNVPTXLaneID(CGF);
2697*13fbcb42Sjoerg   // nvptx_warp_id = nvptx_id / warpsize
2698*13fbcb42Sjoerg   llvm::Value *WarpID = getNVPTXWarpID(CGF);
2699*13fbcb42Sjoerg 
2700*13fbcb42Sjoerg   Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2701*13fbcb42Sjoerg   Address LocalReduceList(
2702*13fbcb42Sjoerg       Bld.CreatePointerBitCastOrAddrSpaceCast(
2703*13fbcb42Sjoerg           CGF.EmitLoadOfScalar(
2704*13fbcb42Sjoerg               AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,
2705*13fbcb42Sjoerg               LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()),
2706*13fbcb42Sjoerg           CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2707*13fbcb42Sjoerg       CGF.getPointerAlign());
2708*13fbcb42Sjoerg 
2709*13fbcb42Sjoerg   unsigned Idx = 0;
2710*13fbcb42Sjoerg   for (const Expr *Private : Privates) {
2711*13fbcb42Sjoerg     //
2712*13fbcb42Sjoerg     // Warp master copies reduce element to transfer medium in __shared__
2713*13fbcb42Sjoerg     // memory.
2714*13fbcb42Sjoerg     //
2715*13fbcb42Sjoerg     unsigned RealTySize =
2716*13fbcb42Sjoerg         C.getTypeSizeInChars(Private->getType())
2717*13fbcb42Sjoerg             .alignTo(C.getTypeAlignInChars(Private->getType()))
2718*13fbcb42Sjoerg             .getQuantity();
2719*13fbcb42Sjoerg     for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
2720*13fbcb42Sjoerg       unsigned NumIters = RealTySize / TySize;
2721*13fbcb42Sjoerg       if (NumIters == 0)
2722*13fbcb42Sjoerg         continue;
2723*13fbcb42Sjoerg       QualType CType = C.getIntTypeForBitwidth(
2724*13fbcb42Sjoerg           C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
2725*13fbcb42Sjoerg       llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
2726*13fbcb42Sjoerg       CharUnits Align = CharUnits::fromQuantity(TySize);
2727*13fbcb42Sjoerg       llvm::Value *Cnt = nullptr;
2728*13fbcb42Sjoerg       Address CntAddr = Address::invalid();
2729*13fbcb42Sjoerg       llvm::BasicBlock *PrecondBB = nullptr;
2730*13fbcb42Sjoerg       llvm::BasicBlock *ExitBB = nullptr;
2731*13fbcb42Sjoerg       if (NumIters > 1) {
2732*13fbcb42Sjoerg         CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
2733*13fbcb42Sjoerg         CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
2734*13fbcb42Sjoerg                               /*Volatile=*/false, C.IntTy);
2735*13fbcb42Sjoerg         PrecondBB = CGF.createBasicBlock("precond");
2736*13fbcb42Sjoerg         ExitBB = CGF.createBasicBlock("exit");
2737*13fbcb42Sjoerg         llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
2738*13fbcb42Sjoerg         // There is no need to emit line number for unconditional branch.
2739*13fbcb42Sjoerg         (void)ApplyDebugLocation::CreateEmpty(CGF);
2740*13fbcb42Sjoerg         CGF.EmitBlock(PrecondBB);
2741*13fbcb42Sjoerg         Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
2742*13fbcb42Sjoerg         llvm::Value *Cmp =
2743*13fbcb42Sjoerg             Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
2744*13fbcb42Sjoerg         Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
2745*13fbcb42Sjoerg         CGF.EmitBlock(BodyBB);
2746*13fbcb42Sjoerg       }
2747*13fbcb42Sjoerg       // kmpc_barrier.
2748*13fbcb42Sjoerg       CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
2749*13fbcb42Sjoerg                                              /*EmitChecks=*/false,
2750*13fbcb42Sjoerg                                              /*ForceSimpleCall=*/true);
2751*13fbcb42Sjoerg       llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2752*13fbcb42Sjoerg       llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2753*13fbcb42Sjoerg       llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2754*13fbcb42Sjoerg 
2755*13fbcb42Sjoerg       // if (lane_id == 0)
2756*13fbcb42Sjoerg       llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
2757*13fbcb42Sjoerg       Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2758*13fbcb42Sjoerg       CGF.EmitBlock(ThenBB);
2759*13fbcb42Sjoerg 
2760*13fbcb42Sjoerg       // Reduce element = LocalReduceList[i]
2761*13fbcb42Sjoerg       Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2762*13fbcb42Sjoerg       llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2763*13fbcb42Sjoerg           ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2764*13fbcb42Sjoerg       // elemptr = ((CopyType*)(elemptrptr)) + I
2765*13fbcb42Sjoerg       Address ElemPtr = Address(ElemPtrPtr, Align);
2766*13fbcb42Sjoerg       ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
2767*13fbcb42Sjoerg       if (NumIters > 1) {
2768*13fbcb42Sjoerg         ElemPtr = Address(Bld.CreateGEP(ElemPtr.getPointer(), Cnt),
2769*13fbcb42Sjoerg                           ElemPtr.getAlignment());
2770*13fbcb42Sjoerg       }
2771*13fbcb42Sjoerg 
2772*13fbcb42Sjoerg       // Get pointer to location in transfer medium.
2773*13fbcb42Sjoerg       // MediumPtr = &medium[warp_id]
2774*13fbcb42Sjoerg       llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
2775*13fbcb42Sjoerg           TransferMedium->getValueType(), TransferMedium,
2776*13fbcb42Sjoerg           {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
2777*13fbcb42Sjoerg       Address MediumPtr(MediumPtrVal, Align);
2778*13fbcb42Sjoerg       // Casting to actual data type.
2779*13fbcb42Sjoerg       // MediumPtr = (CopyType*)MediumPtrAddr;
2780*13fbcb42Sjoerg       MediumPtr = Bld.CreateElementBitCast(MediumPtr, CopyType);
2781*13fbcb42Sjoerg 
2782*13fbcb42Sjoerg       // elem = *elemptr
2783*13fbcb42Sjoerg       //*MediumPtr = elem
2784*13fbcb42Sjoerg       llvm::Value *Elem = CGF.EmitLoadOfScalar(
2785*13fbcb42Sjoerg           ElemPtr, /*Volatile=*/false, CType, Loc,
2786*13fbcb42Sjoerg           LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
2787*13fbcb42Sjoerg       // Store the source element value to the dest element address.
2788*13fbcb42Sjoerg       CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType,
2789*13fbcb42Sjoerg                             LValueBaseInfo(AlignmentSource::Type),
2790*13fbcb42Sjoerg                             TBAAAccessInfo());
2791*13fbcb42Sjoerg 
2792*13fbcb42Sjoerg       Bld.CreateBr(MergeBB);
2793*13fbcb42Sjoerg 
2794*13fbcb42Sjoerg       CGF.EmitBlock(ElseBB);
2795*13fbcb42Sjoerg       Bld.CreateBr(MergeBB);
2796*13fbcb42Sjoerg 
2797*13fbcb42Sjoerg       CGF.EmitBlock(MergeBB);
2798*13fbcb42Sjoerg 
2799*13fbcb42Sjoerg       // kmpc_barrier.
2800*13fbcb42Sjoerg       CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
2801*13fbcb42Sjoerg                                              /*EmitChecks=*/false,
2802*13fbcb42Sjoerg                                              /*ForceSimpleCall=*/true);
2803*13fbcb42Sjoerg 
2804*13fbcb42Sjoerg       //
2805*13fbcb42Sjoerg       // Warp 0 copies reduce element from transfer medium.
2806*13fbcb42Sjoerg       //
2807*13fbcb42Sjoerg       llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
2808*13fbcb42Sjoerg       llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
2809*13fbcb42Sjoerg       llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
2810*13fbcb42Sjoerg 
2811*13fbcb42Sjoerg       Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
2812*13fbcb42Sjoerg       llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
2813*13fbcb42Sjoerg           AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
2814*13fbcb42Sjoerg 
2815*13fbcb42Sjoerg       // Up to 32 threads in warp 0 are active.
2816*13fbcb42Sjoerg       llvm::Value *IsActiveThread =
2817*13fbcb42Sjoerg           Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
2818*13fbcb42Sjoerg       Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2819*13fbcb42Sjoerg 
2820*13fbcb42Sjoerg       CGF.EmitBlock(W0ThenBB);
2821*13fbcb42Sjoerg 
2822*13fbcb42Sjoerg       // SrcMediumPtr = &medium[tid]
2823*13fbcb42Sjoerg       llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
2824*13fbcb42Sjoerg           TransferMedium->getValueType(), TransferMedium,
2825*13fbcb42Sjoerg           {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
2826*13fbcb42Sjoerg       Address SrcMediumPtr(SrcMediumPtrVal, Align);
2827*13fbcb42Sjoerg       // SrcMediumVal = *SrcMediumPtr;
2828*13fbcb42Sjoerg       SrcMediumPtr = Bld.CreateElementBitCast(SrcMediumPtr, CopyType);
2829*13fbcb42Sjoerg 
2830*13fbcb42Sjoerg       // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2831*13fbcb42Sjoerg       Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2832*13fbcb42Sjoerg       llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
2833*13fbcb42Sjoerg           TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
2834*13fbcb42Sjoerg       Address TargetElemPtr = Address(TargetElemPtrVal, Align);
2835*13fbcb42Sjoerg       TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
2836*13fbcb42Sjoerg       if (NumIters > 1) {
2837*13fbcb42Sjoerg         TargetElemPtr = Address(Bld.CreateGEP(TargetElemPtr.getPointer(), Cnt),
2838*13fbcb42Sjoerg                                 TargetElemPtr.getAlignment());
2839*13fbcb42Sjoerg       }
2840*13fbcb42Sjoerg 
2841*13fbcb42Sjoerg       // *TargetElemPtr = SrcMediumVal;
2842*13fbcb42Sjoerg       llvm::Value *SrcMediumValue =
2843*13fbcb42Sjoerg           CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
2844*13fbcb42Sjoerg       CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
2845*13fbcb42Sjoerg                             CType);
2846*13fbcb42Sjoerg       Bld.CreateBr(W0MergeBB);
2847*13fbcb42Sjoerg 
2848*13fbcb42Sjoerg       CGF.EmitBlock(W0ElseBB);
2849*13fbcb42Sjoerg       Bld.CreateBr(W0MergeBB);
2850*13fbcb42Sjoerg 
2851*13fbcb42Sjoerg       CGF.EmitBlock(W0MergeBB);
2852*13fbcb42Sjoerg 
2853*13fbcb42Sjoerg       if (NumIters > 1) {
2854*13fbcb42Sjoerg         Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
2855*13fbcb42Sjoerg         CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
2856*13fbcb42Sjoerg         CGF.EmitBranch(PrecondBB);
2857*13fbcb42Sjoerg         (void)ApplyDebugLocation::CreateEmpty(CGF);
2858*13fbcb42Sjoerg         CGF.EmitBlock(ExitBB);
2859*13fbcb42Sjoerg       }
2860*13fbcb42Sjoerg       RealTySize %= TySize;
2861*13fbcb42Sjoerg     }
2862*13fbcb42Sjoerg     ++Idx;
2863*13fbcb42Sjoerg   }
2864*13fbcb42Sjoerg 
2865*13fbcb42Sjoerg   CGF.FinishFunction();
2866*13fbcb42Sjoerg   return Fn;
2867*13fbcb42Sjoerg }
2868*13fbcb42Sjoerg 
2869*13fbcb42Sjoerg /// Emit a helper that reduces data across two OpenMP threads (lanes)
2870*13fbcb42Sjoerg /// in the same warp.  It uses shuffle instructions to copy over data from
2871*13fbcb42Sjoerg /// a remote lane's stack.  The reduction algorithm performed is specified
2872*13fbcb42Sjoerg /// by the fourth parameter.
2873*13fbcb42Sjoerg ///
2874*13fbcb42Sjoerg /// Algorithm Versions.
2875*13fbcb42Sjoerg /// Full Warp Reduce (argument value 0):
2876*13fbcb42Sjoerg ///   This algorithm assumes that all 32 lanes are active and gathers
2877*13fbcb42Sjoerg ///   data from these 32 lanes, producing a single resultant value.
2878*13fbcb42Sjoerg /// Contiguous Partial Warp Reduce (argument value 1):
2879*13fbcb42Sjoerg ///   This algorithm assumes that only a *contiguous* subset of lanes
2880*13fbcb42Sjoerg ///   are active.  This happens for the last warp in a parallel region
2881*13fbcb42Sjoerg ///   when the user specified num_threads is not an integer multiple of
2882*13fbcb42Sjoerg ///   32.  This contiguous subset always starts with the zeroth lane.
2883*13fbcb42Sjoerg /// Partial Warp Reduce (argument value 2):
2884*13fbcb42Sjoerg ///   This algorithm gathers data from any number of lanes at any position.
2885*13fbcb42Sjoerg /// All reduced values are stored in the lowest possible lane.  The set
2886*13fbcb42Sjoerg /// of problems every algorithm addresses is a super set of those
2887*13fbcb42Sjoerg /// addressable by algorithms with a lower version number.  Overhead
2888*13fbcb42Sjoerg /// increases as algorithm version increases.
2889*13fbcb42Sjoerg ///
2890*13fbcb42Sjoerg /// Terminology
2891*13fbcb42Sjoerg /// Reduce element:
2892*13fbcb42Sjoerg ///   Reduce element refers to the individual data field with primitive
2893*13fbcb42Sjoerg ///   data types to be combined and reduced across threads.
2894*13fbcb42Sjoerg /// Reduce list:
2895*13fbcb42Sjoerg ///   Reduce list refers to a collection of local, thread-private
2896*13fbcb42Sjoerg ///   reduce elements.
2897*13fbcb42Sjoerg /// Remote Reduce list:
2898*13fbcb42Sjoerg ///   Remote Reduce list refers to a collection of remote (relative to
2899*13fbcb42Sjoerg ///   the current thread) reduce elements.
2900*13fbcb42Sjoerg ///
2901*13fbcb42Sjoerg /// We distinguish between three states of threads that are important to
2902*13fbcb42Sjoerg /// the implementation of this function.
2903*13fbcb42Sjoerg /// Alive threads:
2904*13fbcb42Sjoerg ///   Threads in a warp executing the SIMT instruction, as distinguished from
2905*13fbcb42Sjoerg ///   threads that are inactive due to divergent control flow.
2906*13fbcb42Sjoerg /// Active threads:
2907*13fbcb42Sjoerg ///   The minimal set of threads that has to be alive upon entry to this
2908*13fbcb42Sjoerg ///   function.  The computation is correct iff active threads are alive.
2909*13fbcb42Sjoerg ///   Some threads are alive but they are not active because they do not
2910*13fbcb42Sjoerg ///   contribute to the computation in any useful manner.  Turning them off
2911*13fbcb42Sjoerg ///   may introduce control flow overheads without any tangible benefits.
2912*13fbcb42Sjoerg /// Effective threads:
2913*13fbcb42Sjoerg ///   In order to comply with the argument requirements of the shuffle
2914*13fbcb42Sjoerg ///   function, we must keep all lanes holding data alive.  But at most
2915*13fbcb42Sjoerg ///   half of them perform value aggregation; we refer to this half of
2916*13fbcb42Sjoerg ///   threads as effective. The other half is simply handing off their
2917*13fbcb42Sjoerg ///   data.
2918*13fbcb42Sjoerg ///
2919*13fbcb42Sjoerg /// Procedure
2920*13fbcb42Sjoerg /// Value shuffle:
2921*13fbcb42Sjoerg ///   In this step active threads transfer data from higher lane positions
2922*13fbcb42Sjoerg ///   in the warp to lower lane positions, creating Remote Reduce list.
2923*13fbcb42Sjoerg /// Value aggregation:
2924*13fbcb42Sjoerg ///   In this step, effective threads combine their thread local Reduce list
2925*13fbcb42Sjoerg ///   with Remote Reduce list and store the result in the thread local
2926*13fbcb42Sjoerg ///   Reduce list.
2927*13fbcb42Sjoerg /// Value copy:
2928*13fbcb42Sjoerg ///   In this step, we deal with the assumption made by algorithm 2
2929*13fbcb42Sjoerg ///   (i.e. contiguity assumption).  When we have an odd number of lanes
2930*13fbcb42Sjoerg ///   active, say 2k+1, only k threads will be effective and therefore k
2931*13fbcb42Sjoerg ///   new values will be produced.  However, the Reduce list owned by the
2932*13fbcb42Sjoerg ///   (2k+1)th thread is ignored in the value aggregation.  Therefore
2933*13fbcb42Sjoerg ///   we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
2934*13fbcb42Sjoerg ///   that the contiguity assumption still holds.
emitShuffleAndReduceFunction(CodeGenModule & CGM,ArrayRef<const Expr * > Privates,QualType ReductionArrayTy,llvm::Function * ReduceFn,SourceLocation Loc)2935*13fbcb42Sjoerg static llvm::Function *emitShuffleAndReduceFunction(
2936*13fbcb42Sjoerg     CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2937*13fbcb42Sjoerg     QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
2938*13fbcb42Sjoerg   ASTContext &C = CGM.getContext();
2939*13fbcb42Sjoerg 
2940*13fbcb42Sjoerg   // Thread local Reduce list used to host the values of data to be reduced.
2941*13fbcb42Sjoerg   ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2942*13fbcb42Sjoerg                                   C.VoidPtrTy, ImplicitParamDecl::Other);
2943*13fbcb42Sjoerg   // Current lane id; could be logical.
2944*13fbcb42Sjoerg   ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
2945*13fbcb42Sjoerg                               ImplicitParamDecl::Other);
2946*13fbcb42Sjoerg   // Offset of the remote source lane relative to the current lane.
2947*13fbcb42Sjoerg   ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2948*13fbcb42Sjoerg                                         C.ShortTy, ImplicitParamDecl::Other);
2949*13fbcb42Sjoerg   // Algorithm version.  This is expected to be known at compile time.
2950*13fbcb42Sjoerg   ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2951*13fbcb42Sjoerg                                C.ShortTy, ImplicitParamDecl::Other);
2952*13fbcb42Sjoerg   FunctionArgList Args;
2953*13fbcb42Sjoerg   Args.push_back(&ReduceListArg);
2954*13fbcb42Sjoerg   Args.push_back(&LaneIDArg);
2955*13fbcb42Sjoerg   Args.push_back(&RemoteLaneOffsetArg);
2956*13fbcb42Sjoerg   Args.push_back(&AlgoVerArg);
2957*13fbcb42Sjoerg 
2958*13fbcb42Sjoerg   const CGFunctionInfo &CGFI =
2959*13fbcb42Sjoerg       CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2960*13fbcb42Sjoerg   auto *Fn = llvm::Function::Create(
2961*13fbcb42Sjoerg       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2962*13fbcb42Sjoerg       "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
2963*13fbcb42Sjoerg   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2964*13fbcb42Sjoerg   Fn->setDoesNotRecurse();
2965*13fbcb42Sjoerg 
2966*13fbcb42Sjoerg   CodeGenFunction CGF(CGM);
2967*13fbcb42Sjoerg   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2968*13fbcb42Sjoerg 
2969*13fbcb42Sjoerg   CGBuilderTy &Bld = CGF.Builder;
2970*13fbcb42Sjoerg 
2971*13fbcb42Sjoerg   Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2972*13fbcb42Sjoerg   Address LocalReduceList(
2973*13fbcb42Sjoerg       Bld.CreatePointerBitCastOrAddrSpaceCast(
2974*13fbcb42Sjoerg           CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2975*13fbcb42Sjoerg                                C.VoidPtrTy, SourceLocation()),
2976*13fbcb42Sjoerg           CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2977*13fbcb42Sjoerg       CGF.getPointerAlign());
2978*13fbcb42Sjoerg 
2979*13fbcb42Sjoerg   Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
2980*13fbcb42Sjoerg   llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
2981*13fbcb42Sjoerg       AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2982*13fbcb42Sjoerg 
2983*13fbcb42Sjoerg   Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
2984*13fbcb42Sjoerg   llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
2985*13fbcb42Sjoerg       AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2986*13fbcb42Sjoerg 
2987*13fbcb42Sjoerg   Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
2988*13fbcb42Sjoerg   llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
2989*13fbcb42Sjoerg       AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2990*13fbcb42Sjoerg 
2991*13fbcb42Sjoerg   // Create a local thread-private variable to host the Reduce list
2992*13fbcb42Sjoerg   // from a remote lane.
2993*13fbcb42Sjoerg   Address RemoteReduceList =
2994*13fbcb42Sjoerg       CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
2995*13fbcb42Sjoerg 
2996*13fbcb42Sjoerg   // This loop iterates through the list of reduce elements and copies,
2997*13fbcb42Sjoerg   // element by element, from a remote lane in the warp to RemoteReduceList,
2998*13fbcb42Sjoerg   // hosted on the thread's stack.
2999*13fbcb42Sjoerg   emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
3000*13fbcb42Sjoerg                         LocalReduceList, RemoteReduceList,
3001*13fbcb42Sjoerg                         {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
3002*13fbcb42Sjoerg                          /*ScratchpadIndex=*/nullptr,
3003*13fbcb42Sjoerg                          /*ScratchpadWidth=*/nullptr});
3004*13fbcb42Sjoerg 
3005*13fbcb42Sjoerg   // The actions to be performed on the Remote Reduce list is dependent
3006*13fbcb42Sjoerg   // on the algorithm version.
3007*13fbcb42Sjoerg   //
3008*13fbcb42Sjoerg   //  if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
3009*13fbcb42Sjoerg   //  LaneId % 2 == 0 && Offset > 0):
3010*13fbcb42Sjoerg   //    do the reduction value aggregation
3011*13fbcb42Sjoerg   //
3012*13fbcb42Sjoerg   //  The thread local variable Reduce list is mutated in place to host the
3013*13fbcb42Sjoerg   //  reduced data, which is the aggregated value produced from local and
3014*13fbcb42Sjoerg   //  remote lanes.
3015*13fbcb42Sjoerg   //
3016*13fbcb42Sjoerg   //  Note that AlgoVer is expected to be a constant integer known at compile
3017*13fbcb42Sjoerg   //  time.
3018*13fbcb42Sjoerg   //  When AlgoVer==0, the first conjunction evaluates to true, making
3019*13fbcb42Sjoerg   //    the entire predicate true during compile time.
3020*13fbcb42Sjoerg   //  When AlgoVer==1, the second conjunction has only the second part to be
3021*13fbcb42Sjoerg   //    evaluated during runtime.  Other conjunctions evaluates to false
3022*13fbcb42Sjoerg   //    during compile time.
3023*13fbcb42Sjoerg   //  When AlgoVer==2, the third conjunction has only the second part to be
3024*13fbcb42Sjoerg   //    evaluated during runtime.  Other conjunctions evaluates to false
3025*13fbcb42Sjoerg   //    during compile time.
3026*13fbcb42Sjoerg   llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
3027*13fbcb42Sjoerg 
3028*13fbcb42Sjoerg   llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
3029*13fbcb42Sjoerg   llvm::Value *CondAlgo1 = Bld.CreateAnd(
3030*13fbcb42Sjoerg       Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
3031*13fbcb42Sjoerg 
3032*13fbcb42Sjoerg   llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
3033*13fbcb42Sjoerg   llvm::Value *CondAlgo2 = Bld.CreateAnd(
3034*13fbcb42Sjoerg       Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
3035*13fbcb42Sjoerg   CondAlgo2 = Bld.CreateAnd(
3036*13fbcb42Sjoerg       CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
3037*13fbcb42Sjoerg 
3038*13fbcb42Sjoerg   llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
3039*13fbcb42Sjoerg   CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
3040*13fbcb42Sjoerg 
3041*13fbcb42Sjoerg   llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3042*13fbcb42Sjoerg   llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3043*13fbcb42Sjoerg   llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3044*13fbcb42Sjoerg   Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
3045*13fbcb42Sjoerg 
3046*13fbcb42Sjoerg   CGF.EmitBlock(ThenBB);
3047*13fbcb42Sjoerg   // reduce_function(LocalReduceList, RemoteReduceList)
3048*13fbcb42Sjoerg   llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3049*13fbcb42Sjoerg       LocalReduceList.getPointer(), CGF.VoidPtrTy);
3050*13fbcb42Sjoerg   llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3051*13fbcb42Sjoerg       RemoteReduceList.getPointer(), CGF.VoidPtrTy);
3052*13fbcb42Sjoerg   CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3053*13fbcb42Sjoerg       CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
3054*13fbcb42Sjoerg   Bld.CreateBr(MergeBB);
3055*13fbcb42Sjoerg 
3056*13fbcb42Sjoerg   CGF.EmitBlock(ElseBB);
3057*13fbcb42Sjoerg   Bld.CreateBr(MergeBB);
3058*13fbcb42Sjoerg 
3059*13fbcb42Sjoerg   CGF.EmitBlock(MergeBB);
3060*13fbcb42Sjoerg 
3061*13fbcb42Sjoerg   // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3062*13fbcb42Sjoerg   // Reduce list.
3063*13fbcb42Sjoerg   Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
3064*13fbcb42Sjoerg   llvm::Value *CondCopy = Bld.CreateAnd(
3065*13fbcb42Sjoerg       Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
3066*13fbcb42Sjoerg 
3067*13fbcb42Sjoerg   llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
3068*13fbcb42Sjoerg   llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
3069*13fbcb42Sjoerg   llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
3070*13fbcb42Sjoerg   Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3071*13fbcb42Sjoerg 
3072*13fbcb42Sjoerg   CGF.EmitBlock(CpyThenBB);
3073*13fbcb42Sjoerg   emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
3074*13fbcb42Sjoerg                         RemoteReduceList, LocalReduceList);
3075*13fbcb42Sjoerg   Bld.CreateBr(CpyMergeBB);
3076*13fbcb42Sjoerg 
3077*13fbcb42Sjoerg   CGF.EmitBlock(CpyElseBB);
3078*13fbcb42Sjoerg   Bld.CreateBr(CpyMergeBB);
3079*13fbcb42Sjoerg 
3080*13fbcb42Sjoerg   CGF.EmitBlock(CpyMergeBB);
3081*13fbcb42Sjoerg 
3082*13fbcb42Sjoerg   CGF.FinishFunction();
3083*13fbcb42Sjoerg   return Fn;
3084*13fbcb42Sjoerg }
3085*13fbcb42Sjoerg 
3086*13fbcb42Sjoerg /// This function emits a helper that copies all the reduction variables from
3087*13fbcb42Sjoerg /// the team into the provided global buffer for the reduction variables.
3088*13fbcb42Sjoerg ///
3089*13fbcb42Sjoerg /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
3090*13fbcb42Sjoerg ///   For all data entries D in reduce_data:
3091*13fbcb42Sjoerg ///     Copy local D to buffer.D[Idx]
emitListToGlobalCopyFunction(CodeGenModule & CGM,ArrayRef<const Expr * > Privates,QualType ReductionArrayTy,SourceLocation Loc,const RecordDecl * TeamReductionRec,const llvm::SmallDenseMap<const ValueDecl *,const FieldDecl * > & VarFieldMap)3092*13fbcb42Sjoerg static llvm::Value *emitListToGlobalCopyFunction(
3093*13fbcb42Sjoerg     CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3094*13fbcb42Sjoerg     QualType ReductionArrayTy, SourceLocation Loc,
3095*13fbcb42Sjoerg     const RecordDecl *TeamReductionRec,
3096*13fbcb42Sjoerg     const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3097*13fbcb42Sjoerg         &VarFieldMap) {
3098*13fbcb42Sjoerg   ASTContext &C = CGM.getContext();
3099*13fbcb42Sjoerg 
3100*13fbcb42Sjoerg   // Buffer: global reduction buffer.
3101*13fbcb42Sjoerg   ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3102*13fbcb42Sjoerg                               C.VoidPtrTy, ImplicitParamDecl::Other);
3103*13fbcb42Sjoerg   // Idx: index of the buffer.
3104*13fbcb42Sjoerg   ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3105*13fbcb42Sjoerg                            ImplicitParamDecl::Other);
3106*13fbcb42Sjoerg   // ReduceList: thread local Reduce list.
3107*13fbcb42Sjoerg   ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3108*13fbcb42Sjoerg                                   C.VoidPtrTy, ImplicitParamDecl::Other);
3109*13fbcb42Sjoerg   FunctionArgList Args;
3110*13fbcb42Sjoerg   Args.push_back(&BufferArg);
3111*13fbcb42Sjoerg   Args.push_back(&IdxArg);
3112*13fbcb42Sjoerg   Args.push_back(&ReduceListArg);
3113*13fbcb42Sjoerg 
3114*13fbcb42Sjoerg   const CGFunctionInfo &CGFI =
3115*13fbcb42Sjoerg       CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
3116*13fbcb42Sjoerg   auto *Fn = llvm::Function::Create(
3117*13fbcb42Sjoerg       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3118*13fbcb42Sjoerg       "_omp_reduction_list_to_global_copy_func", &CGM.getModule());
3119*13fbcb42Sjoerg   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3120*13fbcb42Sjoerg   Fn->setDoesNotRecurse();
3121*13fbcb42Sjoerg   CodeGenFunction CGF(CGM);
3122*13fbcb42Sjoerg   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3123*13fbcb42Sjoerg 
3124*13fbcb42Sjoerg   CGBuilderTy &Bld = CGF.Builder;
3125*13fbcb42Sjoerg 
3126*13fbcb42Sjoerg   Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3127*13fbcb42Sjoerg   Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3128*13fbcb42Sjoerg   Address LocalReduceList(
3129*13fbcb42Sjoerg       Bld.CreatePointerBitCastOrAddrSpaceCast(
3130*13fbcb42Sjoerg           CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3131*13fbcb42Sjoerg                                C.VoidPtrTy, Loc),
3132*13fbcb42Sjoerg           CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3133*13fbcb42Sjoerg       CGF.getPointerAlign());
3134*13fbcb42Sjoerg   QualType StaticTy = C.getRecordType(TeamReductionRec);
3135*13fbcb42Sjoerg   llvm::Type *LLVMReductionsBufferTy =
3136*13fbcb42Sjoerg       CGM.getTypes().ConvertTypeForMem(StaticTy);
3137*13fbcb42Sjoerg   llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3138*13fbcb42Sjoerg       CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3139*13fbcb42Sjoerg       LLVMReductionsBufferTy->getPointerTo());
3140*13fbcb42Sjoerg   llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3141*13fbcb42Sjoerg                          CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3142*13fbcb42Sjoerg                                               /*Volatile=*/false, C.IntTy,
3143*13fbcb42Sjoerg                                               Loc)};
3144*13fbcb42Sjoerg   unsigned Idx = 0;
3145*13fbcb42Sjoerg   for (const Expr *Private : Privates) {
3146*13fbcb42Sjoerg     // Reduce element = LocalReduceList[i]
3147*13fbcb42Sjoerg     Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3148*13fbcb42Sjoerg     llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3149*13fbcb42Sjoerg         ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3150*13fbcb42Sjoerg     // elemptr = ((CopyType*)(elemptrptr)) + I
3151*13fbcb42Sjoerg     ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3152*13fbcb42Sjoerg         ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
3153*13fbcb42Sjoerg     Address ElemPtr =
3154*13fbcb42Sjoerg         Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
3155*13fbcb42Sjoerg     const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
3156*13fbcb42Sjoerg     // Global = Buffer.VD[Idx];
3157*13fbcb42Sjoerg     const FieldDecl *FD = VarFieldMap.lookup(VD);
3158*13fbcb42Sjoerg     LValue GlobLVal = CGF.EmitLValueForField(
3159*13fbcb42Sjoerg         CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3160*13fbcb42Sjoerg     Address GlobAddr = GlobLVal.getAddress(CGF);
3161*13fbcb42Sjoerg     llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
3162*13fbcb42Sjoerg         GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
3163*13fbcb42Sjoerg     GlobLVal.setAddress(Address(BufferPtr, GlobAddr.getAlignment()));
3164*13fbcb42Sjoerg     switch (CGF.getEvaluationKind(Private->getType())) {
3165*13fbcb42Sjoerg     case TEK_Scalar: {
3166*13fbcb42Sjoerg       llvm::Value *V = CGF.EmitLoadOfScalar(
3167*13fbcb42Sjoerg           ElemPtr, /*Volatile=*/false, Private->getType(), Loc,
3168*13fbcb42Sjoerg           LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
3169*13fbcb42Sjoerg       CGF.EmitStoreOfScalar(V, GlobLVal);
3170*13fbcb42Sjoerg       break;
3171*13fbcb42Sjoerg     }
3172*13fbcb42Sjoerg     case TEK_Complex: {
3173*13fbcb42Sjoerg       CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(
3174*13fbcb42Sjoerg           CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
3175*13fbcb42Sjoerg       CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false);
3176*13fbcb42Sjoerg       break;
3177*13fbcb42Sjoerg     }
3178*13fbcb42Sjoerg     case TEK_Aggregate:
3179*13fbcb42Sjoerg       CGF.EmitAggregateCopy(GlobLVal,
3180*13fbcb42Sjoerg                             CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3181*13fbcb42Sjoerg                             Private->getType(), AggValueSlot::DoesNotOverlap);
3182*13fbcb42Sjoerg       break;
3183*13fbcb42Sjoerg     }
3184*13fbcb42Sjoerg     ++Idx;
3185*13fbcb42Sjoerg   }
3186*13fbcb42Sjoerg 
3187*13fbcb42Sjoerg   CGF.FinishFunction();
3188*13fbcb42Sjoerg   return Fn;
3189*13fbcb42Sjoerg }
3190*13fbcb42Sjoerg 
3191*13fbcb42Sjoerg /// This function emits a helper that reduces all the reduction variables from
3192*13fbcb42Sjoerg /// the team into the provided global buffer for the reduction variables.
3193*13fbcb42Sjoerg ///
3194*13fbcb42Sjoerg /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
3195*13fbcb42Sjoerg ///  void *GlobPtrs[];
3196*13fbcb42Sjoerg ///  GlobPtrs[0] = (void*)&buffer.D0[Idx];
3197*13fbcb42Sjoerg ///  ...
3198*13fbcb42Sjoerg ///  GlobPtrs[N] = (void*)&buffer.DN[Idx];
3199*13fbcb42Sjoerg ///  reduce_function(GlobPtrs, reduce_data);
emitListToGlobalReduceFunction(CodeGenModule & CGM,ArrayRef<const Expr * > Privates,QualType ReductionArrayTy,SourceLocation Loc,const RecordDecl * TeamReductionRec,const llvm::SmallDenseMap<const ValueDecl *,const FieldDecl * > & VarFieldMap,llvm::Function * ReduceFn)3200*13fbcb42Sjoerg static llvm::Value *emitListToGlobalReduceFunction(
3201*13fbcb42Sjoerg     CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3202*13fbcb42Sjoerg     QualType ReductionArrayTy, SourceLocation Loc,
3203*13fbcb42Sjoerg     const RecordDecl *TeamReductionRec,
3204*13fbcb42Sjoerg     const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3205*13fbcb42Sjoerg         &VarFieldMap,
3206*13fbcb42Sjoerg     llvm::Function *ReduceFn) {
3207*13fbcb42Sjoerg   ASTContext &C = CGM.getContext();
3208*13fbcb42Sjoerg 
3209*13fbcb42Sjoerg   // Buffer: global reduction buffer.
3210*13fbcb42Sjoerg   ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3211*13fbcb42Sjoerg                               C.VoidPtrTy, ImplicitParamDecl::Other);
3212*13fbcb42Sjoerg   // Idx: index of the buffer.
3213*13fbcb42Sjoerg   ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3214*13fbcb42Sjoerg                            ImplicitParamDecl::Other);
3215*13fbcb42Sjoerg   // ReduceList: thread local Reduce list.
3216*13fbcb42Sjoerg   ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3217*13fbcb42Sjoerg                                   C.VoidPtrTy, ImplicitParamDecl::Other);
3218*13fbcb42Sjoerg   FunctionArgList Args;
3219*13fbcb42Sjoerg   Args.push_back(&BufferArg);
3220*13fbcb42Sjoerg   Args.push_back(&IdxArg);
3221*13fbcb42Sjoerg   Args.push_back(&ReduceListArg);
3222*13fbcb42Sjoerg 
3223*13fbcb42Sjoerg   const CGFunctionInfo &CGFI =
3224*13fbcb42Sjoerg       CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
3225*13fbcb42Sjoerg   auto *Fn = llvm::Function::Create(
3226*13fbcb42Sjoerg       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3227*13fbcb42Sjoerg       "_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
3228*13fbcb42Sjoerg   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3229*13fbcb42Sjoerg   Fn->setDoesNotRecurse();
3230*13fbcb42Sjoerg   CodeGenFunction CGF(CGM);
3231*13fbcb42Sjoerg   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3232*13fbcb42Sjoerg 
3233*13fbcb42Sjoerg   CGBuilderTy &Bld = CGF.Builder;
3234*13fbcb42Sjoerg 
3235*13fbcb42Sjoerg   Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3236*13fbcb42Sjoerg   QualType StaticTy = C.getRecordType(TeamReductionRec);
3237*13fbcb42Sjoerg   llvm::Type *LLVMReductionsBufferTy =
3238*13fbcb42Sjoerg       CGM.getTypes().ConvertTypeForMem(StaticTy);
3239*13fbcb42Sjoerg   llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3240*13fbcb42Sjoerg       CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3241*13fbcb42Sjoerg       LLVMReductionsBufferTy->getPointerTo());
3242*13fbcb42Sjoerg 
3243*13fbcb42Sjoerg   // 1. Build a list of reduction variables.
3244*13fbcb42Sjoerg   // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3245*13fbcb42Sjoerg   Address ReductionList =
3246*13fbcb42Sjoerg       CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3247*13fbcb42Sjoerg   auto IPriv = Privates.begin();
3248*13fbcb42Sjoerg   llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3249*13fbcb42Sjoerg                          CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3250*13fbcb42Sjoerg                                               /*Volatile=*/false, C.IntTy,
3251*13fbcb42Sjoerg                                               Loc)};
3252*13fbcb42Sjoerg   unsigned Idx = 0;
3253*13fbcb42Sjoerg   for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
3254*13fbcb42Sjoerg     Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3255*13fbcb42Sjoerg     // Global = Buffer.VD[Idx];
3256*13fbcb42Sjoerg     const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
3257*13fbcb42Sjoerg     const FieldDecl *FD = VarFieldMap.lookup(VD);
3258*13fbcb42Sjoerg     LValue GlobLVal = CGF.EmitLValueForField(
3259*13fbcb42Sjoerg         CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3260*13fbcb42Sjoerg     Address GlobAddr = GlobLVal.getAddress(CGF);
3261*13fbcb42Sjoerg     llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
3262*13fbcb42Sjoerg         GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
3263*13fbcb42Sjoerg     llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
3264*13fbcb42Sjoerg     CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
3265*13fbcb42Sjoerg     if ((*IPriv)->getType()->isVariablyModifiedType()) {
3266*13fbcb42Sjoerg       // Store array size.
3267*13fbcb42Sjoerg       ++Idx;
3268*13fbcb42Sjoerg       Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3269*13fbcb42Sjoerg       llvm::Value *Size = CGF.Builder.CreateIntCast(
3270*13fbcb42Sjoerg           CGF.getVLASize(
3271*13fbcb42Sjoerg                  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3272*13fbcb42Sjoerg               .NumElts,
3273*13fbcb42Sjoerg           CGF.SizeTy, /*isSigned=*/false);
3274*13fbcb42Sjoerg       CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3275*13fbcb42Sjoerg                               Elem);
3276*13fbcb42Sjoerg     }
3277*13fbcb42Sjoerg   }
3278*13fbcb42Sjoerg 
3279*13fbcb42Sjoerg   // Call reduce_function(GlobalReduceList, ReduceList)
3280*13fbcb42Sjoerg   llvm::Value *GlobalReduceList =
3281*13fbcb42Sjoerg       CGF.EmitCastToVoidPtr(ReductionList.getPointer());
3282*13fbcb42Sjoerg   Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3283*13fbcb42Sjoerg   llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
3284*13fbcb42Sjoerg       AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
3285*13fbcb42Sjoerg   CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3286*13fbcb42Sjoerg       CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
3287*13fbcb42Sjoerg   CGF.FinishFunction();
3288*13fbcb42Sjoerg   return Fn;
3289*13fbcb42Sjoerg }
3290*13fbcb42Sjoerg 
3291*13fbcb42Sjoerg /// This function emits a helper that copies all the reduction variables from
3292*13fbcb42Sjoerg /// the team into the provided global buffer for the reduction variables.
3293*13fbcb42Sjoerg ///
3294*13fbcb42Sjoerg /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
3295*13fbcb42Sjoerg ///   For all data entries D in reduce_data:
3296*13fbcb42Sjoerg ///     Copy buffer.D[Idx] to local D;
emitGlobalToListCopyFunction(CodeGenModule & CGM,ArrayRef<const Expr * > Privates,QualType ReductionArrayTy,SourceLocation Loc,const RecordDecl * TeamReductionRec,const llvm::SmallDenseMap<const ValueDecl *,const FieldDecl * > & VarFieldMap)3297*13fbcb42Sjoerg static llvm::Value *emitGlobalToListCopyFunction(
3298*13fbcb42Sjoerg     CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3299*13fbcb42Sjoerg     QualType ReductionArrayTy, SourceLocation Loc,
3300*13fbcb42Sjoerg     const RecordDecl *TeamReductionRec,
3301*13fbcb42Sjoerg     const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3302*13fbcb42Sjoerg         &VarFieldMap) {
3303*13fbcb42Sjoerg   ASTContext &C = CGM.getContext();
3304*13fbcb42Sjoerg 
3305*13fbcb42Sjoerg   // Buffer: global reduction buffer.
3306*13fbcb42Sjoerg   ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3307*13fbcb42Sjoerg                               C.VoidPtrTy, ImplicitParamDecl::Other);
3308*13fbcb42Sjoerg   // Idx: index of the buffer.
3309*13fbcb42Sjoerg   ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3310*13fbcb42Sjoerg                            ImplicitParamDecl::Other);
3311*13fbcb42Sjoerg   // ReduceList: thread local Reduce list.
3312*13fbcb42Sjoerg   ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3313*13fbcb42Sjoerg                                   C.VoidPtrTy, ImplicitParamDecl::Other);
3314*13fbcb42Sjoerg   FunctionArgList Args;
3315*13fbcb42Sjoerg   Args.push_back(&BufferArg);
3316*13fbcb42Sjoerg   Args.push_back(&IdxArg);
3317*13fbcb42Sjoerg   Args.push_back(&ReduceListArg);
3318*13fbcb42Sjoerg 
3319*13fbcb42Sjoerg   const CGFunctionInfo &CGFI =
3320*13fbcb42Sjoerg       CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
3321*13fbcb42Sjoerg   auto *Fn = llvm::Function::Create(
3322*13fbcb42Sjoerg       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3323*13fbcb42Sjoerg       "_omp_reduction_global_to_list_copy_func", &CGM.getModule());
3324*13fbcb42Sjoerg   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3325*13fbcb42Sjoerg   Fn->setDoesNotRecurse();
3326*13fbcb42Sjoerg   CodeGenFunction CGF(CGM);
3327*13fbcb42Sjoerg   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3328*13fbcb42Sjoerg 
3329*13fbcb42Sjoerg   CGBuilderTy &Bld = CGF.Builder;
3330*13fbcb42Sjoerg 
3331*13fbcb42Sjoerg   Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3332*13fbcb42Sjoerg   Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3333*13fbcb42Sjoerg   Address LocalReduceList(
3334*13fbcb42Sjoerg       Bld.CreatePointerBitCastOrAddrSpaceCast(
3335*13fbcb42Sjoerg           CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3336*13fbcb42Sjoerg                                C.VoidPtrTy, Loc),
3337*13fbcb42Sjoerg           CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3338*13fbcb42Sjoerg       CGF.getPointerAlign());
3339*13fbcb42Sjoerg   QualType StaticTy = C.getRecordType(TeamReductionRec);
3340*13fbcb42Sjoerg   llvm::Type *LLVMReductionsBufferTy =
3341*13fbcb42Sjoerg       CGM.getTypes().ConvertTypeForMem(StaticTy);
3342*13fbcb42Sjoerg   llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3343*13fbcb42Sjoerg       CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3344*13fbcb42Sjoerg       LLVMReductionsBufferTy->getPointerTo());
3345*13fbcb42Sjoerg 
3346*13fbcb42Sjoerg   llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3347*13fbcb42Sjoerg                          CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3348*13fbcb42Sjoerg                                               /*Volatile=*/false, C.IntTy,
3349*13fbcb42Sjoerg                                               Loc)};
3350*13fbcb42Sjoerg   unsigned Idx = 0;
3351*13fbcb42Sjoerg   for (const Expr *Private : Privates) {
3352*13fbcb42Sjoerg     // Reduce element = LocalReduceList[i]
3353*13fbcb42Sjoerg     Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3354*13fbcb42Sjoerg     llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3355*13fbcb42Sjoerg         ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3356*13fbcb42Sjoerg     // elemptr = ((CopyType*)(elemptrptr)) + I
3357*13fbcb42Sjoerg     ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3358*13fbcb42Sjoerg         ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
3359*13fbcb42Sjoerg     Address ElemPtr =
3360*13fbcb42Sjoerg         Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
3361*13fbcb42Sjoerg     const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
3362*13fbcb42Sjoerg     // Global = Buffer.VD[Idx];
3363*13fbcb42Sjoerg     const FieldDecl *FD = VarFieldMap.lookup(VD);
3364*13fbcb42Sjoerg     LValue GlobLVal = CGF.EmitLValueForField(
3365*13fbcb42Sjoerg         CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3366*13fbcb42Sjoerg     Address GlobAddr = GlobLVal.getAddress(CGF);
3367*13fbcb42Sjoerg     llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
3368*13fbcb42Sjoerg         GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
3369*13fbcb42Sjoerg     GlobLVal.setAddress(Address(BufferPtr, GlobAddr.getAlignment()));
3370*13fbcb42Sjoerg     switch (CGF.getEvaluationKind(Private->getType())) {
3371*13fbcb42Sjoerg     case TEK_Scalar: {
3372*13fbcb42Sjoerg       llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
3373*13fbcb42Sjoerg       CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType(),
3374*13fbcb42Sjoerg                             LValueBaseInfo(AlignmentSource::Type),
3375*13fbcb42Sjoerg                             TBAAAccessInfo());
3376*13fbcb42Sjoerg       break;
3377*13fbcb42Sjoerg     }
3378*13fbcb42Sjoerg     case TEK_Complex: {
3379*13fbcb42Sjoerg       CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc);
3380*13fbcb42Sjoerg       CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3381*13fbcb42Sjoerg                              /*isInit=*/false);
3382*13fbcb42Sjoerg       break;
3383*13fbcb42Sjoerg     }
3384*13fbcb42Sjoerg     case TEK_Aggregate:
3385*13fbcb42Sjoerg       CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3386*13fbcb42Sjoerg                             GlobLVal, Private->getType(),
3387*13fbcb42Sjoerg                             AggValueSlot::DoesNotOverlap);
3388*13fbcb42Sjoerg       break;
3389*13fbcb42Sjoerg     }
3390*13fbcb42Sjoerg     ++Idx;
3391*13fbcb42Sjoerg   }
3392*13fbcb42Sjoerg 
3393*13fbcb42Sjoerg   CGF.FinishFunction();
3394*13fbcb42Sjoerg   return Fn;
3395*13fbcb42Sjoerg }
3396*13fbcb42Sjoerg 
3397*13fbcb42Sjoerg /// This function emits a helper that reduces all the reduction variables from
3398*13fbcb42Sjoerg /// the team into the provided global buffer for the reduction variables.
3399*13fbcb42Sjoerg ///
3400*13fbcb42Sjoerg /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
3401*13fbcb42Sjoerg ///  void *GlobPtrs[];
3402*13fbcb42Sjoerg ///  GlobPtrs[0] = (void*)&buffer.D0[Idx];
3403*13fbcb42Sjoerg ///  ...
3404*13fbcb42Sjoerg ///  GlobPtrs[N] = (void*)&buffer.DN[Idx];
3405*13fbcb42Sjoerg ///  reduce_function(reduce_data, GlobPtrs);
emitGlobalToListReduceFunction(CodeGenModule & CGM,ArrayRef<const Expr * > Privates,QualType ReductionArrayTy,SourceLocation Loc,const RecordDecl * TeamReductionRec,const llvm::SmallDenseMap<const ValueDecl *,const FieldDecl * > & VarFieldMap,llvm::Function * ReduceFn)3406*13fbcb42Sjoerg static llvm::Value *emitGlobalToListReduceFunction(
3407*13fbcb42Sjoerg     CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3408*13fbcb42Sjoerg     QualType ReductionArrayTy, SourceLocation Loc,
3409*13fbcb42Sjoerg     const RecordDecl *TeamReductionRec,
3410*13fbcb42Sjoerg     const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3411*13fbcb42Sjoerg         &VarFieldMap,
3412*13fbcb42Sjoerg     llvm::Function *ReduceFn) {
3413*13fbcb42Sjoerg   ASTContext &C = CGM.getContext();
3414*13fbcb42Sjoerg 
3415*13fbcb42Sjoerg   // Buffer: global reduction buffer.
3416*13fbcb42Sjoerg   ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3417*13fbcb42Sjoerg                               C.VoidPtrTy, ImplicitParamDecl::Other);
3418*13fbcb42Sjoerg   // Idx: index of the buffer.
3419*13fbcb42Sjoerg   ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3420*13fbcb42Sjoerg                            ImplicitParamDecl::Other);
3421*13fbcb42Sjoerg   // ReduceList: thread local Reduce list.
3422*13fbcb42Sjoerg   ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3423*13fbcb42Sjoerg                                   C.VoidPtrTy, ImplicitParamDecl::Other);
3424*13fbcb42Sjoerg   FunctionArgList Args;
3425*13fbcb42Sjoerg   Args.push_back(&BufferArg);
3426*13fbcb42Sjoerg   Args.push_back(&IdxArg);
3427*13fbcb42Sjoerg   Args.push_back(&ReduceListArg);
3428*13fbcb42Sjoerg 
3429*13fbcb42Sjoerg   const CGFunctionInfo &CGFI =
3430*13fbcb42Sjoerg       CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
3431*13fbcb42Sjoerg   auto *Fn = llvm::Function::Create(
3432*13fbcb42Sjoerg       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3433*13fbcb42Sjoerg       "_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
3434*13fbcb42Sjoerg   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3435*13fbcb42Sjoerg   Fn->setDoesNotRecurse();
3436*13fbcb42Sjoerg   CodeGenFunction CGF(CGM);
3437*13fbcb42Sjoerg   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3438*13fbcb42Sjoerg 
3439*13fbcb42Sjoerg   CGBuilderTy &Bld = CGF.Builder;
3440*13fbcb42Sjoerg 
3441*13fbcb42Sjoerg   Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3442*13fbcb42Sjoerg   QualType StaticTy = C.getRecordType(TeamReductionRec);
3443*13fbcb42Sjoerg   llvm::Type *LLVMReductionsBufferTy =
3444*13fbcb42Sjoerg       CGM.getTypes().ConvertTypeForMem(StaticTy);
3445*13fbcb42Sjoerg   llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3446*13fbcb42Sjoerg       CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3447*13fbcb42Sjoerg       LLVMReductionsBufferTy->getPointerTo());
3448*13fbcb42Sjoerg 
3449*13fbcb42Sjoerg   // 1. Build a list of reduction variables.
3450*13fbcb42Sjoerg   // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3451*13fbcb42Sjoerg   Address ReductionList =
3452*13fbcb42Sjoerg       CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3453*13fbcb42Sjoerg   auto IPriv = Privates.begin();
3454*13fbcb42Sjoerg   llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3455*13fbcb42Sjoerg                          CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3456*13fbcb42Sjoerg                                               /*Volatile=*/false, C.IntTy,
3457*13fbcb42Sjoerg                                               Loc)};
3458*13fbcb42Sjoerg   unsigned Idx = 0;
3459*13fbcb42Sjoerg   for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
3460*13fbcb42Sjoerg     Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3461*13fbcb42Sjoerg     // Global = Buffer.VD[Idx];
3462*13fbcb42Sjoerg     const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
3463*13fbcb42Sjoerg     const FieldDecl *FD = VarFieldMap.lookup(VD);
3464*13fbcb42Sjoerg     LValue GlobLVal = CGF.EmitLValueForField(
3465*13fbcb42Sjoerg         CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3466*13fbcb42Sjoerg     Address GlobAddr = GlobLVal.getAddress(CGF);
3467*13fbcb42Sjoerg     llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
3468*13fbcb42Sjoerg         GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
3469*13fbcb42Sjoerg     llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
3470*13fbcb42Sjoerg     CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
3471*13fbcb42Sjoerg     if ((*IPriv)->getType()->isVariablyModifiedType()) {
3472*13fbcb42Sjoerg       // Store array size.
3473*13fbcb42Sjoerg       ++Idx;
3474*13fbcb42Sjoerg       Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3475*13fbcb42Sjoerg       llvm::Value *Size = CGF.Builder.CreateIntCast(
3476*13fbcb42Sjoerg           CGF.getVLASize(
3477*13fbcb42Sjoerg                  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3478*13fbcb42Sjoerg               .NumElts,
3479*13fbcb42Sjoerg           CGF.SizeTy, /*isSigned=*/false);
3480*13fbcb42Sjoerg       CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3481*13fbcb42Sjoerg                               Elem);
3482*13fbcb42Sjoerg     }
3483*13fbcb42Sjoerg   }
3484*13fbcb42Sjoerg 
3485*13fbcb42Sjoerg   // Call reduce_function(ReduceList, GlobalReduceList)
3486*13fbcb42Sjoerg   llvm::Value *GlobalReduceList =
3487*13fbcb42Sjoerg       CGF.EmitCastToVoidPtr(ReductionList.getPointer());
3488*13fbcb42Sjoerg   Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3489*13fbcb42Sjoerg   llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
3490*13fbcb42Sjoerg       AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
3491*13fbcb42Sjoerg   CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3492*13fbcb42Sjoerg       CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
3493*13fbcb42Sjoerg   CGF.FinishFunction();
3494*13fbcb42Sjoerg   return Fn;
3495*13fbcb42Sjoerg }
3496*13fbcb42Sjoerg 
3497*13fbcb42Sjoerg ///
3498*13fbcb42Sjoerg /// Design of OpenMP reductions on the GPU
3499*13fbcb42Sjoerg ///
3500*13fbcb42Sjoerg /// Consider a typical OpenMP program with one or more reduction
3501*13fbcb42Sjoerg /// clauses:
3502*13fbcb42Sjoerg ///
3503*13fbcb42Sjoerg /// float foo;
3504*13fbcb42Sjoerg /// double bar;
3505*13fbcb42Sjoerg /// #pragma omp target teams distribute parallel for \
3506*13fbcb42Sjoerg ///             reduction(+:foo) reduction(*:bar)
3507*13fbcb42Sjoerg /// for (int i = 0; i < N; i++) {
3508*13fbcb42Sjoerg ///   foo += A[i]; bar *= B[i];
3509*13fbcb42Sjoerg /// }
3510*13fbcb42Sjoerg ///
3511*13fbcb42Sjoerg /// where 'foo' and 'bar' are reduced across all OpenMP threads in
3512*13fbcb42Sjoerg /// all teams.  In our OpenMP implementation on the NVPTX device an
3513*13fbcb42Sjoerg /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
3514*13fbcb42Sjoerg /// within a team are mapped to CUDA threads within a threadblock.
3515*13fbcb42Sjoerg /// Our goal is to efficiently aggregate values across all OpenMP
3516*13fbcb42Sjoerg /// threads such that:
3517*13fbcb42Sjoerg ///
3518*13fbcb42Sjoerg ///   - the compiler and runtime are logically concise, and
3519*13fbcb42Sjoerg ///   - the reduction is performed efficiently in a hierarchical
3520*13fbcb42Sjoerg ///     manner as follows: within OpenMP threads in the same warp,
3521*13fbcb42Sjoerg ///     across warps in a threadblock, and finally across teams on
3522*13fbcb42Sjoerg ///     the NVPTX device.
3523*13fbcb42Sjoerg ///
3524*13fbcb42Sjoerg /// Introduction to Decoupling
3525*13fbcb42Sjoerg ///
3526*13fbcb42Sjoerg /// We would like to decouple the compiler and the runtime so that the
3527*13fbcb42Sjoerg /// latter is ignorant of the reduction variables (number, data types)
3528*13fbcb42Sjoerg /// and the reduction operators.  This allows a simpler interface
3529*13fbcb42Sjoerg /// and implementation while still attaining good performance.
3530*13fbcb42Sjoerg ///
3531*13fbcb42Sjoerg /// Pseudocode for the aforementioned OpenMP program generated by the
3532*13fbcb42Sjoerg /// compiler is as follows:
3533*13fbcb42Sjoerg ///
3534*13fbcb42Sjoerg /// 1. Create private copies of reduction variables on each OpenMP
3535*13fbcb42Sjoerg ///    thread: 'foo_private', 'bar_private'
3536*13fbcb42Sjoerg /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
3537*13fbcb42Sjoerg ///    to it and writes the result in 'foo_private' and 'bar_private'
3538*13fbcb42Sjoerg ///    respectively.
3539*13fbcb42Sjoerg /// 3. Call the OpenMP runtime on the GPU to reduce within a team
3540*13fbcb42Sjoerg ///    and store the result on the team master:
3541*13fbcb42Sjoerg ///
3542*13fbcb42Sjoerg ///     __kmpc_nvptx_parallel_reduce_nowait_v2(...,
3543*13fbcb42Sjoerg ///        reduceData, shuffleReduceFn, interWarpCpyFn)
3544*13fbcb42Sjoerg ///
3545*13fbcb42Sjoerg ///     where:
3546*13fbcb42Sjoerg ///       struct ReduceData {
3547*13fbcb42Sjoerg ///         double *foo;
3548*13fbcb42Sjoerg ///         double *bar;
3549*13fbcb42Sjoerg ///       } reduceData
3550*13fbcb42Sjoerg ///       reduceData.foo = &foo_private
3551*13fbcb42Sjoerg ///       reduceData.bar = &bar_private
3552*13fbcb42Sjoerg ///
3553*13fbcb42Sjoerg ///     'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
3554*13fbcb42Sjoerg ///     auxiliary functions generated by the compiler that operate on
3555*13fbcb42Sjoerg ///     variables of type 'ReduceData'.  They aid the runtime perform
3556*13fbcb42Sjoerg ///     algorithmic steps in a data agnostic manner.
3557*13fbcb42Sjoerg ///
3558*13fbcb42Sjoerg ///     'shuffleReduceFn' is a pointer to a function that reduces data
3559*13fbcb42Sjoerg ///     of type 'ReduceData' across two OpenMP threads (lanes) in the
3560*13fbcb42Sjoerg ///     same warp.  It takes the following arguments as input:
3561*13fbcb42Sjoerg ///
3562*13fbcb42Sjoerg ///     a. variable of type 'ReduceData' on the calling lane,
3563*13fbcb42Sjoerg ///     b. its lane_id,
3564*13fbcb42Sjoerg ///     c. an offset relative to the current lane_id to generate a
3565*13fbcb42Sjoerg ///        remote_lane_id.  The remote lane contains the second
3566*13fbcb42Sjoerg ///        variable of type 'ReduceData' that is to be reduced.
3567*13fbcb42Sjoerg ///     d. an algorithm version parameter determining which reduction
3568*13fbcb42Sjoerg ///        algorithm to use.
3569*13fbcb42Sjoerg ///
3570*13fbcb42Sjoerg ///     'shuffleReduceFn' retrieves data from the remote lane using
3571*13fbcb42Sjoerg ///     efficient GPU shuffle intrinsics and reduces, using the
3572*13fbcb42Sjoerg ///     algorithm specified by the 4th parameter, the two operands
3573*13fbcb42Sjoerg ///     element-wise.  The result is written to the first operand.
3574*13fbcb42Sjoerg ///
3575*13fbcb42Sjoerg ///     Different reduction algorithms are implemented in different
3576*13fbcb42Sjoerg ///     runtime functions, all calling 'shuffleReduceFn' to perform
3577*13fbcb42Sjoerg ///     the essential reduction step.  Therefore, based on the 4th
3578*13fbcb42Sjoerg ///     parameter, this function behaves slightly differently to
3579*13fbcb42Sjoerg ///     cooperate with the runtime to ensure correctness under
3580*13fbcb42Sjoerg ///     different circumstances.
3581*13fbcb42Sjoerg ///
3582*13fbcb42Sjoerg ///     'InterWarpCpyFn' is a pointer to a function that transfers
3583*13fbcb42Sjoerg ///     reduced variables across warps.  It tunnels, through CUDA
3584*13fbcb42Sjoerg ///     shared memory, the thread-private data of type 'ReduceData'
3585*13fbcb42Sjoerg ///     from lane 0 of each warp to a lane in the first warp.
3586*13fbcb42Sjoerg /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
3587*13fbcb42Sjoerg ///    The last team writes the global reduced value to memory.
3588*13fbcb42Sjoerg ///
3589*13fbcb42Sjoerg ///     ret = __kmpc_nvptx_teams_reduce_nowait(...,
3590*13fbcb42Sjoerg ///             reduceData, shuffleReduceFn, interWarpCpyFn,
3591*13fbcb42Sjoerg ///             scratchpadCopyFn, loadAndReduceFn)
3592*13fbcb42Sjoerg ///
3593*13fbcb42Sjoerg ///     'scratchpadCopyFn' is a helper that stores reduced
3594*13fbcb42Sjoerg ///     data from the team master to a scratchpad array in
3595*13fbcb42Sjoerg ///     global memory.
3596*13fbcb42Sjoerg ///
3597*13fbcb42Sjoerg ///     'loadAndReduceFn' is a helper that loads data from
3598*13fbcb42Sjoerg ///     the scratchpad array and reduces it with the input
3599*13fbcb42Sjoerg ///     operand.
3600*13fbcb42Sjoerg ///
3601*13fbcb42Sjoerg ///     These compiler generated functions hide address
3602*13fbcb42Sjoerg ///     calculation and alignment information from the runtime.
3603*13fbcb42Sjoerg /// 5. if ret == 1:
3604*13fbcb42Sjoerg ///     The team master of the last team stores the reduced
3605*13fbcb42Sjoerg ///     result to the globals in memory.
3606*13fbcb42Sjoerg ///     foo += reduceData.foo; bar *= reduceData.bar
3607*13fbcb42Sjoerg ///
3608*13fbcb42Sjoerg ///
3609*13fbcb42Sjoerg /// Warp Reduction Algorithms
3610*13fbcb42Sjoerg ///
3611*13fbcb42Sjoerg /// On the warp level, we have three algorithms implemented in the
3612*13fbcb42Sjoerg /// OpenMP runtime depending on the number of active lanes:
3613*13fbcb42Sjoerg ///
3614*13fbcb42Sjoerg /// Full Warp Reduction
3615*13fbcb42Sjoerg ///
3616*13fbcb42Sjoerg /// The reduce algorithm within a warp where all lanes are active
3617*13fbcb42Sjoerg /// is implemented in the runtime as follows:
3618*13fbcb42Sjoerg ///
3619*13fbcb42Sjoerg /// full_warp_reduce(void *reduce_data,
3620*13fbcb42Sjoerg ///                  kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3621*13fbcb42Sjoerg ///   for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
3622*13fbcb42Sjoerg ///     ShuffleReduceFn(reduce_data, 0, offset, 0);
3623*13fbcb42Sjoerg /// }
3624*13fbcb42Sjoerg ///
3625*13fbcb42Sjoerg /// The algorithm completes in log(2, WARPSIZE) steps.
3626*13fbcb42Sjoerg ///
3627*13fbcb42Sjoerg /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
3628*13fbcb42Sjoerg /// not used therefore we save instructions by not retrieving lane_id
3629*13fbcb42Sjoerg /// from the corresponding special registers.  The 4th parameter, which
3630*13fbcb42Sjoerg /// represents the version of the algorithm being used, is set to 0 to
3631*13fbcb42Sjoerg /// signify full warp reduction.
3632*13fbcb42Sjoerg ///
3633*13fbcb42Sjoerg /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3634*13fbcb42Sjoerg ///
3635*13fbcb42Sjoerg /// #reduce_elem refers to an element in the local lane's data structure
3636*13fbcb42Sjoerg /// #remote_elem is retrieved from a remote lane
3637*13fbcb42Sjoerg /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3638*13fbcb42Sjoerg /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
3639*13fbcb42Sjoerg ///
3640*13fbcb42Sjoerg /// Contiguous Partial Warp Reduction
3641*13fbcb42Sjoerg ///
3642*13fbcb42Sjoerg /// This reduce algorithm is used within a warp where only the first
3643*13fbcb42Sjoerg /// 'n' (n <= WARPSIZE) lanes are active.  It is typically used when the
3644*13fbcb42Sjoerg /// number of OpenMP threads in a parallel region is not a multiple of
3645*13fbcb42Sjoerg /// WARPSIZE.  The algorithm is implemented in the runtime as follows:
3646*13fbcb42Sjoerg ///
3647*13fbcb42Sjoerg /// void
3648*13fbcb42Sjoerg /// contiguous_partial_reduce(void *reduce_data,
3649*13fbcb42Sjoerg ///                           kmp_ShuffleReductFctPtr ShuffleReduceFn,
3650*13fbcb42Sjoerg ///                           int size, int lane_id) {
3651*13fbcb42Sjoerg ///   int curr_size;
3652*13fbcb42Sjoerg ///   int offset;
3653*13fbcb42Sjoerg ///   curr_size = size;
3654*13fbcb42Sjoerg ///   mask = curr_size/2;
3655*13fbcb42Sjoerg ///   while (offset>0) {
3656*13fbcb42Sjoerg ///     ShuffleReduceFn(reduce_data, lane_id, offset, 1);
3657*13fbcb42Sjoerg ///     curr_size = (curr_size+1)/2;
3658*13fbcb42Sjoerg ///     offset = curr_size/2;
3659*13fbcb42Sjoerg ///   }
3660*13fbcb42Sjoerg /// }
3661*13fbcb42Sjoerg ///
3662*13fbcb42Sjoerg /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3663*13fbcb42Sjoerg ///
3664*13fbcb42Sjoerg /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3665*13fbcb42Sjoerg /// if (lane_id < offset)
3666*13fbcb42Sjoerg ///     reduce_elem = reduce_elem REDUCE_OP remote_elem
3667*13fbcb42Sjoerg /// else
3668*13fbcb42Sjoerg ///     reduce_elem = remote_elem
3669*13fbcb42Sjoerg ///
3670*13fbcb42Sjoerg /// This algorithm assumes that the data to be reduced are located in a
3671*13fbcb42Sjoerg /// contiguous subset of lanes starting from the first.  When there is
3672*13fbcb42Sjoerg /// an odd number of active lanes, the data in the last lane is not
3673*13fbcb42Sjoerg /// aggregated with any other lane's dat but is instead copied over.
3674*13fbcb42Sjoerg ///
3675*13fbcb42Sjoerg /// Dispersed Partial Warp Reduction
3676*13fbcb42Sjoerg ///
3677*13fbcb42Sjoerg /// This algorithm is used within a warp when any discontiguous subset of
3678*13fbcb42Sjoerg /// lanes are active.  It is used to implement the reduction operation
3679*13fbcb42Sjoerg /// across lanes in an OpenMP simd region or in a nested parallel region.
3680*13fbcb42Sjoerg ///
3681*13fbcb42Sjoerg /// void
3682*13fbcb42Sjoerg /// dispersed_partial_reduce(void *reduce_data,
3683*13fbcb42Sjoerg ///                          kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3684*13fbcb42Sjoerg ///   int size, remote_id;
3685*13fbcb42Sjoerg ///   int logical_lane_id = number_of_active_lanes_before_me() * 2;
3686*13fbcb42Sjoerg ///   do {
3687*13fbcb42Sjoerg ///       remote_id = next_active_lane_id_right_after_me();
3688*13fbcb42Sjoerg ///       # the above function returns 0 of no active lane
3689*13fbcb42Sjoerg ///       # is present right after the current lane.
3690*13fbcb42Sjoerg ///       size = number_of_active_lanes_in_this_warp();
3691*13fbcb42Sjoerg ///       logical_lane_id /= 2;
3692*13fbcb42Sjoerg ///       ShuffleReduceFn(reduce_data, logical_lane_id,
3693*13fbcb42Sjoerg ///                       remote_id-1-threadIdx.x, 2);
3694*13fbcb42Sjoerg ///   } while (logical_lane_id % 2 == 0 && size > 1);
3695*13fbcb42Sjoerg /// }
3696*13fbcb42Sjoerg ///
3697*13fbcb42Sjoerg /// There is no assumption made about the initial state of the reduction.
3698*13fbcb42Sjoerg /// Any number of lanes (>=1) could be active at any position.  The reduction
3699*13fbcb42Sjoerg /// result is returned in the first active lane.
3700*13fbcb42Sjoerg ///
3701*13fbcb42Sjoerg /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3702*13fbcb42Sjoerg ///
3703*13fbcb42Sjoerg /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3704*13fbcb42Sjoerg /// if (lane_id % 2 == 0 && offset > 0)
3705*13fbcb42Sjoerg ///     reduce_elem = reduce_elem REDUCE_OP remote_elem
3706*13fbcb42Sjoerg /// else
3707*13fbcb42Sjoerg ///     reduce_elem = remote_elem
3708*13fbcb42Sjoerg ///
3709*13fbcb42Sjoerg ///
3710*13fbcb42Sjoerg /// Intra-Team Reduction
3711*13fbcb42Sjoerg ///
3712*13fbcb42Sjoerg /// This function, as implemented in the runtime call
3713*13fbcb42Sjoerg /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
3714*13fbcb42Sjoerg /// threads in a team.  It first reduces within a warp using the
3715*13fbcb42Sjoerg /// aforementioned algorithms.  We then proceed to gather all such
3716*13fbcb42Sjoerg /// reduced values at the first warp.
3717*13fbcb42Sjoerg ///
3718*13fbcb42Sjoerg /// The runtime makes use of the function 'InterWarpCpyFn', which copies
3719*13fbcb42Sjoerg /// data from each of the "warp master" (zeroth lane of each warp, where
3720*13fbcb42Sjoerg /// warp-reduced data is held) to the zeroth warp.  This step reduces (in
3721*13fbcb42Sjoerg /// a mathematical sense) the problem of reduction across warp masters in
3722*13fbcb42Sjoerg /// a block to the problem of warp reduction.
3723*13fbcb42Sjoerg ///
3724*13fbcb42Sjoerg ///
3725*13fbcb42Sjoerg /// Inter-Team Reduction
3726*13fbcb42Sjoerg ///
3727*13fbcb42Sjoerg /// Once a team has reduced its data to a single value, it is stored in
3728*13fbcb42Sjoerg /// a global scratchpad array.  Since each team has a distinct slot, this
3729*13fbcb42Sjoerg /// can be done without locking.
3730*13fbcb42Sjoerg ///
3731*13fbcb42Sjoerg /// The last team to write to the scratchpad array proceeds to reduce the
3732*13fbcb42Sjoerg /// scratchpad array.  One or more workers in the last team use the helper
3733*13fbcb42Sjoerg /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
3734*13fbcb42Sjoerg /// the k'th worker reduces every k'th element.
3735*13fbcb42Sjoerg ///
3736*13fbcb42Sjoerg /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
3737*13fbcb42Sjoerg /// reduce across workers and compute a globally reduced value.
3738*13fbcb42Sjoerg ///
emitReduction(CodeGenFunction & CGF,SourceLocation Loc,ArrayRef<const Expr * > Privates,ArrayRef<const Expr * > LHSExprs,ArrayRef<const Expr * > RHSExprs,ArrayRef<const Expr * > ReductionOps,ReductionOptionsTy Options)3739*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitReduction(
3740*13fbcb42Sjoerg     CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
3741*13fbcb42Sjoerg     ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
3742*13fbcb42Sjoerg     ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
3743*13fbcb42Sjoerg   if (!CGF.HaveInsertPoint())
3744*13fbcb42Sjoerg     return;
3745*13fbcb42Sjoerg 
3746*13fbcb42Sjoerg   bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
3747*13fbcb42Sjoerg #ifndef NDEBUG
3748*13fbcb42Sjoerg   bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
3749*13fbcb42Sjoerg #endif
3750*13fbcb42Sjoerg 
3751*13fbcb42Sjoerg   if (Options.SimpleReduction) {
3752*13fbcb42Sjoerg     assert(!TeamsReduction && !ParallelReduction &&
3753*13fbcb42Sjoerg            "Invalid reduction selection in emitReduction.");
3754*13fbcb42Sjoerg     CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
3755*13fbcb42Sjoerg                                    ReductionOps, Options);
3756*13fbcb42Sjoerg     return;
3757*13fbcb42Sjoerg   }
3758*13fbcb42Sjoerg 
3759*13fbcb42Sjoerg   assert((TeamsReduction || ParallelReduction) &&
3760*13fbcb42Sjoerg          "Invalid reduction selection in emitReduction.");
3761*13fbcb42Sjoerg 
3762*13fbcb42Sjoerg   // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3763*13fbcb42Sjoerg   // RedList, shuffle_reduce_func, interwarp_copy_func);
3764*13fbcb42Sjoerg   // or
3765*13fbcb42Sjoerg   // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3766*13fbcb42Sjoerg   llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
3767*13fbcb42Sjoerg   llvm::Value *ThreadId = getThreadID(CGF, Loc);
3768*13fbcb42Sjoerg 
3769*13fbcb42Sjoerg   llvm::Value *Res;
3770*13fbcb42Sjoerg   ASTContext &C = CGM.getContext();
3771*13fbcb42Sjoerg   // 1. Build a list of reduction variables.
3772*13fbcb42Sjoerg   // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3773*13fbcb42Sjoerg   auto Size = RHSExprs.size();
3774*13fbcb42Sjoerg   for (const Expr *E : Privates) {
3775*13fbcb42Sjoerg     if (E->getType()->isVariablyModifiedType())
3776*13fbcb42Sjoerg       // Reserve place for array size.
3777*13fbcb42Sjoerg       ++Size;
3778*13fbcb42Sjoerg   }
3779*13fbcb42Sjoerg   llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
3780*13fbcb42Sjoerg   QualType ReductionArrayTy =
3781*13fbcb42Sjoerg       C.getConstantArrayType(C.VoidPtrTy, ArraySize, nullptr, ArrayType::Normal,
3782*13fbcb42Sjoerg                              /*IndexTypeQuals=*/0);
3783*13fbcb42Sjoerg   Address ReductionList =
3784*13fbcb42Sjoerg       CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3785*13fbcb42Sjoerg   auto IPriv = Privates.begin();
3786*13fbcb42Sjoerg   unsigned Idx = 0;
3787*13fbcb42Sjoerg   for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
3788*13fbcb42Sjoerg     Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3789*13fbcb42Sjoerg     CGF.Builder.CreateStore(
3790*13fbcb42Sjoerg         CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3791*13fbcb42Sjoerg             CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),
3792*13fbcb42Sjoerg         Elem);
3793*13fbcb42Sjoerg     if ((*IPriv)->getType()->isVariablyModifiedType()) {
3794*13fbcb42Sjoerg       // Store array size.
3795*13fbcb42Sjoerg       ++Idx;
3796*13fbcb42Sjoerg       Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3797*13fbcb42Sjoerg       llvm::Value *Size = CGF.Builder.CreateIntCast(
3798*13fbcb42Sjoerg           CGF.getVLASize(
3799*13fbcb42Sjoerg                  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3800*13fbcb42Sjoerg               .NumElts,
3801*13fbcb42Sjoerg           CGF.SizeTy, /*isSigned=*/false);
3802*13fbcb42Sjoerg       CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3803*13fbcb42Sjoerg                               Elem);
3804*13fbcb42Sjoerg     }
3805*13fbcb42Sjoerg   }
3806*13fbcb42Sjoerg 
3807*13fbcb42Sjoerg   llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3808*13fbcb42Sjoerg       ReductionList.getPointer(), CGF.VoidPtrTy);
3809*13fbcb42Sjoerg   llvm::Function *ReductionFn = emitReductionFunction(
3810*13fbcb42Sjoerg       Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates,
3811*13fbcb42Sjoerg       LHSExprs, RHSExprs, ReductionOps);
3812*13fbcb42Sjoerg   llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
3813*13fbcb42Sjoerg   llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
3814*13fbcb42Sjoerg       CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
3815*13fbcb42Sjoerg   llvm::Value *InterWarpCopyFn =
3816*13fbcb42Sjoerg       emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
3817*13fbcb42Sjoerg 
3818*13fbcb42Sjoerg   if (ParallelReduction) {
3819*13fbcb42Sjoerg     llvm::Value *Args[] = {RTLoc,
3820*13fbcb42Sjoerg                            ThreadId,
3821*13fbcb42Sjoerg                            CGF.Builder.getInt32(RHSExprs.size()),
3822*13fbcb42Sjoerg                            ReductionArrayTySize,
3823*13fbcb42Sjoerg                            RL,
3824*13fbcb42Sjoerg                            ShuffleAndReduceFn,
3825*13fbcb42Sjoerg                            InterWarpCopyFn};
3826*13fbcb42Sjoerg 
3827*13fbcb42Sjoerg     Res = CGF.EmitRuntimeCall(
3828*13fbcb42Sjoerg         OMPBuilder.getOrCreateRuntimeFunction(
3829*13fbcb42Sjoerg             CGM.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
3830*13fbcb42Sjoerg         Args);
3831*13fbcb42Sjoerg   } else {
3832*13fbcb42Sjoerg     assert(TeamsReduction && "expected teams reduction.");
3833*13fbcb42Sjoerg     llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
3834*13fbcb42Sjoerg     llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
3835*13fbcb42Sjoerg     int Cnt = 0;
3836*13fbcb42Sjoerg     for (const Expr *DRE : Privates) {
3837*13fbcb42Sjoerg       PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
3838*13fbcb42Sjoerg       ++Cnt;
3839*13fbcb42Sjoerg     }
3840*13fbcb42Sjoerg     const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars(
3841*13fbcb42Sjoerg         CGM.getContext(), PrivatesReductions, llvm::None, VarFieldMap,
3842*13fbcb42Sjoerg         C.getLangOpts().OpenMPCUDAReductionBufNum);
3843*13fbcb42Sjoerg     TeamsReductions.push_back(TeamReductionRec);
3844*13fbcb42Sjoerg     if (!KernelTeamsReductionPtr) {
3845*13fbcb42Sjoerg       KernelTeamsReductionPtr = new llvm::GlobalVariable(
3846*13fbcb42Sjoerg           CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
3847*13fbcb42Sjoerg           llvm::GlobalValue::InternalLinkage, nullptr,
3848*13fbcb42Sjoerg           "_openmp_teams_reductions_buffer_$_$ptr");
3849*13fbcb42Sjoerg     }
3850*13fbcb42Sjoerg     llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
3851*13fbcb42Sjoerg         Address(KernelTeamsReductionPtr, CGM.getPointerAlign()),
3852*13fbcb42Sjoerg         /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc);
3853*13fbcb42Sjoerg     llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
3854*13fbcb42Sjoerg         CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
3855*13fbcb42Sjoerg     llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
3856*13fbcb42Sjoerg         CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
3857*13fbcb42Sjoerg         ReductionFn);
3858*13fbcb42Sjoerg     llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
3859*13fbcb42Sjoerg         CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
3860*13fbcb42Sjoerg     llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
3861*13fbcb42Sjoerg         CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
3862*13fbcb42Sjoerg         ReductionFn);
3863*13fbcb42Sjoerg 
3864*13fbcb42Sjoerg     llvm::Value *Args[] = {
3865*13fbcb42Sjoerg         RTLoc,
3866*13fbcb42Sjoerg         ThreadId,
3867*13fbcb42Sjoerg         GlobalBufferPtr,
3868*13fbcb42Sjoerg         CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
3869*13fbcb42Sjoerg         RL,
3870*13fbcb42Sjoerg         ShuffleAndReduceFn,
3871*13fbcb42Sjoerg         InterWarpCopyFn,
3872*13fbcb42Sjoerg         GlobalToBufferCpyFn,
3873*13fbcb42Sjoerg         GlobalToBufferRedFn,
3874*13fbcb42Sjoerg         BufferToGlobalCpyFn,
3875*13fbcb42Sjoerg         BufferToGlobalRedFn};
3876*13fbcb42Sjoerg 
3877*13fbcb42Sjoerg     Res = CGF.EmitRuntimeCall(
3878*13fbcb42Sjoerg         OMPBuilder.getOrCreateRuntimeFunction(
3879*13fbcb42Sjoerg             CGM.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
3880*13fbcb42Sjoerg         Args);
3881*13fbcb42Sjoerg   }
3882*13fbcb42Sjoerg 
3883*13fbcb42Sjoerg   // 5. Build if (res == 1)
3884*13fbcb42Sjoerg   llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
3885*13fbcb42Sjoerg   llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
3886*13fbcb42Sjoerg   llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
3887*13fbcb42Sjoerg       Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
3888*13fbcb42Sjoerg   CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3889*13fbcb42Sjoerg 
3890*13fbcb42Sjoerg   // 6. Build then branch: where we have reduced values in the master
3891*13fbcb42Sjoerg   //    thread in each team.
3892*13fbcb42Sjoerg   //    __kmpc_end_reduce{_nowait}(<gtid>);
3893*13fbcb42Sjoerg   //    break;
3894*13fbcb42Sjoerg   CGF.EmitBlock(ThenBB);
3895*13fbcb42Sjoerg 
3896*13fbcb42Sjoerg   // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3897*13fbcb42Sjoerg   auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
3898*13fbcb42Sjoerg                     this](CodeGenFunction &CGF, PrePostActionTy &Action) {
3899*13fbcb42Sjoerg     auto IPriv = Privates.begin();
3900*13fbcb42Sjoerg     auto ILHS = LHSExprs.begin();
3901*13fbcb42Sjoerg     auto IRHS = RHSExprs.begin();
3902*13fbcb42Sjoerg     for (const Expr *E : ReductionOps) {
3903*13fbcb42Sjoerg       emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
3904*13fbcb42Sjoerg                                   cast<DeclRefExpr>(*IRHS));
3905*13fbcb42Sjoerg       ++IPriv;
3906*13fbcb42Sjoerg       ++ILHS;
3907*13fbcb42Sjoerg       ++IRHS;
3908*13fbcb42Sjoerg     }
3909*13fbcb42Sjoerg   };
3910*13fbcb42Sjoerg   llvm::Value *EndArgs[] = {ThreadId};
3911*13fbcb42Sjoerg   RegionCodeGenTy RCG(CodeGen);
3912*13fbcb42Sjoerg   NVPTXActionTy Action(
3913*13fbcb42Sjoerg       nullptr, llvm::None,
3914*13fbcb42Sjoerg       OMPBuilder.getOrCreateRuntimeFunction(
3915*13fbcb42Sjoerg           CGM.getModule(), OMPRTL___kmpc_nvptx_end_reduce_nowait),
3916*13fbcb42Sjoerg       EndArgs);
3917*13fbcb42Sjoerg   RCG.setAction(Action);
3918*13fbcb42Sjoerg   RCG(CGF);
3919*13fbcb42Sjoerg   // There is no need to emit line number for unconditional branch.
3920*13fbcb42Sjoerg   (void)ApplyDebugLocation::CreateEmpty(CGF);
3921*13fbcb42Sjoerg   CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
3922*13fbcb42Sjoerg }
3923*13fbcb42Sjoerg 
3924*13fbcb42Sjoerg const VarDecl *
translateParameter(const FieldDecl * FD,const VarDecl * NativeParam) const3925*13fbcb42Sjoerg CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD,
3926*13fbcb42Sjoerg                                        const VarDecl *NativeParam) const {
3927*13fbcb42Sjoerg   if (!NativeParam->getType()->isReferenceType())
3928*13fbcb42Sjoerg     return NativeParam;
3929*13fbcb42Sjoerg   QualType ArgType = NativeParam->getType();
3930*13fbcb42Sjoerg   QualifierCollector QC;
3931*13fbcb42Sjoerg   const Type *NonQualTy = QC.strip(ArgType);
3932*13fbcb42Sjoerg   QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3933*13fbcb42Sjoerg   if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
3934*13fbcb42Sjoerg     if (Attr->getCaptureKind() == OMPC_map) {
3935*13fbcb42Sjoerg       PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
3936*13fbcb42Sjoerg                                                         LangAS::opencl_global);
3937*13fbcb42Sjoerg     } else if (Attr->getCaptureKind() == OMPC_firstprivate &&
3938*13fbcb42Sjoerg                PointeeTy.isConstant(CGM.getContext())) {
3939*13fbcb42Sjoerg       PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
3940*13fbcb42Sjoerg                                                         LangAS::opencl_generic);
3941*13fbcb42Sjoerg     }
3942*13fbcb42Sjoerg   }
3943*13fbcb42Sjoerg   ArgType = CGM.getContext().getPointerType(PointeeTy);
3944*13fbcb42Sjoerg   QC.addRestrict();
3945*13fbcb42Sjoerg   enum { NVPTX_local_addr = 5 };
3946*13fbcb42Sjoerg   QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
3947*13fbcb42Sjoerg   ArgType = QC.apply(CGM.getContext(), ArgType);
3948*13fbcb42Sjoerg   if (isa<ImplicitParamDecl>(NativeParam))
3949*13fbcb42Sjoerg     return ImplicitParamDecl::Create(
3950*13fbcb42Sjoerg         CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
3951*13fbcb42Sjoerg         NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
3952*13fbcb42Sjoerg   return ParmVarDecl::Create(
3953*13fbcb42Sjoerg       CGM.getContext(),
3954*13fbcb42Sjoerg       const_cast<DeclContext *>(NativeParam->getDeclContext()),
3955*13fbcb42Sjoerg       NativeParam->getBeginLoc(), NativeParam->getLocation(),
3956*13fbcb42Sjoerg       NativeParam->getIdentifier(), ArgType,
3957*13fbcb42Sjoerg       /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
3958*13fbcb42Sjoerg }
3959*13fbcb42Sjoerg 
3960*13fbcb42Sjoerg Address
getParameterAddress(CodeGenFunction & CGF,const VarDecl * NativeParam,const VarDecl * TargetParam) const3961*13fbcb42Sjoerg CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF,
3962*13fbcb42Sjoerg                                           const VarDecl *NativeParam,
3963*13fbcb42Sjoerg                                           const VarDecl *TargetParam) const {
3964*13fbcb42Sjoerg   assert(NativeParam != TargetParam &&
3965*13fbcb42Sjoerg          NativeParam->getType()->isReferenceType() &&
3966*13fbcb42Sjoerg          "Native arg must not be the same as target arg.");
3967*13fbcb42Sjoerg   Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
3968*13fbcb42Sjoerg   QualType NativeParamType = NativeParam->getType();
3969*13fbcb42Sjoerg   QualifierCollector QC;
3970*13fbcb42Sjoerg   const Type *NonQualTy = QC.strip(NativeParamType);
3971*13fbcb42Sjoerg   QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3972*13fbcb42Sjoerg   unsigned NativePointeeAddrSpace =
3973*13fbcb42Sjoerg       CGF.getContext().getTargetAddressSpace(NativePointeeTy);
3974*13fbcb42Sjoerg   QualType TargetTy = TargetParam->getType();
3975*13fbcb42Sjoerg   llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
3976*13fbcb42Sjoerg       LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
3977*13fbcb42Sjoerg   // First cast to generic.
3978*13fbcb42Sjoerg   TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3979*13fbcb42Sjoerg       TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
3980*13fbcb42Sjoerg                       /*AddrSpace=*/0));
3981*13fbcb42Sjoerg   // Cast from generic to native address space.
3982*13fbcb42Sjoerg   TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3983*13fbcb42Sjoerg       TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
3984*13fbcb42Sjoerg                       NativePointeeAddrSpace));
3985*13fbcb42Sjoerg   Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
3986*13fbcb42Sjoerg   CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
3987*13fbcb42Sjoerg                         NativeParamType);
3988*13fbcb42Sjoerg   return NativeParamAddr;
3989*13fbcb42Sjoerg }
3990*13fbcb42Sjoerg 
emitOutlinedFunctionCall(CodeGenFunction & CGF,SourceLocation Loc,llvm::FunctionCallee OutlinedFn,ArrayRef<llvm::Value * > Args) const3991*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall(
3992*13fbcb42Sjoerg     CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
3993*13fbcb42Sjoerg     ArrayRef<llvm::Value *> Args) const {
3994*13fbcb42Sjoerg   SmallVector<llvm::Value *, 4> TargetArgs;
3995*13fbcb42Sjoerg   TargetArgs.reserve(Args.size());
3996*13fbcb42Sjoerg   auto *FnType = OutlinedFn.getFunctionType();
3997*13fbcb42Sjoerg   for (unsigned I = 0, E = Args.size(); I < E; ++I) {
3998*13fbcb42Sjoerg     if (FnType->isVarArg() && FnType->getNumParams() <= I) {
3999*13fbcb42Sjoerg       TargetArgs.append(std::next(Args.begin(), I), Args.end());
4000*13fbcb42Sjoerg       break;
4001*13fbcb42Sjoerg     }
4002*13fbcb42Sjoerg     llvm::Type *TargetType = FnType->getParamType(I);
4003*13fbcb42Sjoerg     llvm::Value *NativeArg = Args[I];
4004*13fbcb42Sjoerg     if (!TargetType->isPointerTy()) {
4005*13fbcb42Sjoerg       TargetArgs.emplace_back(NativeArg);
4006*13fbcb42Sjoerg       continue;
4007*13fbcb42Sjoerg     }
4008*13fbcb42Sjoerg     llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
4009*13fbcb42Sjoerg         NativeArg,
4010*13fbcb42Sjoerg         NativeArg->getType()->getPointerElementType()->getPointerTo());
4011*13fbcb42Sjoerg     TargetArgs.emplace_back(
4012*13fbcb42Sjoerg         CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
4013*13fbcb42Sjoerg   }
4014*13fbcb42Sjoerg   CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
4015*13fbcb42Sjoerg }
4016*13fbcb42Sjoerg 
4017*13fbcb42Sjoerg /// Emit function which wraps the outline parallel region
4018*13fbcb42Sjoerg /// and controls the arguments which are passed to this function.
4019*13fbcb42Sjoerg /// The wrapper ensures that the outlined function is called
4020*13fbcb42Sjoerg /// with the correct arguments when data is shared.
createParallelDataSharingWrapper(llvm::Function * OutlinedParallelFn,const OMPExecutableDirective & D)4021*13fbcb42Sjoerg llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
4022*13fbcb42Sjoerg     llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
4023*13fbcb42Sjoerg   ASTContext &Ctx = CGM.getContext();
4024*13fbcb42Sjoerg   const auto &CS = *D.getCapturedStmt(OMPD_parallel);
4025*13fbcb42Sjoerg 
4026*13fbcb42Sjoerg   // Create a function that takes as argument the source thread.
4027*13fbcb42Sjoerg   FunctionArgList WrapperArgs;
4028*13fbcb42Sjoerg   QualType Int16QTy =
4029*13fbcb42Sjoerg       Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
4030*13fbcb42Sjoerg   QualType Int32QTy =
4031*13fbcb42Sjoerg       Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
4032*13fbcb42Sjoerg   ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
4033*13fbcb42Sjoerg                                      /*Id=*/nullptr, Int16QTy,
4034*13fbcb42Sjoerg                                      ImplicitParamDecl::Other);
4035*13fbcb42Sjoerg   ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
4036*13fbcb42Sjoerg                                /*Id=*/nullptr, Int32QTy,
4037*13fbcb42Sjoerg                                ImplicitParamDecl::Other);
4038*13fbcb42Sjoerg   WrapperArgs.emplace_back(&ParallelLevelArg);
4039*13fbcb42Sjoerg   WrapperArgs.emplace_back(&WrapperArg);
4040*13fbcb42Sjoerg 
4041*13fbcb42Sjoerg   const CGFunctionInfo &CGFI =
4042*13fbcb42Sjoerg       CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
4043*13fbcb42Sjoerg 
4044*13fbcb42Sjoerg   auto *Fn = llvm::Function::Create(
4045*13fbcb42Sjoerg       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
4046*13fbcb42Sjoerg       Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
4047*13fbcb42Sjoerg 
4048*13fbcb42Sjoerg   // Ensure we do not inline the function. This is trivially true for the ones
4049*13fbcb42Sjoerg   // passed to __kmpc_fork_call but the ones calles in serialized regions
4050*13fbcb42Sjoerg   // could be inlined. This is not a perfect but it is closer to the invariant
4051*13fbcb42Sjoerg   // we want, namely, every data environment starts with a new function.
4052*13fbcb42Sjoerg   // TODO: We should pass the if condition to the runtime function and do the
4053*13fbcb42Sjoerg   //       handling there. Much cleaner code.
4054*13fbcb42Sjoerg   Fn->addFnAttr(llvm::Attribute::NoInline);
4055*13fbcb42Sjoerg 
4056*13fbcb42Sjoerg   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
4057*13fbcb42Sjoerg   Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
4058*13fbcb42Sjoerg   Fn->setDoesNotRecurse();
4059*13fbcb42Sjoerg 
4060*13fbcb42Sjoerg   CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
4061*13fbcb42Sjoerg   CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
4062*13fbcb42Sjoerg                     D.getBeginLoc(), D.getBeginLoc());
4063*13fbcb42Sjoerg 
4064*13fbcb42Sjoerg   const auto *RD = CS.getCapturedRecordDecl();
4065*13fbcb42Sjoerg   auto CurField = RD->field_begin();
4066*13fbcb42Sjoerg 
4067*13fbcb42Sjoerg   Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
4068*13fbcb42Sjoerg                                                       /*Name=*/".zero.addr");
4069*13fbcb42Sjoerg   CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
4070*13fbcb42Sjoerg   // Get the array of arguments.
4071*13fbcb42Sjoerg   SmallVector<llvm::Value *, 8> Args;
4072*13fbcb42Sjoerg 
4073*13fbcb42Sjoerg   Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
4074*13fbcb42Sjoerg   Args.emplace_back(ZeroAddr.getPointer());
4075*13fbcb42Sjoerg 
4076*13fbcb42Sjoerg   CGBuilderTy &Bld = CGF.Builder;
4077*13fbcb42Sjoerg   auto CI = CS.capture_begin();
4078*13fbcb42Sjoerg 
4079*13fbcb42Sjoerg   // Use global memory for data sharing.
4080*13fbcb42Sjoerg   // Handle passing of global args to workers.
4081*13fbcb42Sjoerg   Address GlobalArgs =
4082*13fbcb42Sjoerg       CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
4083*13fbcb42Sjoerg   llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
4084*13fbcb42Sjoerg   llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
4085*13fbcb42Sjoerg   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
4086*13fbcb42Sjoerg                           CGM.getModule(), OMPRTL___kmpc_get_shared_variables),
4087*13fbcb42Sjoerg                       DataSharingArgs);
4088*13fbcb42Sjoerg 
4089*13fbcb42Sjoerg   // Retrieve the shared variables from the list of references returned
4090*13fbcb42Sjoerg   // by the runtime. Pass the variables to the outlined function.
4091*13fbcb42Sjoerg   Address SharedArgListAddress = Address::invalid();
4092*13fbcb42Sjoerg   if (CS.capture_size() > 0 ||
4093*13fbcb42Sjoerg       isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
4094*13fbcb42Sjoerg     SharedArgListAddress = CGF.EmitLoadOfPointer(
4095*13fbcb42Sjoerg         GlobalArgs, CGF.getContext()
4096*13fbcb42Sjoerg                         .getPointerType(CGF.getContext().getPointerType(
4097*13fbcb42Sjoerg                             CGF.getContext().VoidPtrTy))
4098*13fbcb42Sjoerg                         .castAs<PointerType>());
4099*13fbcb42Sjoerg   }
4100*13fbcb42Sjoerg   unsigned Idx = 0;
4101*13fbcb42Sjoerg   if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
4102*13fbcb42Sjoerg     Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
4103*13fbcb42Sjoerg     Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4104*13fbcb42Sjoerg         Src, CGF.SizeTy->getPointerTo());
4105*13fbcb42Sjoerg     llvm::Value *LB = CGF.EmitLoadOfScalar(
4106*13fbcb42Sjoerg         TypedAddress,
4107*13fbcb42Sjoerg         /*Volatile=*/false,
4108*13fbcb42Sjoerg         CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
4109*13fbcb42Sjoerg         cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
4110*13fbcb42Sjoerg     Args.emplace_back(LB);
4111*13fbcb42Sjoerg     ++Idx;
4112*13fbcb42Sjoerg     Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
4113*13fbcb42Sjoerg     TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4114*13fbcb42Sjoerg         Src, CGF.SizeTy->getPointerTo());
4115*13fbcb42Sjoerg     llvm::Value *UB = CGF.EmitLoadOfScalar(
4116*13fbcb42Sjoerg         TypedAddress,
4117*13fbcb42Sjoerg         /*Volatile=*/false,
4118*13fbcb42Sjoerg         CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
4119*13fbcb42Sjoerg         cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
4120*13fbcb42Sjoerg     Args.emplace_back(UB);
4121*13fbcb42Sjoerg     ++Idx;
4122*13fbcb42Sjoerg   }
4123*13fbcb42Sjoerg   if (CS.capture_size() > 0) {
4124*13fbcb42Sjoerg     ASTContext &CGFContext = CGF.getContext();
4125*13fbcb42Sjoerg     for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
4126*13fbcb42Sjoerg       QualType ElemTy = CurField->getType();
4127*13fbcb42Sjoerg       Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
4128*13fbcb42Sjoerg       Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4129*13fbcb42Sjoerg           Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
4130*13fbcb42Sjoerg       llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
4131*13fbcb42Sjoerg                                               /*Volatile=*/false,
4132*13fbcb42Sjoerg                                               CGFContext.getPointerType(ElemTy),
4133*13fbcb42Sjoerg                                               CI->getLocation());
4134*13fbcb42Sjoerg       if (CI->capturesVariableByCopy() &&
4135*13fbcb42Sjoerg           !CI->getCapturedVar()->getType()->isAnyPointerType()) {
4136*13fbcb42Sjoerg         Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
4137*13fbcb42Sjoerg                               CI->getLocation());
4138*13fbcb42Sjoerg       }
4139*13fbcb42Sjoerg       Args.emplace_back(Arg);
4140*13fbcb42Sjoerg     }
4141*13fbcb42Sjoerg   }
4142*13fbcb42Sjoerg 
4143*13fbcb42Sjoerg   emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
4144*13fbcb42Sjoerg   CGF.FinishFunction();
4145*13fbcb42Sjoerg   return Fn;
4146*13fbcb42Sjoerg }
4147*13fbcb42Sjoerg 
emitFunctionProlog(CodeGenFunction & CGF,const Decl * D)4148*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
4149*13fbcb42Sjoerg                                               const Decl *D) {
4150*13fbcb42Sjoerg   if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
4151*13fbcb42Sjoerg     return;
4152*13fbcb42Sjoerg 
4153*13fbcb42Sjoerg   assert(D && "Expected function or captured|block decl.");
4154*13fbcb42Sjoerg   assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
4155*13fbcb42Sjoerg          "Function is registered already.");
4156*13fbcb42Sjoerg   assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
4157*13fbcb42Sjoerg          "Team is set but not processed.");
4158*13fbcb42Sjoerg   const Stmt *Body = nullptr;
4159*13fbcb42Sjoerg   bool NeedToDelayGlobalization = false;
4160*13fbcb42Sjoerg   if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
4161*13fbcb42Sjoerg     Body = FD->getBody();
4162*13fbcb42Sjoerg   } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
4163*13fbcb42Sjoerg     Body = BD->getBody();
4164*13fbcb42Sjoerg   } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
4165*13fbcb42Sjoerg     Body = CD->getBody();
4166*13fbcb42Sjoerg     NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
4167*13fbcb42Sjoerg     if (NeedToDelayGlobalization &&
4168*13fbcb42Sjoerg         getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
4169*13fbcb42Sjoerg       return;
4170*13fbcb42Sjoerg   }
4171*13fbcb42Sjoerg   if (!Body)
4172*13fbcb42Sjoerg     return;
4173*13fbcb42Sjoerg   CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
4174*13fbcb42Sjoerg   VarChecker.Visit(Body);
4175*13fbcb42Sjoerg   const RecordDecl *GlobalizedVarsRecord =
4176*13fbcb42Sjoerg       VarChecker.getGlobalizedRecord(IsInTTDRegion);
4177*13fbcb42Sjoerg   TeamAndReductions.first = nullptr;
4178*13fbcb42Sjoerg   TeamAndReductions.second.clear();
4179*13fbcb42Sjoerg   ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
4180*13fbcb42Sjoerg       VarChecker.getEscapedVariableLengthDecls();
4181*13fbcb42Sjoerg   if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
4182*13fbcb42Sjoerg     return;
4183*13fbcb42Sjoerg   auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
4184*13fbcb42Sjoerg   I->getSecond().MappedParams =
4185*13fbcb42Sjoerg       std::make_unique<CodeGenFunction::OMPMapVars>();
4186*13fbcb42Sjoerg   I->getSecond().GlobalRecord = GlobalizedVarsRecord;
4187*13fbcb42Sjoerg   I->getSecond().EscapedParameters.insert(
4188*13fbcb42Sjoerg       VarChecker.getEscapedParameters().begin(),
4189*13fbcb42Sjoerg       VarChecker.getEscapedParameters().end());
4190*13fbcb42Sjoerg   I->getSecond().EscapedVariableLengthDecls.append(
4191*13fbcb42Sjoerg       EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
4192*13fbcb42Sjoerg   DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
4193*13fbcb42Sjoerg   for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
4194*13fbcb42Sjoerg     assert(VD->isCanonicalDecl() && "Expected canonical declaration");
4195*13fbcb42Sjoerg     const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
4196*13fbcb42Sjoerg     Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion)));
4197*13fbcb42Sjoerg   }
4198*13fbcb42Sjoerg   if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
4199*13fbcb42Sjoerg     CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None);
4200*13fbcb42Sjoerg     VarChecker.Visit(Body);
4201*13fbcb42Sjoerg     I->getSecond().SecondaryGlobalRecord =
4202*13fbcb42Sjoerg         VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true);
4203*13fbcb42Sjoerg     I->getSecond().SecondaryLocalVarData.emplace();
4204*13fbcb42Sjoerg     DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue();
4205*13fbcb42Sjoerg     for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
4206*13fbcb42Sjoerg       assert(VD->isCanonicalDecl() && "Expected canonical declaration");
4207*13fbcb42Sjoerg       const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
4208*13fbcb42Sjoerg       Data.insert(
4209*13fbcb42Sjoerg           std::make_pair(VD, MappedVarData(FD, /*IsInTTDRegion=*/true)));
4210*13fbcb42Sjoerg     }
4211*13fbcb42Sjoerg   }
4212*13fbcb42Sjoerg   if (!NeedToDelayGlobalization) {
4213*13fbcb42Sjoerg     emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true);
4214*13fbcb42Sjoerg     struct GlobalizationScope final : EHScopeStack::Cleanup {
4215*13fbcb42Sjoerg       GlobalizationScope() = default;
4216*13fbcb42Sjoerg 
4217*13fbcb42Sjoerg       void Emit(CodeGenFunction &CGF, Flags flags) override {
4218*13fbcb42Sjoerg         static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
4219*13fbcb42Sjoerg             .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true);
4220*13fbcb42Sjoerg       }
4221*13fbcb42Sjoerg     };
4222*13fbcb42Sjoerg     CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
4223*13fbcb42Sjoerg   }
4224*13fbcb42Sjoerg }
4225*13fbcb42Sjoerg 
getAddressOfLocalVariable(CodeGenFunction & CGF,const VarDecl * VD)4226*13fbcb42Sjoerg Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
4227*13fbcb42Sjoerg                                                         const VarDecl *VD) {
4228*13fbcb42Sjoerg   if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
4229*13fbcb42Sjoerg     const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
4230*13fbcb42Sjoerg     auto AS = LangAS::Default;
4231*13fbcb42Sjoerg     switch (A->getAllocatorType()) {
4232*13fbcb42Sjoerg       // Use the default allocator here as by default local vars are
4233*13fbcb42Sjoerg       // threadlocal.
4234*13fbcb42Sjoerg     case OMPAllocateDeclAttr::OMPNullMemAlloc:
4235*13fbcb42Sjoerg     case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
4236*13fbcb42Sjoerg     case OMPAllocateDeclAttr::OMPThreadMemAlloc:
4237*13fbcb42Sjoerg     case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
4238*13fbcb42Sjoerg     case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
4239*13fbcb42Sjoerg       // Follow the user decision - use default allocation.
4240*13fbcb42Sjoerg       return Address::invalid();
4241*13fbcb42Sjoerg     case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
4242*13fbcb42Sjoerg       // TODO: implement aupport for user-defined allocators.
4243*13fbcb42Sjoerg       return Address::invalid();
4244*13fbcb42Sjoerg     case OMPAllocateDeclAttr::OMPConstMemAlloc:
4245*13fbcb42Sjoerg       AS = LangAS::cuda_constant;
4246*13fbcb42Sjoerg       break;
4247*13fbcb42Sjoerg     case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
4248*13fbcb42Sjoerg       AS = LangAS::cuda_shared;
4249*13fbcb42Sjoerg       break;
4250*13fbcb42Sjoerg     case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
4251*13fbcb42Sjoerg     case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
4252*13fbcb42Sjoerg       break;
4253*13fbcb42Sjoerg     }
4254*13fbcb42Sjoerg     llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
4255*13fbcb42Sjoerg     auto *GV = new llvm::GlobalVariable(
4256*13fbcb42Sjoerg         CGM.getModule(), VarTy, /*isConstant=*/false,
4257*13fbcb42Sjoerg         llvm::GlobalValue::InternalLinkage, llvm::Constant::getNullValue(VarTy),
4258*13fbcb42Sjoerg         VD->getName(),
4259*13fbcb42Sjoerg         /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
4260*13fbcb42Sjoerg         CGM.getContext().getTargetAddressSpace(AS));
4261*13fbcb42Sjoerg     CharUnits Align = CGM.getContext().getDeclAlign(VD);
4262*13fbcb42Sjoerg     GV->setAlignment(Align.getAsAlign());
4263*13fbcb42Sjoerg     return Address(
4264*13fbcb42Sjoerg         CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
4265*13fbcb42Sjoerg             GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace(
4266*13fbcb42Sjoerg                     VD->getType().getAddressSpace()))),
4267*13fbcb42Sjoerg         Align);
4268*13fbcb42Sjoerg   }
4269*13fbcb42Sjoerg 
4270*13fbcb42Sjoerg   if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
4271*13fbcb42Sjoerg     return Address::invalid();
4272*13fbcb42Sjoerg 
4273*13fbcb42Sjoerg   VD = VD->getCanonicalDecl();
4274*13fbcb42Sjoerg   auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
4275*13fbcb42Sjoerg   if (I == FunctionGlobalizedDecls.end())
4276*13fbcb42Sjoerg     return Address::invalid();
4277*13fbcb42Sjoerg   auto VDI = I->getSecond().LocalVarData.find(VD);
4278*13fbcb42Sjoerg   if (VDI != I->getSecond().LocalVarData.end())
4279*13fbcb42Sjoerg     return VDI->second.PrivateAddr;
4280*13fbcb42Sjoerg   if (VD->hasAttrs()) {
4281*13fbcb42Sjoerg     for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
4282*13fbcb42Sjoerg          E(VD->attr_end());
4283*13fbcb42Sjoerg          IT != E; ++IT) {
4284*13fbcb42Sjoerg       auto VDI = I->getSecond().LocalVarData.find(
4285*13fbcb42Sjoerg           cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
4286*13fbcb42Sjoerg               ->getCanonicalDecl());
4287*13fbcb42Sjoerg       if (VDI != I->getSecond().LocalVarData.end())
4288*13fbcb42Sjoerg         return VDI->second.PrivateAddr;
4289*13fbcb42Sjoerg     }
4290*13fbcb42Sjoerg   }
4291*13fbcb42Sjoerg 
4292*13fbcb42Sjoerg   return Address::invalid();
4293*13fbcb42Sjoerg }
4294*13fbcb42Sjoerg 
functionFinished(CodeGenFunction & CGF)4295*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) {
4296*13fbcb42Sjoerg   FunctionGlobalizedDecls.erase(CGF.CurFn);
4297*13fbcb42Sjoerg   CGOpenMPRuntime::functionFinished(CGF);
4298*13fbcb42Sjoerg }
4299*13fbcb42Sjoerg 
getDefaultDistScheduleAndChunk(CodeGenFunction & CGF,const OMPLoopDirective & S,OpenMPDistScheduleClauseKind & ScheduleKind,llvm::Value * & Chunk) const4300*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
4301*13fbcb42Sjoerg     CodeGenFunction &CGF, const OMPLoopDirective &S,
4302*13fbcb42Sjoerg     OpenMPDistScheduleClauseKind &ScheduleKind,
4303*13fbcb42Sjoerg     llvm::Value *&Chunk) const {
4304*13fbcb42Sjoerg   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
4305*13fbcb42Sjoerg   if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
4306*13fbcb42Sjoerg     ScheduleKind = OMPC_DIST_SCHEDULE_static;
4307*13fbcb42Sjoerg     Chunk = CGF.EmitScalarConversion(
4308*13fbcb42Sjoerg         RT.getGPUNumThreads(CGF),
4309*13fbcb42Sjoerg         CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
4310*13fbcb42Sjoerg         S.getIterationVariable()->getType(), S.getBeginLoc());
4311*13fbcb42Sjoerg     return;
4312*13fbcb42Sjoerg   }
4313*13fbcb42Sjoerg   CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
4314*13fbcb42Sjoerg       CGF, S, ScheduleKind, Chunk);
4315*13fbcb42Sjoerg }
4316*13fbcb42Sjoerg 
getDefaultScheduleAndChunk(CodeGenFunction & CGF,const OMPLoopDirective & S,OpenMPScheduleClauseKind & ScheduleKind,const Expr * & ChunkExpr) const4317*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk(
4318*13fbcb42Sjoerg     CodeGenFunction &CGF, const OMPLoopDirective &S,
4319*13fbcb42Sjoerg     OpenMPScheduleClauseKind &ScheduleKind,
4320*13fbcb42Sjoerg     const Expr *&ChunkExpr) const {
4321*13fbcb42Sjoerg   ScheduleKind = OMPC_SCHEDULE_static;
4322*13fbcb42Sjoerg   // Chunk size is 1 in this case.
4323*13fbcb42Sjoerg   llvm::APInt ChunkSize(32, 1);
4324*13fbcb42Sjoerg   ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
4325*13fbcb42Sjoerg       CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
4326*13fbcb42Sjoerg       SourceLocation());
4327*13fbcb42Sjoerg }
4328*13fbcb42Sjoerg 
adjustTargetSpecificDataForLambdas(CodeGenFunction & CGF,const OMPExecutableDirective & D) const4329*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas(
4330*13fbcb42Sjoerg     CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
4331*13fbcb42Sjoerg   assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&
4332*13fbcb42Sjoerg          " Expected target-based directive.");
4333*13fbcb42Sjoerg   const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
4334*13fbcb42Sjoerg   for (const CapturedStmt::Capture &C : CS->captures()) {
4335*13fbcb42Sjoerg     // Capture variables captured by reference in lambdas for target-based
4336*13fbcb42Sjoerg     // directives.
4337*13fbcb42Sjoerg     if (!C.capturesVariable())
4338*13fbcb42Sjoerg       continue;
4339*13fbcb42Sjoerg     const VarDecl *VD = C.getCapturedVar();
4340*13fbcb42Sjoerg     const auto *RD = VD->getType()
4341*13fbcb42Sjoerg                          .getCanonicalType()
4342*13fbcb42Sjoerg                          .getNonReferenceType()
4343*13fbcb42Sjoerg                          ->getAsCXXRecordDecl();
4344*13fbcb42Sjoerg     if (!RD || !RD->isLambda())
4345*13fbcb42Sjoerg       continue;
4346*13fbcb42Sjoerg     Address VDAddr = CGF.GetAddrOfLocalVar(VD);
4347*13fbcb42Sjoerg     LValue VDLVal;
4348*13fbcb42Sjoerg     if (VD->getType().getCanonicalType()->isReferenceType())
4349*13fbcb42Sjoerg       VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
4350*13fbcb42Sjoerg     else
4351*13fbcb42Sjoerg       VDLVal = CGF.MakeAddrLValue(
4352*13fbcb42Sjoerg           VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
4353*13fbcb42Sjoerg     llvm::DenseMap<const VarDecl *, FieldDecl *> Captures;
4354*13fbcb42Sjoerg     FieldDecl *ThisCapture = nullptr;
4355*13fbcb42Sjoerg     RD->getCaptureFields(Captures, ThisCapture);
4356*13fbcb42Sjoerg     if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
4357*13fbcb42Sjoerg       LValue ThisLVal =
4358*13fbcb42Sjoerg           CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
4359*13fbcb42Sjoerg       llvm::Value *CXXThis = CGF.LoadCXXThis();
4360*13fbcb42Sjoerg       CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
4361*13fbcb42Sjoerg     }
4362*13fbcb42Sjoerg     for (const LambdaCapture &LC : RD->captures()) {
4363*13fbcb42Sjoerg       if (LC.getCaptureKind() != LCK_ByRef)
4364*13fbcb42Sjoerg         continue;
4365*13fbcb42Sjoerg       const VarDecl *VD = LC.getCapturedVar();
4366*13fbcb42Sjoerg       if (!CS->capturesVariable(VD))
4367*13fbcb42Sjoerg         continue;
4368*13fbcb42Sjoerg       auto It = Captures.find(VD);
4369*13fbcb42Sjoerg       assert(It != Captures.end() && "Found lambda capture without field.");
4370*13fbcb42Sjoerg       LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
4371*13fbcb42Sjoerg       Address VDAddr = CGF.GetAddrOfLocalVar(VD);
4372*13fbcb42Sjoerg       if (VD->getType().getCanonicalType()->isReferenceType())
4373*13fbcb42Sjoerg         VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
4374*13fbcb42Sjoerg                                                VD->getType().getCanonicalType())
4375*13fbcb42Sjoerg                      .getAddress(CGF);
4376*13fbcb42Sjoerg       CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
4377*13fbcb42Sjoerg     }
4378*13fbcb42Sjoerg   }
4379*13fbcb42Sjoerg }
4380*13fbcb42Sjoerg 
getDefaultFirstprivateAddressSpace() const4381*13fbcb42Sjoerg unsigned CGOpenMPRuntimeGPU::getDefaultFirstprivateAddressSpace() const {
4382*13fbcb42Sjoerg   return CGM.getContext().getTargetAddressSpace(LangAS::cuda_constant);
4383*13fbcb42Sjoerg }
4384*13fbcb42Sjoerg 
hasAllocateAttributeForGlobalVar(const VarDecl * VD,LangAS & AS)4385*13fbcb42Sjoerg bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
4386*13fbcb42Sjoerg                                                             LangAS &AS) {
4387*13fbcb42Sjoerg   if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
4388*13fbcb42Sjoerg     return false;
4389*13fbcb42Sjoerg   const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
4390*13fbcb42Sjoerg   switch(A->getAllocatorType()) {
4391*13fbcb42Sjoerg   case OMPAllocateDeclAttr::OMPNullMemAlloc:
4392*13fbcb42Sjoerg   case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
4393*13fbcb42Sjoerg   // Not supported, fallback to the default mem space.
4394*13fbcb42Sjoerg   case OMPAllocateDeclAttr::OMPThreadMemAlloc:
4395*13fbcb42Sjoerg   case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
4396*13fbcb42Sjoerg   case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
4397*13fbcb42Sjoerg   case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
4398*13fbcb42Sjoerg   case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
4399*13fbcb42Sjoerg     AS = LangAS::Default;
4400*13fbcb42Sjoerg     return true;
4401*13fbcb42Sjoerg   case OMPAllocateDeclAttr::OMPConstMemAlloc:
4402*13fbcb42Sjoerg     AS = LangAS::cuda_constant;
4403*13fbcb42Sjoerg     return true;
4404*13fbcb42Sjoerg   case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
4405*13fbcb42Sjoerg     AS = LangAS::cuda_shared;
4406*13fbcb42Sjoerg     return true;
4407*13fbcb42Sjoerg   case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
4408*13fbcb42Sjoerg     llvm_unreachable("Expected predefined allocator for the variables with the "
4409*13fbcb42Sjoerg                      "static storage.");
4410*13fbcb42Sjoerg   }
4411*13fbcb42Sjoerg   return false;
4412*13fbcb42Sjoerg }
4413*13fbcb42Sjoerg 
4414*13fbcb42Sjoerg // Get current CudaArch and ignore any unknown values
getCudaArch(CodeGenModule & CGM)4415*13fbcb42Sjoerg static CudaArch getCudaArch(CodeGenModule &CGM) {
4416*13fbcb42Sjoerg   if (!CGM.getTarget().hasFeature("ptx"))
4417*13fbcb42Sjoerg     return CudaArch::UNKNOWN;
4418*13fbcb42Sjoerg   for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
4419*13fbcb42Sjoerg     if (Feature.getValue()) {
4420*13fbcb42Sjoerg       CudaArch Arch = StringToCudaArch(Feature.getKey());
4421*13fbcb42Sjoerg       if (Arch != CudaArch::UNKNOWN)
4422*13fbcb42Sjoerg         return Arch;
4423*13fbcb42Sjoerg     }
4424*13fbcb42Sjoerg   }
4425*13fbcb42Sjoerg   return CudaArch::UNKNOWN;
4426*13fbcb42Sjoerg }
4427*13fbcb42Sjoerg 
4428*13fbcb42Sjoerg /// Check to see if target architecture supports unified addressing which is
4429*13fbcb42Sjoerg /// a restriction for OpenMP requires clause "unified_shared_memory".
processRequiresDirective(const OMPRequiresDecl * D)4430*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::processRequiresDirective(
4431*13fbcb42Sjoerg     const OMPRequiresDecl *D) {
4432*13fbcb42Sjoerg   for (const OMPClause *Clause : D->clauselists()) {
4433*13fbcb42Sjoerg     if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
4434*13fbcb42Sjoerg       CudaArch Arch = getCudaArch(CGM);
4435*13fbcb42Sjoerg       switch (Arch) {
4436*13fbcb42Sjoerg       case CudaArch::SM_20:
4437*13fbcb42Sjoerg       case CudaArch::SM_21:
4438*13fbcb42Sjoerg       case CudaArch::SM_30:
4439*13fbcb42Sjoerg       case CudaArch::SM_32:
4440*13fbcb42Sjoerg       case CudaArch::SM_35:
4441*13fbcb42Sjoerg       case CudaArch::SM_37:
4442*13fbcb42Sjoerg       case CudaArch::SM_50:
4443*13fbcb42Sjoerg       case CudaArch::SM_52:
4444*13fbcb42Sjoerg       case CudaArch::SM_53: {
4445*13fbcb42Sjoerg         SmallString<256> Buffer;
4446*13fbcb42Sjoerg         llvm::raw_svector_ostream Out(Buffer);
4447*13fbcb42Sjoerg         Out << "Target architecture " << CudaArchToString(Arch)
4448*13fbcb42Sjoerg             << " does not support unified addressing";
4449*13fbcb42Sjoerg         CGM.Error(Clause->getBeginLoc(), Out.str());
4450*13fbcb42Sjoerg         return;
4451*13fbcb42Sjoerg       }
4452*13fbcb42Sjoerg       case CudaArch::SM_60:
4453*13fbcb42Sjoerg       case CudaArch::SM_61:
4454*13fbcb42Sjoerg       case CudaArch::SM_62:
4455*13fbcb42Sjoerg       case CudaArch::SM_70:
4456*13fbcb42Sjoerg       case CudaArch::SM_72:
4457*13fbcb42Sjoerg       case CudaArch::SM_75:
4458*13fbcb42Sjoerg       case CudaArch::SM_80:
4459*13fbcb42Sjoerg       case CudaArch::SM_86:
4460*13fbcb42Sjoerg       case CudaArch::GFX600:
4461*13fbcb42Sjoerg       case CudaArch::GFX601:
4462*13fbcb42Sjoerg       case CudaArch::GFX602:
4463*13fbcb42Sjoerg       case CudaArch::GFX700:
4464*13fbcb42Sjoerg       case CudaArch::GFX701:
4465*13fbcb42Sjoerg       case CudaArch::GFX702:
4466*13fbcb42Sjoerg       case CudaArch::GFX703:
4467*13fbcb42Sjoerg       case CudaArch::GFX704:
4468*13fbcb42Sjoerg       case CudaArch::GFX705:
4469*13fbcb42Sjoerg       case CudaArch::GFX801:
4470*13fbcb42Sjoerg       case CudaArch::GFX802:
4471*13fbcb42Sjoerg       case CudaArch::GFX803:
4472*13fbcb42Sjoerg       case CudaArch::GFX805:
4473*13fbcb42Sjoerg       case CudaArch::GFX810:
4474*13fbcb42Sjoerg       case CudaArch::GFX900:
4475*13fbcb42Sjoerg       case CudaArch::GFX902:
4476*13fbcb42Sjoerg       case CudaArch::GFX904:
4477*13fbcb42Sjoerg       case CudaArch::GFX906:
4478*13fbcb42Sjoerg       case CudaArch::GFX908:
4479*13fbcb42Sjoerg       case CudaArch::GFX909:
4480*13fbcb42Sjoerg       case CudaArch::GFX90a:
4481*13fbcb42Sjoerg       case CudaArch::GFX90c:
4482*13fbcb42Sjoerg       case CudaArch::GFX1010:
4483*13fbcb42Sjoerg       case CudaArch::GFX1011:
4484*13fbcb42Sjoerg       case CudaArch::GFX1012:
4485*13fbcb42Sjoerg       case CudaArch::GFX1030:
4486*13fbcb42Sjoerg       case CudaArch::GFX1031:
4487*13fbcb42Sjoerg       case CudaArch::GFX1032:
4488*13fbcb42Sjoerg       case CudaArch::GFX1033:
4489*13fbcb42Sjoerg       case CudaArch::GFX1034:
4490*13fbcb42Sjoerg       case CudaArch::UNUSED:
4491*13fbcb42Sjoerg       case CudaArch::UNKNOWN:
4492*13fbcb42Sjoerg         break;
4493*13fbcb42Sjoerg       case CudaArch::LAST:
4494*13fbcb42Sjoerg         llvm_unreachable("Unexpected Cuda arch.");
4495*13fbcb42Sjoerg       }
4496*13fbcb42Sjoerg     }
4497*13fbcb42Sjoerg   }
4498*13fbcb42Sjoerg   CGOpenMPRuntime::processRequiresDirective(D);
4499*13fbcb42Sjoerg }
4500*13fbcb42Sjoerg 
4501*13fbcb42Sjoerg /// Get number of SMs and number of blocks per SM.
getSMsBlocksPerSM(CodeGenModule & CGM)4502*13fbcb42Sjoerg static std::pair<unsigned, unsigned> getSMsBlocksPerSM(CodeGenModule &CGM) {
4503*13fbcb42Sjoerg   std::pair<unsigned, unsigned> Data;
4504*13fbcb42Sjoerg   if (CGM.getLangOpts().OpenMPCUDANumSMs)
4505*13fbcb42Sjoerg     Data.first = CGM.getLangOpts().OpenMPCUDANumSMs;
4506*13fbcb42Sjoerg   if (CGM.getLangOpts().OpenMPCUDABlocksPerSM)
4507*13fbcb42Sjoerg     Data.second = CGM.getLangOpts().OpenMPCUDABlocksPerSM;
4508*13fbcb42Sjoerg   if (Data.first && Data.second)
4509*13fbcb42Sjoerg     return Data;
4510*13fbcb42Sjoerg   switch (getCudaArch(CGM)) {
4511*13fbcb42Sjoerg   case CudaArch::SM_20:
4512*13fbcb42Sjoerg   case CudaArch::SM_21:
4513*13fbcb42Sjoerg   case CudaArch::SM_30:
4514*13fbcb42Sjoerg   case CudaArch::SM_32:
4515*13fbcb42Sjoerg   case CudaArch::SM_35:
4516*13fbcb42Sjoerg   case CudaArch::SM_37:
4517*13fbcb42Sjoerg   case CudaArch::SM_50:
4518*13fbcb42Sjoerg   case CudaArch::SM_52:
4519*13fbcb42Sjoerg   case CudaArch::SM_53:
4520*13fbcb42Sjoerg     return {16, 16};
4521*13fbcb42Sjoerg   case CudaArch::SM_60:
4522*13fbcb42Sjoerg   case CudaArch::SM_61:
4523*13fbcb42Sjoerg   case CudaArch::SM_62:
4524*13fbcb42Sjoerg     return {56, 32};
4525*13fbcb42Sjoerg   case CudaArch::SM_70:
4526*13fbcb42Sjoerg   case CudaArch::SM_72:
4527*13fbcb42Sjoerg   case CudaArch::SM_75:
4528*13fbcb42Sjoerg   case CudaArch::SM_80:
4529*13fbcb42Sjoerg   case CudaArch::SM_86:
4530*13fbcb42Sjoerg     return {84, 32};
4531*13fbcb42Sjoerg   case CudaArch::GFX600:
4532*13fbcb42Sjoerg   case CudaArch::GFX601:
4533*13fbcb42Sjoerg   case CudaArch::GFX602:
4534*13fbcb42Sjoerg   case CudaArch::GFX700:
4535*13fbcb42Sjoerg   case CudaArch::GFX701:
4536*13fbcb42Sjoerg   case CudaArch::GFX702:
4537*13fbcb42Sjoerg   case CudaArch::GFX703:
4538*13fbcb42Sjoerg   case CudaArch::GFX704:
4539*13fbcb42Sjoerg   case CudaArch::GFX705:
4540*13fbcb42Sjoerg   case CudaArch::GFX801:
4541*13fbcb42Sjoerg   case CudaArch::GFX802:
4542*13fbcb42Sjoerg   case CudaArch::GFX803:
4543*13fbcb42Sjoerg   case CudaArch::GFX805:
4544*13fbcb42Sjoerg   case CudaArch::GFX810:
4545*13fbcb42Sjoerg   case CudaArch::GFX900:
4546*13fbcb42Sjoerg   case CudaArch::GFX902:
4547*13fbcb42Sjoerg   case CudaArch::GFX904:
4548*13fbcb42Sjoerg   case CudaArch::GFX906:
4549*13fbcb42Sjoerg   case CudaArch::GFX908:
4550*13fbcb42Sjoerg   case CudaArch::GFX909:
4551*13fbcb42Sjoerg   case CudaArch::GFX90a:
4552*13fbcb42Sjoerg   case CudaArch::GFX90c:
4553*13fbcb42Sjoerg   case CudaArch::GFX1010:
4554*13fbcb42Sjoerg   case CudaArch::GFX1011:
4555*13fbcb42Sjoerg   case CudaArch::GFX1012:
4556*13fbcb42Sjoerg   case CudaArch::GFX1030:
4557*13fbcb42Sjoerg   case CudaArch::GFX1031:
4558*13fbcb42Sjoerg   case CudaArch::GFX1032:
4559*13fbcb42Sjoerg   case CudaArch::GFX1033:
4560*13fbcb42Sjoerg   case CudaArch::GFX1034:
4561*13fbcb42Sjoerg   case CudaArch::UNUSED:
4562*13fbcb42Sjoerg   case CudaArch::UNKNOWN:
4563*13fbcb42Sjoerg     break;
4564*13fbcb42Sjoerg   case CudaArch::LAST:
4565*13fbcb42Sjoerg     llvm_unreachable("Unexpected Cuda arch.");
4566*13fbcb42Sjoerg   }
4567*13fbcb42Sjoerg   llvm_unreachable("Unexpected NVPTX target without ptx feature.");
4568*13fbcb42Sjoerg }
4569*13fbcb42Sjoerg 
clear()4570*13fbcb42Sjoerg void CGOpenMPRuntimeGPU::clear() {
4571*13fbcb42Sjoerg   if (!GlobalizedRecords.empty() &&
4572*13fbcb42Sjoerg       !CGM.getLangOpts().OpenMPCUDATargetParallel) {
4573*13fbcb42Sjoerg     ASTContext &C = CGM.getContext();
4574*13fbcb42Sjoerg     llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> GlobalRecs;
4575*13fbcb42Sjoerg     llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> SharedRecs;
4576*13fbcb42Sjoerg     RecordDecl *StaticRD = C.buildImplicitRecord(
4577*13fbcb42Sjoerg         "_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
4578*13fbcb42Sjoerg     StaticRD->startDefinition();
4579*13fbcb42Sjoerg     RecordDecl *SharedStaticRD = C.buildImplicitRecord(
4580*13fbcb42Sjoerg         "_shared_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
4581*13fbcb42Sjoerg     SharedStaticRD->startDefinition();
4582*13fbcb42Sjoerg     for (const GlobalPtrSizeRecsTy &Records : GlobalizedRecords) {
4583*13fbcb42Sjoerg       if (Records.Records.empty())
4584*13fbcb42Sjoerg         continue;
4585*13fbcb42Sjoerg       unsigned Size = 0;
4586*13fbcb42Sjoerg       unsigned RecAlignment = 0;
4587*13fbcb42Sjoerg       for (const RecordDecl *RD : Records.Records) {
4588*13fbcb42Sjoerg         QualType RDTy = C.getRecordType(RD);
4589*13fbcb42Sjoerg         unsigned Alignment = C.getTypeAlignInChars(RDTy).getQuantity();
4590*13fbcb42Sjoerg         RecAlignment = std::max(RecAlignment, Alignment);
4591*13fbcb42Sjoerg         unsigned RecSize = C.getTypeSizeInChars(RDTy).getQuantity();
4592*13fbcb42Sjoerg         Size =
4593*13fbcb42Sjoerg             llvm::alignTo(llvm::alignTo(Size, Alignment) + RecSize, Alignment);
4594*13fbcb42Sjoerg       }
4595*13fbcb42Sjoerg       Size = llvm::alignTo(Size, RecAlignment);
4596*13fbcb42Sjoerg       llvm::APInt ArySize(/*numBits=*/64, Size);
4597*13fbcb42Sjoerg       QualType SubTy = C.getConstantArrayType(
4598*13fbcb42Sjoerg           C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0);
4599*13fbcb42Sjoerg       const bool UseSharedMemory = Size <= SharedMemorySize;
4600*13fbcb42Sjoerg       auto *Field =
4601*13fbcb42Sjoerg           FieldDecl::Create(C, UseSharedMemory ? SharedStaticRD : StaticRD,
4602*13fbcb42Sjoerg                             SourceLocation(), SourceLocation(), nullptr, SubTy,
4603*13fbcb42Sjoerg                             C.getTrivialTypeSourceInfo(SubTy, SourceLocation()),
4604*13fbcb42Sjoerg                             /*BW=*/nullptr, /*Mutable=*/false,
4605*13fbcb42Sjoerg                             /*InitStyle=*/ICIS_NoInit);
4606*13fbcb42Sjoerg       Field->setAccess(AS_public);
4607*13fbcb42Sjoerg       if (UseSharedMemory) {
4608*13fbcb42Sjoerg         SharedStaticRD->addDecl(Field);
4609*13fbcb42Sjoerg         SharedRecs.push_back(&Records);
4610*13fbcb42Sjoerg       } else {
4611*13fbcb42Sjoerg         StaticRD->addDecl(Field);
4612*13fbcb42Sjoerg         GlobalRecs.push_back(&Records);
4613*13fbcb42Sjoerg       }
4614*13fbcb42Sjoerg       Records.RecSize->setInitializer(llvm::ConstantInt::get(CGM.SizeTy, Size));
4615*13fbcb42Sjoerg       Records.UseSharedMemory->setInitializer(
4616*13fbcb42Sjoerg           llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0));
4617*13fbcb42Sjoerg     }
4618*13fbcb42Sjoerg     // Allocate SharedMemorySize buffer for the shared memory.
4619*13fbcb42Sjoerg     // FIXME: nvlink does not handle weak linkage correctly (object with the
4620*13fbcb42Sjoerg     // different size are reported as erroneous).
4621*13fbcb42Sjoerg     // Restore this code as sson as nvlink is fixed.
4622*13fbcb42Sjoerg     if (!SharedStaticRD->field_empty()) {
4623*13fbcb42Sjoerg       llvm::APInt ArySize(/*numBits=*/64, SharedMemorySize);
4624*13fbcb42Sjoerg       QualType SubTy = C.getConstantArrayType(
4625*13fbcb42Sjoerg           C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0);
4626*13fbcb42Sjoerg       auto *Field = FieldDecl::Create(
4627*13fbcb42Sjoerg           C, SharedStaticRD, SourceLocation(), SourceLocation(), nullptr, SubTy,
4628*13fbcb42Sjoerg           C.getTrivialTypeSourceInfo(SubTy, SourceLocation()),
4629*13fbcb42Sjoerg           /*BW=*/nullptr, /*Mutable=*/false,
4630*13fbcb42Sjoerg           /*InitStyle=*/ICIS_NoInit);
4631*13fbcb42Sjoerg       Field->setAccess(AS_public);
4632*13fbcb42Sjoerg       SharedStaticRD->addDecl(Field);
4633*13fbcb42Sjoerg     }
4634*13fbcb42Sjoerg     SharedStaticRD->completeDefinition();
4635*13fbcb42Sjoerg     if (!SharedStaticRD->field_empty()) {
4636*13fbcb42Sjoerg       QualType StaticTy = C.getRecordType(SharedStaticRD);
4637*13fbcb42Sjoerg       llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy);
4638*13fbcb42Sjoerg       auto *GV = new llvm::GlobalVariable(
4639*13fbcb42Sjoerg           CGM.getModule(), LLVMStaticTy,
4640*13fbcb42Sjoerg           /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage,
4641*13fbcb42Sjoerg           llvm::UndefValue::get(LLVMStaticTy),
4642*13fbcb42Sjoerg           "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr,
4643*13fbcb42Sjoerg           llvm::GlobalValue::NotThreadLocal,
4644*13fbcb42Sjoerg           C.getTargetAddressSpace(LangAS::cuda_shared));
4645*13fbcb42Sjoerg       auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
4646*13fbcb42Sjoerg           GV, CGM.VoidPtrTy);
4647*13fbcb42Sjoerg       for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) {
4648*13fbcb42Sjoerg         Rec->Buffer->replaceAllUsesWith(Replacement);
4649*13fbcb42Sjoerg         Rec->Buffer->eraseFromParent();
4650*13fbcb42Sjoerg       }
4651*13fbcb42Sjoerg     }
4652*13fbcb42Sjoerg     StaticRD->completeDefinition();
4653*13fbcb42Sjoerg     if (!StaticRD->field_empty()) {
4654*13fbcb42Sjoerg       QualType StaticTy = C.getRecordType(StaticRD);
4655*13fbcb42Sjoerg       std::pair<unsigned, unsigned> SMsBlockPerSM = getSMsBlocksPerSM(CGM);
4656*13fbcb42Sjoerg       llvm::APInt Size1(32, SMsBlockPerSM.second);
4657*13fbcb42Sjoerg       QualType Arr1Ty =
4658*13fbcb42Sjoerg           C.getConstantArrayType(StaticTy, Size1, nullptr, ArrayType::Normal,
4659*13fbcb42Sjoerg                                  /*IndexTypeQuals=*/0);
4660*13fbcb42Sjoerg       llvm::APInt Size2(32, SMsBlockPerSM.first);
4661*13fbcb42Sjoerg       QualType Arr2Ty =
4662*13fbcb42Sjoerg           C.getConstantArrayType(Arr1Ty, Size2, nullptr, ArrayType::Normal,
4663*13fbcb42Sjoerg                                  /*IndexTypeQuals=*/0);
4664*13fbcb42Sjoerg       llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty);
4665*13fbcb42Sjoerg       // FIXME: nvlink does not handle weak linkage correctly (object with the
4666*13fbcb42Sjoerg       // different size are reported as erroneous).
4667*13fbcb42Sjoerg       // Restore CommonLinkage as soon as nvlink is fixed.
4668*13fbcb42Sjoerg       auto *GV = new llvm::GlobalVariable(
4669*13fbcb42Sjoerg           CGM.getModule(), LLVMArr2Ty,
4670*13fbcb42Sjoerg           /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
4671*13fbcb42Sjoerg           llvm::Constant::getNullValue(LLVMArr2Ty),
4672*13fbcb42Sjoerg           "_openmp_static_glob_rd_$_");
4673*13fbcb42Sjoerg       auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
4674*13fbcb42Sjoerg           GV, CGM.VoidPtrTy);
4675*13fbcb42Sjoerg       for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) {
4676*13fbcb42Sjoerg         Rec->Buffer->replaceAllUsesWith(Replacement);
4677*13fbcb42Sjoerg         Rec->Buffer->eraseFromParent();
4678*13fbcb42Sjoerg       }
4679*13fbcb42Sjoerg     }
4680*13fbcb42Sjoerg   }
4681*13fbcb42Sjoerg   if (!TeamsReductions.empty()) {
4682*13fbcb42Sjoerg     ASTContext &C = CGM.getContext();
4683*13fbcb42Sjoerg     RecordDecl *StaticRD = C.buildImplicitRecord(
4684*13fbcb42Sjoerg         "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
4685*13fbcb42Sjoerg     StaticRD->startDefinition();
4686*13fbcb42Sjoerg     for (const RecordDecl *TeamReductionRec : TeamsReductions) {
4687*13fbcb42Sjoerg       QualType RecTy = C.getRecordType(TeamReductionRec);
4688*13fbcb42Sjoerg       auto *Field = FieldDecl::Create(
4689*13fbcb42Sjoerg           C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
4690*13fbcb42Sjoerg           C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
4691*13fbcb42Sjoerg           /*BW=*/nullptr, /*Mutable=*/false,
4692*13fbcb42Sjoerg           /*InitStyle=*/ICIS_NoInit);
4693*13fbcb42Sjoerg       Field->setAccess(AS_public);
4694*13fbcb42Sjoerg       StaticRD->addDecl(Field);
4695*13fbcb42Sjoerg     }
4696*13fbcb42Sjoerg     StaticRD->completeDefinition();
4697*13fbcb42Sjoerg     QualType StaticTy = C.getRecordType(StaticRD);
4698*13fbcb42Sjoerg     llvm::Type *LLVMReductionsBufferTy =
4699*13fbcb42Sjoerg         CGM.getTypes().ConvertTypeForMem(StaticTy);
4700*13fbcb42Sjoerg     // FIXME: nvlink does not handle weak linkage correctly (object with the
4701*13fbcb42Sjoerg     // different size are reported as erroneous).
4702*13fbcb42Sjoerg     // Restore CommonLinkage as soon as nvlink is fixed.
4703*13fbcb42Sjoerg     auto *GV = new llvm::GlobalVariable(
4704*13fbcb42Sjoerg         CGM.getModule(), LLVMReductionsBufferTy,
4705*13fbcb42Sjoerg         /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
4706*13fbcb42Sjoerg         llvm::Constant::getNullValue(LLVMReductionsBufferTy),
4707*13fbcb42Sjoerg         "_openmp_teams_reductions_buffer_$_");
4708*13fbcb42Sjoerg     KernelTeamsReductionPtr->setInitializer(
4709*13fbcb42Sjoerg         llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
4710*13fbcb42Sjoerg                                                              CGM.VoidPtrTy));
4711*13fbcb42Sjoerg   }
4712*13fbcb42Sjoerg   CGOpenMPRuntime::clear();
4713*13fbcb42Sjoerg }
4714