1 //===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This provides a generalized class for OpenMP runtime code generation
10 // specialized by GPU targets NVPTX and AMDGCN.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "CGOpenMPRuntimeGPU.h"
15 #include "CGOpenMPRuntimeNVPTX.h"
16 #include "CodeGenFunction.h"
17 #include "clang/AST/Attr.h"
18 #include "clang/AST/DeclOpenMP.h"
19 #include "clang/AST/StmtOpenMP.h"
20 #include "clang/AST/StmtVisitor.h"
21 #include "clang/Basic/Cuda.h"
22 #include "llvm/ADT/SmallPtrSet.h"
23 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
24 #include "llvm/IR/IntrinsicsNVPTX.h"
25 
26 using namespace clang;
27 using namespace CodeGen;
28 using namespace llvm::omp;
29 
30 namespace {
31 /// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
32 class NVPTXActionTy final : public PrePostActionTy {
33   llvm::FunctionCallee EnterCallee = nullptr;
34   ArrayRef<llvm::Value *> EnterArgs;
35   llvm::FunctionCallee ExitCallee = nullptr;
36   ArrayRef<llvm::Value *> ExitArgs;
37   bool Conditional = false;
38   llvm::BasicBlock *ContBlock = nullptr;
39 
40 public:
NVPTXActionTy(llvm::FunctionCallee EnterCallee,ArrayRef<llvm::Value * > EnterArgs,llvm::FunctionCallee ExitCallee,ArrayRef<llvm::Value * > ExitArgs,bool Conditional=false)41   NVPTXActionTy(llvm::FunctionCallee EnterCallee,
42                 ArrayRef<llvm::Value *> EnterArgs,
43                 llvm::FunctionCallee ExitCallee,
44                 ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
45       : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
46         ExitArgs(ExitArgs), Conditional(Conditional) {}
Enter(CodeGenFunction & CGF)47   void Enter(CodeGenFunction &CGF) override {
48     llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
49     if (Conditional) {
50       llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
51       auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
52       ContBlock = CGF.createBasicBlock("omp_if.end");
53       // Generate the branch (If-stmt)
54       CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
55       CGF.EmitBlock(ThenBlock);
56     }
57   }
Done(CodeGenFunction & CGF)58   void Done(CodeGenFunction &CGF) {
59     // Emit the rest of blocks/branches
60     CGF.EmitBranch(ContBlock);
61     CGF.EmitBlock(ContBlock, true);
62   }
Exit(CodeGenFunction & CGF)63   void Exit(CodeGenFunction &CGF) override {
64     CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
65   }
66 };
67 
68 /// A class to track the execution mode when codegening directives within
69 /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
70 /// to the target region and used by containing directives such as 'parallel'
71 /// to emit optimized code.
72 class ExecutionRuntimeModesRAII {
73 private:
74   CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode =
75       CGOpenMPRuntimeGPU::EM_Unknown;
76   CGOpenMPRuntimeGPU::ExecutionMode &ExecMode;
77   bool SavedRuntimeMode = false;
78   bool *RuntimeMode = nullptr;
79 
80 public:
81   /// Constructor for Non-SPMD mode.
ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode & ExecMode)82   ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode)
83       : ExecMode(ExecMode) {
84     SavedExecMode = ExecMode;
85     ExecMode = CGOpenMPRuntimeGPU::EM_NonSPMD;
86   }
87   /// Constructor for SPMD mode.
ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode & ExecMode,bool & RuntimeMode,bool FullRuntimeMode)88   ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
89                             bool &RuntimeMode, bool FullRuntimeMode)
90       : ExecMode(ExecMode), RuntimeMode(&RuntimeMode) {
91     SavedExecMode = ExecMode;
92     SavedRuntimeMode = RuntimeMode;
93     ExecMode = CGOpenMPRuntimeGPU::EM_SPMD;
94     RuntimeMode = FullRuntimeMode;
95   }
~ExecutionRuntimeModesRAII()96   ~ExecutionRuntimeModesRAII() {
97     ExecMode = SavedExecMode;
98     if (RuntimeMode)
99       *RuntimeMode = SavedRuntimeMode;
100   }
101 };
102 
103 /// GPU Configuration:  This information can be derived from cuda registers,
104 /// however, providing compile time constants helps generate more efficient
105 /// code.  For all practical purposes this is fine because the configuration
106 /// is the same for all known NVPTX architectures.
107 enum MachineConfiguration : unsigned {
108   /// See "llvm/Frontend/OpenMP/OMPGridValues.h" for various related target
109   /// specific Grid Values like GV_Warp_Size, GV_Warp_Size_Log2,
110   /// and GV_Warp_Size_Log2_Mask.
111 
112   /// Global memory alignment for performance.
113   GlobalMemoryAlignment = 128,
114 
115   /// Maximal size of the shared memory buffer.
116   SharedMemorySize = 128,
117 };
118 
getPrivateItem(const Expr * RefExpr)119 static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
120   RefExpr = RefExpr->IgnoreParens();
121   if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
122     const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
123     while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
124       Base = TempASE->getBase()->IgnoreParenImpCasts();
125     RefExpr = Base;
126   } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
127     const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
128     while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
129       Base = TempOASE->getBase()->IgnoreParenImpCasts();
130     while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
131       Base = TempASE->getBase()->IgnoreParenImpCasts();
132     RefExpr = Base;
133   }
134   RefExpr = RefExpr->IgnoreParenImpCasts();
135   if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
136     return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
137   const auto *ME = cast<MemberExpr>(RefExpr);
138   return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
139 }
140 
141 
buildRecordForGlobalizedVars(ASTContext & C,ArrayRef<const ValueDecl * > EscapedDecls,ArrayRef<const ValueDecl * > EscapedDeclsForTeams,llvm::SmallDenseMap<const ValueDecl *,const FieldDecl * > & MappedDeclsFields,int BufSize)142 static RecordDecl *buildRecordForGlobalizedVars(
143     ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
144     ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
145     llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
146         &MappedDeclsFields, int BufSize) {
147   using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
148   if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
149     return nullptr;
150   SmallVector<VarsDataTy, 4> GlobalizedVars;
151   for (const ValueDecl *D : EscapedDecls)
152     GlobalizedVars.emplace_back(
153         CharUnits::fromQuantity(std::max(
154             C.getDeclAlign(D).getQuantity(),
155             static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
156         D);
157   for (const ValueDecl *D : EscapedDeclsForTeams)
158     GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
159   llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) {
160     return L.first > R.first;
161   });
162 
163   // Build struct _globalized_locals_ty {
164   //         /*  globalized vars  */[WarSize] align (max(decl_align,
165   //         GlobalMemoryAlignment))
166   //         /*  globalized vars  */ for EscapedDeclsForTeams
167   //       };
168   RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
169   GlobalizedRD->startDefinition();
170   llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
171       EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
172   for (const auto &Pair : GlobalizedVars) {
173     const ValueDecl *VD = Pair.second;
174     QualType Type = VD->getType();
175     if (Type->isLValueReferenceType())
176       Type = C.getPointerType(Type.getNonReferenceType());
177     else
178       Type = Type.getNonReferenceType();
179     SourceLocation Loc = VD->getLocation();
180     FieldDecl *Field;
181     if (SingleEscaped.count(VD)) {
182       Field = FieldDecl::Create(
183           C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
184           C.getTrivialTypeSourceInfo(Type, SourceLocation()),
185           /*BW=*/nullptr, /*Mutable=*/false,
186           /*InitStyle=*/ICIS_NoInit);
187       Field->setAccess(AS_public);
188       if (VD->hasAttrs()) {
189         for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
190              E(VD->getAttrs().end());
191              I != E; ++I)
192           Field->addAttr(*I);
193       }
194     } else {
195       llvm::APInt ArraySize(32, BufSize);
196       Type = C.getConstantArrayType(Type, ArraySize, nullptr, ArrayType::Normal,
197                                     0);
198       Field = FieldDecl::Create(
199           C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
200           C.getTrivialTypeSourceInfo(Type, SourceLocation()),
201           /*BW=*/nullptr, /*Mutable=*/false,
202           /*InitStyle=*/ICIS_NoInit);
203       Field->setAccess(AS_public);
204       llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
205                                      static_cast<CharUnits::QuantityType>(
206                                          GlobalMemoryAlignment)));
207       Field->addAttr(AlignedAttr::CreateImplicit(
208           C, /*IsAlignmentExpr=*/true,
209           IntegerLiteral::Create(C, Align,
210                                  C.getIntTypeForBitwidth(32, /*Signed=*/0),
211                                  SourceLocation()),
212           {}, AttributeCommonInfo::AS_GNU, AlignedAttr::GNU_aligned));
213     }
214     GlobalizedRD->addDecl(Field);
215     MappedDeclsFields.try_emplace(VD, Field);
216   }
217   GlobalizedRD->completeDefinition();
218   return GlobalizedRD;
219 }
220 
221 /// Get the list of variables that can escape their declaration context.
222 class CheckVarsEscapingDeclContext final
223     : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
224   CodeGenFunction &CGF;
225   llvm::SetVector<const ValueDecl *> EscapedDecls;
226   llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
227   llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
228   RecordDecl *GlobalizedRD = nullptr;
229   llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
230   bool AllEscaped = false;
231   bool IsForCombinedParallelRegion = false;
232 
markAsEscaped(const ValueDecl * VD)233   void markAsEscaped(const ValueDecl *VD) {
234     // Do not globalize declare target variables.
235     if (!isa<VarDecl>(VD) ||
236         OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
237       return;
238     VD = cast<ValueDecl>(VD->getCanonicalDecl());
239     // Use user-specified allocation.
240     if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
241       return;
242     // Variables captured by value must be globalized.
243     if (auto *CSI = CGF.CapturedStmtInfo) {
244       if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
245         // Check if need to capture the variable that was already captured by
246         // value in the outer region.
247         if (!IsForCombinedParallelRegion) {
248           if (!FD->hasAttrs())
249             return;
250           const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
251           if (!Attr)
252             return;
253           if (((Attr->getCaptureKind() != OMPC_map) &&
254                !isOpenMPPrivate(Attr->getCaptureKind())) ||
255               ((Attr->getCaptureKind() == OMPC_map) &&
256                !FD->getType()->isAnyPointerType()))
257             return;
258         }
259         if (!FD->getType()->isReferenceType()) {
260           assert(!VD->getType()->isVariablyModifiedType() &&
261                  "Parameter captured by value with variably modified type");
262           EscapedParameters.insert(VD);
263         } else if (!IsForCombinedParallelRegion) {
264           return;
265         }
266       }
267     }
268     if ((!CGF.CapturedStmtInfo ||
269          (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
270         VD->getType()->isReferenceType())
271       // Do not globalize variables with reference type.
272       return;
273     if (VD->getType()->isVariablyModifiedType())
274       EscapedVariableLengthDecls.insert(VD);
275     else
276       EscapedDecls.insert(VD);
277   }
278 
VisitValueDecl(const ValueDecl * VD)279   void VisitValueDecl(const ValueDecl *VD) {
280     if (VD->getType()->isLValueReferenceType())
281       markAsEscaped(VD);
282     if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
283       if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
284         const bool SavedAllEscaped = AllEscaped;
285         AllEscaped = VD->getType()->isLValueReferenceType();
286         Visit(VarD->getInit());
287         AllEscaped = SavedAllEscaped;
288       }
289     }
290   }
VisitOpenMPCapturedStmt(const CapturedStmt * S,ArrayRef<OMPClause * > Clauses,bool IsCombinedParallelRegion)291   void VisitOpenMPCapturedStmt(const CapturedStmt *S,
292                                ArrayRef<OMPClause *> Clauses,
293                                bool IsCombinedParallelRegion) {
294     if (!S)
295       return;
296     for (const CapturedStmt::Capture &C : S->captures()) {
297       if (C.capturesVariable() && !C.capturesVariableByCopy()) {
298         const ValueDecl *VD = C.getCapturedVar();
299         bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
300         if (IsCombinedParallelRegion) {
301           // Check if the variable is privatized in the combined construct and
302           // those private copies must be shared in the inner parallel
303           // directive.
304           IsForCombinedParallelRegion = false;
305           for (const OMPClause *C : Clauses) {
306             if (!isOpenMPPrivate(C->getClauseKind()) ||
307                 C->getClauseKind() == OMPC_reduction ||
308                 C->getClauseKind() == OMPC_linear ||
309                 C->getClauseKind() == OMPC_private)
310               continue;
311             ArrayRef<const Expr *> Vars;
312             if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
313               Vars = PC->getVarRefs();
314             else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
315               Vars = PC->getVarRefs();
316             else
317               llvm_unreachable("Unexpected clause.");
318             for (const auto *E : Vars) {
319               const Decl *D =
320                   cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
321               if (D == VD->getCanonicalDecl()) {
322                 IsForCombinedParallelRegion = true;
323                 break;
324               }
325             }
326             if (IsForCombinedParallelRegion)
327               break;
328           }
329         }
330         markAsEscaped(VD);
331         if (isa<OMPCapturedExprDecl>(VD))
332           VisitValueDecl(VD);
333         IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
334       }
335     }
336   }
337 
buildRecordForGlobalizedVars(bool IsInTTDRegion)338   void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
339     assert(!GlobalizedRD &&
340            "Record for globalized variables is built already.");
341     ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
342     unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
343     if (IsInTTDRegion)
344       EscapedDeclsForTeams = EscapedDecls.getArrayRef();
345     else
346       EscapedDeclsForParallel = EscapedDecls.getArrayRef();
347     GlobalizedRD = ::buildRecordForGlobalizedVars(
348         CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
349         MappedDeclsFields, WarpSize);
350   }
351 
352 public:
CheckVarsEscapingDeclContext(CodeGenFunction & CGF,ArrayRef<const ValueDecl * > TeamsReductions)353   CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
354                                ArrayRef<const ValueDecl *> TeamsReductions)
355       : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
356   }
357   virtual ~CheckVarsEscapingDeclContext() = default;
VisitDeclStmt(const DeclStmt * S)358   void VisitDeclStmt(const DeclStmt *S) {
359     if (!S)
360       return;
361     for (const Decl *D : S->decls())
362       if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
363         VisitValueDecl(VD);
364   }
VisitOMPExecutableDirective(const OMPExecutableDirective * D)365   void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
366     if (!D)
367       return;
368     if (!D->hasAssociatedStmt())
369       return;
370     if (const auto *S =
371             dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
372       // Do not analyze directives that do not actually require capturing,
373       // like `omp for` or `omp simd` directives.
374       llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
375       getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
376       if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
377         VisitStmt(S->getCapturedStmt());
378         return;
379       }
380       VisitOpenMPCapturedStmt(
381           S, D->clauses(),
382           CaptureRegions.back() == OMPD_parallel &&
383               isOpenMPDistributeDirective(D->getDirectiveKind()));
384     }
385   }
VisitCapturedStmt(const CapturedStmt * S)386   void VisitCapturedStmt(const CapturedStmt *S) {
387     if (!S)
388       return;
389     for (const CapturedStmt::Capture &C : S->captures()) {
390       if (C.capturesVariable() && !C.capturesVariableByCopy()) {
391         const ValueDecl *VD = C.getCapturedVar();
392         markAsEscaped(VD);
393         if (isa<OMPCapturedExprDecl>(VD))
394           VisitValueDecl(VD);
395       }
396     }
397   }
VisitLambdaExpr(const LambdaExpr * E)398   void VisitLambdaExpr(const LambdaExpr *E) {
399     if (!E)
400       return;
401     for (const LambdaCapture &C : E->captures()) {
402       if (C.capturesVariable()) {
403         if (C.getCaptureKind() == LCK_ByRef) {
404           const ValueDecl *VD = C.getCapturedVar();
405           markAsEscaped(VD);
406           if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
407             VisitValueDecl(VD);
408         }
409       }
410     }
411   }
VisitBlockExpr(const BlockExpr * E)412   void VisitBlockExpr(const BlockExpr *E) {
413     if (!E)
414       return;
415     for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
416       if (C.isByRef()) {
417         const VarDecl *VD = C.getVariable();
418         markAsEscaped(VD);
419         if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
420           VisitValueDecl(VD);
421       }
422     }
423   }
VisitCallExpr(const CallExpr * E)424   void VisitCallExpr(const CallExpr *E) {
425     if (!E)
426       return;
427     for (const Expr *Arg : E->arguments()) {
428       if (!Arg)
429         continue;
430       if (Arg->isLValue()) {
431         const bool SavedAllEscaped = AllEscaped;
432         AllEscaped = true;
433         Visit(Arg);
434         AllEscaped = SavedAllEscaped;
435       } else {
436         Visit(Arg);
437       }
438     }
439     Visit(E->getCallee());
440   }
VisitDeclRefExpr(const DeclRefExpr * E)441   void VisitDeclRefExpr(const DeclRefExpr *E) {
442     if (!E)
443       return;
444     const ValueDecl *VD = E->getDecl();
445     if (AllEscaped)
446       markAsEscaped(VD);
447     if (isa<OMPCapturedExprDecl>(VD))
448       VisitValueDecl(VD);
449     else if (const auto *VarD = dyn_cast<VarDecl>(VD))
450       if (VarD->isInitCapture())
451         VisitValueDecl(VD);
452   }
VisitUnaryOperator(const UnaryOperator * E)453   void VisitUnaryOperator(const UnaryOperator *E) {
454     if (!E)
455       return;
456     if (E->getOpcode() == UO_AddrOf) {
457       const bool SavedAllEscaped = AllEscaped;
458       AllEscaped = true;
459       Visit(E->getSubExpr());
460       AllEscaped = SavedAllEscaped;
461     } else {
462       Visit(E->getSubExpr());
463     }
464   }
VisitImplicitCastExpr(const ImplicitCastExpr * E)465   void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
466     if (!E)
467       return;
468     if (E->getCastKind() == CK_ArrayToPointerDecay) {
469       const bool SavedAllEscaped = AllEscaped;
470       AllEscaped = true;
471       Visit(E->getSubExpr());
472       AllEscaped = SavedAllEscaped;
473     } else {
474       Visit(E->getSubExpr());
475     }
476   }
VisitExpr(const Expr * E)477   void VisitExpr(const Expr *E) {
478     if (!E)
479       return;
480     bool SavedAllEscaped = AllEscaped;
481     if (!E->isLValue())
482       AllEscaped = false;
483     for (const Stmt *Child : E->children())
484       if (Child)
485         Visit(Child);
486     AllEscaped = SavedAllEscaped;
487   }
VisitStmt(const Stmt * S)488   void VisitStmt(const Stmt *S) {
489     if (!S)
490       return;
491     for (const Stmt *Child : S->children())
492       if (Child)
493         Visit(Child);
494   }
495 
496   /// Returns the record that handles all the escaped local variables and used
497   /// instead of their original storage.
getGlobalizedRecord(bool IsInTTDRegion)498   const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
499     if (!GlobalizedRD)
500       buildRecordForGlobalizedVars(IsInTTDRegion);
501     return GlobalizedRD;
502   }
503 
504   /// Returns the field in the globalized record for the escaped variable.
getFieldForGlobalizedVar(const ValueDecl * VD) const505   const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
506     assert(GlobalizedRD &&
507            "Record for globalized variables must be generated already.");
508     auto I = MappedDeclsFields.find(VD);
509     if (I == MappedDeclsFields.end())
510       return nullptr;
511     return I->getSecond();
512   }
513 
514   /// Returns the list of the escaped local variables/parameters.
getEscapedDecls() const515   ArrayRef<const ValueDecl *> getEscapedDecls() const {
516     return EscapedDecls.getArrayRef();
517   }
518 
519   /// Checks if the escaped local variable is actually a parameter passed by
520   /// value.
getEscapedParameters() const521   const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
522     return EscapedParameters;
523   }
524 
525   /// Returns the list of the escaped variables with the variably modified
526   /// types.
getEscapedVariableLengthDecls() const527   ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
528     return EscapedVariableLengthDecls.getArrayRef();
529   }
530 };
531 } // anonymous namespace
532 
533 /// Get the id of the warp in the block.
534 /// We assume that the warp size is 32, which is always the case
535 /// on the NVPTX device, to generate more efficient code.
getNVPTXWarpID(CodeGenFunction & CGF)536 static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
537   CGBuilderTy &Bld = CGF.Builder;
538   unsigned LaneIDBits =
539       CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size_Log2);
540   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
541   return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
542 }
543 
544 /// Get the id of the current lane in the Warp.
545 /// We assume that the warp size is 32, which is always the case
546 /// on the NVPTX device, to generate more efficient code.
getNVPTXLaneID(CodeGenFunction & CGF)547 static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
548   CGBuilderTy &Bld = CGF.Builder;
549   unsigned LaneIDMask = CGF.getContext().getTargetInfo().getGridValue(
550       llvm::omp::GV_Warp_Size_Log2_Mask);
551   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
552   return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
553                        "nvptx_lane_id");
554 }
555 
556 /// Get the value of the thread_limit clause in the teams directive.
557 /// For the 'generic' execution mode, the runtime encodes thread_limit in
558 /// the launch parameters, always starting thread_limit+warpSize threads per
559 /// CTA. The threads in the last warp are reserved for master execution.
560 /// For the 'spmd' execution mode, all threads in a CTA are part of the team.
getThreadLimit(CodeGenFunction & CGF,bool IsInSPMDExecutionMode=false)561 static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
562                                    bool IsInSPMDExecutionMode = false) {
563   CGBuilderTy &Bld = CGF.Builder;
564   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
565   llvm::Value *ThreadLimit = nullptr;
566   if (IsInSPMDExecutionMode)
567     ThreadLimit = RT.getGPUNumThreads(CGF);
568   else {
569     llvm::Value *GPUNumThreads = RT.getGPUNumThreads(CGF);
570     llvm::Value *GPUWarpSize = RT.getGPUWarpSize(CGF);
571     ThreadLimit = Bld.CreateNUWSub(GPUNumThreads, GPUWarpSize, "thread_limit");
572   }
573   assert(ThreadLimit != nullptr && "Expected non-null ThreadLimit");
574   return ThreadLimit;
575 }
576 
577 /// Get the thread id of the OMP master thread.
578 /// The master thread id is the first thread (lane) of the last warp in the
579 /// GPU block.  Warp size is assumed to be some power of 2.
580 /// Thread id is 0 indexed.
581 /// E.g: If NumThreads is 33, master id is 32.
582 ///      If NumThreads is 64, master id is 32.
583 ///      If NumThreads is 1024, master id is 992.
getMasterThreadID(CodeGenFunction & CGF)584 static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
585   CGBuilderTy &Bld = CGF.Builder;
586   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
587   llvm::Value *NumThreads = RT.getGPUNumThreads(CGF);
588   // We assume that the warp size is a power of 2.
589   llvm::Value *Mask = Bld.CreateNUWSub(RT.getGPUWarpSize(CGF), Bld.getInt32(1));
590 
591   llvm::Value *NumThreadsSubOne = Bld.CreateNUWSub(NumThreads, Bld.getInt32(1));
592   return Bld.CreateAnd(NumThreadsSubOne, Bld.CreateNot(Mask), "master_tid");
593 }
594 
WorkerFunctionState(CodeGenModule & CGM,SourceLocation Loc)595 CGOpenMPRuntimeGPU::WorkerFunctionState::WorkerFunctionState(
596     CodeGenModule &CGM, SourceLocation Loc)
597     : WorkerFn(nullptr), CGFI(CGM.getTypes().arrangeNullaryFunction()),
598       Loc(Loc) {
599   createWorkerFunction(CGM);
600 }
601 
createWorkerFunction(CodeGenModule & CGM)602 void CGOpenMPRuntimeGPU::WorkerFunctionState::createWorkerFunction(
603     CodeGenModule &CGM) {
604   // Create an worker function with no arguments.
605 
606   WorkerFn = llvm::Function::Create(
607       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
608       /*placeholder=*/"_worker", &CGM.getModule());
609   CGM.SetInternalFunctionAttributes(GlobalDecl(), WorkerFn, CGFI);
610   WorkerFn->setDoesNotRecurse();
611 }
612 
613 CGOpenMPRuntimeGPU::ExecutionMode
getExecutionMode() const614 CGOpenMPRuntimeGPU::getExecutionMode() const {
615   return CurrentExecutionMode;
616 }
617 
618 static CGOpenMPRuntimeGPU::DataSharingMode
getDataSharingMode(CodeGenModule & CGM)619 getDataSharingMode(CodeGenModule &CGM) {
620   return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeGPU::CUDA
621                                           : CGOpenMPRuntimeGPU::Generic;
622 }
623 
624 /// Check for inner (nested) SPMD construct, if any
hasNestedSPMDDirective(ASTContext & Ctx,const OMPExecutableDirective & D)625 static bool hasNestedSPMDDirective(ASTContext &Ctx,
626                                    const OMPExecutableDirective &D) {
627   const auto *CS = D.getInnermostCapturedStmt();
628   const auto *Body =
629       CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
630   const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
631 
632   if (const auto *NestedDir =
633           dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
634     OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
635     switch (D.getDirectiveKind()) {
636     case OMPD_target:
637       if (isOpenMPParallelDirective(DKind))
638         return true;
639       if (DKind == OMPD_teams) {
640         Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
641             /*IgnoreCaptured=*/true);
642         if (!Body)
643           return false;
644         ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
645         if (const auto *NND =
646                 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
647           DKind = NND->getDirectiveKind();
648           if (isOpenMPParallelDirective(DKind))
649             return true;
650         }
651       }
652       return false;
653     case OMPD_target_teams:
654       return isOpenMPParallelDirective(DKind);
655     case OMPD_target_simd:
656     case OMPD_target_parallel:
657     case OMPD_target_parallel_for:
658     case OMPD_target_parallel_for_simd:
659     case OMPD_target_teams_distribute:
660     case OMPD_target_teams_distribute_simd:
661     case OMPD_target_teams_distribute_parallel_for:
662     case OMPD_target_teams_distribute_parallel_for_simd:
663     case OMPD_parallel:
664     case OMPD_for:
665     case OMPD_parallel_for:
666     case OMPD_parallel_master:
667     case OMPD_parallel_sections:
668     case OMPD_for_simd:
669     case OMPD_parallel_for_simd:
670     case OMPD_cancel:
671     case OMPD_cancellation_point:
672     case OMPD_ordered:
673     case OMPD_threadprivate:
674     case OMPD_allocate:
675     case OMPD_task:
676     case OMPD_simd:
677     case OMPD_sections:
678     case OMPD_section:
679     case OMPD_single:
680     case OMPD_master:
681     case OMPD_critical:
682     case OMPD_taskyield:
683     case OMPD_barrier:
684     case OMPD_taskwait:
685     case OMPD_taskgroup:
686     case OMPD_atomic:
687     case OMPD_flush:
688     case OMPD_depobj:
689     case OMPD_scan:
690     case OMPD_teams:
691     case OMPD_target_data:
692     case OMPD_target_exit_data:
693     case OMPD_target_enter_data:
694     case OMPD_distribute:
695     case OMPD_distribute_simd:
696     case OMPD_distribute_parallel_for:
697     case OMPD_distribute_parallel_for_simd:
698     case OMPD_teams_distribute:
699     case OMPD_teams_distribute_simd:
700     case OMPD_teams_distribute_parallel_for:
701     case OMPD_teams_distribute_parallel_for_simd:
702     case OMPD_target_update:
703     case OMPD_declare_simd:
704     case OMPD_declare_variant:
705     case OMPD_begin_declare_variant:
706     case OMPD_end_declare_variant:
707     case OMPD_declare_target:
708     case OMPD_end_declare_target:
709     case OMPD_declare_reduction:
710     case OMPD_declare_mapper:
711     case OMPD_taskloop:
712     case OMPD_taskloop_simd:
713     case OMPD_master_taskloop:
714     case OMPD_master_taskloop_simd:
715     case OMPD_parallel_master_taskloop:
716     case OMPD_parallel_master_taskloop_simd:
717     case OMPD_requires:
718     case OMPD_unknown:
719     default:
720       llvm_unreachable("Unexpected directive.");
721     }
722   }
723 
724   return false;
725 }
726 
supportsSPMDExecutionMode(ASTContext & Ctx,const OMPExecutableDirective & D)727 static bool supportsSPMDExecutionMode(ASTContext &Ctx,
728                                       const OMPExecutableDirective &D) {
729   OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
730   switch (DirectiveKind) {
731   case OMPD_target:
732   case OMPD_target_teams:
733     return hasNestedSPMDDirective(Ctx, D);
734   case OMPD_target_parallel:
735   case OMPD_target_parallel_for:
736   case OMPD_target_parallel_for_simd:
737   case OMPD_target_teams_distribute_parallel_for:
738   case OMPD_target_teams_distribute_parallel_for_simd:
739   case OMPD_target_simd:
740   case OMPD_target_teams_distribute_simd:
741     return true;
742   case OMPD_target_teams_distribute:
743     return false;
744   case OMPD_parallel:
745   case OMPD_for:
746   case OMPD_parallel_for:
747   case OMPD_parallel_master:
748   case OMPD_parallel_sections:
749   case OMPD_for_simd:
750   case OMPD_parallel_for_simd:
751   case OMPD_cancel:
752   case OMPD_cancellation_point:
753   case OMPD_ordered:
754   case OMPD_threadprivate:
755   case OMPD_allocate:
756   case OMPD_task:
757   case OMPD_simd:
758   case OMPD_sections:
759   case OMPD_section:
760   case OMPD_single:
761   case OMPD_master:
762   case OMPD_critical:
763   case OMPD_taskyield:
764   case OMPD_barrier:
765   case OMPD_taskwait:
766   case OMPD_taskgroup:
767   case OMPD_atomic:
768   case OMPD_flush:
769   case OMPD_depobj:
770   case OMPD_scan:
771   case OMPD_teams:
772   case OMPD_target_data:
773   case OMPD_target_exit_data:
774   case OMPD_target_enter_data:
775   case OMPD_distribute:
776   case OMPD_distribute_simd:
777   case OMPD_distribute_parallel_for:
778   case OMPD_distribute_parallel_for_simd:
779   case OMPD_teams_distribute:
780   case OMPD_teams_distribute_simd:
781   case OMPD_teams_distribute_parallel_for:
782   case OMPD_teams_distribute_parallel_for_simd:
783   case OMPD_target_update:
784   case OMPD_declare_simd:
785   case OMPD_declare_variant:
786   case OMPD_begin_declare_variant:
787   case OMPD_end_declare_variant:
788   case OMPD_declare_target:
789   case OMPD_end_declare_target:
790   case OMPD_declare_reduction:
791   case OMPD_declare_mapper:
792   case OMPD_taskloop:
793   case OMPD_taskloop_simd:
794   case OMPD_master_taskloop:
795   case OMPD_master_taskloop_simd:
796   case OMPD_parallel_master_taskloop:
797   case OMPD_parallel_master_taskloop_simd:
798   case OMPD_requires:
799   case OMPD_unknown:
800   default:
801     break;
802   }
803   llvm_unreachable(
804       "Unknown programming model for OpenMP directive on NVPTX target.");
805 }
806 
807 /// Check if the directive is loops based and has schedule clause at all or has
808 /// static scheduling.
hasStaticScheduling(const OMPExecutableDirective & D)809 static bool hasStaticScheduling(const OMPExecutableDirective &D) {
810   assert(isOpenMPWorksharingDirective(D.getDirectiveKind()) &&
811          isOpenMPLoopDirective(D.getDirectiveKind()) &&
812          "Expected loop-based directive.");
813   return !D.hasClausesOfKind<OMPOrderedClause>() &&
814          (!D.hasClausesOfKind<OMPScheduleClause>() ||
815           llvm::any_of(D.getClausesOfKind<OMPScheduleClause>(),
816                        [](const OMPScheduleClause *C) {
817                          return C->getScheduleKind() == OMPC_SCHEDULE_static;
818                        }));
819 }
820 
821 /// Check for inner (nested) lightweight runtime construct, if any
hasNestedLightweightDirective(ASTContext & Ctx,const OMPExecutableDirective & D)822 static bool hasNestedLightweightDirective(ASTContext &Ctx,
823                                           const OMPExecutableDirective &D) {
824   assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive.");
825   const auto *CS = D.getInnermostCapturedStmt();
826   const auto *Body =
827       CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
828   const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
829 
830   if (const auto *NestedDir =
831           dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
832     OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
833     switch (D.getDirectiveKind()) {
834     case OMPD_target:
835       if (isOpenMPParallelDirective(DKind) &&
836           isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
837           hasStaticScheduling(*NestedDir))
838         return true;
839       if (DKind == OMPD_teams_distribute_simd || DKind == OMPD_simd)
840         return true;
841       if (DKind == OMPD_parallel) {
842         Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
843             /*IgnoreCaptured=*/true);
844         if (!Body)
845           return false;
846         ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
847         if (const auto *NND =
848                 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
849           DKind = NND->getDirectiveKind();
850           if (isOpenMPWorksharingDirective(DKind) &&
851               isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
852             return true;
853         }
854       } else if (DKind == OMPD_teams) {
855         Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
856             /*IgnoreCaptured=*/true);
857         if (!Body)
858           return false;
859         ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
860         if (const auto *NND =
861                 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
862           DKind = NND->getDirectiveKind();
863           if (isOpenMPParallelDirective(DKind) &&
864               isOpenMPWorksharingDirective(DKind) &&
865               isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
866             return true;
867           if (DKind == OMPD_parallel) {
868             Body = NND->getInnermostCapturedStmt()->IgnoreContainers(
869                 /*IgnoreCaptured=*/true);
870             if (!Body)
871               return false;
872             ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
873             if (const auto *NND =
874                     dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
875               DKind = NND->getDirectiveKind();
876               if (isOpenMPWorksharingDirective(DKind) &&
877                   isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
878                 return true;
879             }
880           }
881         }
882       }
883       return false;
884     case OMPD_target_teams:
885       if (isOpenMPParallelDirective(DKind) &&
886           isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
887           hasStaticScheduling(*NestedDir))
888         return true;
889       if (DKind == OMPD_distribute_simd || DKind == OMPD_simd)
890         return true;
891       if (DKind == OMPD_parallel) {
892         Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
893             /*IgnoreCaptured=*/true);
894         if (!Body)
895           return false;
896         ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
897         if (const auto *NND =
898                 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
899           DKind = NND->getDirectiveKind();
900           if (isOpenMPWorksharingDirective(DKind) &&
901               isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
902             return true;
903         }
904       }
905       return false;
906     case OMPD_target_parallel:
907       if (DKind == OMPD_simd)
908         return true;
909       return isOpenMPWorksharingDirective(DKind) &&
910              isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir);
911     case OMPD_target_teams_distribute:
912     case OMPD_target_simd:
913     case OMPD_target_parallel_for:
914     case OMPD_target_parallel_for_simd:
915     case OMPD_target_teams_distribute_simd:
916     case OMPD_target_teams_distribute_parallel_for:
917     case OMPD_target_teams_distribute_parallel_for_simd:
918     case OMPD_parallel:
919     case OMPD_for:
920     case OMPD_parallel_for:
921     case OMPD_parallel_master:
922     case OMPD_parallel_sections:
923     case OMPD_for_simd:
924     case OMPD_parallel_for_simd:
925     case OMPD_cancel:
926     case OMPD_cancellation_point:
927     case OMPD_ordered:
928     case OMPD_threadprivate:
929     case OMPD_allocate:
930     case OMPD_task:
931     case OMPD_simd:
932     case OMPD_sections:
933     case OMPD_section:
934     case OMPD_single:
935     case OMPD_master:
936     case OMPD_critical:
937     case OMPD_taskyield:
938     case OMPD_barrier:
939     case OMPD_taskwait:
940     case OMPD_taskgroup:
941     case OMPD_atomic:
942     case OMPD_flush:
943     case OMPD_depobj:
944     case OMPD_scan:
945     case OMPD_teams:
946     case OMPD_target_data:
947     case OMPD_target_exit_data:
948     case OMPD_target_enter_data:
949     case OMPD_distribute:
950     case OMPD_distribute_simd:
951     case OMPD_distribute_parallel_for:
952     case OMPD_distribute_parallel_for_simd:
953     case OMPD_teams_distribute:
954     case OMPD_teams_distribute_simd:
955     case OMPD_teams_distribute_parallel_for:
956     case OMPD_teams_distribute_parallel_for_simd:
957     case OMPD_target_update:
958     case OMPD_declare_simd:
959     case OMPD_declare_variant:
960     case OMPD_begin_declare_variant:
961     case OMPD_end_declare_variant:
962     case OMPD_declare_target:
963     case OMPD_end_declare_target:
964     case OMPD_declare_reduction:
965     case OMPD_declare_mapper:
966     case OMPD_taskloop:
967     case OMPD_taskloop_simd:
968     case OMPD_master_taskloop:
969     case OMPD_master_taskloop_simd:
970     case OMPD_parallel_master_taskloop:
971     case OMPD_parallel_master_taskloop_simd:
972     case OMPD_requires:
973     case OMPD_unknown:
974     default:
975       llvm_unreachable("Unexpected directive.");
976     }
977   }
978 
979   return false;
980 }
981 
982 /// Checks if the construct supports lightweight runtime. It must be SPMD
983 /// construct + inner loop-based construct with static scheduling.
supportsLightweightRuntime(ASTContext & Ctx,const OMPExecutableDirective & D)984 static bool supportsLightweightRuntime(ASTContext &Ctx,
985                                        const OMPExecutableDirective &D) {
986   if (!supportsSPMDExecutionMode(Ctx, D))
987     return false;
988   OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
989   switch (DirectiveKind) {
990   case OMPD_target:
991   case OMPD_target_teams:
992   case OMPD_target_parallel:
993     return hasNestedLightweightDirective(Ctx, D);
994   case OMPD_target_parallel_for:
995   case OMPD_target_parallel_for_simd:
996   case OMPD_target_teams_distribute_parallel_for:
997   case OMPD_target_teams_distribute_parallel_for_simd:
998     // (Last|First)-privates must be shared in parallel region.
999     return hasStaticScheduling(D);
1000   case OMPD_target_simd:
1001   case OMPD_target_teams_distribute_simd:
1002     return true;
1003   case OMPD_target_teams_distribute:
1004     return false;
1005   case OMPD_parallel:
1006   case OMPD_for:
1007   case OMPD_parallel_for:
1008   case OMPD_parallel_master:
1009   case OMPD_parallel_sections:
1010   case OMPD_for_simd:
1011   case OMPD_parallel_for_simd:
1012   case OMPD_cancel:
1013   case OMPD_cancellation_point:
1014   case OMPD_ordered:
1015   case OMPD_threadprivate:
1016   case OMPD_allocate:
1017   case OMPD_task:
1018   case OMPD_simd:
1019   case OMPD_sections:
1020   case OMPD_section:
1021   case OMPD_single:
1022   case OMPD_master:
1023   case OMPD_critical:
1024   case OMPD_taskyield:
1025   case OMPD_barrier:
1026   case OMPD_taskwait:
1027   case OMPD_taskgroup:
1028   case OMPD_atomic:
1029   case OMPD_flush:
1030   case OMPD_depobj:
1031   case OMPD_scan:
1032   case OMPD_teams:
1033   case OMPD_target_data:
1034   case OMPD_target_exit_data:
1035   case OMPD_target_enter_data:
1036   case OMPD_distribute:
1037   case OMPD_distribute_simd:
1038   case OMPD_distribute_parallel_for:
1039   case OMPD_distribute_parallel_for_simd:
1040   case OMPD_teams_distribute:
1041   case OMPD_teams_distribute_simd:
1042   case OMPD_teams_distribute_parallel_for:
1043   case OMPD_teams_distribute_parallel_for_simd:
1044   case OMPD_target_update:
1045   case OMPD_declare_simd:
1046   case OMPD_declare_variant:
1047   case OMPD_begin_declare_variant:
1048   case OMPD_end_declare_variant:
1049   case OMPD_declare_target:
1050   case OMPD_end_declare_target:
1051   case OMPD_declare_reduction:
1052   case OMPD_declare_mapper:
1053   case OMPD_taskloop:
1054   case OMPD_taskloop_simd:
1055   case OMPD_master_taskloop:
1056   case OMPD_master_taskloop_simd:
1057   case OMPD_parallel_master_taskloop:
1058   case OMPD_parallel_master_taskloop_simd:
1059   case OMPD_requires:
1060   case OMPD_unknown:
1061   default:
1062     break;
1063   }
1064   llvm_unreachable(
1065       "Unknown programming model for OpenMP directive on NVPTX target.");
1066 }
1067 
emitNonSPMDKernel(const OMPExecutableDirective & D,StringRef ParentName,llvm::Function * & OutlinedFn,llvm::Constant * & OutlinedFnID,bool IsOffloadEntry,const RegionCodeGenTy & CodeGen)1068 void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
1069                                              StringRef ParentName,
1070                                              llvm::Function *&OutlinedFn,
1071                                              llvm::Constant *&OutlinedFnID,
1072                                              bool IsOffloadEntry,
1073                                              const RegionCodeGenTy &CodeGen) {
1074   ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode);
1075   EntryFunctionState EST;
1076   WorkerFunctionState WST(CGM, D.getBeginLoc());
1077   Work.clear();
1078   WrapperFunctionsMap.clear();
1079 
1080   // Emit target region as a standalone region.
1081   class NVPTXPrePostActionTy : public PrePostActionTy {
1082     CGOpenMPRuntimeGPU::EntryFunctionState &EST;
1083     CGOpenMPRuntimeGPU::WorkerFunctionState &WST;
1084 
1085   public:
1086     NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST,
1087                          CGOpenMPRuntimeGPU::WorkerFunctionState &WST)
1088         : EST(EST), WST(WST) {}
1089     void Enter(CodeGenFunction &CGF) override {
1090       auto &RT =
1091           static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1092       RT.emitNonSPMDEntryHeader(CGF, EST, WST);
1093       // Skip target region initialization.
1094       RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1095     }
1096     void Exit(CodeGenFunction &CGF) override {
1097       auto &RT =
1098           static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1099       RT.clearLocThreadIdInsertPt(CGF);
1100       RT.emitNonSPMDEntryFooter(CGF, EST);
1101     }
1102   } Action(EST, WST);
1103   CodeGen.setAction(Action);
1104   IsInTTDRegion = true;
1105   // Reserve place for the globalized memory.
1106   GlobalizedRecords.emplace_back();
1107   if (!KernelStaticGlobalized) {
1108     KernelStaticGlobalized = new llvm::GlobalVariable(
1109         CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
1110         llvm::GlobalValue::InternalLinkage,
1111         llvm::UndefValue::get(CGM.VoidPtrTy),
1112         "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
1113         llvm::GlobalValue::NotThreadLocal,
1114         CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
1115   }
1116   emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1117                                    IsOffloadEntry, CodeGen);
1118   IsInTTDRegion = false;
1119 
1120   // Now change the name of the worker function to correspond to this target
1121   // region's entry function.
1122   WST.WorkerFn->setName(Twine(OutlinedFn->getName(), "_worker"));
1123 
1124   // Create the worker function
1125   emitWorkerFunction(WST);
1126 }
1127 
1128 // Setup NVPTX threads for master-worker OpenMP scheme.
emitNonSPMDEntryHeader(CodeGenFunction & CGF,EntryFunctionState & EST,WorkerFunctionState & WST)1129 void CGOpenMPRuntimeGPU::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
1130                                                   EntryFunctionState &EST,
1131                                                   WorkerFunctionState &WST) {
1132   CGBuilderTy &Bld = CGF.Builder;
1133 
1134   llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
1135   llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
1136   llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
1137   EST.ExitBB = CGF.createBasicBlock(".exit");
1138 
1139   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1140   llvm::Value *GPUThreadID = RT.getGPUThreadID(CGF);
1141   llvm::Value *ThreadLimit = getThreadLimit(CGF);
1142   llvm::Value *IsWorker = Bld.CreateICmpULT(GPUThreadID, ThreadLimit);
1143   Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
1144 
1145   CGF.EmitBlock(WorkerBB);
1146   emitCall(CGF, WST.Loc, WST.WorkerFn);
1147   CGF.EmitBranch(EST.ExitBB);
1148 
1149   CGF.EmitBlock(MasterCheckBB);
1150   GPUThreadID = RT.getGPUThreadID(CGF);
1151   llvm::Value *MasterThreadID = getMasterThreadID(CGF);
1152   llvm::Value *IsMaster = Bld.CreateICmpEQ(GPUThreadID, MasterThreadID);
1153   Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
1154 
1155   CGF.EmitBlock(MasterBB);
1156   IsInTargetMasterThreadRegion = true;
1157   // SEQUENTIAL (MASTER) REGION START
1158   // First action in sequential region:
1159   // Initialize the state of the OpenMP runtime library on the GPU.
1160   // TODO: Optimize runtime initialization and pass in correct value.
1161   llvm::Value *Args[] = {getThreadLimit(CGF),
1162                          Bld.getInt16(/*RequiresOMPRuntime=*/1)};
1163   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1164                           CGM.getModule(), OMPRTL___kmpc_kernel_init),
1165                       Args);
1166 
1167   // For data sharing, we need to initialize the stack.
1168   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1169       CGM.getModule(), OMPRTL___kmpc_data_sharing_init_stack));
1170 
1171   emitGenericVarsProlog(CGF, WST.Loc);
1172 }
1173 
emitNonSPMDEntryFooter(CodeGenFunction & CGF,EntryFunctionState & EST)1174 void CGOpenMPRuntimeGPU::emitNonSPMDEntryFooter(CodeGenFunction &CGF,
1175                                                   EntryFunctionState &EST) {
1176   IsInTargetMasterThreadRegion = false;
1177   if (!CGF.HaveInsertPoint())
1178     return;
1179 
1180   emitGenericVarsEpilog(CGF);
1181 
1182   if (!EST.ExitBB)
1183     EST.ExitBB = CGF.createBasicBlock(".exit");
1184 
1185   llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
1186   CGF.EmitBranch(TerminateBB);
1187 
1188   CGF.EmitBlock(TerminateBB);
1189   // Signal termination condition.
1190   // TODO: Optimize runtime initialization and pass in correct value.
1191   llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)};
1192   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1193                           CGM.getModule(), OMPRTL___kmpc_kernel_deinit),
1194                       Args);
1195   // Barrier to terminate worker threads.
1196   syncCTAThreads(CGF);
1197   // Master thread jumps to exit point.
1198   CGF.EmitBranch(EST.ExitBB);
1199 
1200   CGF.EmitBlock(EST.ExitBB);
1201   EST.ExitBB = nullptr;
1202 }
1203 
emitSPMDKernel(const OMPExecutableDirective & D,StringRef ParentName,llvm::Function * & OutlinedFn,llvm::Constant * & OutlinedFnID,bool IsOffloadEntry,const RegionCodeGenTy & CodeGen)1204 void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
1205                                           StringRef ParentName,
1206                                           llvm::Function *&OutlinedFn,
1207                                           llvm::Constant *&OutlinedFnID,
1208                                           bool IsOffloadEntry,
1209                                           const RegionCodeGenTy &CodeGen) {
1210   ExecutionRuntimeModesRAII ModeRAII(
1211       CurrentExecutionMode, RequiresFullRuntime,
1212       CGM.getLangOpts().OpenMPCUDAForceFullRuntime ||
1213           !supportsLightweightRuntime(CGM.getContext(), D));
1214   EntryFunctionState EST;
1215 
1216   // Emit target region as a standalone region.
1217   class NVPTXPrePostActionTy : public PrePostActionTy {
1218     CGOpenMPRuntimeGPU &RT;
1219     CGOpenMPRuntimeGPU::EntryFunctionState &EST;
1220     const OMPExecutableDirective &D;
1221 
1222   public:
1223     NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
1224                          CGOpenMPRuntimeGPU::EntryFunctionState &EST,
1225                          const OMPExecutableDirective &D)
1226         : RT(RT), EST(EST), D(D) {}
1227     void Enter(CodeGenFunction &CGF) override {
1228       RT.emitSPMDEntryHeader(CGF, EST, D);
1229       // Skip target region initialization.
1230       RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1231     }
1232     void Exit(CodeGenFunction &CGF) override {
1233       RT.clearLocThreadIdInsertPt(CGF);
1234       RT.emitSPMDEntryFooter(CGF, EST);
1235     }
1236   } Action(*this, EST, D);
1237   CodeGen.setAction(Action);
1238   IsInTTDRegion = true;
1239   // Reserve place for the globalized memory.
1240   GlobalizedRecords.emplace_back();
1241   if (!KernelStaticGlobalized) {
1242     KernelStaticGlobalized = new llvm::GlobalVariable(
1243         CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
1244         llvm::GlobalValue::InternalLinkage,
1245         llvm::UndefValue::get(CGM.VoidPtrTy),
1246         "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
1247         llvm::GlobalValue::NotThreadLocal,
1248         CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
1249   }
1250   emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1251                                    IsOffloadEntry, CodeGen);
1252   IsInTTDRegion = false;
1253 }
1254 
emitSPMDEntryHeader(CodeGenFunction & CGF,EntryFunctionState & EST,const OMPExecutableDirective & D)1255 void CGOpenMPRuntimeGPU::emitSPMDEntryHeader(
1256     CodeGenFunction &CGF, EntryFunctionState &EST,
1257     const OMPExecutableDirective &D) {
1258   CGBuilderTy &Bld = CGF.Builder;
1259 
1260   // Setup BBs in entry function.
1261   llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
1262   EST.ExitBB = CGF.createBasicBlock(".exit");
1263 
1264   llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSPMDExecutionMode=*/true),
1265                          /*RequiresOMPRuntime=*/
1266                          Bld.getInt16(RequiresFullRuntime ? 1 : 0)};
1267   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1268                           CGM.getModule(), OMPRTL___kmpc_spmd_kernel_init),
1269                       Args);
1270 
1271   if (RequiresFullRuntime) {
1272     // For data sharing, we need to initialize the stack.
1273     CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1274         CGM.getModule(), OMPRTL___kmpc_data_sharing_init_stack_spmd));
1275   }
1276 
1277   CGF.EmitBranch(ExecuteBB);
1278 
1279   CGF.EmitBlock(ExecuteBB);
1280 
1281   IsInTargetMasterThreadRegion = true;
1282 }
1283 
emitSPMDEntryFooter(CodeGenFunction & CGF,EntryFunctionState & EST)1284 void CGOpenMPRuntimeGPU::emitSPMDEntryFooter(CodeGenFunction &CGF,
1285                                                EntryFunctionState &EST) {
1286   IsInTargetMasterThreadRegion = false;
1287   if (!CGF.HaveInsertPoint())
1288     return;
1289 
1290   if (!EST.ExitBB)
1291     EST.ExitBB = CGF.createBasicBlock(".exit");
1292 
1293   llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
1294   CGF.EmitBranch(OMPDeInitBB);
1295 
1296   CGF.EmitBlock(OMPDeInitBB);
1297   // DeInitialize the OMP state in the runtime; called by all active threads.
1298   llvm::Value *Args[] = {/*RequiresOMPRuntime=*/
1299                          CGF.Builder.getInt16(RequiresFullRuntime ? 1 : 0)};
1300   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1301                           CGM.getModule(), OMPRTL___kmpc_spmd_kernel_deinit_v2),
1302                       Args);
1303   CGF.EmitBranch(EST.ExitBB);
1304 
1305   CGF.EmitBlock(EST.ExitBB);
1306   EST.ExitBB = nullptr;
1307 }
1308 
1309 // Create a unique global variable to indicate the execution mode of this target
1310 // region. The execution mode is either 'generic', or 'spmd' depending on the
1311 // target directive. This variable is picked up by the offload library to setup
1312 // the device appropriately before kernel launch. If the execution mode is
1313 // 'generic', the runtime reserves one warp for the master, otherwise, all
1314 // warps participate in parallel work.
setPropertyExecutionMode(CodeGenModule & CGM,StringRef Name,bool Mode)1315 static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
1316                                      bool Mode) {
1317   auto *GVMode =
1318       new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
1319                                llvm::GlobalValue::WeakAnyLinkage,
1320                                llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1),
1321                                Twine(Name, "_exec_mode"));
1322   CGM.addCompilerUsedGlobal(GVMode);
1323 }
1324 
emitWorkerFunction(WorkerFunctionState & WST)1325 void CGOpenMPRuntimeGPU::emitWorkerFunction(WorkerFunctionState &WST) {
1326   ASTContext &Ctx = CGM.getContext();
1327 
1328   CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
1329   CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, WST.CGFI, {},
1330                     WST.Loc, WST.Loc);
1331   emitWorkerLoop(CGF, WST);
1332   CGF.FinishFunction();
1333 }
1334 
emitWorkerLoop(CodeGenFunction & CGF,WorkerFunctionState & WST)1335 void CGOpenMPRuntimeGPU::emitWorkerLoop(CodeGenFunction &CGF,
1336                                         WorkerFunctionState &WST) {
1337   //
1338   // The workers enter this loop and wait for parallel work from the master.
1339   // When the master encounters a parallel region it sets up the work + variable
1340   // arguments, and wakes up the workers.  The workers first check to see if
1341   // they are required for the parallel region, i.e., within the # of requested
1342   // parallel threads.  The activated workers load the variable arguments and
1343   // execute the parallel work.
1344   //
1345 
1346   CGBuilderTy &Bld = CGF.Builder;
1347 
1348   llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
1349   llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
1350   llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
1351   llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
1352   llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
1353   llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
1354 
1355   CGF.EmitBranch(AwaitBB);
1356 
1357   // Workers wait for work from master.
1358   CGF.EmitBlock(AwaitBB);
1359   // Wait for parallel work
1360   syncCTAThreads(CGF);
1361 
1362   Address WorkFn =
1363       CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");
1364   Address ExecStatus =
1365       CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status");
1366   CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
1367   CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
1368 
1369   // TODO: Optimize runtime initialization and pass in correct value.
1370   llvm::Value *Args[] = {WorkFn.getPointer()};
1371   llvm::Value *Ret =
1372       CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1373                               CGM.getModule(), OMPRTL___kmpc_kernel_parallel),
1374                           Args);
1375   Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
1376 
1377   // On termination condition (workid == 0), exit loop.
1378   llvm::Value *WorkID = Bld.CreateLoad(WorkFn);
1379   llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID, "should_terminate");
1380   Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
1381 
1382   // Activate requested workers.
1383   CGF.EmitBlock(SelectWorkersBB);
1384   llvm::Value *IsActive =
1385       Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
1386   Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
1387 
1388   // Signal start of parallel region.
1389   CGF.EmitBlock(ExecuteBB);
1390   // Skip initialization.
1391   setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1392 
1393   // Process work items: outlined parallel functions.
1394   for (llvm::Function *W : Work) {
1395     // Try to match this outlined function.
1396     llvm::Value *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy);
1397 
1398     llvm::Value *WorkFnMatch =
1399         Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
1400 
1401     llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
1402     llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
1403     Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
1404 
1405     // Execute this outlined function.
1406     CGF.EmitBlock(ExecuteFNBB);
1407 
1408     // Insert call to work function via shared wrapper. The shared
1409     // wrapper takes two arguments:
1410     //   - the parallelism level;
1411     //   - the thread ID;
1412     emitCall(CGF, WST.Loc, W,
1413              {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1414 
1415     // Go to end of parallel region.
1416     CGF.EmitBranch(TerminateBB);
1417 
1418     CGF.EmitBlock(CheckNextBB);
1419   }
1420   // Default case: call to outlined function through pointer if the target
1421   // region makes a declare target call that may contain an orphaned parallel
1422   // directive.
1423   auto *ParallelFnTy =
1424       llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty},
1425                               /*isVarArg=*/false);
1426   llvm::Value *WorkFnCast =
1427       Bld.CreateBitCast(WorkID, ParallelFnTy->getPointerTo());
1428   // Insert call to work function via shared wrapper. The shared
1429   // wrapper takes two arguments:
1430   //   - the parallelism level;
1431   //   - the thread ID;
1432   emitCall(CGF, WST.Loc, {ParallelFnTy, WorkFnCast},
1433            {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1434   // Go to end of parallel region.
1435   CGF.EmitBranch(TerminateBB);
1436 
1437   // Signal end of parallel region.
1438   CGF.EmitBlock(TerminateBB);
1439   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1440                           CGM.getModule(), OMPRTL___kmpc_kernel_end_parallel),
1441                       llvm::None);
1442   CGF.EmitBranch(BarrierBB);
1443 
1444   // All active and inactive workers wait at a barrier after parallel region.
1445   CGF.EmitBlock(BarrierBB);
1446   // Barrier after parallel region.
1447   syncCTAThreads(CGF);
1448   CGF.EmitBranch(AwaitBB);
1449 
1450   // Exit target region.
1451   CGF.EmitBlock(ExitBB);
1452   // Skip initialization.
1453   clearLocThreadIdInsertPt(CGF);
1454 }
1455 
createOffloadEntry(llvm::Constant * ID,llvm::Constant * Addr,uint64_t Size,int32_t,llvm::GlobalValue::LinkageTypes)1456 void CGOpenMPRuntimeGPU::createOffloadEntry(llvm::Constant *ID,
1457                                               llvm::Constant *Addr,
1458                                               uint64_t Size, int32_t,
1459                                               llvm::GlobalValue::LinkageTypes) {
1460   // TODO: Add support for global variables on the device after declare target
1461   // support.
1462   if (!isa<llvm::Function>(Addr))
1463     return;
1464   llvm::Module &M = CGM.getModule();
1465   llvm::LLVMContext &Ctx = CGM.getLLVMContext();
1466 
1467   // Get "nvvm.annotations" metadata node
1468   llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
1469 
1470   llvm::Metadata *MDVals[] = {
1471       llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"),
1472       llvm::ConstantAsMetadata::get(
1473           llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
1474   // Append metadata to nvvm.annotations
1475   MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
1476 }
1477 
emitTargetOutlinedFunction(const OMPExecutableDirective & D,StringRef ParentName,llvm::Function * & OutlinedFn,llvm::Constant * & OutlinedFnID,bool IsOffloadEntry,const RegionCodeGenTy & CodeGen)1478 void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
1479     const OMPExecutableDirective &D, StringRef ParentName,
1480     llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
1481     bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
1482   if (!IsOffloadEntry) // Nothing to do.
1483     return;
1484 
1485   assert(!ParentName.empty() && "Invalid target region parent name!");
1486 
1487   bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
1488   if (Mode)
1489     emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1490                    CodeGen);
1491   else
1492     emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1493                       CodeGen);
1494 
1495   setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
1496 }
1497 
1498 namespace {
1499 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
1500 /// Enum for accesseing the reserved_2 field of the ident_t struct.
1501 enum ModeFlagsTy : unsigned {
1502   /// Bit set to 1 when in SPMD mode.
1503   KMP_IDENT_SPMD_MODE = 0x01,
1504   /// Bit set to 1 when a simplified runtime is used.
1505   KMP_IDENT_SIMPLE_RT_MODE = 0x02,
1506   LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/KMP_IDENT_SIMPLE_RT_MODE)
1507 };
1508 
1509 /// Special mode Undefined. Is the combination of Non-SPMD mode + SimpleRuntime.
1510 static const ModeFlagsTy UndefinedMode =
1511     (~KMP_IDENT_SPMD_MODE) & KMP_IDENT_SIMPLE_RT_MODE;
1512 } // anonymous namespace
1513 
getDefaultLocationReserved2Flags() const1514 unsigned CGOpenMPRuntimeGPU::getDefaultLocationReserved2Flags() const {
1515   switch (getExecutionMode()) {
1516   case EM_SPMD:
1517     if (requiresFullRuntime())
1518       return KMP_IDENT_SPMD_MODE & (~KMP_IDENT_SIMPLE_RT_MODE);
1519     return KMP_IDENT_SPMD_MODE | KMP_IDENT_SIMPLE_RT_MODE;
1520   case EM_NonSPMD:
1521     assert(requiresFullRuntime() && "Expected full runtime.");
1522     return (~KMP_IDENT_SPMD_MODE) & (~KMP_IDENT_SIMPLE_RT_MODE);
1523   case EM_Unknown:
1524     return UndefinedMode;
1525   }
1526   llvm_unreachable("Unknown flags are requested.");
1527 }
1528 
CGOpenMPRuntimeGPU(CodeGenModule & CGM)1529 CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
1530     : CGOpenMPRuntime(CGM, "_", "$") {
1531   if (!CGM.getLangOpts().OpenMPIsDevice)
1532     llvm_unreachable("OpenMP NVPTX can only handle device code.");
1533 }
1534 
emitProcBindClause(CodeGenFunction & CGF,ProcBindKind ProcBind,SourceLocation Loc)1535 void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,
1536                                               ProcBindKind ProcBind,
1537                                               SourceLocation Loc) {
1538   // Do nothing in case of SPMD mode and L0 parallel.
1539   if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
1540     return;
1541 
1542   CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
1543 }
1544 
emitNumThreadsClause(CodeGenFunction & CGF,llvm::Value * NumThreads,SourceLocation Loc)1545 void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF,
1546                                                 llvm::Value *NumThreads,
1547                                                 SourceLocation Loc) {
1548   // Do nothing in case of SPMD mode and L0 parallel.
1549   if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
1550     return;
1551 
1552   CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
1553 }
1554 
emitNumTeamsClause(CodeGenFunction & CGF,const Expr * NumTeams,const Expr * ThreadLimit,SourceLocation Loc)1555 void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF,
1556                                               const Expr *NumTeams,
1557                                               const Expr *ThreadLimit,
1558                                               SourceLocation Loc) {}
1559 
emitParallelOutlinedFunction(const OMPExecutableDirective & D,const VarDecl * ThreadIDVar,OpenMPDirectiveKind InnermostKind,const RegionCodeGenTy & CodeGen)1560 llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction(
1561     const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1562     OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1563   // Emit target region as a standalone region.
1564   class NVPTXPrePostActionTy : public PrePostActionTy {
1565     bool &IsInParallelRegion;
1566     bool PrevIsInParallelRegion;
1567 
1568   public:
1569     NVPTXPrePostActionTy(bool &IsInParallelRegion)
1570         : IsInParallelRegion(IsInParallelRegion) {}
1571     void Enter(CodeGenFunction &CGF) override {
1572       PrevIsInParallelRegion = IsInParallelRegion;
1573       IsInParallelRegion = true;
1574     }
1575     void Exit(CodeGenFunction &CGF) override {
1576       IsInParallelRegion = PrevIsInParallelRegion;
1577     }
1578   } Action(IsInParallelRegion);
1579   CodeGen.setAction(Action);
1580   bool PrevIsInTTDRegion = IsInTTDRegion;
1581   IsInTTDRegion = false;
1582   bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
1583   IsInTargetMasterThreadRegion = false;
1584   auto *OutlinedFun =
1585       cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
1586           D, ThreadIDVar, InnermostKind, CodeGen));
1587   IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
1588   IsInTTDRegion = PrevIsInTTDRegion;
1589   if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD &&
1590       !IsInParallelRegion) {
1591     llvm::Function *WrapperFun =
1592         createParallelDataSharingWrapper(OutlinedFun, D);
1593     WrapperFunctionsMap[OutlinedFun] = WrapperFun;
1594   }
1595 
1596   return OutlinedFun;
1597 }
1598 
1599 /// Get list of lastprivate variables from the teams distribute ... or
1600 /// teams {distribute ...} directives.
1601 static void
getDistributeLastprivateVars(ASTContext & Ctx,const OMPExecutableDirective & D,llvm::SmallVectorImpl<const ValueDecl * > & Vars)1602 getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,
1603                              llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
1604   assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
1605          "expected teams directive.");
1606   const OMPExecutableDirective *Dir = &D;
1607   if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
1608     if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild(
1609             Ctx,
1610             D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
1611                 /*IgnoreCaptured=*/true))) {
1612       Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
1613       if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
1614         Dir = nullptr;
1615     }
1616   }
1617   if (!Dir)
1618     return;
1619   for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
1620     for (const Expr *E : C->getVarRefs())
1621       Vars.push_back(getPrivateItem(E));
1622   }
1623 }
1624 
1625 /// Get list of reduction variables from the teams ... directives.
1626 static void
getTeamsReductionVars(ASTContext & Ctx,const OMPExecutableDirective & D,llvm::SmallVectorImpl<const ValueDecl * > & Vars)1627 getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,
1628                       llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
1629   assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
1630          "expected teams directive.");
1631   for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
1632     for (const Expr *E : C->privates())
1633       Vars.push_back(getPrivateItem(E));
1634   }
1635 }
1636 
emitTeamsOutlinedFunction(const OMPExecutableDirective & D,const VarDecl * ThreadIDVar,OpenMPDirectiveKind InnermostKind,const RegionCodeGenTy & CodeGen)1637 llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
1638     const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1639     OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1640   SourceLocation Loc = D.getBeginLoc();
1641 
1642   const RecordDecl *GlobalizedRD = nullptr;
1643   llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
1644   llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
1645   unsigned WarpSize = CGM.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
1646   // Globalize team reductions variable unconditionally in all modes.
1647   if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1648     getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
1649   if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
1650     getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
1651     if (!LastPrivatesReductions.empty()) {
1652       GlobalizedRD = ::buildRecordForGlobalizedVars(
1653           CGM.getContext(), llvm::None, LastPrivatesReductions,
1654           MappedDeclsFields, WarpSize);
1655     }
1656   } else if (!LastPrivatesReductions.empty()) {
1657     assert(!TeamAndReductions.first &&
1658            "Previous team declaration is not expected.");
1659     TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
1660     std::swap(TeamAndReductions.second, LastPrivatesReductions);
1661   }
1662 
1663   // Emit target region as a standalone region.
1664   class NVPTXPrePostActionTy : public PrePostActionTy {
1665     SourceLocation &Loc;
1666     const RecordDecl *GlobalizedRD;
1667     llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1668         &MappedDeclsFields;
1669 
1670   public:
1671     NVPTXPrePostActionTy(
1672         SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1673         llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1674             &MappedDeclsFields)
1675         : Loc(Loc), GlobalizedRD(GlobalizedRD),
1676           MappedDeclsFields(MappedDeclsFields) {}
1677     void Enter(CodeGenFunction &CGF) override {
1678       auto &Rt =
1679           static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1680       if (GlobalizedRD) {
1681         auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
1682         I->getSecond().GlobalRecord = GlobalizedRD;
1683         I->getSecond().MappedParams =
1684             std::make_unique<CodeGenFunction::OMPMapVars>();
1685         DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
1686         for (const auto &Pair : MappedDeclsFields) {
1687           assert(Pair.getFirst()->isCanonicalDecl() &&
1688                  "Expected canonical declaration");
1689           Data.insert(std::make_pair(Pair.getFirst(),
1690                                      MappedVarData(Pair.getSecond(),
1691                                                    /*IsOnePerTeam=*/true)));
1692         }
1693       }
1694       Rt.emitGenericVarsProlog(CGF, Loc);
1695     }
1696     void Exit(CodeGenFunction &CGF) override {
1697       static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
1698           .emitGenericVarsEpilog(CGF);
1699     }
1700   } Action(Loc, GlobalizedRD, MappedDeclsFields);
1701   CodeGen.setAction(Action);
1702   llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1703       D, ThreadIDVar, InnermostKind, CodeGen);
1704 
1705   return OutlinedFun;
1706 }
1707 
emitGenericVarsProlog(CodeGenFunction & CGF,SourceLocation Loc,bool WithSPMDCheck)1708 void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
1709                                                  SourceLocation Loc,
1710                                                  bool WithSPMDCheck) {
1711   if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
1712       getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1713     return;
1714 
1715   CGBuilderTy &Bld = CGF.Builder;
1716 
1717   const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1718   if (I == FunctionGlobalizedDecls.end())
1719     return;
1720   if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) {
1721     QualType GlobalRecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord);
1722     QualType SecGlobalRecTy;
1723 
1724     // Recover pointer to this function's global record. The runtime will
1725     // handle the specifics of the allocation of the memory.
1726     // Use actual memory size of the record including the padding
1727     // for alignment purposes.
1728     unsigned Alignment =
1729         CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
1730     unsigned GlobalRecordSize =
1731         CGM.getContext().getTypeSizeInChars(GlobalRecTy).getQuantity();
1732     GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
1733 
1734     llvm::PointerType *GlobalRecPtrTy =
1735         CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo();
1736     llvm::Value *GlobalRecCastAddr;
1737     llvm::Value *IsTTD = nullptr;
1738     if (!IsInTTDRegion &&
1739         (WithSPMDCheck ||
1740          getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) {
1741       llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
1742       llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd");
1743       llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
1744       if (I->getSecond().SecondaryGlobalRecord.hasValue()) {
1745         llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1746         llvm::Value *ThreadID = getThreadID(CGF, Loc);
1747         llvm::Value *PL = CGF.EmitRuntimeCall(
1748             OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
1749                                                   OMPRTL___kmpc_parallel_level),
1750             {RTLoc, ThreadID});
1751         IsTTD = Bld.CreateIsNull(PL);
1752       }
1753       llvm::Value *IsSPMD = Bld.CreateIsNotNull(
1754           CGF.EmitNounwindRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1755               CGM.getModule(), OMPRTL___kmpc_is_spmd_exec_mode)));
1756       Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB);
1757       // There is no need to emit line number for unconditional branch.
1758       (void)ApplyDebugLocation::CreateEmpty(CGF);
1759       CGF.EmitBlock(SPMDBB);
1760       Address RecPtr = Address(llvm::ConstantPointerNull::get(GlobalRecPtrTy),
1761                                CharUnits::fromQuantity(Alignment));
1762       CGF.EmitBranch(ExitBB);
1763       // There is no need to emit line number for unconditional branch.
1764       (void)ApplyDebugLocation::CreateEmpty(CGF);
1765       CGF.EmitBlock(NonSPMDBB);
1766       llvm::Value *Size = llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize);
1767       if (const RecordDecl *SecGlobalizedVarsRecord =
1768               I->getSecond().SecondaryGlobalRecord.getValueOr(nullptr)) {
1769         SecGlobalRecTy =
1770             CGM.getContext().getRecordType(SecGlobalizedVarsRecord);
1771 
1772         // Recover pointer to this function's global record. The runtime will
1773         // handle the specifics of the allocation of the memory.
1774         // Use actual memory size of the record including the padding
1775         // for alignment purposes.
1776         unsigned Alignment =
1777             CGM.getContext().getTypeAlignInChars(SecGlobalRecTy).getQuantity();
1778         unsigned GlobalRecordSize =
1779             CGM.getContext().getTypeSizeInChars(SecGlobalRecTy).getQuantity();
1780         GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
1781         Size = Bld.CreateSelect(
1782             IsTTD, llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), Size);
1783       }
1784       // TODO: allow the usage of shared memory to be controlled by
1785       // the user, for now, default to global.
1786       llvm::Value *GlobalRecordSizeArg[] = {
1787           Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
1788       llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
1789           OMPBuilder.getOrCreateRuntimeFunction(
1790               CGM.getModule(), OMPRTL___kmpc_data_sharing_coalesced_push_stack),
1791           GlobalRecordSizeArg);
1792       GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1793           GlobalRecValue, GlobalRecPtrTy);
1794       CGF.EmitBlock(ExitBB);
1795       auto *Phi = Bld.CreatePHI(GlobalRecPtrTy,
1796                                 /*NumReservedValues=*/2, "_select_stack");
1797       Phi->addIncoming(RecPtr.getPointer(), SPMDBB);
1798       Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB);
1799       GlobalRecCastAddr = Phi;
1800       I->getSecond().GlobalRecordAddr = Phi;
1801       I->getSecond().IsInSPMDModeFlag = IsSPMD;
1802     } else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) {
1803       assert(GlobalizedRecords.back().Records.size() < 2 &&
1804              "Expected less than 2 globalized records: one for target and one "
1805              "for teams.");
1806       unsigned Offset = 0;
1807       for (const RecordDecl *RD : GlobalizedRecords.back().Records) {
1808         QualType RDTy = CGM.getContext().getRecordType(RD);
1809         unsigned Alignment =
1810             CGM.getContext().getTypeAlignInChars(RDTy).getQuantity();
1811         unsigned Size = CGM.getContext().getTypeSizeInChars(RDTy).getQuantity();
1812         Offset =
1813             llvm::alignTo(llvm::alignTo(Offset, Alignment) + Size, Alignment);
1814       }
1815       unsigned Alignment =
1816           CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
1817       Offset = llvm::alignTo(Offset, Alignment);
1818       GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord);
1819       ++GlobalizedRecords.back().RegionCounter;
1820       if (GlobalizedRecords.back().Records.size() == 1) {
1821         assert(KernelStaticGlobalized &&
1822                "Kernel static pointer must be initialized already.");
1823         auto *UseSharedMemory = new llvm::GlobalVariable(
1824             CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true,
1825             llvm::GlobalValue::InternalLinkage, nullptr,
1826             "_openmp_static_kernel$is_shared");
1827         UseSharedMemory->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
1828         QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
1829             /*DestWidth=*/16, /*Signed=*/0);
1830         llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
1831             Address(UseSharedMemory,
1832                     CGM.getContext().getTypeAlignInChars(Int16Ty)),
1833             /*Volatile=*/false, Int16Ty, Loc);
1834         auto *StaticGlobalized = new llvm::GlobalVariable(
1835             CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false,
1836             llvm::GlobalValue::CommonLinkage, nullptr);
1837         auto *RecSize = new llvm::GlobalVariable(
1838             CGM.getModule(), CGM.SizeTy, /*isConstant=*/true,
1839             llvm::GlobalValue::InternalLinkage, nullptr,
1840             "_openmp_static_kernel$size");
1841         RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
1842         llvm::Value *Ld = CGF.EmitLoadOfScalar(
1843             Address(RecSize, CGM.getSizeAlign()), /*Volatile=*/false,
1844             CGM.getContext().getSizeType(), Loc);
1845         llvm::Value *ResAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1846             KernelStaticGlobalized, CGM.VoidPtrPtrTy);
1847         llvm::Value *GlobalRecordSizeArg[] = {
1848             llvm::ConstantInt::get(
1849                 CGM.Int16Ty,
1850                 getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD ? 1 : 0),
1851             StaticGlobalized, Ld, IsInSharedMemory, ResAddr};
1852         CGF.EmitRuntimeCall(
1853             OMPBuilder.getOrCreateRuntimeFunction(
1854                 CGM.getModule(), OMPRTL___kmpc_get_team_static_memory),
1855             GlobalRecordSizeArg);
1856         GlobalizedRecords.back().Buffer = StaticGlobalized;
1857         GlobalizedRecords.back().RecSize = RecSize;
1858         GlobalizedRecords.back().UseSharedMemory = UseSharedMemory;
1859         GlobalizedRecords.back().Loc = Loc;
1860       }
1861       assert(KernelStaticGlobalized && "Global address must be set already.");
1862       Address FrameAddr = CGF.EmitLoadOfPointer(
1863           Address(KernelStaticGlobalized, CGM.getPointerAlign()),
1864           CGM.getContext()
1865               .getPointerType(CGM.getContext().VoidPtrTy)
1866               .castAs<PointerType>());
1867       llvm::Value *GlobalRecValue =
1868           Bld.CreateConstInBoundsGEP(FrameAddr, Offset).getPointer();
1869       I->getSecond().GlobalRecordAddr = GlobalRecValue;
1870       I->getSecond().IsInSPMDModeFlag = nullptr;
1871       GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1872           GlobalRecValue, CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo());
1873     } else {
1874       // TODO: allow the usage of shared memory to be controlled by
1875       // the user, for now, default to global.
1876       bool UseSharedMemory =
1877           IsInTTDRegion && GlobalRecordSize <= SharedMemorySize;
1878       llvm::Value *GlobalRecordSizeArg[] = {
1879           llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
1880           CGF.Builder.getInt16(UseSharedMemory ? 1 : 0)};
1881       llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
1882           OMPBuilder.getOrCreateRuntimeFunction(
1883               CGM.getModule(),
1884               IsInTTDRegion ? OMPRTL___kmpc_data_sharing_push_stack
1885                             : OMPRTL___kmpc_data_sharing_coalesced_push_stack),
1886           GlobalRecordSizeArg);
1887       GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1888           GlobalRecValue, GlobalRecPtrTy);
1889       I->getSecond().GlobalRecordAddr = GlobalRecValue;
1890       I->getSecond().IsInSPMDModeFlag = nullptr;
1891     }
1892     LValue Base =
1893         CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, GlobalRecTy);
1894 
1895     // Emit the "global alloca" which is a GEP from the global declaration
1896     // record using the pointer returned by the runtime.
1897     LValue SecBase;
1898     decltype(I->getSecond().LocalVarData)::const_iterator SecIt;
1899     if (IsTTD) {
1900       SecIt = I->getSecond().SecondaryLocalVarData->begin();
1901       llvm::PointerType *SecGlobalRecPtrTy =
1902           CGF.ConvertTypeForMem(SecGlobalRecTy)->getPointerTo();
1903       SecBase = CGF.MakeNaturalAlignPointeeAddrLValue(
1904           Bld.CreatePointerBitCastOrAddrSpaceCast(
1905               I->getSecond().GlobalRecordAddr, SecGlobalRecPtrTy),
1906           SecGlobalRecTy);
1907     }
1908     for (auto &Rec : I->getSecond().LocalVarData) {
1909       bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
1910       llvm::Value *ParValue;
1911       if (EscapedParam) {
1912         const auto *VD = cast<VarDecl>(Rec.first);
1913         LValue ParLVal =
1914             CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
1915         ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
1916       }
1917       LValue VarAddr = CGF.EmitLValueForField(Base, Rec.second.FD);
1918       // Emit VarAddr basing on lane-id if required.
1919       QualType VarTy;
1920       if (Rec.second.IsOnePerTeam) {
1921         VarTy = Rec.second.FD->getType();
1922       } else {
1923         Address Addr = VarAddr.getAddress(CGF);
1924         llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP(
1925             Addr.getElementType(), Addr.getPointer(),
1926             {Bld.getInt32(0), getNVPTXLaneID(CGF)});
1927         VarTy =
1928             Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType();
1929         VarAddr = CGF.MakeAddrLValue(
1930             Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)), VarTy,
1931             AlignmentSource::Decl);
1932       }
1933       Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
1934       if (!IsInTTDRegion &&
1935           (WithSPMDCheck ||
1936            getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) {
1937         assert(I->getSecond().IsInSPMDModeFlag &&
1938                "Expected unknown execution mode or required SPMD check.");
1939         if (IsTTD) {
1940           assert(SecIt->second.IsOnePerTeam &&
1941                  "Secondary glob data must be one per team.");
1942           LValue SecVarAddr = CGF.EmitLValueForField(SecBase, SecIt->second.FD);
1943           VarAddr.setAddress(
1944               Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(CGF),
1945                                        VarAddr.getPointer(CGF)),
1946                       VarAddr.getAlignment()));
1947           Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
1948         }
1949         Address GlobalPtr = Rec.second.PrivateAddr;
1950         Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName());
1951         Rec.second.PrivateAddr = Address(
1952             Bld.CreateSelect(I->getSecond().IsInSPMDModeFlag,
1953                              LocalAddr.getPointer(), GlobalPtr.getPointer()),
1954             LocalAddr.getAlignment());
1955       }
1956       if (EscapedParam) {
1957         const auto *VD = cast<VarDecl>(Rec.first);
1958         CGF.EmitStoreOfScalar(ParValue, VarAddr);
1959         I->getSecond().MappedParams->setVarAddr(CGF, VD,
1960                                                 VarAddr.getAddress(CGF));
1961       }
1962       if (IsTTD)
1963         ++SecIt;
1964     }
1965   }
1966   for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) {
1967     // Recover pointer to this function's global record. The runtime will
1968     // handle the specifics of the allocation of the memory.
1969     // Use actual memory size of the record including the padding
1970     // for alignment purposes.
1971     CGBuilderTy &Bld = CGF.Builder;
1972     llvm::Value *Size = CGF.getTypeSize(VD->getType());
1973     CharUnits Align = CGM.getContext().getDeclAlign(VD);
1974     Size = Bld.CreateNUWAdd(
1975         Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
1976     llvm::Value *AlignVal =
1977         llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
1978     Size = Bld.CreateUDiv(Size, AlignVal);
1979     Size = Bld.CreateNUWMul(Size, AlignVal);
1980     // TODO: allow the usage of shared memory to be controlled by
1981     // the user, for now, default to global.
1982     llvm::Value *GlobalRecordSizeArg[] = {
1983         Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
1984     llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
1985         OMPBuilder.getOrCreateRuntimeFunction(
1986             CGM.getModule(), OMPRTL___kmpc_data_sharing_coalesced_push_stack),
1987         GlobalRecordSizeArg);
1988     llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1989         GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo());
1990     LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(),
1991                                      CGM.getContext().getDeclAlign(VD),
1992                                      AlignmentSource::Decl);
1993     I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
1994                                             Base.getAddress(CGF));
1995     I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue);
1996   }
1997   I->getSecond().MappedParams->apply(CGF);
1998 }
1999 
emitGenericVarsEpilog(CodeGenFunction & CGF,bool WithSPMDCheck)2000 void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF,
2001                                                  bool WithSPMDCheck) {
2002   if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
2003       getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
2004     return;
2005 
2006   const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
2007   if (I != FunctionGlobalizedDecls.end()) {
2008     I->getSecond().MappedParams->restore(CGF);
2009     if (!CGF.HaveInsertPoint())
2010       return;
2011     for (llvm::Value *Addr :
2012          llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
2013       CGF.EmitRuntimeCall(
2014           OMPBuilder.getOrCreateRuntimeFunction(
2015               CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack),
2016           Addr);
2017     }
2018     if (I->getSecond().GlobalRecordAddr) {
2019       if (!IsInTTDRegion &&
2020           (WithSPMDCheck ||
2021            getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) {
2022         CGBuilderTy &Bld = CGF.Builder;
2023         llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2024         llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
2025         Bld.CreateCondBr(I->getSecond().IsInSPMDModeFlag, ExitBB, NonSPMDBB);
2026         // There is no need to emit line number for unconditional branch.
2027         (void)ApplyDebugLocation::CreateEmpty(CGF);
2028         CGF.EmitBlock(NonSPMDBB);
2029         CGF.EmitRuntimeCall(
2030             OMPBuilder.getOrCreateRuntimeFunction(
2031                 CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack),
2032             CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr));
2033         CGF.EmitBlock(ExitBB);
2034       } else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) {
2035         assert(GlobalizedRecords.back().RegionCounter > 0 &&
2036                "region counter must be > 0.");
2037         --GlobalizedRecords.back().RegionCounter;
2038         // Emit the restore function only in the target region.
2039         if (GlobalizedRecords.back().RegionCounter == 0) {
2040           QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
2041               /*DestWidth=*/16, /*Signed=*/0);
2042           llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
2043               Address(GlobalizedRecords.back().UseSharedMemory,
2044                       CGM.getContext().getTypeAlignInChars(Int16Ty)),
2045               /*Volatile=*/false, Int16Ty, GlobalizedRecords.back().Loc);
2046           llvm::Value *Args[] = {
2047               llvm::ConstantInt::get(
2048                   CGM.Int16Ty,
2049                   getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD ? 1 : 0),
2050               IsInSharedMemory};
2051           CGF.EmitRuntimeCall(
2052               OMPBuilder.getOrCreateRuntimeFunction(
2053                   CGM.getModule(), OMPRTL___kmpc_restore_team_static_memory),
2054               Args);
2055         }
2056       } else {
2057         CGF.EmitRuntimeCall(
2058             OMPBuilder.getOrCreateRuntimeFunction(
2059                 CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack),
2060             I->getSecond().GlobalRecordAddr);
2061       }
2062     }
2063   }
2064 }
2065 
emitTeamsCall(CodeGenFunction & CGF,const OMPExecutableDirective & D,SourceLocation Loc,llvm::Function * OutlinedFn,ArrayRef<llvm::Value * > CapturedVars)2066 void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,
2067                                          const OMPExecutableDirective &D,
2068                                          SourceLocation Loc,
2069                                          llvm::Function *OutlinedFn,
2070                                          ArrayRef<llvm::Value *> CapturedVars) {
2071   if (!CGF.HaveInsertPoint())
2072     return;
2073 
2074   Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
2075                                                       /*Name=*/".zero.addr");
2076   CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2077   llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2078   OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
2079   OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2080   OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2081   emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
2082 }
2083 
emitParallelCall(CodeGenFunction & CGF,SourceLocation Loc,llvm::Function * OutlinedFn,ArrayRef<llvm::Value * > CapturedVars,const Expr * IfCond)2084 void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
2085                                           SourceLocation Loc,
2086                                           llvm::Function *OutlinedFn,
2087                                           ArrayRef<llvm::Value *> CapturedVars,
2088                                           const Expr *IfCond) {
2089   if (!CGF.HaveInsertPoint())
2090     return;
2091 
2092   auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars,
2093                         IfCond](CodeGenFunction &CGF, PrePostActionTy &Action) {
2094     CGBuilderTy &Bld = CGF.Builder;
2095     llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
2096     llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
2097     if (WFn) {
2098       ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
2099       // Remember for post-processing in worker loop.
2100       Work.emplace_back(WFn);
2101     }
2102     llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);
2103 
2104     // Create a private scope that will globalize the arguments
2105     // passed from the outside of the target region.
2106     // TODO: Is that needed?
2107     CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
2108 
2109     Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
2110         llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),
2111         "captured_vars_addrs");
2112     // There's something to share.
2113     if (!CapturedVars.empty()) {
2114       // Prepare for parallel region. Indicate the outlined function.
2115       ASTContext &Ctx = CGF.getContext();
2116       unsigned Idx = 0;
2117       for (llvm::Value *V : CapturedVars) {
2118         Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);
2119         llvm::Value *PtrV;
2120         if (V->getType()->isIntegerTy())
2121           PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
2122         else
2123           PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
2124         CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
2125                               Ctx.getPointerType(Ctx.VoidPtrTy));
2126         ++Idx;
2127       }
2128     }
2129 
2130     llvm::Value *IfCondVal = nullptr;
2131     if (IfCond)
2132       IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,
2133                                     /* isSigned */ false);
2134     else
2135       IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
2136 
2137     assert(IfCondVal && "Expected a value");
2138     llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2139     llvm::Value *Args[] = {
2140         RTLoc,
2141         getThreadID(CGF, Loc),
2142         IfCondVal,
2143         llvm::ConstantInt::get(CGF.Int32Ty, -1),
2144         llvm::ConstantInt::get(CGF.Int32Ty, -1),
2145         FnPtr,
2146         ID,
2147         Bld.CreateBitOrPointerCast(CapturedVarsAddrs.getPointer(),
2148                                    CGF.VoidPtrPtrTy),
2149         llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
2150     CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
2151                             CGM.getModule(), OMPRTL___kmpc_parallel_51),
2152                         Args);
2153   };
2154 
2155   RegionCodeGenTy RCG(ParallelGen);
2156   RCG(CGF);
2157 }
2158 
syncCTAThreads(CodeGenFunction & CGF)2159 void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
2160   // Always emit simple barriers!
2161   if (!CGF.HaveInsertPoint())
2162     return;
2163   // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
2164   // This function does not use parameters, so we can emit just default values.
2165   llvm::Value *Args[] = {
2166       llvm::ConstantPointerNull::get(
2167           cast<llvm::PointerType>(getIdentTyPointerTy())),
2168       llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
2169   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
2170                           CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd),
2171                       Args);
2172 }
2173 
emitBarrierCall(CodeGenFunction & CGF,SourceLocation Loc,OpenMPDirectiveKind Kind,bool,bool)2174 void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF,
2175                                            SourceLocation Loc,
2176                                            OpenMPDirectiveKind Kind, bool,
2177                                            bool) {
2178   // Always emit simple barriers!
2179   if (!CGF.HaveInsertPoint())
2180     return;
2181   // Build call __kmpc_cancel_barrier(loc, thread_id);
2182   unsigned Flags = getDefaultFlagsForBarriers(Kind);
2183   llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
2184                          getThreadID(CGF, Loc)};
2185 
2186   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
2187                           CGM.getModule(), OMPRTL___kmpc_barrier),
2188                       Args);
2189 }
2190 
emitCriticalRegion(CodeGenFunction & CGF,StringRef CriticalName,const RegionCodeGenTy & CriticalOpGen,SourceLocation Loc,const Expr * Hint)2191 void CGOpenMPRuntimeGPU::emitCriticalRegion(
2192     CodeGenFunction &CGF, StringRef CriticalName,
2193     const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
2194     const Expr *Hint) {
2195   llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
2196   llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
2197   llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
2198   llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
2199   llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
2200 
2201   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
2202 
2203   // Get the mask of active threads in the warp.
2204   llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
2205       CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask));
2206   // Fetch team-local id of the thread.
2207   llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
2208 
2209   // Get the width of the team.
2210   llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
2211 
2212   // Initialize the counter variable for the loop.
2213   QualType Int32Ty =
2214       CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
2215   Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
2216   LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
2217   CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
2218                         /*isInit=*/true);
2219 
2220   // Block checks if loop counter exceeds upper bound.
2221   CGF.EmitBlock(LoopBB);
2222   llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2223   llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
2224   CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
2225 
2226   // Block tests which single thread should execute region, and which threads
2227   // should go straight to synchronisation point.
2228   CGF.EmitBlock(TestBB);
2229   CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2230   llvm::Value *CmpThreadToCounter =
2231       CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
2232   CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
2233 
2234   // Block emits the body of the critical region.
2235   CGF.EmitBlock(BodyBB);
2236 
2237   // Output the critical statement.
2238   CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
2239                                       Hint);
2240 
2241   // After the body surrounded by the critical region, the single executing
2242   // thread will jump to the synchronisation point.
2243   // Block waits for all threads in current team to finish then increments the
2244   // counter variable and returns to the loop.
2245   CGF.EmitBlock(SyncBB);
2246   // Reconverge active threads in the warp.
2247   (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
2248                                 CGM.getModule(), OMPRTL___kmpc_syncwarp),
2249                             Mask);
2250 
2251   llvm::Value *IncCounterVal =
2252       CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
2253   CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
2254   CGF.EmitBranch(LoopBB);
2255 
2256   // Block that is reached when  all threads in the team complete the region.
2257   CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
2258 }
2259 
2260 /// Cast value to the specified type.
castValueToType(CodeGenFunction & CGF,llvm::Value * Val,QualType ValTy,QualType CastTy,SourceLocation Loc)2261 static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
2262                                     QualType ValTy, QualType CastTy,
2263                                     SourceLocation Loc) {
2264   assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
2265          "Cast type must sized.");
2266   assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
2267          "Val type must sized.");
2268   llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
2269   if (ValTy == CastTy)
2270     return Val;
2271   if (CGF.getContext().getTypeSizeInChars(ValTy) ==
2272       CGF.getContext().getTypeSizeInChars(CastTy))
2273     return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
2274   if (CastTy->isIntegerType() && ValTy->isIntegerType())
2275     return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
2276                                      CastTy->hasSignedIntegerRepresentation());
2277   Address CastItem = CGF.CreateMemTemp(CastTy);
2278   Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2279       CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
2280   CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy,
2281                         LValueBaseInfo(AlignmentSource::Type),
2282                         TBAAAccessInfo());
2283   return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc,
2284                               LValueBaseInfo(AlignmentSource::Type),
2285                               TBAAAccessInfo());
2286 }
2287 
2288 /// This function creates calls to one of two shuffle functions to copy
2289 /// variables between lanes in a warp.
createRuntimeShuffleFunction(CodeGenFunction & CGF,llvm::Value * Elem,QualType ElemType,llvm::Value * Offset,SourceLocation Loc)2290 static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
2291                                                  llvm::Value *Elem,
2292                                                  QualType ElemType,
2293                                                  llvm::Value *Offset,
2294                                                  SourceLocation Loc) {
2295   CodeGenModule &CGM = CGF.CGM;
2296   CGBuilderTy &Bld = CGF.Builder;
2297   CGOpenMPRuntimeGPU &RT =
2298       *(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime()));
2299   llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();
2300 
2301   CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2302   assert(Size.getQuantity() <= 8 &&
2303          "Unsupported bitwidth in shuffle instruction.");
2304 
2305   RuntimeFunction ShuffleFn = Size.getQuantity() <= 4
2306                                   ? OMPRTL___kmpc_shuffle_int32
2307                                   : OMPRTL___kmpc_shuffle_int64;
2308 
2309   // Cast all types to 32- or 64-bit values before calling shuffle routines.
2310   QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
2311       Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
2312   llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
2313   llvm::Value *WarpSize =
2314       Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
2315 
2316   llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
2317       OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn),
2318       {ElemCast, Offset, WarpSize});
2319 
2320   return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
2321 }
2322 
shuffleAndStore(CodeGenFunction & CGF,Address SrcAddr,Address DestAddr,QualType ElemType,llvm::Value * Offset,SourceLocation Loc)2323 static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
2324                             Address DestAddr, QualType ElemType,
2325                             llvm::Value *Offset, SourceLocation Loc) {
2326   CGBuilderTy &Bld = CGF.Builder;
2327 
2328   CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2329   // Create the loop over the big sized data.
2330   // ptr = (void*)Elem;
2331   // ptrEnd = (void*) Elem + 1;
2332   // Step = 8;
2333   // while (ptr + Step < ptrEnd)
2334   //   shuffle((int64_t)*ptr);
2335   // Step = 4;
2336   // while (ptr + Step < ptrEnd)
2337   //   shuffle((int32_t)*ptr);
2338   // ...
2339   Address ElemPtr = DestAddr;
2340   Address Ptr = SrcAddr;
2341   Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
2342       Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy);
2343   for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
2344     if (Size < CharUnits::fromQuantity(IntSize))
2345       continue;
2346     QualType IntType = CGF.getContext().getIntTypeForBitwidth(
2347         CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
2348         /*Signed=*/1);
2349     llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
2350     Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo());
2351     ElemPtr =
2352         Bld.CreatePointerBitCastOrAddrSpaceCast(ElemPtr, IntTy->getPointerTo());
2353     if (Size.getQuantity() / IntSize > 1) {
2354       llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
2355       llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
2356       llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
2357       llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
2358       CGF.EmitBlock(PreCondBB);
2359       llvm::PHINode *PhiSrc =
2360           Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
2361       PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
2362       llvm::PHINode *PhiDest =
2363           Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
2364       PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
2365       Ptr = Address(PhiSrc, Ptr.getAlignment());
2366       ElemPtr = Address(PhiDest, ElemPtr.getAlignment());
2367       llvm::Value *PtrDiff = Bld.CreatePtrDiff(
2368           PtrEnd.getPointer(), Bld.CreatePointerBitCastOrAddrSpaceCast(
2369                                    Ptr.getPointer(), CGF.VoidPtrTy));
2370       Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
2371                        ThenBB, ExitBB);
2372       CGF.EmitBlock(ThenBB);
2373       llvm::Value *Res = createRuntimeShuffleFunction(
2374           CGF,
2375           CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
2376                                LValueBaseInfo(AlignmentSource::Type),
2377                                TBAAAccessInfo()),
2378           IntType, Offset, Loc);
2379       CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
2380                             LValueBaseInfo(AlignmentSource::Type),
2381                             TBAAAccessInfo());
2382       Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
2383       Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
2384       PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
2385       PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
2386       CGF.EmitBranch(PreCondBB);
2387       CGF.EmitBlock(ExitBB);
2388     } else {
2389       llvm::Value *Res = createRuntimeShuffleFunction(
2390           CGF,
2391           CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
2392                                LValueBaseInfo(AlignmentSource::Type),
2393                                TBAAAccessInfo()),
2394           IntType, Offset, Loc);
2395       CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
2396                             LValueBaseInfo(AlignmentSource::Type),
2397                             TBAAAccessInfo());
2398       Ptr = Bld.CreateConstGEP(Ptr, 1);
2399       ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
2400     }
2401     Size = Size % IntSize;
2402   }
2403 }
2404 
2405 namespace {
2406 enum CopyAction : unsigned {
2407   // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
2408   // the warp using shuffle instructions.
2409   RemoteLaneToThread,
2410   // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
2411   ThreadCopy,
2412   // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
2413   ThreadToScratchpad,
2414   // ScratchpadToThread: Copy from a scratchpad array in global memory
2415   // containing team-reduced data to a thread's stack.
2416   ScratchpadToThread,
2417 };
2418 } // namespace
2419 
2420 struct CopyOptionsTy {
2421   llvm::Value *RemoteLaneOffset;
2422   llvm::Value *ScratchpadIndex;
2423   llvm::Value *ScratchpadWidth;
2424 };
2425 
2426 /// Emit instructions to copy a Reduce list, which contains partially
2427 /// aggregated values, in the specified direction.
emitReductionListCopy(CopyAction Action,CodeGenFunction & CGF,QualType ReductionArrayTy,ArrayRef<const Expr * > Privates,Address SrcBase,Address DestBase,CopyOptionsTy CopyOptions={nullptr, nullptr, nullptr})2428 static void emitReductionListCopy(
2429     CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
2430     ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
2431     CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
2432 
2433   CodeGenModule &CGM = CGF.CGM;
2434   ASTContext &C = CGM.getContext();
2435   CGBuilderTy &Bld = CGF.Builder;
2436 
2437   llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2438   llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
2439   llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
2440 
2441   // Iterates, element-by-element, through the source Reduce list and
2442   // make a copy.
2443   unsigned Idx = 0;
2444   unsigned Size = Privates.size();
2445   for (const Expr *Private : Privates) {
2446     Address SrcElementAddr = Address::invalid();
2447     Address DestElementAddr = Address::invalid();
2448     Address DestElementPtrAddr = Address::invalid();
2449     // Should we shuffle in an element from a remote lane?
2450     bool ShuffleInElement = false;
2451     // Set to true to update the pointer in the dest Reduce list to a
2452     // newly created element.
2453     bool UpdateDestListPtr = false;
2454     // Increment the src or dest pointer to the scratchpad, for each
2455     // new element.
2456     bool IncrScratchpadSrc = false;
2457     bool IncrScratchpadDest = false;
2458 
2459     switch (Action) {
2460     case RemoteLaneToThread: {
2461       // Step 1.1: Get the address for the src element in the Reduce list.
2462       Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
2463       SrcElementAddr = CGF.EmitLoadOfPointer(
2464           SrcElementPtrAddr,
2465           C.getPointerType(Private->getType())->castAs<PointerType>());
2466 
2467       // Step 1.2: Create a temporary to store the element in the destination
2468       // Reduce list.
2469       DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
2470       DestElementAddr =
2471           CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
2472       ShuffleInElement = true;
2473       UpdateDestListPtr = true;
2474       break;
2475     }
2476     case ThreadCopy: {
2477       // Step 1.1: Get the address for the src element in the Reduce list.
2478       Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
2479       SrcElementAddr = CGF.EmitLoadOfPointer(
2480           SrcElementPtrAddr,
2481           C.getPointerType(Private->getType())->castAs<PointerType>());
2482 
2483       // Step 1.2: Get the address for dest element.  The destination
2484       // element has already been created on the thread's stack.
2485       DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
2486       DestElementAddr = CGF.EmitLoadOfPointer(
2487           DestElementPtrAddr,
2488           C.getPointerType(Private->getType())->castAs<PointerType>());
2489       break;
2490     }
2491     case ThreadToScratchpad: {
2492       // Step 1.1: Get the address for the src element in the Reduce list.
2493       Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
2494       SrcElementAddr = CGF.EmitLoadOfPointer(
2495           SrcElementPtrAddr,
2496           C.getPointerType(Private->getType())->castAs<PointerType>());
2497 
2498       // Step 1.2: Get the address for dest element:
2499       // address = base + index * ElementSizeInChars.
2500       llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2501       llvm::Value *CurrentOffset =
2502           Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
2503       llvm::Value *ScratchPadElemAbsolutePtrVal =
2504           Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
2505       ScratchPadElemAbsolutePtrVal =
2506           Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
2507       DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
2508                                 C.getTypeAlignInChars(Private->getType()));
2509       IncrScratchpadDest = true;
2510       break;
2511     }
2512     case ScratchpadToThread: {
2513       // Step 1.1: Get the address for the src element in the scratchpad.
2514       // address = base + index * ElementSizeInChars.
2515       llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2516       llvm::Value *CurrentOffset =
2517           Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
2518       llvm::Value *ScratchPadElemAbsolutePtrVal =
2519           Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
2520       ScratchPadElemAbsolutePtrVal =
2521           Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
2522       SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
2523                                C.getTypeAlignInChars(Private->getType()));
2524       IncrScratchpadSrc = true;
2525 
2526       // Step 1.2: Create a temporary to store the element in the destination
2527       // Reduce list.
2528       DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
2529       DestElementAddr =
2530           CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
2531       UpdateDestListPtr = true;
2532       break;
2533     }
2534     }
2535 
2536     // Regardless of src and dest of copy, we emit the load of src
2537     // element as this is required in all directions
2538     SrcElementAddr = Bld.CreateElementBitCast(
2539         SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
2540     DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
2541                                                SrcElementAddr.getElementType());
2542 
2543     // Now that all active lanes have read the element in the
2544     // Reduce list, shuffle over the value from the remote lane.
2545     if (ShuffleInElement) {
2546       shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
2547                       RemoteLaneOffset, Private->getExprLoc());
2548     } else {
2549       switch (CGF.getEvaluationKind(Private->getType())) {
2550       case TEK_Scalar: {
2551         llvm::Value *Elem = CGF.EmitLoadOfScalar(
2552             SrcElementAddr, /*Volatile=*/false, Private->getType(),
2553             Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type),
2554             TBAAAccessInfo());
2555         // Store the source element value to the dest element address.
2556         CGF.EmitStoreOfScalar(
2557             Elem, DestElementAddr, /*Volatile=*/false, Private->getType(),
2558             LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
2559         break;
2560       }
2561       case TEK_Complex: {
2562         CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex(
2563             CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
2564             Private->getExprLoc());
2565         CGF.EmitStoreOfComplex(
2566             Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
2567             /*isInit=*/false);
2568         break;
2569       }
2570       case TEK_Aggregate:
2571         CGF.EmitAggregateCopy(
2572             CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
2573             CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
2574             Private->getType(), AggValueSlot::DoesNotOverlap);
2575         break;
2576       }
2577     }
2578 
2579     // Step 3.1: Modify reference in dest Reduce list as needed.
2580     // Modifying the reference in Reduce list to point to the newly
2581     // created element.  The element is live in the current function
2582     // scope and that of functions it invokes (i.e., reduce_function).
2583     // RemoteReduceData[i] = (void*)&RemoteElem
2584     if (UpdateDestListPtr) {
2585       CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
2586                                 DestElementAddr.getPointer(), CGF.VoidPtrTy),
2587                             DestElementPtrAddr, /*Volatile=*/false,
2588                             C.VoidPtrTy);
2589     }
2590 
2591     // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
2592     // address of the next element in scratchpad memory, unless we're currently
2593     // processing the last one.  Memory alignment is also taken care of here.
2594     if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
2595       llvm::Value *ScratchpadBasePtr =
2596           IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
2597       llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2598       ScratchpadBasePtr = Bld.CreateNUWAdd(
2599           ScratchpadBasePtr,
2600           Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
2601 
2602       // Take care of global memory alignment for performance
2603       ScratchpadBasePtr = Bld.CreateNUWSub(
2604           ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
2605       ScratchpadBasePtr = Bld.CreateUDiv(
2606           ScratchpadBasePtr,
2607           llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
2608       ScratchpadBasePtr = Bld.CreateNUWAdd(
2609           ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
2610       ScratchpadBasePtr = Bld.CreateNUWMul(
2611           ScratchpadBasePtr,
2612           llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
2613 
2614       if (IncrScratchpadDest)
2615         DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
2616       else /* IncrScratchpadSrc = true */
2617         SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
2618     }
2619 
2620     ++Idx;
2621   }
2622 }
2623 
2624 /// This function emits a helper that gathers Reduce lists from the first
2625 /// lane of every active warp to lanes in the first warp.
2626 ///
2627 /// void inter_warp_copy_func(void* reduce_data, num_warps)
2628 ///   shared smem[warp_size];
2629 ///   For all data entries D in reduce_data:
2630 ///     sync
2631 ///     If (I am the first lane in each warp)
2632 ///       Copy my local D to smem[warp_id]
2633 ///     sync
2634 ///     if (I am the first warp)
2635 ///       Copy smem[thread_id] to my local D
emitInterWarpCopyFunction(CodeGenModule & CGM,ArrayRef<const Expr * > Privates,QualType ReductionArrayTy,SourceLocation Loc)2636 static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
2637                                               ArrayRef<const Expr *> Privates,
2638                                               QualType ReductionArrayTy,
2639                                               SourceLocation Loc) {
2640   ASTContext &C = CGM.getContext();
2641   llvm::Module &M = CGM.getModule();
2642 
2643   // ReduceList: thread local Reduce list.
2644   // At the stage of the computation when this function is called, partially
2645   // aggregated values reside in the first lane of every active warp.
2646   ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2647                                   C.VoidPtrTy, ImplicitParamDecl::Other);
2648   // NumWarps: number of warps active in the parallel region.  This could
2649   // be smaller than 32 (max warps in a CTA) for partial block reduction.
2650   ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2651                                 C.getIntTypeForBitwidth(32, /* Signed */ true),
2652                                 ImplicitParamDecl::Other);
2653   FunctionArgList Args;
2654   Args.push_back(&ReduceListArg);
2655   Args.push_back(&NumWarpsArg);
2656 
2657   const CGFunctionInfo &CGFI =
2658       CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2659   auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
2660                                     llvm::GlobalValue::InternalLinkage,
2661                                     "_omp_reduction_inter_warp_copy_func", &M);
2662   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2663   Fn->setDoesNotRecurse();
2664   CodeGenFunction CGF(CGM);
2665   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2666 
2667   CGBuilderTy &Bld = CGF.Builder;
2668 
2669   // This array is used as a medium to transfer, one reduce element at a time,
2670   // the data from the first lane of every warp to lanes in the first warp
2671   // in order to perform the final step of a reduction in a parallel region
2672   // (reduction across warps).  The array is placed in NVPTX __shared__ memory
2673   // for reduced latency, as well as to have a distinct copy for concurrently
2674   // executing target regions.  The array is declared with common linkage so
2675   // as to be shared across compilation units.
2676   StringRef TransferMediumName =
2677       "__openmp_nvptx_data_transfer_temporary_storage";
2678   llvm::GlobalVariable *TransferMedium =
2679       M.getGlobalVariable(TransferMediumName);
2680   unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
2681   if (!TransferMedium) {
2682     auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
2683     unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
2684     TransferMedium = new llvm::GlobalVariable(
2685         M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,
2686         llvm::UndefValue::get(Ty), TransferMediumName,
2687         /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
2688         SharedAddressSpace);
2689     CGM.addCompilerUsedGlobal(TransferMedium);
2690   }
2691 
2692   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
2693   // Get the CUDA thread id of the current OpenMP thread on the GPU.
2694   llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
2695   // nvptx_lane_id = nvptx_id % warpsize
2696   llvm::Value *LaneID = getNVPTXLaneID(CGF);
2697   // nvptx_warp_id = nvptx_id / warpsize
2698   llvm::Value *WarpID = getNVPTXWarpID(CGF);
2699 
2700   Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2701   Address LocalReduceList(
2702       Bld.CreatePointerBitCastOrAddrSpaceCast(
2703           CGF.EmitLoadOfScalar(
2704               AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,
2705               LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()),
2706           CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2707       CGF.getPointerAlign());
2708 
2709   unsigned Idx = 0;
2710   for (const Expr *Private : Privates) {
2711     //
2712     // Warp master copies reduce element to transfer medium in __shared__
2713     // memory.
2714     //
2715     unsigned RealTySize =
2716         C.getTypeSizeInChars(Private->getType())
2717             .alignTo(C.getTypeAlignInChars(Private->getType()))
2718             .getQuantity();
2719     for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
2720       unsigned NumIters = RealTySize / TySize;
2721       if (NumIters == 0)
2722         continue;
2723       QualType CType = C.getIntTypeForBitwidth(
2724           C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
2725       llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
2726       CharUnits Align = CharUnits::fromQuantity(TySize);
2727       llvm::Value *Cnt = nullptr;
2728       Address CntAddr = Address::invalid();
2729       llvm::BasicBlock *PrecondBB = nullptr;
2730       llvm::BasicBlock *ExitBB = nullptr;
2731       if (NumIters > 1) {
2732         CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
2733         CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
2734                               /*Volatile=*/false, C.IntTy);
2735         PrecondBB = CGF.createBasicBlock("precond");
2736         ExitBB = CGF.createBasicBlock("exit");
2737         llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
2738         // There is no need to emit line number for unconditional branch.
2739         (void)ApplyDebugLocation::CreateEmpty(CGF);
2740         CGF.EmitBlock(PrecondBB);
2741         Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
2742         llvm::Value *Cmp =
2743             Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
2744         Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
2745         CGF.EmitBlock(BodyBB);
2746       }
2747       // kmpc_barrier.
2748       CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
2749                                              /*EmitChecks=*/false,
2750                                              /*ForceSimpleCall=*/true);
2751       llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2752       llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2753       llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2754 
2755       // if (lane_id == 0)
2756       llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
2757       Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2758       CGF.EmitBlock(ThenBB);
2759 
2760       // Reduce element = LocalReduceList[i]
2761       Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2762       llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2763           ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2764       // elemptr = ((CopyType*)(elemptrptr)) + I
2765       Address ElemPtr = Address(ElemPtrPtr, Align);
2766       ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
2767       if (NumIters > 1) {
2768         ElemPtr = Address(Bld.CreateGEP(ElemPtr.getPointer(), Cnt),
2769                           ElemPtr.getAlignment());
2770       }
2771 
2772       // Get pointer to location in transfer medium.
2773       // MediumPtr = &medium[warp_id]
2774       llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
2775           TransferMedium->getValueType(), TransferMedium,
2776           {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
2777       Address MediumPtr(MediumPtrVal, Align);
2778       // Casting to actual data type.
2779       // MediumPtr = (CopyType*)MediumPtrAddr;
2780       MediumPtr = Bld.CreateElementBitCast(MediumPtr, CopyType);
2781 
2782       // elem = *elemptr
2783       //*MediumPtr = elem
2784       llvm::Value *Elem = CGF.EmitLoadOfScalar(
2785           ElemPtr, /*Volatile=*/false, CType, Loc,
2786           LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
2787       // Store the source element value to the dest element address.
2788       CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType,
2789                             LValueBaseInfo(AlignmentSource::Type),
2790                             TBAAAccessInfo());
2791 
2792       Bld.CreateBr(MergeBB);
2793 
2794       CGF.EmitBlock(ElseBB);
2795       Bld.CreateBr(MergeBB);
2796 
2797       CGF.EmitBlock(MergeBB);
2798 
2799       // kmpc_barrier.
2800       CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
2801                                              /*EmitChecks=*/false,
2802                                              /*ForceSimpleCall=*/true);
2803 
2804       //
2805       // Warp 0 copies reduce element from transfer medium.
2806       //
2807       llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
2808       llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
2809       llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
2810 
2811       Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
2812       llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
2813           AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
2814 
2815       // Up to 32 threads in warp 0 are active.
2816       llvm::Value *IsActiveThread =
2817           Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
2818       Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2819 
2820       CGF.EmitBlock(W0ThenBB);
2821 
2822       // SrcMediumPtr = &medium[tid]
2823       llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
2824           TransferMedium->getValueType(), TransferMedium,
2825           {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
2826       Address SrcMediumPtr(SrcMediumPtrVal, Align);
2827       // SrcMediumVal = *SrcMediumPtr;
2828       SrcMediumPtr = Bld.CreateElementBitCast(SrcMediumPtr, CopyType);
2829 
2830       // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2831       Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2832       llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
2833           TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
2834       Address TargetElemPtr = Address(TargetElemPtrVal, Align);
2835       TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
2836       if (NumIters > 1) {
2837         TargetElemPtr = Address(Bld.CreateGEP(TargetElemPtr.getPointer(), Cnt),
2838                                 TargetElemPtr.getAlignment());
2839       }
2840 
2841       // *TargetElemPtr = SrcMediumVal;
2842       llvm::Value *SrcMediumValue =
2843           CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
2844       CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
2845                             CType);
2846       Bld.CreateBr(W0MergeBB);
2847 
2848       CGF.EmitBlock(W0ElseBB);
2849       Bld.CreateBr(W0MergeBB);
2850 
2851       CGF.EmitBlock(W0MergeBB);
2852 
2853       if (NumIters > 1) {
2854         Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
2855         CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
2856         CGF.EmitBranch(PrecondBB);
2857         (void)ApplyDebugLocation::CreateEmpty(CGF);
2858         CGF.EmitBlock(ExitBB);
2859       }
2860       RealTySize %= TySize;
2861     }
2862     ++Idx;
2863   }
2864 
2865   CGF.FinishFunction();
2866   return Fn;
2867 }
2868 
2869 /// Emit a helper that reduces data across two OpenMP threads (lanes)
2870 /// in the same warp.  It uses shuffle instructions to copy over data from
2871 /// a remote lane's stack.  The reduction algorithm performed is specified
2872 /// by the fourth parameter.
2873 ///
2874 /// Algorithm Versions.
2875 /// Full Warp Reduce (argument value 0):
2876 ///   This algorithm assumes that all 32 lanes are active and gathers
2877 ///   data from these 32 lanes, producing a single resultant value.
2878 /// Contiguous Partial Warp Reduce (argument value 1):
2879 ///   This algorithm assumes that only a *contiguous* subset of lanes
2880 ///   are active.  This happens for the last warp in a parallel region
2881 ///   when the user specified num_threads is not an integer multiple of
2882 ///   32.  This contiguous subset always starts with the zeroth lane.
2883 /// Partial Warp Reduce (argument value 2):
2884 ///   This algorithm gathers data from any number of lanes at any position.
2885 /// All reduced values are stored in the lowest possible lane.  The set
2886 /// of problems every algorithm addresses is a super set of those
2887 /// addressable by algorithms with a lower version number.  Overhead
2888 /// increases as algorithm version increases.
2889 ///
2890 /// Terminology
2891 /// Reduce element:
2892 ///   Reduce element refers to the individual data field with primitive
2893 ///   data types to be combined and reduced across threads.
2894 /// Reduce list:
2895 ///   Reduce list refers to a collection of local, thread-private
2896 ///   reduce elements.
2897 /// Remote Reduce list:
2898 ///   Remote Reduce list refers to a collection of remote (relative to
2899 ///   the current thread) reduce elements.
2900 ///
2901 /// We distinguish between three states of threads that are important to
2902 /// the implementation of this function.
2903 /// Alive threads:
2904 ///   Threads in a warp executing the SIMT instruction, as distinguished from
2905 ///   threads that are inactive due to divergent control flow.
2906 /// Active threads:
2907 ///   The minimal set of threads that has to be alive upon entry to this
2908 ///   function.  The computation is correct iff active threads are alive.
2909 ///   Some threads are alive but they are not active because they do not
2910 ///   contribute to the computation in any useful manner.  Turning them off
2911 ///   may introduce control flow overheads without any tangible benefits.
2912 /// Effective threads:
2913 ///   In order to comply with the argument requirements of the shuffle
2914 ///   function, we must keep all lanes holding data alive.  But at most
2915 ///   half of them perform value aggregation; we refer to this half of
2916 ///   threads as effective. The other half is simply handing off their
2917 ///   data.
2918 ///
2919 /// Procedure
2920 /// Value shuffle:
2921 ///   In this step active threads transfer data from higher lane positions
2922 ///   in the warp to lower lane positions, creating Remote Reduce list.
2923 /// Value aggregation:
2924 ///   In this step, effective threads combine their thread local Reduce list
2925 ///   with Remote Reduce list and store the result in the thread local
2926 ///   Reduce list.
2927 /// Value copy:
2928 ///   In this step, we deal with the assumption made by algorithm 2
2929 ///   (i.e. contiguity assumption).  When we have an odd number of lanes
2930 ///   active, say 2k+1, only k threads will be effective and therefore k
2931 ///   new values will be produced.  However, the Reduce list owned by the
2932 ///   (2k+1)th thread is ignored in the value aggregation.  Therefore
2933 ///   we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
2934 ///   that the contiguity assumption still holds.
emitShuffleAndReduceFunction(CodeGenModule & CGM,ArrayRef<const Expr * > Privates,QualType ReductionArrayTy,llvm::Function * ReduceFn,SourceLocation Loc)2935 static llvm::Function *emitShuffleAndReduceFunction(
2936     CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2937     QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
2938   ASTContext &C = CGM.getContext();
2939 
2940   // Thread local Reduce list used to host the values of data to be reduced.
2941   ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2942                                   C.VoidPtrTy, ImplicitParamDecl::Other);
2943   // Current lane id; could be logical.
2944   ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
2945                               ImplicitParamDecl::Other);
2946   // Offset of the remote source lane relative to the current lane.
2947   ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2948                                         C.ShortTy, ImplicitParamDecl::Other);
2949   // Algorithm version.  This is expected to be known at compile time.
2950   ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2951                                C.ShortTy, ImplicitParamDecl::Other);
2952   FunctionArgList Args;
2953   Args.push_back(&ReduceListArg);
2954   Args.push_back(&LaneIDArg);
2955   Args.push_back(&RemoteLaneOffsetArg);
2956   Args.push_back(&AlgoVerArg);
2957 
2958   const CGFunctionInfo &CGFI =
2959       CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2960   auto *Fn = llvm::Function::Create(
2961       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2962       "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
2963   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2964   Fn->setDoesNotRecurse();
2965 
2966   CodeGenFunction CGF(CGM);
2967   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2968 
2969   CGBuilderTy &Bld = CGF.Builder;
2970 
2971   Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2972   Address LocalReduceList(
2973       Bld.CreatePointerBitCastOrAddrSpaceCast(
2974           CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2975                                C.VoidPtrTy, SourceLocation()),
2976           CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2977       CGF.getPointerAlign());
2978 
2979   Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
2980   llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
2981       AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2982 
2983   Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
2984   llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
2985       AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2986 
2987   Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
2988   llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
2989       AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2990 
2991   // Create a local thread-private variable to host the Reduce list
2992   // from a remote lane.
2993   Address RemoteReduceList =
2994       CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
2995 
2996   // This loop iterates through the list of reduce elements and copies,
2997   // element by element, from a remote lane in the warp to RemoteReduceList,
2998   // hosted on the thread's stack.
2999   emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
3000                         LocalReduceList, RemoteReduceList,
3001                         {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
3002                          /*ScratchpadIndex=*/nullptr,
3003                          /*ScratchpadWidth=*/nullptr});
3004 
3005   // The actions to be performed on the Remote Reduce list is dependent
3006   // on the algorithm version.
3007   //
3008   //  if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
3009   //  LaneId % 2 == 0 && Offset > 0):
3010   //    do the reduction value aggregation
3011   //
3012   //  The thread local variable Reduce list is mutated in place to host the
3013   //  reduced data, which is the aggregated value produced from local and
3014   //  remote lanes.
3015   //
3016   //  Note that AlgoVer is expected to be a constant integer known at compile
3017   //  time.
3018   //  When AlgoVer==0, the first conjunction evaluates to true, making
3019   //    the entire predicate true during compile time.
3020   //  When AlgoVer==1, the second conjunction has only the second part to be
3021   //    evaluated during runtime.  Other conjunctions evaluates to false
3022   //    during compile time.
3023   //  When AlgoVer==2, the third conjunction has only the second part to be
3024   //    evaluated during runtime.  Other conjunctions evaluates to false
3025   //    during compile time.
3026   llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
3027 
3028   llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
3029   llvm::Value *CondAlgo1 = Bld.CreateAnd(
3030       Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
3031 
3032   llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
3033   llvm::Value *CondAlgo2 = Bld.CreateAnd(
3034       Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
3035   CondAlgo2 = Bld.CreateAnd(
3036       CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
3037 
3038   llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
3039   CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
3040 
3041   llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3042   llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3043   llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3044   Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
3045 
3046   CGF.EmitBlock(ThenBB);
3047   // reduce_function(LocalReduceList, RemoteReduceList)
3048   llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3049       LocalReduceList.getPointer(), CGF.VoidPtrTy);
3050   llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3051       RemoteReduceList.getPointer(), CGF.VoidPtrTy);
3052   CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3053       CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
3054   Bld.CreateBr(MergeBB);
3055 
3056   CGF.EmitBlock(ElseBB);
3057   Bld.CreateBr(MergeBB);
3058 
3059   CGF.EmitBlock(MergeBB);
3060 
3061   // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3062   // Reduce list.
3063   Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
3064   llvm::Value *CondCopy = Bld.CreateAnd(
3065       Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
3066 
3067   llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
3068   llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
3069   llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
3070   Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3071 
3072   CGF.EmitBlock(CpyThenBB);
3073   emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
3074                         RemoteReduceList, LocalReduceList);
3075   Bld.CreateBr(CpyMergeBB);
3076 
3077   CGF.EmitBlock(CpyElseBB);
3078   Bld.CreateBr(CpyMergeBB);
3079 
3080   CGF.EmitBlock(CpyMergeBB);
3081 
3082   CGF.FinishFunction();
3083   return Fn;
3084 }
3085 
3086 /// This function emits a helper that copies all the reduction variables from
3087 /// the team into the provided global buffer for the reduction variables.
3088 ///
3089 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
3090 ///   For all data entries D in reduce_data:
3091 ///     Copy local D to buffer.D[Idx]
emitListToGlobalCopyFunction(CodeGenModule & CGM,ArrayRef<const Expr * > Privates,QualType ReductionArrayTy,SourceLocation Loc,const RecordDecl * TeamReductionRec,const llvm::SmallDenseMap<const ValueDecl *,const FieldDecl * > & VarFieldMap)3092 static llvm::Value *emitListToGlobalCopyFunction(
3093     CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3094     QualType ReductionArrayTy, SourceLocation Loc,
3095     const RecordDecl *TeamReductionRec,
3096     const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3097         &VarFieldMap) {
3098   ASTContext &C = CGM.getContext();
3099 
3100   // Buffer: global reduction buffer.
3101   ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3102                               C.VoidPtrTy, ImplicitParamDecl::Other);
3103   // Idx: index of the buffer.
3104   ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3105                            ImplicitParamDecl::Other);
3106   // ReduceList: thread local Reduce list.
3107   ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3108                                   C.VoidPtrTy, ImplicitParamDecl::Other);
3109   FunctionArgList Args;
3110   Args.push_back(&BufferArg);
3111   Args.push_back(&IdxArg);
3112   Args.push_back(&ReduceListArg);
3113 
3114   const CGFunctionInfo &CGFI =
3115       CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
3116   auto *Fn = llvm::Function::Create(
3117       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3118       "_omp_reduction_list_to_global_copy_func", &CGM.getModule());
3119   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3120   Fn->setDoesNotRecurse();
3121   CodeGenFunction CGF(CGM);
3122   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3123 
3124   CGBuilderTy &Bld = CGF.Builder;
3125 
3126   Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3127   Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3128   Address LocalReduceList(
3129       Bld.CreatePointerBitCastOrAddrSpaceCast(
3130           CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3131                                C.VoidPtrTy, Loc),
3132           CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3133       CGF.getPointerAlign());
3134   QualType StaticTy = C.getRecordType(TeamReductionRec);
3135   llvm::Type *LLVMReductionsBufferTy =
3136       CGM.getTypes().ConvertTypeForMem(StaticTy);
3137   llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3138       CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3139       LLVMReductionsBufferTy->getPointerTo());
3140   llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3141                          CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3142                                               /*Volatile=*/false, C.IntTy,
3143                                               Loc)};
3144   unsigned Idx = 0;
3145   for (const Expr *Private : Privates) {
3146     // Reduce element = LocalReduceList[i]
3147     Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3148     llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3149         ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3150     // elemptr = ((CopyType*)(elemptrptr)) + I
3151     ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3152         ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
3153     Address ElemPtr =
3154         Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
3155     const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
3156     // Global = Buffer.VD[Idx];
3157     const FieldDecl *FD = VarFieldMap.lookup(VD);
3158     LValue GlobLVal = CGF.EmitLValueForField(
3159         CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3160     Address GlobAddr = GlobLVal.getAddress(CGF);
3161     llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
3162         GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
3163     GlobLVal.setAddress(Address(BufferPtr, GlobAddr.getAlignment()));
3164     switch (CGF.getEvaluationKind(Private->getType())) {
3165     case TEK_Scalar: {
3166       llvm::Value *V = CGF.EmitLoadOfScalar(
3167           ElemPtr, /*Volatile=*/false, Private->getType(), Loc,
3168           LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
3169       CGF.EmitStoreOfScalar(V, GlobLVal);
3170       break;
3171     }
3172     case TEK_Complex: {
3173       CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(
3174           CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
3175       CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false);
3176       break;
3177     }
3178     case TEK_Aggregate:
3179       CGF.EmitAggregateCopy(GlobLVal,
3180                             CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3181                             Private->getType(), AggValueSlot::DoesNotOverlap);
3182       break;
3183     }
3184     ++Idx;
3185   }
3186 
3187   CGF.FinishFunction();
3188   return Fn;
3189 }
3190 
3191 /// This function emits a helper that reduces all the reduction variables from
3192 /// the team into the provided global buffer for the reduction variables.
3193 ///
3194 /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
3195 ///  void *GlobPtrs[];
3196 ///  GlobPtrs[0] = (void*)&buffer.D0[Idx];
3197 ///  ...
3198 ///  GlobPtrs[N] = (void*)&buffer.DN[Idx];
3199 ///  reduce_function(GlobPtrs, reduce_data);
emitListToGlobalReduceFunction(CodeGenModule & CGM,ArrayRef<const Expr * > Privates,QualType ReductionArrayTy,SourceLocation Loc,const RecordDecl * TeamReductionRec,const llvm::SmallDenseMap<const ValueDecl *,const FieldDecl * > & VarFieldMap,llvm::Function * ReduceFn)3200 static llvm::Value *emitListToGlobalReduceFunction(
3201     CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3202     QualType ReductionArrayTy, SourceLocation Loc,
3203     const RecordDecl *TeamReductionRec,
3204     const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3205         &VarFieldMap,
3206     llvm::Function *ReduceFn) {
3207   ASTContext &C = CGM.getContext();
3208 
3209   // Buffer: global reduction buffer.
3210   ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3211                               C.VoidPtrTy, ImplicitParamDecl::Other);
3212   // Idx: index of the buffer.
3213   ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3214                            ImplicitParamDecl::Other);
3215   // ReduceList: thread local Reduce list.
3216   ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3217                                   C.VoidPtrTy, ImplicitParamDecl::Other);
3218   FunctionArgList Args;
3219   Args.push_back(&BufferArg);
3220   Args.push_back(&IdxArg);
3221   Args.push_back(&ReduceListArg);
3222 
3223   const CGFunctionInfo &CGFI =
3224       CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
3225   auto *Fn = llvm::Function::Create(
3226       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3227       "_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
3228   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3229   Fn->setDoesNotRecurse();
3230   CodeGenFunction CGF(CGM);
3231   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3232 
3233   CGBuilderTy &Bld = CGF.Builder;
3234 
3235   Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3236   QualType StaticTy = C.getRecordType(TeamReductionRec);
3237   llvm::Type *LLVMReductionsBufferTy =
3238       CGM.getTypes().ConvertTypeForMem(StaticTy);
3239   llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3240       CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3241       LLVMReductionsBufferTy->getPointerTo());
3242 
3243   // 1. Build a list of reduction variables.
3244   // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3245   Address ReductionList =
3246       CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3247   auto IPriv = Privates.begin();
3248   llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3249                          CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3250                                               /*Volatile=*/false, C.IntTy,
3251                                               Loc)};
3252   unsigned Idx = 0;
3253   for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
3254     Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3255     // Global = Buffer.VD[Idx];
3256     const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
3257     const FieldDecl *FD = VarFieldMap.lookup(VD);
3258     LValue GlobLVal = CGF.EmitLValueForField(
3259         CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3260     Address GlobAddr = GlobLVal.getAddress(CGF);
3261     llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
3262         GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
3263     llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
3264     CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
3265     if ((*IPriv)->getType()->isVariablyModifiedType()) {
3266       // Store array size.
3267       ++Idx;
3268       Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3269       llvm::Value *Size = CGF.Builder.CreateIntCast(
3270           CGF.getVLASize(
3271                  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3272               .NumElts,
3273           CGF.SizeTy, /*isSigned=*/false);
3274       CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3275                               Elem);
3276     }
3277   }
3278 
3279   // Call reduce_function(GlobalReduceList, ReduceList)
3280   llvm::Value *GlobalReduceList =
3281       CGF.EmitCastToVoidPtr(ReductionList.getPointer());
3282   Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3283   llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
3284       AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
3285   CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3286       CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
3287   CGF.FinishFunction();
3288   return Fn;
3289 }
3290 
3291 /// This function emits a helper that copies all the reduction variables from
3292 /// the team into the provided global buffer for the reduction variables.
3293 ///
3294 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
3295 ///   For all data entries D in reduce_data:
3296 ///     Copy buffer.D[Idx] to local D;
emitGlobalToListCopyFunction(CodeGenModule & CGM,ArrayRef<const Expr * > Privates,QualType ReductionArrayTy,SourceLocation Loc,const RecordDecl * TeamReductionRec,const llvm::SmallDenseMap<const ValueDecl *,const FieldDecl * > & VarFieldMap)3297 static llvm::Value *emitGlobalToListCopyFunction(
3298     CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3299     QualType ReductionArrayTy, SourceLocation Loc,
3300     const RecordDecl *TeamReductionRec,
3301     const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3302         &VarFieldMap) {
3303   ASTContext &C = CGM.getContext();
3304 
3305   // Buffer: global reduction buffer.
3306   ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3307                               C.VoidPtrTy, ImplicitParamDecl::Other);
3308   // Idx: index of the buffer.
3309   ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3310                            ImplicitParamDecl::Other);
3311   // ReduceList: thread local Reduce list.
3312   ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3313                                   C.VoidPtrTy, ImplicitParamDecl::Other);
3314   FunctionArgList Args;
3315   Args.push_back(&BufferArg);
3316   Args.push_back(&IdxArg);
3317   Args.push_back(&ReduceListArg);
3318 
3319   const CGFunctionInfo &CGFI =
3320       CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
3321   auto *Fn = llvm::Function::Create(
3322       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3323       "_omp_reduction_global_to_list_copy_func", &CGM.getModule());
3324   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3325   Fn->setDoesNotRecurse();
3326   CodeGenFunction CGF(CGM);
3327   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3328 
3329   CGBuilderTy &Bld = CGF.Builder;
3330 
3331   Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3332   Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3333   Address LocalReduceList(
3334       Bld.CreatePointerBitCastOrAddrSpaceCast(
3335           CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3336                                C.VoidPtrTy, Loc),
3337           CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3338       CGF.getPointerAlign());
3339   QualType StaticTy = C.getRecordType(TeamReductionRec);
3340   llvm::Type *LLVMReductionsBufferTy =
3341       CGM.getTypes().ConvertTypeForMem(StaticTy);
3342   llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3343       CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3344       LLVMReductionsBufferTy->getPointerTo());
3345 
3346   llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3347                          CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3348                                               /*Volatile=*/false, C.IntTy,
3349                                               Loc)};
3350   unsigned Idx = 0;
3351   for (const Expr *Private : Privates) {
3352     // Reduce element = LocalReduceList[i]
3353     Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3354     llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3355         ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3356     // elemptr = ((CopyType*)(elemptrptr)) + I
3357     ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3358         ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
3359     Address ElemPtr =
3360         Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
3361     const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
3362     // Global = Buffer.VD[Idx];
3363     const FieldDecl *FD = VarFieldMap.lookup(VD);
3364     LValue GlobLVal = CGF.EmitLValueForField(
3365         CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3366     Address GlobAddr = GlobLVal.getAddress(CGF);
3367     llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
3368         GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
3369     GlobLVal.setAddress(Address(BufferPtr, GlobAddr.getAlignment()));
3370     switch (CGF.getEvaluationKind(Private->getType())) {
3371     case TEK_Scalar: {
3372       llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
3373       CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType(),
3374                             LValueBaseInfo(AlignmentSource::Type),
3375                             TBAAAccessInfo());
3376       break;
3377     }
3378     case TEK_Complex: {
3379       CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc);
3380       CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3381                              /*isInit=*/false);
3382       break;
3383     }
3384     case TEK_Aggregate:
3385       CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3386                             GlobLVal, Private->getType(),
3387                             AggValueSlot::DoesNotOverlap);
3388       break;
3389     }
3390     ++Idx;
3391   }
3392 
3393   CGF.FinishFunction();
3394   return Fn;
3395 }
3396 
3397 /// This function emits a helper that reduces all the reduction variables from
3398 /// the team into the provided global buffer for the reduction variables.
3399 ///
3400 /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
3401 ///  void *GlobPtrs[];
3402 ///  GlobPtrs[0] = (void*)&buffer.D0[Idx];
3403 ///  ...
3404 ///  GlobPtrs[N] = (void*)&buffer.DN[Idx];
3405 ///  reduce_function(reduce_data, GlobPtrs);
emitGlobalToListReduceFunction(CodeGenModule & CGM,ArrayRef<const Expr * > Privates,QualType ReductionArrayTy,SourceLocation Loc,const RecordDecl * TeamReductionRec,const llvm::SmallDenseMap<const ValueDecl *,const FieldDecl * > & VarFieldMap,llvm::Function * ReduceFn)3406 static llvm::Value *emitGlobalToListReduceFunction(
3407     CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3408     QualType ReductionArrayTy, SourceLocation Loc,
3409     const RecordDecl *TeamReductionRec,
3410     const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3411         &VarFieldMap,
3412     llvm::Function *ReduceFn) {
3413   ASTContext &C = CGM.getContext();
3414 
3415   // Buffer: global reduction buffer.
3416   ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3417                               C.VoidPtrTy, ImplicitParamDecl::Other);
3418   // Idx: index of the buffer.
3419   ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3420                            ImplicitParamDecl::Other);
3421   // ReduceList: thread local Reduce list.
3422   ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3423                                   C.VoidPtrTy, ImplicitParamDecl::Other);
3424   FunctionArgList Args;
3425   Args.push_back(&BufferArg);
3426   Args.push_back(&IdxArg);
3427   Args.push_back(&ReduceListArg);
3428 
3429   const CGFunctionInfo &CGFI =
3430       CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
3431   auto *Fn = llvm::Function::Create(
3432       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3433       "_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
3434   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3435   Fn->setDoesNotRecurse();
3436   CodeGenFunction CGF(CGM);
3437   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3438 
3439   CGBuilderTy &Bld = CGF.Builder;
3440 
3441   Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3442   QualType StaticTy = C.getRecordType(TeamReductionRec);
3443   llvm::Type *LLVMReductionsBufferTy =
3444       CGM.getTypes().ConvertTypeForMem(StaticTy);
3445   llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3446       CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3447       LLVMReductionsBufferTy->getPointerTo());
3448 
3449   // 1. Build a list of reduction variables.
3450   // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3451   Address ReductionList =
3452       CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3453   auto IPriv = Privates.begin();
3454   llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3455                          CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3456                                               /*Volatile=*/false, C.IntTy,
3457                                               Loc)};
3458   unsigned Idx = 0;
3459   for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
3460     Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3461     // Global = Buffer.VD[Idx];
3462     const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
3463     const FieldDecl *FD = VarFieldMap.lookup(VD);
3464     LValue GlobLVal = CGF.EmitLValueForField(
3465         CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3466     Address GlobAddr = GlobLVal.getAddress(CGF);
3467     llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
3468         GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
3469     llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
3470     CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
3471     if ((*IPriv)->getType()->isVariablyModifiedType()) {
3472       // Store array size.
3473       ++Idx;
3474       Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3475       llvm::Value *Size = CGF.Builder.CreateIntCast(
3476           CGF.getVLASize(
3477                  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3478               .NumElts,
3479           CGF.SizeTy, /*isSigned=*/false);
3480       CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3481                               Elem);
3482     }
3483   }
3484 
3485   // Call reduce_function(ReduceList, GlobalReduceList)
3486   llvm::Value *GlobalReduceList =
3487       CGF.EmitCastToVoidPtr(ReductionList.getPointer());
3488   Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3489   llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
3490       AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
3491   CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3492       CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
3493   CGF.FinishFunction();
3494   return Fn;
3495 }
3496 
3497 ///
3498 /// Design of OpenMP reductions on the GPU
3499 ///
3500 /// Consider a typical OpenMP program with one or more reduction
3501 /// clauses:
3502 ///
3503 /// float foo;
3504 /// double bar;
3505 /// #pragma omp target teams distribute parallel for \
3506 ///             reduction(+:foo) reduction(*:bar)
3507 /// for (int i = 0; i < N; i++) {
3508 ///   foo += A[i]; bar *= B[i];
3509 /// }
3510 ///
3511 /// where 'foo' and 'bar' are reduced across all OpenMP threads in
3512 /// all teams.  In our OpenMP implementation on the NVPTX device an
3513 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
3514 /// within a team are mapped to CUDA threads within a threadblock.
3515 /// Our goal is to efficiently aggregate values across all OpenMP
3516 /// threads such that:
3517 ///
3518 ///   - the compiler and runtime are logically concise, and
3519 ///   - the reduction is performed efficiently in a hierarchical
3520 ///     manner as follows: within OpenMP threads in the same warp,
3521 ///     across warps in a threadblock, and finally across teams on
3522 ///     the NVPTX device.
3523 ///
3524 /// Introduction to Decoupling
3525 ///
3526 /// We would like to decouple the compiler and the runtime so that the
3527 /// latter is ignorant of the reduction variables (number, data types)
3528 /// and the reduction operators.  This allows a simpler interface
3529 /// and implementation while still attaining good performance.
3530 ///
3531 /// Pseudocode for the aforementioned OpenMP program generated by the
3532 /// compiler is as follows:
3533 ///
3534 /// 1. Create private copies of reduction variables on each OpenMP
3535 ///    thread: 'foo_private', 'bar_private'
3536 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
3537 ///    to it and writes the result in 'foo_private' and 'bar_private'
3538 ///    respectively.
3539 /// 3. Call the OpenMP runtime on the GPU to reduce within a team
3540 ///    and store the result on the team master:
3541 ///
3542 ///     __kmpc_nvptx_parallel_reduce_nowait_v2(...,
3543 ///        reduceData, shuffleReduceFn, interWarpCpyFn)
3544 ///
3545 ///     where:
3546 ///       struct ReduceData {
3547 ///         double *foo;
3548 ///         double *bar;
3549 ///       } reduceData
3550 ///       reduceData.foo = &foo_private
3551 ///       reduceData.bar = &bar_private
3552 ///
3553 ///     'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
3554 ///     auxiliary functions generated by the compiler that operate on
3555 ///     variables of type 'ReduceData'.  They aid the runtime perform
3556 ///     algorithmic steps in a data agnostic manner.
3557 ///
3558 ///     'shuffleReduceFn' is a pointer to a function that reduces data
3559 ///     of type 'ReduceData' across two OpenMP threads (lanes) in the
3560 ///     same warp.  It takes the following arguments as input:
3561 ///
3562 ///     a. variable of type 'ReduceData' on the calling lane,
3563 ///     b. its lane_id,
3564 ///     c. an offset relative to the current lane_id to generate a
3565 ///        remote_lane_id.  The remote lane contains the second
3566 ///        variable of type 'ReduceData' that is to be reduced.
3567 ///     d. an algorithm version parameter determining which reduction
3568 ///        algorithm to use.
3569 ///
3570 ///     'shuffleReduceFn' retrieves data from the remote lane using
3571 ///     efficient GPU shuffle intrinsics and reduces, using the
3572 ///     algorithm specified by the 4th parameter, the two operands
3573 ///     element-wise.  The result is written to the first operand.
3574 ///
3575 ///     Different reduction algorithms are implemented in different
3576 ///     runtime functions, all calling 'shuffleReduceFn' to perform
3577 ///     the essential reduction step.  Therefore, based on the 4th
3578 ///     parameter, this function behaves slightly differently to
3579 ///     cooperate with the runtime to ensure correctness under
3580 ///     different circumstances.
3581 ///
3582 ///     'InterWarpCpyFn' is a pointer to a function that transfers
3583 ///     reduced variables across warps.  It tunnels, through CUDA
3584 ///     shared memory, the thread-private data of type 'ReduceData'
3585 ///     from lane 0 of each warp to a lane in the first warp.
3586 /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
3587 ///    The last team writes the global reduced value to memory.
3588 ///
3589 ///     ret = __kmpc_nvptx_teams_reduce_nowait(...,
3590 ///             reduceData, shuffleReduceFn, interWarpCpyFn,
3591 ///             scratchpadCopyFn, loadAndReduceFn)
3592 ///
3593 ///     'scratchpadCopyFn' is a helper that stores reduced
3594 ///     data from the team master to a scratchpad array in
3595 ///     global memory.
3596 ///
3597 ///     'loadAndReduceFn' is a helper that loads data from
3598 ///     the scratchpad array and reduces it with the input
3599 ///     operand.
3600 ///
3601 ///     These compiler generated functions hide address
3602 ///     calculation and alignment information from the runtime.
3603 /// 5. if ret == 1:
3604 ///     The team master of the last team stores the reduced
3605 ///     result to the globals in memory.
3606 ///     foo += reduceData.foo; bar *= reduceData.bar
3607 ///
3608 ///
3609 /// Warp Reduction Algorithms
3610 ///
3611 /// On the warp level, we have three algorithms implemented in the
3612 /// OpenMP runtime depending on the number of active lanes:
3613 ///
3614 /// Full Warp Reduction
3615 ///
3616 /// The reduce algorithm within a warp where all lanes are active
3617 /// is implemented in the runtime as follows:
3618 ///
3619 /// full_warp_reduce(void *reduce_data,
3620 ///                  kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3621 ///   for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
3622 ///     ShuffleReduceFn(reduce_data, 0, offset, 0);
3623 /// }
3624 ///
3625 /// The algorithm completes in log(2, WARPSIZE) steps.
3626 ///
3627 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
3628 /// not used therefore we save instructions by not retrieving lane_id
3629 /// from the corresponding special registers.  The 4th parameter, which
3630 /// represents the version of the algorithm being used, is set to 0 to
3631 /// signify full warp reduction.
3632 ///
3633 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3634 ///
3635 /// #reduce_elem refers to an element in the local lane's data structure
3636 /// #remote_elem is retrieved from a remote lane
3637 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3638 /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
3639 ///
3640 /// Contiguous Partial Warp Reduction
3641 ///
3642 /// This reduce algorithm is used within a warp where only the first
3643 /// 'n' (n <= WARPSIZE) lanes are active.  It is typically used when the
3644 /// number of OpenMP threads in a parallel region is not a multiple of
3645 /// WARPSIZE.  The algorithm is implemented in the runtime as follows:
3646 ///
3647 /// void
3648 /// contiguous_partial_reduce(void *reduce_data,
3649 ///                           kmp_ShuffleReductFctPtr ShuffleReduceFn,
3650 ///                           int size, int lane_id) {
3651 ///   int curr_size;
3652 ///   int offset;
3653 ///   curr_size = size;
3654 ///   mask = curr_size/2;
3655 ///   while (offset>0) {
3656 ///     ShuffleReduceFn(reduce_data, lane_id, offset, 1);
3657 ///     curr_size = (curr_size+1)/2;
3658 ///     offset = curr_size/2;
3659 ///   }
3660 /// }
3661 ///
3662 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3663 ///
3664 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3665 /// if (lane_id < offset)
3666 ///     reduce_elem = reduce_elem REDUCE_OP remote_elem
3667 /// else
3668 ///     reduce_elem = remote_elem
3669 ///
3670 /// This algorithm assumes that the data to be reduced are located in a
3671 /// contiguous subset of lanes starting from the first.  When there is
3672 /// an odd number of active lanes, the data in the last lane is not
3673 /// aggregated with any other lane's dat but is instead copied over.
3674 ///
3675 /// Dispersed Partial Warp Reduction
3676 ///
3677 /// This algorithm is used within a warp when any discontiguous subset of
3678 /// lanes are active.  It is used to implement the reduction operation
3679 /// across lanes in an OpenMP simd region or in a nested parallel region.
3680 ///
3681 /// void
3682 /// dispersed_partial_reduce(void *reduce_data,
3683 ///                          kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3684 ///   int size, remote_id;
3685 ///   int logical_lane_id = number_of_active_lanes_before_me() * 2;
3686 ///   do {
3687 ///       remote_id = next_active_lane_id_right_after_me();
3688 ///       # the above function returns 0 of no active lane
3689 ///       # is present right after the current lane.
3690 ///       size = number_of_active_lanes_in_this_warp();
3691 ///       logical_lane_id /= 2;
3692 ///       ShuffleReduceFn(reduce_data, logical_lane_id,
3693 ///                       remote_id-1-threadIdx.x, 2);
3694 ///   } while (logical_lane_id % 2 == 0 && size > 1);
3695 /// }
3696 ///
3697 /// There is no assumption made about the initial state of the reduction.
3698 /// Any number of lanes (>=1) could be active at any position.  The reduction
3699 /// result is returned in the first active lane.
3700 ///
3701 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3702 ///
3703 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3704 /// if (lane_id % 2 == 0 && offset > 0)
3705 ///     reduce_elem = reduce_elem REDUCE_OP remote_elem
3706 /// else
3707 ///     reduce_elem = remote_elem
3708 ///
3709 ///
3710 /// Intra-Team Reduction
3711 ///
3712 /// This function, as implemented in the runtime call
3713 /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
3714 /// threads in a team.  It first reduces within a warp using the
3715 /// aforementioned algorithms.  We then proceed to gather all such
3716 /// reduced values at the first warp.
3717 ///
3718 /// The runtime makes use of the function 'InterWarpCpyFn', which copies
3719 /// data from each of the "warp master" (zeroth lane of each warp, where
3720 /// warp-reduced data is held) to the zeroth warp.  This step reduces (in
3721 /// a mathematical sense) the problem of reduction across warp masters in
3722 /// a block to the problem of warp reduction.
3723 ///
3724 ///
3725 /// Inter-Team Reduction
3726 ///
3727 /// Once a team has reduced its data to a single value, it is stored in
3728 /// a global scratchpad array.  Since each team has a distinct slot, this
3729 /// can be done without locking.
3730 ///
3731 /// The last team to write to the scratchpad array proceeds to reduce the
3732 /// scratchpad array.  One or more workers in the last team use the helper
3733 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
3734 /// the k'th worker reduces every k'th element.
3735 ///
3736 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
3737 /// reduce across workers and compute a globally reduced value.
3738 ///
emitReduction(CodeGenFunction & CGF,SourceLocation Loc,ArrayRef<const Expr * > Privates,ArrayRef<const Expr * > LHSExprs,ArrayRef<const Expr * > RHSExprs,ArrayRef<const Expr * > ReductionOps,ReductionOptionsTy Options)3739 void CGOpenMPRuntimeGPU::emitReduction(
3740     CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
3741     ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
3742     ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
3743   if (!CGF.HaveInsertPoint())
3744     return;
3745 
3746   bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
3747 #ifndef NDEBUG
3748   bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
3749 #endif
3750 
3751   if (Options.SimpleReduction) {
3752     assert(!TeamsReduction && !ParallelReduction &&
3753            "Invalid reduction selection in emitReduction.");
3754     CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
3755                                    ReductionOps, Options);
3756     return;
3757   }
3758 
3759   assert((TeamsReduction || ParallelReduction) &&
3760          "Invalid reduction selection in emitReduction.");
3761 
3762   // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3763   // RedList, shuffle_reduce_func, interwarp_copy_func);
3764   // or
3765   // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3766   llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
3767   llvm::Value *ThreadId = getThreadID(CGF, Loc);
3768 
3769   llvm::Value *Res;
3770   ASTContext &C = CGM.getContext();
3771   // 1. Build a list of reduction variables.
3772   // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3773   auto Size = RHSExprs.size();
3774   for (const Expr *E : Privates) {
3775     if (E->getType()->isVariablyModifiedType())
3776       // Reserve place for array size.
3777       ++Size;
3778   }
3779   llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
3780   QualType ReductionArrayTy =
3781       C.getConstantArrayType(C.VoidPtrTy, ArraySize, nullptr, ArrayType::Normal,
3782                              /*IndexTypeQuals=*/0);
3783   Address ReductionList =
3784       CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3785   auto IPriv = Privates.begin();
3786   unsigned Idx = 0;
3787   for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
3788     Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3789     CGF.Builder.CreateStore(
3790         CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3791             CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),
3792         Elem);
3793     if ((*IPriv)->getType()->isVariablyModifiedType()) {
3794       // Store array size.
3795       ++Idx;
3796       Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3797       llvm::Value *Size = CGF.Builder.CreateIntCast(
3798           CGF.getVLASize(
3799                  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3800               .NumElts,
3801           CGF.SizeTy, /*isSigned=*/false);
3802       CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3803                               Elem);
3804     }
3805   }
3806 
3807   llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3808       ReductionList.getPointer(), CGF.VoidPtrTy);
3809   llvm::Function *ReductionFn = emitReductionFunction(
3810       Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates,
3811       LHSExprs, RHSExprs, ReductionOps);
3812   llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
3813   llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
3814       CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
3815   llvm::Value *InterWarpCopyFn =
3816       emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
3817 
3818   if (ParallelReduction) {
3819     llvm::Value *Args[] = {RTLoc,
3820                            ThreadId,
3821                            CGF.Builder.getInt32(RHSExprs.size()),
3822                            ReductionArrayTySize,
3823                            RL,
3824                            ShuffleAndReduceFn,
3825                            InterWarpCopyFn};
3826 
3827     Res = CGF.EmitRuntimeCall(
3828         OMPBuilder.getOrCreateRuntimeFunction(
3829             CGM.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
3830         Args);
3831   } else {
3832     assert(TeamsReduction && "expected teams reduction.");
3833     llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
3834     llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
3835     int Cnt = 0;
3836     for (const Expr *DRE : Privates) {
3837       PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
3838       ++Cnt;
3839     }
3840     const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars(
3841         CGM.getContext(), PrivatesReductions, llvm::None, VarFieldMap,
3842         C.getLangOpts().OpenMPCUDAReductionBufNum);
3843     TeamsReductions.push_back(TeamReductionRec);
3844     if (!KernelTeamsReductionPtr) {
3845       KernelTeamsReductionPtr = new llvm::GlobalVariable(
3846           CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
3847           llvm::GlobalValue::InternalLinkage, nullptr,
3848           "_openmp_teams_reductions_buffer_$_$ptr");
3849     }
3850     llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
3851         Address(KernelTeamsReductionPtr, CGM.getPointerAlign()),
3852         /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc);
3853     llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
3854         CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
3855     llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
3856         CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
3857         ReductionFn);
3858     llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
3859         CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
3860     llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
3861         CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
3862         ReductionFn);
3863 
3864     llvm::Value *Args[] = {
3865         RTLoc,
3866         ThreadId,
3867         GlobalBufferPtr,
3868         CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
3869         RL,
3870         ShuffleAndReduceFn,
3871         InterWarpCopyFn,
3872         GlobalToBufferCpyFn,
3873         GlobalToBufferRedFn,
3874         BufferToGlobalCpyFn,
3875         BufferToGlobalRedFn};
3876 
3877     Res = CGF.EmitRuntimeCall(
3878         OMPBuilder.getOrCreateRuntimeFunction(
3879             CGM.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
3880         Args);
3881   }
3882 
3883   // 5. Build if (res == 1)
3884   llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
3885   llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
3886   llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
3887       Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
3888   CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3889 
3890   // 6. Build then branch: where we have reduced values in the master
3891   //    thread in each team.
3892   //    __kmpc_end_reduce{_nowait}(<gtid>);
3893   //    break;
3894   CGF.EmitBlock(ThenBB);
3895 
3896   // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3897   auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
3898                     this](CodeGenFunction &CGF, PrePostActionTy &Action) {
3899     auto IPriv = Privates.begin();
3900     auto ILHS = LHSExprs.begin();
3901     auto IRHS = RHSExprs.begin();
3902     for (const Expr *E : ReductionOps) {
3903       emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
3904                                   cast<DeclRefExpr>(*IRHS));
3905       ++IPriv;
3906       ++ILHS;
3907       ++IRHS;
3908     }
3909   };
3910   llvm::Value *EndArgs[] = {ThreadId};
3911   RegionCodeGenTy RCG(CodeGen);
3912   NVPTXActionTy Action(
3913       nullptr, llvm::None,
3914       OMPBuilder.getOrCreateRuntimeFunction(
3915           CGM.getModule(), OMPRTL___kmpc_nvptx_end_reduce_nowait),
3916       EndArgs);
3917   RCG.setAction(Action);
3918   RCG(CGF);
3919   // There is no need to emit line number for unconditional branch.
3920   (void)ApplyDebugLocation::CreateEmpty(CGF);
3921   CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
3922 }
3923 
3924 const VarDecl *
translateParameter(const FieldDecl * FD,const VarDecl * NativeParam) const3925 CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD,
3926                                        const VarDecl *NativeParam) const {
3927   if (!NativeParam->getType()->isReferenceType())
3928     return NativeParam;
3929   QualType ArgType = NativeParam->getType();
3930   QualifierCollector QC;
3931   const Type *NonQualTy = QC.strip(ArgType);
3932   QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3933   if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
3934     if (Attr->getCaptureKind() == OMPC_map) {
3935       PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
3936                                                         LangAS::opencl_global);
3937     } else if (Attr->getCaptureKind() == OMPC_firstprivate &&
3938                PointeeTy.isConstant(CGM.getContext())) {
3939       PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
3940                                                         LangAS::opencl_generic);
3941     }
3942   }
3943   ArgType = CGM.getContext().getPointerType(PointeeTy);
3944   QC.addRestrict();
3945   enum { NVPTX_local_addr = 5 };
3946   QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
3947   ArgType = QC.apply(CGM.getContext(), ArgType);
3948   if (isa<ImplicitParamDecl>(NativeParam))
3949     return ImplicitParamDecl::Create(
3950         CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
3951         NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
3952   return ParmVarDecl::Create(
3953       CGM.getContext(),
3954       const_cast<DeclContext *>(NativeParam->getDeclContext()),
3955       NativeParam->getBeginLoc(), NativeParam->getLocation(),
3956       NativeParam->getIdentifier(), ArgType,
3957       /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
3958 }
3959 
3960 Address
getParameterAddress(CodeGenFunction & CGF,const VarDecl * NativeParam,const VarDecl * TargetParam) const3961 CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF,
3962                                           const VarDecl *NativeParam,
3963                                           const VarDecl *TargetParam) const {
3964   assert(NativeParam != TargetParam &&
3965          NativeParam->getType()->isReferenceType() &&
3966          "Native arg must not be the same as target arg.");
3967   Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
3968   QualType NativeParamType = NativeParam->getType();
3969   QualifierCollector QC;
3970   const Type *NonQualTy = QC.strip(NativeParamType);
3971   QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3972   unsigned NativePointeeAddrSpace =
3973       CGF.getContext().getTargetAddressSpace(NativePointeeTy);
3974   QualType TargetTy = TargetParam->getType();
3975   llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
3976       LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
3977   // First cast to generic.
3978   TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3979       TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
3980                       /*AddrSpace=*/0));
3981   // Cast from generic to native address space.
3982   TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3983       TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
3984                       NativePointeeAddrSpace));
3985   Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
3986   CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
3987                         NativeParamType);
3988   return NativeParamAddr;
3989 }
3990 
emitOutlinedFunctionCall(CodeGenFunction & CGF,SourceLocation Loc,llvm::FunctionCallee OutlinedFn,ArrayRef<llvm::Value * > Args) const3991 void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall(
3992     CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
3993     ArrayRef<llvm::Value *> Args) const {
3994   SmallVector<llvm::Value *, 4> TargetArgs;
3995   TargetArgs.reserve(Args.size());
3996   auto *FnType = OutlinedFn.getFunctionType();
3997   for (unsigned I = 0, E = Args.size(); I < E; ++I) {
3998     if (FnType->isVarArg() && FnType->getNumParams() <= I) {
3999       TargetArgs.append(std::next(Args.begin(), I), Args.end());
4000       break;
4001     }
4002     llvm::Type *TargetType = FnType->getParamType(I);
4003     llvm::Value *NativeArg = Args[I];
4004     if (!TargetType->isPointerTy()) {
4005       TargetArgs.emplace_back(NativeArg);
4006       continue;
4007     }
4008     llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
4009         NativeArg,
4010         NativeArg->getType()->getPointerElementType()->getPointerTo());
4011     TargetArgs.emplace_back(
4012         CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
4013   }
4014   CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
4015 }
4016 
4017 /// Emit function which wraps the outline parallel region
4018 /// and controls the arguments which are passed to this function.
4019 /// The wrapper ensures that the outlined function is called
4020 /// with the correct arguments when data is shared.
createParallelDataSharingWrapper(llvm::Function * OutlinedParallelFn,const OMPExecutableDirective & D)4021 llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
4022     llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
4023   ASTContext &Ctx = CGM.getContext();
4024   const auto &CS = *D.getCapturedStmt(OMPD_parallel);
4025 
4026   // Create a function that takes as argument the source thread.
4027   FunctionArgList WrapperArgs;
4028   QualType Int16QTy =
4029       Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
4030   QualType Int32QTy =
4031       Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
4032   ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
4033                                      /*Id=*/nullptr, Int16QTy,
4034                                      ImplicitParamDecl::Other);
4035   ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
4036                                /*Id=*/nullptr, Int32QTy,
4037                                ImplicitParamDecl::Other);
4038   WrapperArgs.emplace_back(&ParallelLevelArg);
4039   WrapperArgs.emplace_back(&WrapperArg);
4040 
4041   const CGFunctionInfo &CGFI =
4042       CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
4043 
4044   auto *Fn = llvm::Function::Create(
4045       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
4046       Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
4047 
4048   // Ensure we do not inline the function. This is trivially true for the ones
4049   // passed to __kmpc_fork_call but the ones calles in serialized regions
4050   // could be inlined. This is not a perfect but it is closer to the invariant
4051   // we want, namely, every data environment starts with a new function.
4052   // TODO: We should pass the if condition to the runtime function and do the
4053   //       handling there. Much cleaner code.
4054   Fn->addFnAttr(llvm::Attribute::NoInline);
4055 
4056   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
4057   Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
4058   Fn->setDoesNotRecurse();
4059 
4060   CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
4061   CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
4062                     D.getBeginLoc(), D.getBeginLoc());
4063 
4064   const auto *RD = CS.getCapturedRecordDecl();
4065   auto CurField = RD->field_begin();
4066 
4067   Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
4068                                                       /*Name=*/".zero.addr");
4069   CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
4070   // Get the array of arguments.
4071   SmallVector<llvm::Value *, 8> Args;
4072 
4073   Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
4074   Args.emplace_back(ZeroAddr.getPointer());
4075 
4076   CGBuilderTy &Bld = CGF.Builder;
4077   auto CI = CS.capture_begin();
4078 
4079   // Use global memory for data sharing.
4080   // Handle passing of global args to workers.
4081   Address GlobalArgs =
4082       CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
4083   llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
4084   llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
4085   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
4086                           CGM.getModule(), OMPRTL___kmpc_get_shared_variables),
4087                       DataSharingArgs);
4088 
4089   // Retrieve the shared variables from the list of references returned
4090   // by the runtime. Pass the variables to the outlined function.
4091   Address SharedArgListAddress = Address::invalid();
4092   if (CS.capture_size() > 0 ||
4093       isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
4094     SharedArgListAddress = CGF.EmitLoadOfPointer(
4095         GlobalArgs, CGF.getContext()
4096                         .getPointerType(CGF.getContext().getPointerType(
4097                             CGF.getContext().VoidPtrTy))
4098                         .castAs<PointerType>());
4099   }
4100   unsigned Idx = 0;
4101   if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
4102     Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
4103     Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4104         Src, CGF.SizeTy->getPointerTo());
4105     llvm::Value *LB = CGF.EmitLoadOfScalar(
4106         TypedAddress,
4107         /*Volatile=*/false,
4108         CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
4109         cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
4110     Args.emplace_back(LB);
4111     ++Idx;
4112     Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
4113     TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4114         Src, CGF.SizeTy->getPointerTo());
4115     llvm::Value *UB = CGF.EmitLoadOfScalar(
4116         TypedAddress,
4117         /*Volatile=*/false,
4118         CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
4119         cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
4120     Args.emplace_back(UB);
4121     ++Idx;
4122   }
4123   if (CS.capture_size() > 0) {
4124     ASTContext &CGFContext = CGF.getContext();
4125     for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
4126       QualType ElemTy = CurField->getType();
4127       Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
4128       Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4129           Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
4130       llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
4131                                               /*Volatile=*/false,
4132                                               CGFContext.getPointerType(ElemTy),
4133                                               CI->getLocation());
4134       if (CI->capturesVariableByCopy() &&
4135           !CI->getCapturedVar()->getType()->isAnyPointerType()) {
4136         Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
4137                               CI->getLocation());
4138       }
4139       Args.emplace_back(Arg);
4140     }
4141   }
4142 
4143   emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
4144   CGF.FinishFunction();
4145   return Fn;
4146 }
4147 
emitFunctionProlog(CodeGenFunction & CGF,const Decl * D)4148 void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
4149                                               const Decl *D) {
4150   if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
4151     return;
4152 
4153   assert(D && "Expected function or captured|block decl.");
4154   assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
4155          "Function is registered already.");
4156   assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
4157          "Team is set but not processed.");
4158   const Stmt *Body = nullptr;
4159   bool NeedToDelayGlobalization = false;
4160   if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
4161     Body = FD->getBody();
4162   } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
4163     Body = BD->getBody();
4164   } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
4165     Body = CD->getBody();
4166     NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
4167     if (NeedToDelayGlobalization &&
4168         getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
4169       return;
4170   }
4171   if (!Body)
4172     return;
4173   CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
4174   VarChecker.Visit(Body);
4175   const RecordDecl *GlobalizedVarsRecord =
4176       VarChecker.getGlobalizedRecord(IsInTTDRegion);
4177   TeamAndReductions.first = nullptr;
4178   TeamAndReductions.second.clear();
4179   ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
4180       VarChecker.getEscapedVariableLengthDecls();
4181   if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
4182     return;
4183   auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
4184   I->getSecond().MappedParams =
4185       std::make_unique<CodeGenFunction::OMPMapVars>();
4186   I->getSecond().GlobalRecord = GlobalizedVarsRecord;
4187   I->getSecond().EscapedParameters.insert(
4188       VarChecker.getEscapedParameters().begin(),
4189       VarChecker.getEscapedParameters().end());
4190   I->getSecond().EscapedVariableLengthDecls.append(
4191       EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
4192   DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
4193   for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
4194     assert(VD->isCanonicalDecl() && "Expected canonical declaration");
4195     const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
4196     Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion)));
4197   }
4198   if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
4199     CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None);
4200     VarChecker.Visit(Body);
4201     I->getSecond().SecondaryGlobalRecord =
4202         VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true);
4203     I->getSecond().SecondaryLocalVarData.emplace();
4204     DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue();
4205     for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
4206       assert(VD->isCanonicalDecl() && "Expected canonical declaration");
4207       const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
4208       Data.insert(
4209           std::make_pair(VD, MappedVarData(FD, /*IsInTTDRegion=*/true)));
4210     }
4211   }
4212   if (!NeedToDelayGlobalization) {
4213     emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true);
4214     struct GlobalizationScope final : EHScopeStack::Cleanup {
4215       GlobalizationScope() = default;
4216 
4217       void Emit(CodeGenFunction &CGF, Flags flags) override {
4218         static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
4219             .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true);
4220       }
4221     };
4222     CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
4223   }
4224 }
4225 
getAddressOfLocalVariable(CodeGenFunction & CGF,const VarDecl * VD)4226 Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
4227                                                         const VarDecl *VD) {
4228   if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
4229     const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
4230     auto AS = LangAS::Default;
4231     switch (A->getAllocatorType()) {
4232       // Use the default allocator here as by default local vars are
4233       // threadlocal.
4234     case OMPAllocateDeclAttr::OMPNullMemAlloc:
4235     case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
4236     case OMPAllocateDeclAttr::OMPThreadMemAlloc:
4237     case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
4238     case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
4239       // Follow the user decision - use default allocation.
4240       return Address::invalid();
4241     case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
4242       // TODO: implement aupport for user-defined allocators.
4243       return Address::invalid();
4244     case OMPAllocateDeclAttr::OMPConstMemAlloc:
4245       AS = LangAS::cuda_constant;
4246       break;
4247     case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
4248       AS = LangAS::cuda_shared;
4249       break;
4250     case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
4251     case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
4252       break;
4253     }
4254     llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
4255     auto *GV = new llvm::GlobalVariable(
4256         CGM.getModule(), VarTy, /*isConstant=*/false,
4257         llvm::GlobalValue::InternalLinkage, llvm::Constant::getNullValue(VarTy),
4258         VD->getName(),
4259         /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
4260         CGM.getContext().getTargetAddressSpace(AS));
4261     CharUnits Align = CGM.getContext().getDeclAlign(VD);
4262     GV->setAlignment(Align.getAsAlign());
4263     return Address(
4264         CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
4265             GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace(
4266                     VD->getType().getAddressSpace()))),
4267         Align);
4268   }
4269 
4270   if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
4271     return Address::invalid();
4272 
4273   VD = VD->getCanonicalDecl();
4274   auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
4275   if (I == FunctionGlobalizedDecls.end())
4276     return Address::invalid();
4277   auto VDI = I->getSecond().LocalVarData.find(VD);
4278   if (VDI != I->getSecond().LocalVarData.end())
4279     return VDI->second.PrivateAddr;
4280   if (VD->hasAttrs()) {
4281     for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
4282          E(VD->attr_end());
4283          IT != E; ++IT) {
4284       auto VDI = I->getSecond().LocalVarData.find(
4285           cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
4286               ->getCanonicalDecl());
4287       if (VDI != I->getSecond().LocalVarData.end())
4288         return VDI->second.PrivateAddr;
4289     }
4290   }
4291 
4292   return Address::invalid();
4293 }
4294 
functionFinished(CodeGenFunction & CGF)4295 void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) {
4296   FunctionGlobalizedDecls.erase(CGF.CurFn);
4297   CGOpenMPRuntime::functionFinished(CGF);
4298 }
4299 
getDefaultDistScheduleAndChunk(CodeGenFunction & CGF,const OMPLoopDirective & S,OpenMPDistScheduleClauseKind & ScheduleKind,llvm::Value * & Chunk) const4300 void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
4301     CodeGenFunction &CGF, const OMPLoopDirective &S,
4302     OpenMPDistScheduleClauseKind &ScheduleKind,
4303     llvm::Value *&Chunk) const {
4304   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
4305   if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
4306     ScheduleKind = OMPC_DIST_SCHEDULE_static;
4307     Chunk = CGF.EmitScalarConversion(
4308         RT.getGPUNumThreads(CGF),
4309         CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
4310         S.getIterationVariable()->getType(), S.getBeginLoc());
4311     return;
4312   }
4313   CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
4314       CGF, S, ScheduleKind, Chunk);
4315 }
4316 
getDefaultScheduleAndChunk(CodeGenFunction & CGF,const OMPLoopDirective & S,OpenMPScheduleClauseKind & ScheduleKind,const Expr * & ChunkExpr) const4317 void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk(
4318     CodeGenFunction &CGF, const OMPLoopDirective &S,
4319     OpenMPScheduleClauseKind &ScheduleKind,
4320     const Expr *&ChunkExpr) const {
4321   ScheduleKind = OMPC_SCHEDULE_static;
4322   // Chunk size is 1 in this case.
4323   llvm::APInt ChunkSize(32, 1);
4324   ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
4325       CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
4326       SourceLocation());
4327 }
4328 
adjustTargetSpecificDataForLambdas(CodeGenFunction & CGF,const OMPExecutableDirective & D) const4329 void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas(
4330     CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
4331   assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&
4332          " Expected target-based directive.");
4333   const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
4334   for (const CapturedStmt::Capture &C : CS->captures()) {
4335     // Capture variables captured by reference in lambdas for target-based
4336     // directives.
4337     if (!C.capturesVariable())
4338       continue;
4339     const VarDecl *VD = C.getCapturedVar();
4340     const auto *RD = VD->getType()
4341                          .getCanonicalType()
4342                          .getNonReferenceType()
4343                          ->getAsCXXRecordDecl();
4344     if (!RD || !RD->isLambda())
4345       continue;
4346     Address VDAddr = CGF.GetAddrOfLocalVar(VD);
4347     LValue VDLVal;
4348     if (VD->getType().getCanonicalType()->isReferenceType())
4349       VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
4350     else
4351       VDLVal = CGF.MakeAddrLValue(
4352           VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
4353     llvm::DenseMap<const VarDecl *, FieldDecl *> Captures;
4354     FieldDecl *ThisCapture = nullptr;
4355     RD->getCaptureFields(Captures, ThisCapture);
4356     if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
4357       LValue ThisLVal =
4358           CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
4359       llvm::Value *CXXThis = CGF.LoadCXXThis();
4360       CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
4361     }
4362     for (const LambdaCapture &LC : RD->captures()) {
4363       if (LC.getCaptureKind() != LCK_ByRef)
4364         continue;
4365       const VarDecl *VD = LC.getCapturedVar();
4366       if (!CS->capturesVariable(VD))
4367         continue;
4368       auto It = Captures.find(VD);
4369       assert(It != Captures.end() && "Found lambda capture without field.");
4370       LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
4371       Address VDAddr = CGF.GetAddrOfLocalVar(VD);
4372       if (VD->getType().getCanonicalType()->isReferenceType())
4373         VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
4374                                                VD->getType().getCanonicalType())
4375                      .getAddress(CGF);
4376       CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
4377     }
4378   }
4379 }
4380 
getDefaultFirstprivateAddressSpace() const4381 unsigned CGOpenMPRuntimeGPU::getDefaultFirstprivateAddressSpace() const {
4382   return CGM.getContext().getTargetAddressSpace(LangAS::cuda_constant);
4383 }
4384 
hasAllocateAttributeForGlobalVar(const VarDecl * VD,LangAS & AS)4385 bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
4386                                                             LangAS &AS) {
4387   if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
4388     return false;
4389   const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
4390   switch(A->getAllocatorType()) {
4391   case OMPAllocateDeclAttr::OMPNullMemAlloc:
4392   case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
4393   // Not supported, fallback to the default mem space.
4394   case OMPAllocateDeclAttr::OMPThreadMemAlloc:
4395   case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
4396   case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
4397   case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
4398   case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
4399     AS = LangAS::Default;
4400     return true;
4401   case OMPAllocateDeclAttr::OMPConstMemAlloc:
4402     AS = LangAS::cuda_constant;
4403     return true;
4404   case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
4405     AS = LangAS::cuda_shared;
4406     return true;
4407   case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
4408     llvm_unreachable("Expected predefined allocator for the variables with the "
4409                      "static storage.");
4410   }
4411   return false;
4412 }
4413 
4414 // Get current CudaArch and ignore any unknown values
getCudaArch(CodeGenModule & CGM)4415 static CudaArch getCudaArch(CodeGenModule &CGM) {
4416   if (!CGM.getTarget().hasFeature("ptx"))
4417     return CudaArch::UNKNOWN;
4418   for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
4419     if (Feature.getValue()) {
4420       CudaArch Arch = StringToCudaArch(Feature.getKey());
4421       if (Arch != CudaArch::UNKNOWN)
4422         return Arch;
4423     }
4424   }
4425   return CudaArch::UNKNOWN;
4426 }
4427 
4428 /// Check to see if target architecture supports unified addressing which is
4429 /// a restriction for OpenMP requires clause "unified_shared_memory".
processRequiresDirective(const OMPRequiresDecl * D)4430 void CGOpenMPRuntimeGPU::processRequiresDirective(
4431     const OMPRequiresDecl *D) {
4432   for (const OMPClause *Clause : D->clauselists()) {
4433     if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
4434       CudaArch Arch = getCudaArch(CGM);
4435       switch (Arch) {
4436       case CudaArch::SM_20:
4437       case CudaArch::SM_21:
4438       case CudaArch::SM_30:
4439       case CudaArch::SM_32:
4440       case CudaArch::SM_35:
4441       case CudaArch::SM_37:
4442       case CudaArch::SM_50:
4443       case CudaArch::SM_52:
4444       case CudaArch::SM_53: {
4445         SmallString<256> Buffer;
4446         llvm::raw_svector_ostream Out(Buffer);
4447         Out << "Target architecture " << CudaArchToString(Arch)
4448             << " does not support unified addressing";
4449         CGM.Error(Clause->getBeginLoc(), Out.str());
4450         return;
4451       }
4452       case CudaArch::SM_60:
4453       case CudaArch::SM_61:
4454       case CudaArch::SM_62:
4455       case CudaArch::SM_70:
4456       case CudaArch::SM_72:
4457       case CudaArch::SM_75:
4458       case CudaArch::SM_80:
4459       case CudaArch::SM_86:
4460       case CudaArch::GFX600:
4461       case CudaArch::GFX601:
4462       case CudaArch::GFX602:
4463       case CudaArch::GFX700:
4464       case CudaArch::GFX701:
4465       case CudaArch::GFX702:
4466       case CudaArch::GFX703:
4467       case CudaArch::GFX704:
4468       case CudaArch::GFX705:
4469       case CudaArch::GFX801:
4470       case CudaArch::GFX802:
4471       case CudaArch::GFX803:
4472       case CudaArch::GFX805:
4473       case CudaArch::GFX810:
4474       case CudaArch::GFX900:
4475       case CudaArch::GFX902:
4476       case CudaArch::GFX904:
4477       case CudaArch::GFX906:
4478       case CudaArch::GFX908:
4479       case CudaArch::GFX909:
4480       case CudaArch::GFX90a:
4481       case CudaArch::GFX90c:
4482       case CudaArch::GFX1010:
4483       case CudaArch::GFX1011:
4484       case CudaArch::GFX1012:
4485       case CudaArch::GFX1030:
4486       case CudaArch::GFX1031:
4487       case CudaArch::GFX1032:
4488       case CudaArch::GFX1033:
4489       case CudaArch::GFX1034:
4490       case CudaArch::UNUSED:
4491       case CudaArch::UNKNOWN:
4492         break;
4493       case CudaArch::LAST:
4494         llvm_unreachable("Unexpected Cuda arch.");
4495       }
4496     }
4497   }
4498   CGOpenMPRuntime::processRequiresDirective(D);
4499 }
4500 
4501 /// Get number of SMs and number of blocks per SM.
getSMsBlocksPerSM(CodeGenModule & CGM)4502 static std::pair<unsigned, unsigned> getSMsBlocksPerSM(CodeGenModule &CGM) {
4503   std::pair<unsigned, unsigned> Data;
4504   if (CGM.getLangOpts().OpenMPCUDANumSMs)
4505     Data.first = CGM.getLangOpts().OpenMPCUDANumSMs;
4506   if (CGM.getLangOpts().OpenMPCUDABlocksPerSM)
4507     Data.second = CGM.getLangOpts().OpenMPCUDABlocksPerSM;
4508   if (Data.first && Data.second)
4509     return Data;
4510   switch (getCudaArch(CGM)) {
4511   case CudaArch::SM_20:
4512   case CudaArch::SM_21:
4513   case CudaArch::SM_30:
4514   case CudaArch::SM_32:
4515   case CudaArch::SM_35:
4516   case CudaArch::SM_37:
4517   case CudaArch::SM_50:
4518   case CudaArch::SM_52:
4519   case CudaArch::SM_53:
4520     return {16, 16};
4521   case CudaArch::SM_60:
4522   case CudaArch::SM_61:
4523   case CudaArch::SM_62:
4524     return {56, 32};
4525   case CudaArch::SM_70:
4526   case CudaArch::SM_72:
4527   case CudaArch::SM_75:
4528   case CudaArch::SM_80:
4529   case CudaArch::SM_86:
4530     return {84, 32};
4531   case CudaArch::GFX600:
4532   case CudaArch::GFX601:
4533   case CudaArch::GFX602:
4534   case CudaArch::GFX700:
4535   case CudaArch::GFX701:
4536   case CudaArch::GFX702:
4537   case CudaArch::GFX703:
4538   case CudaArch::GFX704:
4539   case CudaArch::GFX705:
4540   case CudaArch::GFX801:
4541   case CudaArch::GFX802:
4542   case CudaArch::GFX803:
4543   case CudaArch::GFX805:
4544   case CudaArch::GFX810:
4545   case CudaArch::GFX900:
4546   case CudaArch::GFX902:
4547   case CudaArch::GFX904:
4548   case CudaArch::GFX906:
4549   case CudaArch::GFX908:
4550   case CudaArch::GFX909:
4551   case CudaArch::GFX90a:
4552   case CudaArch::GFX90c:
4553   case CudaArch::GFX1010:
4554   case CudaArch::GFX1011:
4555   case CudaArch::GFX1012:
4556   case CudaArch::GFX1030:
4557   case CudaArch::GFX1031:
4558   case CudaArch::GFX1032:
4559   case CudaArch::GFX1033:
4560   case CudaArch::GFX1034:
4561   case CudaArch::UNUSED:
4562   case CudaArch::UNKNOWN:
4563     break;
4564   case CudaArch::LAST:
4565     llvm_unreachable("Unexpected Cuda arch.");
4566   }
4567   llvm_unreachable("Unexpected NVPTX target without ptx feature.");
4568 }
4569 
clear()4570 void CGOpenMPRuntimeGPU::clear() {
4571   if (!GlobalizedRecords.empty() &&
4572       !CGM.getLangOpts().OpenMPCUDATargetParallel) {
4573     ASTContext &C = CGM.getContext();
4574     llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> GlobalRecs;
4575     llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> SharedRecs;
4576     RecordDecl *StaticRD = C.buildImplicitRecord(
4577         "_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
4578     StaticRD->startDefinition();
4579     RecordDecl *SharedStaticRD = C.buildImplicitRecord(
4580         "_shared_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
4581     SharedStaticRD->startDefinition();
4582     for (const GlobalPtrSizeRecsTy &Records : GlobalizedRecords) {
4583       if (Records.Records.empty())
4584         continue;
4585       unsigned Size = 0;
4586       unsigned RecAlignment = 0;
4587       for (const RecordDecl *RD : Records.Records) {
4588         QualType RDTy = C.getRecordType(RD);
4589         unsigned Alignment = C.getTypeAlignInChars(RDTy).getQuantity();
4590         RecAlignment = std::max(RecAlignment, Alignment);
4591         unsigned RecSize = C.getTypeSizeInChars(RDTy).getQuantity();
4592         Size =
4593             llvm::alignTo(llvm::alignTo(Size, Alignment) + RecSize, Alignment);
4594       }
4595       Size = llvm::alignTo(Size, RecAlignment);
4596       llvm::APInt ArySize(/*numBits=*/64, Size);
4597       QualType SubTy = C.getConstantArrayType(
4598           C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0);
4599       const bool UseSharedMemory = Size <= SharedMemorySize;
4600       auto *Field =
4601           FieldDecl::Create(C, UseSharedMemory ? SharedStaticRD : StaticRD,
4602                             SourceLocation(), SourceLocation(), nullptr, SubTy,
4603                             C.getTrivialTypeSourceInfo(SubTy, SourceLocation()),
4604                             /*BW=*/nullptr, /*Mutable=*/false,
4605                             /*InitStyle=*/ICIS_NoInit);
4606       Field->setAccess(AS_public);
4607       if (UseSharedMemory) {
4608         SharedStaticRD->addDecl(Field);
4609         SharedRecs.push_back(&Records);
4610       } else {
4611         StaticRD->addDecl(Field);
4612         GlobalRecs.push_back(&Records);
4613       }
4614       Records.RecSize->setInitializer(llvm::ConstantInt::get(CGM.SizeTy, Size));
4615       Records.UseSharedMemory->setInitializer(
4616           llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0));
4617     }
4618     // Allocate SharedMemorySize buffer for the shared memory.
4619     // FIXME: nvlink does not handle weak linkage correctly (object with the
4620     // different size are reported as erroneous).
4621     // Restore this code as sson as nvlink is fixed.
4622     if (!SharedStaticRD->field_empty()) {
4623       llvm::APInt ArySize(/*numBits=*/64, SharedMemorySize);
4624       QualType SubTy = C.getConstantArrayType(
4625           C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0);
4626       auto *Field = FieldDecl::Create(
4627           C, SharedStaticRD, SourceLocation(), SourceLocation(), nullptr, SubTy,
4628           C.getTrivialTypeSourceInfo(SubTy, SourceLocation()),
4629           /*BW=*/nullptr, /*Mutable=*/false,
4630           /*InitStyle=*/ICIS_NoInit);
4631       Field->setAccess(AS_public);
4632       SharedStaticRD->addDecl(Field);
4633     }
4634     SharedStaticRD->completeDefinition();
4635     if (!SharedStaticRD->field_empty()) {
4636       QualType StaticTy = C.getRecordType(SharedStaticRD);
4637       llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy);
4638       auto *GV = new llvm::GlobalVariable(
4639           CGM.getModule(), LLVMStaticTy,
4640           /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage,
4641           llvm::UndefValue::get(LLVMStaticTy),
4642           "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr,
4643           llvm::GlobalValue::NotThreadLocal,
4644           C.getTargetAddressSpace(LangAS::cuda_shared));
4645       auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
4646           GV, CGM.VoidPtrTy);
4647       for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) {
4648         Rec->Buffer->replaceAllUsesWith(Replacement);
4649         Rec->Buffer->eraseFromParent();
4650       }
4651     }
4652     StaticRD->completeDefinition();
4653     if (!StaticRD->field_empty()) {
4654       QualType StaticTy = C.getRecordType(StaticRD);
4655       std::pair<unsigned, unsigned> SMsBlockPerSM = getSMsBlocksPerSM(CGM);
4656       llvm::APInt Size1(32, SMsBlockPerSM.second);
4657       QualType Arr1Ty =
4658           C.getConstantArrayType(StaticTy, Size1, nullptr, ArrayType::Normal,
4659                                  /*IndexTypeQuals=*/0);
4660       llvm::APInt Size2(32, SMsBlockPerSM.first);
4661       QualType Arr2Ty =
4662           C.getConstantArrayType(Arr1Ty, Size2, nullptr, ArrayType::Normal,
4663                                  /*IndexTypeQuals=*/0);
4664       llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty);
4665       // FIXME: nvlink does not handle weak linkage correctly (object with the
4666       // different size are reported as erroneous).
4667       // Restore CommonLinkage as soon as nvlink is fixed.
4668       auto *GV = new llvm::GlobalVariable(
4669           CGM.getModule(), LLVMArr2Ty,
4670           /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
4671           llvm::Constant::getNullValue(LLVMArr2Ty),
4672           "_openmp_static_glob_rd_$_");
4673       auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
4674           GV, CGM.VoidPtrTy);
4675       for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) {
4676         Rec->Buffer->replaceAllUsesWith(Replacement);
4677         Rec->Buffer->eraseFromParent();
4678       }
4679     }
4680   }
4681   if (!TeamsReductions.empty()) {
4682     ASTContext &C = CGM.getContext();
4683     RecordDecl *StaticRD = C.buildImplicitRecord(
4684         "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
4685     StaticRD->startDefinition();
4686     for (const RecordDecl *TeamReductionRec : TeamsReductions) {
4687       QualType RecTy = C.getRecordType(TeamReductionRec);
4688       auto *Field = FieldDecl::Create(
4689           C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
4690           C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
4691           /*BW=*/nullptr, /*Mutable=*/false,
4692           /*InitStyle=*/ICIS_NoInit);
4693       Field->setAccess(AS_public);
4694       StaticRD->addDecl(Field);
4695     }
4696     StaticRD->completeDefinition();
4697     QualType StaticTy = C.getRecordType(StaticRD);
4698     llvm::Type *LLVMReductionsBufferTy =
4699         CGM.getTypes().ConvertTypeForMem(StaticTy);
4700     // FIXME: nvlink does not handle weak linkage correctly (object with the
4701     // different size are reported as erroneous).
4702     // Restore CommonLinkage as soon as nvlink is fixed.
4703     auto *GV = new llvm::GlobalVariable(
4704         CGM.getModule(), LLVMReductionsBufferTy,
4705         /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
4706         llvm::Constant::getNullValue(LLVMReductionsBufferTy),
4707         "_openmp_teams_reductions_buffer_$_");
4708     KernelTeamsReductionPtr->setInitializer(
4709         llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
4710                                                              CGM.VoidPtrTy));
4711   }
4712   CGOpenMPRuntime::clear();
4713 }
4714