1 //===-- IPO/OpenMPOpt.cpp - Collection of OpenMP specific optimizations ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // OpenMP specific optimizations:
10 //
11 // - Deduplication of runtime calls, e.g., omp_get_thread_num.
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include "llvm/Transforms/IPO/OpenMPOpt.h"
16
17 #include "llvm/ADT/EnumeratedArray.h"
18 #include "llvm/ADT/Statistic.h"
19 #include "llvm/Analysis/CallGraph.h"
20 #include "llvm/Analysis/CallGraphSCCPass.h"
21 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
22 #include "llvm/Analysis/ValueTracking.h"
23 #include "llvm/Frontend/OpenMP/OMPConstants.h"
24 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
25 #include "llvm/InitializePasses.h"
26 #include "llvm/Support/CommandLine.h"
27 #include "llvm/Transforms/IPO.h"
28 #include "llvm/Transforms/IPO/Attributor.h"
29 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
30 #include "llvm/Transforms/Utils/CallGraphUpdater.h"
31 #include "llvm/Transforms/Utils/CodeExtractor.h"
32
33 using namespace llvm;
34 using namespace omp;
35
36 #define DEBUG_TYPE "openmp-opt"
37
38 static cl::opt<bool> DisableOpenMPOptimizations(
39 "openmp-opt-disable", cl::ZeroOrMore,
40 cl::desc("Disable OpenMP specific optimizations."), cl::Hidden,
41 cl::init(false));
42
43 static cl::opt<bool> EnableParallelRegionMerging(
44 "openmp-opt-enable-merging", cl::ZeroOrMore,
45 cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden,
46 cl::init(false));
47
48 static cl::opt<bool> PrintICVValues("openmp-print-icv-values", cl::init(false),
49 cl::Hidden);
50 static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels",
51 cl::init(false), cl::Hidden);
52
53 static cl::opt<bool> HideMemoryTransferLatency(
54 "openmp-hide-memory-transfer-latency",
55 cl::desc("[WIP] Tries to hide the latency of host to device memory"
56 " transfers"),
57 cl::Hidden, cl::init(false));
58
59 STATISTIC(NumOpenMPRuntimeCallsDeduplicated,
60 "Number of OpenMP runtime calls deduplicated");
61 STATISTIC(NumOpenMPParallelRegionsDeleted,
62 "Number of OpenMP parallel regions deleted");
63 STATISTIC(NumOpenMPRuntimeFunctionsIdentified,
64 "Number of OpenMP runtime functions identified");
65 STATISTIC(NumOpenMPRuntimeFunctionUsesIdentified,
66 "Number of OpenMP runtime function uses identified");
67 STATISTIC(NumOpenMPTargetRegionKernels,
68 "Number of OpenMP target region entry points (=kernels) identified");
69 STATISTIC(
70 NumOpenMPParallelRegionsReplacedInGPUStateMachine,
71 "Number of OpenMP parallel regions replaced with ID in GPU state machines");
72 STATISTIC(NumOpenMPParallelRegionsMerged,
73 "Number of OpenMP parallel regions merged");
74
75 #if !defined(NDEBUG)
76 static constexpr auto TAG = "[" DEBUG_TYPE "]";
77 #endif
78
79 namespace {
80
81 struct AAICVTracker;
82
83 /// OpenMP specific information. For now, stores RFIs and ICVs also needed for
84 /// Attributor runs.
85 struct OMPInformationCache : public InformationCache {
OMPInformationCache__anon43e5a8d30111::OMPInformationCache86 OMPInformationCache(Module &M, AnalysisGetter &AG,
87 BumpPtrAllocator &Allocator, SetVector<Function *> &CGSCC,
88 SmallPtrSetImpl<Kernel> &Kernels)
89 : InformationCache(M, AG, Allocator, &CGSCC), OMPBuilder(M),
90 Kernels(Kernels) {
91
92 OMPBuilder.initialize();
93 initializeRuntimeFunctions();
94 initializeInternalControlVars();
95 }
96
97 /// Generic information that describes an internal control variable.
98 struct InternalControlVarInfo {
99 /// The kind, as described by InternalControlVar enum.
100 InternalControlVar Kind;
101
102 /// The name of the ICV.
103 StringRef Name;
104
105 /// Environment variable associated with this ICV.
106 StringRef EnvVarName;
107
108 /// Initial value kind.
109 ICVInitValue InitKind;
110
111 /// Initial value.
112 ConstantInt *InitValue;
113
114 /// Setter RTL function associated with this ICV.
115 RuntimeFunction Setter;
116
117 /// Getter RTL function associated with this ICV.
118 RuntimeFunction Getter;
119
120 /// RTL Function corresponding to the override clause of this ICV
121 RuntimeFunction Clause;
122 };
123
124 /// Generic information that describes a runtime function
125 struct RuntimeFunctionInfo {
126
127 /// The kind, as described by the RuntimeFunction enum.
128 RuntimeFunction Kind;
129
130 /// The name of the function.
131 StringRef Name;
132
133 /// Flag to indicate a variadic function.
134 bool IsVarArg;
135
136 /// The return type of the function.
137 Type *ReturnType;
138
139 /// The argument types of the function.
140 SmallVector<Type *, 8> ArgumentTypes;
141
142 /// The declaration if available.
143 Function *Declaration = nullptr;
144
145 /// Uses of this runtime function per function containing the use.
146 using UseVector = SmallVector<Use *, 16>;
147
148 /// Clear UsesMap for runtime function.
clearUsesMap__anon43e5a8d30111::OMPInformationCache::RuntimeFunctionInfo149 void clearUsesMap() { UsesMap.clear(); }
150
151 /// Boolean conversion that is true if the runtime function was found.
operator bool__anon43e5a8d30111::OMPInformationCache::RuntimeFunctionInfo152 operator bool() const { return Declaration; }
153
154 /// Return the vector of uses in function \p F.
getOrCreateUseVector__anon43e5a8d30111::OMPInformationCache::RuntimeFunctionInfo155 UseVector &getOrCreateUseVector(Function *F) {
156 std::shared_ptr<UseVector> &UV = UsesMap[F];
157 if (!UV)
158 UV = std::make_shared<UseVector>();
159 return *UV;
160 }
161
162 /// Return the vector of uses in function \p F or `nullptr` if there are
163 /// none.
getUseVector__anon43e5a8d30111::OMPInformationCache::RuntimeFunctionInfo164 const UseVector *getUseVector(Function &F) const {
165 auto I = UsesMap.find(&F);
166 if (I != UsesMap.end())
167 return I->second.get();
168 return nullptr;
169 }
170
171 /// Return how many functions contain uses of this runtime function.
getNumFunctionsWithUses__anon43e5a8d30111::OMPInformationCache::RuntimeFunctionInfo172 size_t getNumFunctionsWithUses() const { return UsesMap.size(); }
173
174 /// Return the number of arguments (or the minimal number for variadic
175 /// functions).
getNumArgs__anon43e5a8d30111::OMPInformationCache::RuntimeFunctionInfo176 size_t getNumArgs() const { return ArgumentTypes.size(); }
177
178 /// Run the callback \p CB on each use and forget the use if the result is
179 /// true. The callback will be fed the function in which the use was
180 /// encountered as second argument.
foreachUse__anon43e5a8d30111::OMPInformationCache::RuntimeFunctionInfo181 void foreachUse(SmallVectorImpl<Function *> &SCC,
182 function_ref<bool(Use &, Function &)> CB) {
183 for (Function *F : SCC)
184 foreachUse(CB, F);
185 }
186
187 /// Run the callback \p CB on each use within the function \p F and forget
188 /// the use if the result is true.
foreachUse__anon43e5a8d30111::OMPInformationCache::RuntimeFunctionInfo189 void foreachUse(function_ref<bool(Use &, Function &)> CB, Function *F) {
190 SmallVector<unsigned, 8> ToBeDeleted;
191 ToBeDeleted.clear();
192
193 unsigned Idx = 0;
194 UseVector &UV = getOrCreateUseVector(F);
195
196 for (Use *U : UV) {
197 if (CB(*U, *F))
198 ToBeDeleted.push_back(Idx);
199 ++Idx;
200 }
201
202 // Remove the to-be-deleted indices in reverse order as prior
203 // modifications will not modify the smaller indices.
204 while (!ToBeDeleted.empty()) {
205 unsigned Idx = ToBeDeleted.pop_back_val();
206 UV[Idx] = UV.back();
207 UV.pop_back();
208 }
209 }
210
211 private:
212 /// Map from functions to all uses of this runtime function contained in
213 /// them.
214 DenseMap<Function *, std::shared_ptr<UseVector>> UsesMap;
215 };
216
217 /// An OpenMP-IR-Builder instance
218 OpenMPIRBuilder OMPBuilder;
219
220 /// Map from runtime function kind to the runtime function description.
221 EnumeratedArray<RuntimeFunctionInfo, RuntimeFunction,
222 RuntimeFunction::OMPRTL___last>
223 RFIs;
224
225 /// Map from ICV kind to the ICV description.
226 EnumeratedArray<InternalControlVarInfo, InternalControlVar,
227 InternalControlVar::ICV___last>
228 ICVs;
229
230 /// Helper to initialize all internal control variable information for those
231 /// defined in OMPKinds.def.
initializeInternalControlVars__anon43e5a8d30111::OMPInformationCache232 void initializeInternalControlVars() {
233 #define ICV_RT_SET(_Name, RTL) \
234 { \
235 auto &ICV = ICVs[_Name]; \
236 ICV.Setter = RTL; \
237 }
238 #define ICV_RT_GET(Name, RTL) \
239 { \
240 auto &ICV = ICVs[Name]; \
241 ICV.Getter = RTL; \
242 }
243 #define ICV_DATA_ENV(Enum, _Name, _EnvVarName, Init) \
244 { \
245 auto &ICV = ICVs[Enum]; \
246 ICV.Name = _Name; \
247 ICV.Kind = Enum; \
248 ICV.InitKind = Init; \
249 ICV.EnvVarName = _EnvVarName; \
250 switch (ICV.InitKind) { \
251 case ICV_IMPLEMENTATION_DEFINED: \
252 ICV.InitValue = nullptr; \
253 break; \
254 case ICV_ZERO: \
255 ICV.InitValue = ConstantInt::get( \
256 Type::getInt32Ty(OMPBuilder.Int32->getContext()), 0); \
257 break; \
258 case ICV_FALSE: \
259 ICV.InitValue = ConstantInt::getFalse(OMPBuilder.Int1->getContext()); \
260 break; \
261 case ICV_LAST: \
262 break; \
263 } \
264 }
265 #include "llvm/Frontend/OpenMP/OMPKinds.def"
266 }
267
268 /// Returns true if the function declaration \p F matches the runtime
269 /// function types, that is, return type \p RTFRetType, and argument types
270 /// \p RTFArgTypes.
declMatchesRTFTypes__anon43e5a8d30111::OMPInformationCache271 static bool declMatchesRTFTypes(Function *F, Type *RTFRetType,
272 SmallVector<Type *, 8> &RTFArgTypes) {
273 // TODO: We should output information to the user (under debug output
274 // and via remarks).
275
276 if (!F)
277 return false;
278 if (F->getReturnType() != RTFRetType)
279 return false;
280 if (F->arg_size() != RTFArgTypes.size())
281 return false;
282
283 auto RTFTyIt = RTFArgTypes.begin();
284 for (Argument &Arg : F->args()) {
285 if (Arg.getType() != *RTFTyIt)
286 return false;
287
288 ++RTFTyIt;
289 }
290
291 return true;
292 }
293
294 // Helper to collect all uses of the declaration in the UsesMap.
collectUses__anon43e5a8d30111::OMPInformationCache295 unsigned collectUses(RuntimeFunctionInfo &RFI, bool CollectStats = true) {
296 unsigned NumUses = 0;
297 if (!RFI.Declaration)
298 return NumUses;
299 OMPBuilder.addAttributes(RFI.Kind, *RFI.Declaration);
300
301 if (CollectStats) {
302 NumOpenMPRuntimeFunctionsIdentified += 1;
303 NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses();
304 }
305
306 // TODO: We directly convert uses into proper calls and unknown uses.
307 for (Use &U : RFI.Declaration->uses()) {
308 if (Instruction *UserI = dyn_cast<Instruction>(U.getUser())) {
309 if (ModuleSlice.count(UserI->getFunction())) {
310 RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U);
311 ++NumUses;
312 }
313 } else {
314 RFI.getOrCreateUseVector(nullptr).push_back(&U);
315 ++NumUses;
316 }
317 }
318 return NumUses;
319 }
320
321 // Helper function to recollect uses of a runtime function.
recollectUsesForFunction__anon43e5a8d30111::OMPInformationCache322 void recollectUsesForFunction(RuntimeFunction RTF) {
323 auto &RFI = RFIs[RTF];
324 RFI.clearUsesMap();
325 collectUses(RFI, /*CollectStats*/ false);
326 }
327
328 // Helper function to recollect uses of all runtime functions.
recollectUses__anon43e5a8d30111::OMPInformationCache329 void recollectUses() {
330 for (int Idx = 0; Idx < RFIs.size(); ++Idx)
331 recollectUsesForFunction(static_cast<RuntimeFunction>(Idx));
332 }
333
334 /// Helper to initialize all runtime function information for those defined
335 /// in OpenMPKinds.def.
initializeRuntimeFunctions__anon43e5a8d30111::OMPInformationCache336 void initializeRuntimeFunctions() {
337 Module &M = *((*ModuleSlice.begin())->getParent());
338
339 // Helper macros for handling __VA_ARGS__ in OMP_RTL
340 #define OMP_TYPE(VarName, ...) \
341 Type *VarName = OMPBuilder.VarName; \
342 (void)VarName;
343
344 #define OMP_ARRAY_TYPE(VarName, ...) \
345 ArrayType *VarName##Ty = OMPBuilder.VarName##Ty; \
346 (void)VarName##Ty; \
347 PointerType *VarName##PtrTy = OMPBuilder.VarName##PtrTy; \
348 (void)VarName##PtrTy;
349
350 #define OMP_FUNCTION_TYPE(VarName, ...) \
351 FunctionType *VarName = OMPBuilder.VarName; \
352 (void)VarName; \
353 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
354 (void)VarName##Ptr;
355
356 #define OMP_STRUCT_TYPE(VarName, ...) \
357 StructType *VarName = OMPBuilder.VarName; \
358 (void)VarName; \
359 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
360 (void)VarName##Ptr;
361
362 #define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...) \
363 { \
364 SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__}); \
365 Function *F = M.getFunction(_Name); \
366 if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) { \
367 auto &RFI = RFIs[_Enum]; \
368 RFI.Kind = _Enum; \
369 RFI.Name = _Name; \
370 RFI.IsVarArg = _IsVarArg; \
371 RFI.ReturnType = OMPBuilder._ReturnType; \
372 RFI.ArgumentTypes = std::move(ArgsTypes); \
373 RFI.Declaration = F; \
374 unsigned NumUses = collectUses(RFI); \
375 (void)NumUses; \
376 LLVM_DEBUG({ \
377 dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not") \
378 << " found\n"; \
379 if (RFI.Declaration) \
380 dbgs() << TAG << "-> got " << NumUses << " uses in " \
381 << RFI.getNumFunctionsWithUses() \
382 << " different functions.\n"; \
383 }); \
384 } \
385 }
386 #include "llvm/Frontend/OpenMP/OMPKinds.def"
387
388 // TODO: We should attach the attributes defined in OMPKinds.def.
389 }
390
391 /// Collection of known kernels (\see Kernel) in the module.
392 SmallPtrSetImpl<Kernel> &Kernels;
393 };
394
395 /// Used to map the values physically (in the IR) stored in an offload
396 /// array, to a vector in memory.
397 struct OffloadArray {
398 /// Physical array (in the IR).
399 AllocaInst *Array = nullptr;
400 /// Mapped values.
401 SmallVector<Value *, 8> StoredValues;
402 /// Last stores made in the offload array.
403 SmallVector<StoreInst *, 8> LastAccesses;
404
405 OffloadArray() = default;
406
407 /// Initializes the OffloadArray with the values stored in \p Array before
408 /// instruction \p Before is reached. Returns false if the initialization
409 /// fails.
410 /// This MUST be used immediately after the construction of the object.
initialize__anon43e5a8d30111::OffloadArray411 bool initialize(AllocaInst &Array, Instruction &Before) {
412 if (!Array.getAllocatedType()->isArrayTy())
413 return false;
414
415 if (!getValues(Array, Before))
416 return false;
417
418 this->Array = &Array;
419 return true;
420 }
421
422 static const unsigned DeviceIDArgNum = 1;
423 static const unsigned BasePtrsArgNum = 3;
424 static const unsigned PtrsArgNum = 4;
425 static const unsigned SizesArgNum = 5;
426
427 private:
428 /// Traverses the BasicBlock where \p Array is, collecting the stores made to
429 /// \p Array, leaving StoredValues with the values stored before the
430 /// instruction \p Before is reached.
getValues__anon43e5a8d30111::OffloadArray431 bool getValues(AllocaInst &Array, Instruction &Before) {
432 // Initialize container.
433 const uint64_t NumValues = Array.getAllocatedType()->getArrayNumElements();
434 StoredValues.assign(NumValues, nullptr);
435 LastAccesses.assign(NumValues, nullptr);
436
437 // TODO: This assumes the instruction \p Before is in the same
438 // BasicBlock as Array. Make it general, for any control flow graph.
439 BasicBlock *BB = Array.getParent();
440 if (BB != Before.getParent())
441 return false;
442
443 const DataLayout &DL = Array.getModule()->getDataLayout();
444 const unsigned int PointerSize = DL.getPointerSize();
445
446 for (Instruction &I : *BB) {
447 if (&I == &Before)
448 break;
449
450 if (!isa<StoreInst>(&I))
451 continue;
452
453 auto *S = cast<StoreInst>(&I);
454 int64_t Offset = -1;
455 auto *Dst =
456 GetPointerBaseWithConstantOffset(S->getPointerOperand(), Offset, DL);
457 if (Dst == &Array) {
458 int64_t Idx = Offset / PointerSize;
459 StoredValues[Idx] = getUnderlyingObject(S->getValueOperand());
460 LastAccesses[Idx] = S;
461 }
462 }
463
464 return isFilled();
465 }
466
467 /// Returns true if all values in StoredValues and
468 /// LastAccesses are not nullptrs.
isFilled__anon43e5a8d30111::OffloadArray469 bool isFilled() {
470 const unsigned NumValues = StoredValues.size();
471 for (unsigned I = 0; I < NumValues; ++I) {
472 if (!StoredValues[I] || !LastAccesses[I])
473 return false;
474 }
475
476 return true;
477 }
478 };
479
480 struct OpenMPOpt {
481
482 using OptimizationRemarkGetter =
483 function_ref<OptimizationRemarkEmitter &(Function *)>;
484
OpenMPOpt__anon43e5a8d30111::OpenMPOpt485 OpenMPOpt(SmallVectorImpl<Function *> &SCC, CallGraphUpdater &CGUpdater,
486 OptimizationRemarkGetter OREGetter,
487 OMPInformationCache &OMPInfoCache, Attributor &A)
488 : M(*(*SCC.begin())->getParent()), SCC(SCC), CGUpdater(CGUpdater),
489 OREGetter(OREGetter), OMPInfoCache(OMPInfoCache), A(A) {}
490
491 /// Check if any remarks are enabled for openmp-opt
remarksEnabled__anon43e5a8d30111::OpenMPOpt492 bool remarksEnabled() {
493 auto &Ctx = M.getContext();
494 return Ctx.getDiagHandlerPtr()->isAnyRemarkEnabled(DEBUG_TYPE);
495 }
496
497 /// Run all OpenMP optimizations on the underlying SCC/ModuleSlice.
run__anon43e5a8d30111::OpenMPOpt498 bool run() {
499 if (SCC.empty())
500 return false;
501
502 bool Changed = false;
503
504 LLVM_DEBUG(dbgs() << TAG << "Run on SCC with " << SCC.size()
505 << " functions in a slice with "
506 << OMPInfoCache.ModuleSlice.size() << " functions\n");
507
508 if (PrintICVValues)
509 printICVs();
510 if (PrintOpenMPKernels)
511 printKernels();
512
513 Changed |= rewriteDeviceCodeStateMachine();
514
515 Changed |= runAttributor();
516
517 // Recollect uses, in case Attributor deleted any.
518 OMPInfoCache.recollectUses();
519
520 Changed |= deleteParallelRegions();
521 if (HideMemoryTransferLatency)
522 Changed |= hideMemTransfersLatency();
523 if (remarksEnabled())
524 analysisGlobalization();
525 Changed |= deduplicateRuntimeCalls();
526 if (EnableParallelRegionMerging) {
527 if (mergeParallelRegions()) {
528 deduplicateRuntimeCalls();
529 Changed = true;
530 }
531 }
532
533 return Changed;
534 }
535
536 /// Print initial ICV values for testing.
537 /// FIXME: This should be done from the Attributor once it is added.
printICVs__anon43e5a8d30111::OpenMPOpt538 void printICVs() const {
539 InternalControlVar ICVs[] = {ICV_nthreads, ICV_active_levels, ICV_cancel,
540 ICV_proc_bind};
541
542 for (Function *F : OMPInfoCache.ModuleSlice) {
543 for (auto ICV : ICVs) {
544 auto ICVInfo = OMPInfoCache.ICVs[ICV];
545 auto Remark = [&](OptimizationRemark OR) {
546 return OR << "OpenMP ICV " << ore::NV("OpenMPICV", ICVInfo.Name)
547 << " Value: "
548 << (ICVInfo.InitValue
549 ? ICVInfo.InitValue->getValue().toString(10, true)
550 : "IMPLEMENTATION_DEFINED");
551 };
552
553 emitRemarkOnFunction(F, "OpenMPICVTracker", Remark);
554 }
555 }
556 }
557
558 /// Print OpenMP GPU kernels for testing.
printKernels__anon43e5a8d30111::OpenMPOpt559 void printKernels() const {
560 for (Function *F : SCC) {
561 if (!OMPInfoCache.Kernels.count(F))
562 continue;
563
564 auto Remark = [&](OptimizationRemark OR) {
565 return OR << "OpenMP GPU kernel "
566 << ore::NV("OpenMPGPUKernel", F->getName()) << "\n";
567 };
568
569 emitRemarkOnFunction(F, "OpenMPGPU", Remark);
570 }
571 }
572
573 /// Return the call if \p U is a callee use in a regular call. If \p RFI is
574 /// given it has to be the callee or a nullptr is returned.
getCallIfRegularCall__anon43e5a8d30111::OpenMPOpt575 static CallInst *getCallIfRegularCall(
576 Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) {
577 CallInst *CI = dyn_cast<CallInst>(U.getUser());
578 if (CI && CI->isCallee(&U) && !CI->hasOperandBundles() &&
579 (!RFI || CI->getCalledFunction() == RFI->Declaration))
580 return CI;
581 return nullptr;
582 }
583
584 /// Return the call if \p V is a regular call. If \p RFI is given it has to be
585 /// the callee or a nullptr is returned.
getCallIfRegularCall__anon43e5a8d30111::OpenMPOpt586 static CallInst *getCallIfRegularCall(
587 Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) {
588 CallInst *CI = dyn_cast<CallInst>(&V);
589 if (CI && !CI->hasOperandBundles() &&
590 (!RFI || CI->getCalledFunction() == RFI->Declaration))
591 return CI;
592 return nullptr;
593 }
594
595 private:
596 /// Merge parallel regions when it is safe.
mergeParallelRegions__anon43e5a8d30111::OpenMPOpt597 bool mergeParallelRegions() {
598 const unsigned CallbackCalleeOperand = 2;
599 const unsigned CallbackFirstArgOperand = 3;
600 using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
601
602 // Check if there are any __kmpc_fork_call calls to merge.
603 OMPInformationCache::RuntimeFunctionInfo &RFI =
604 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
605
606 if (!RFI.Declaration)
607 return false;
608
609 // Unmergable calls that prevent merging a parallel region.
610 OMPInformationCache::RuntimeFunctionInfo UnmergableCallsInfo[] = {
611 OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind],
612 OMPInfoCache.RFIs[OMPRTL___kmpc_push_num_threads],
613 };
614
615 bool Changed = false;
616 LoopInfo *LI = nullptr;
617 DominatorTree *DT = nullptr;
618
619 SmallDenseMap<BasicBlock *, SmallPtrSet<Instruction *, 4>> BB2PRMap;
620
621 BasicBlock *StartBB = nullptr, *EndBB = nullptr;
622 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
623 BasicBlock &ContinuationIP) {
624 BasicBlock *CGStartBB = CodeGenIP.getBlock();
625 BasicBlock *CGEndBB =
626 SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
627 assert(StartBB != nullptr && "StartBB should not be null");
628 CGStartBB->getTerminator()->setSuccessor(0, StartBB);
629 assert(EndBB != nullptr && "EndBB should not be null");
630 EndBB->getTerminator()->setSuccessor(0, CGEndBB);
631 };
632
633 auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &,
634 Value &Inner, Value *&ReplacementValue) -> InsertPointTy {
635 ReplacementValue = &Inner;
636 return CodeGenIP;
637 };
638
639 auto FiniCB = [&](InsertPointTy CodeGenIP) {};
640
641 /// Create a sequential execution region within a merged parallel region,
642 /// encapsulated in a master construct with a barrier for synchronization.
643 auto CreateSequentialRegion = [&](Function *OuterFn,
644 BasicBlock *OuterPredBB,
645 Instruction *SeqStartI,
646 Instruction *SeqEndI) {
647 // Isolate the instructions of the sequential region to a separate
648 // block.
649 BasicBlock *ParentBB = SeqStartI->getParent();
650 BasicBlock *SeqEndBB =
651 SplitBlock(ParentBB, SeqEndI->getNextNode(), DT, LI);
652 BasicBlock *SeqAfterBB =
653 SplitBlock(SeqEndBB, &*SeqEndBB->getFirstInsertionPt(), DT, LI);
654 BasicBlock *SeqStartBB =
655 SplitBlock(ParentBB, SeqStartI, DT, LI, nullptr, "seq.par.merged");
656
657 assert(ParentBB->getUniqueSuccessor() == SeqStartBB &&
658 "Expected a different CFG");
659 const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc();
660 ParentBB->getTerminator()->eraseFromParent();
661
662 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
663 BasicBlock &ContinuationIP) {
664 BasicBlock *CGStartBB = CodeGenIP.getBlock();
665 BasicBlock *CGEndBB =
666 SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
667 assert(SeqStartBB != nullptr && "SeqStartBB should not be null");
668 CGStartBB->getTerminator()->setSuccessor(0, SeqStartBB);
669 assert(SeqEndBB != nullptr && "SeqEndBB should not be null");
670 SeqEndBB->getTerminator()->setSuccessor(0, CGEndBB);
671 };
672 auto FiniCB = [&](InsertPointTy CodeGenIP) {};
673
674 // Find outputs from the sequential region to outside users and
675 // broadcast their values to them.
676 for (Instruction &I : *SeqStartBB) {
677 SmallPtrSet<Instruction *, 4> OutsideUsers;
678 for (User *Usr : I.users()) {
679 Instruction &UsrI = *cast<Instruction>(Usr);
680 // Ignore outputs to LT intrinsics, code extraction for the merged
681 // parallel region will fix them.
682 if (UsrI.isLifetimeStartOrEnd())
683 continue;
684
685 if (UsrI.getParent() != SeqStartBB)
686 OutsideUsers.insert(&UsrI);
687 }
688
689 if (OutsideUsers.empty())
690 continue;
691
692 // Emit an alloca in the outer region to store the broadcasted
693 // value.
694 const DataLayout &DL = M.getDataLayout();
695 AllocaInst *AllocaI = new AllocaInst(
696 I.getType(), DL.getAllocaAddrSpace(), nullptr,
697 I.getName() + ".seq.output.alloc", &OuterFn->front().front());
698
699 // Emit a store instruction in the sequential BB to update the
700 // value.
701 new StoreInst(&I, AllocaI, SeqStartBB->getTerminator());
702
703 // Emit a load instruction and replace the use of the output value
704 // with it.
705 for (Instruction *UsrI : OutsideUsers) {
706 LoadInst *LoadI = new LoadInst(I.getType(), AllocaI,
707 I.getName() + ".seq.output.load", UsrI);
708 UsrI->replaceUsesOfWith(&I, LoadI);
709 }
710 }
711
712 OpenMPIRBuilder::LocationDescription Loc(
713 InsertPointTy(ParentBB, ParentBB->end()), DL);
714 InsertPointTy SeqAfterIP =
715 OMPInfoCache.OMPBuilder.createMaster(Loc, BodyGenCB, FiniCB);
716
717 OMPInfoCache.OMPBuilder.createBarrier(SeqAfterIP, OMPD_parallel);
718
719 BranchInst::Create(SeqAfterBB, SeqAfterIP.getBlock());
720
721 LLVM_DEBUG(dbgs() << TAG << "After sequential inlining " << *OuterFn
722 << "\n");
723 };
724
725 // Helper to merge the __kmpc_fork_call calls in MergableCIs. They are all
726 // contained in BB and only separated by instructions that can be
727 // redundantly executed in parallel. The block BB is split before the first
728 // call (in MergableCIs) and after the last so the entire region we merge
729 // into a single parallel region is contained in a single basic block
730 // without any other instructions. We use the OpenMPIRBuilder to outline
731 // that block and call the resulting function via __kmpc_fork_call.
732 auto Merge = [&](SmallVectorImpl<CallInst *> &MergableCIs, BasicBlock *BB) {
733 // TODO: Change the interface to allow single CIs expanded, e.g, to
734 // include an outer loop.
735 assert(MergableCIs.size() > 1 && "Assumed multiple mergable CIs");
736
737 auto Remark = [&](OptimizationRemark OR) {
738 OR << "Parallel region at "
739 << ore::NV("OpenMPParallelMergeFront",
740 MergableCIs.front()->getDebugLoc())
741 << " merged with parallel regions at ";
742 for (auto *CI : llvm::drop_begin(MergableCIs)) {
743 OR << ore::NV("OpenMPParallelMerge", CI->getDebugLoc());
744 if (CI != MergableCIs.back())
745 OR << ", ";
746 }
747 return OR;
748 };
749
750 emitRemark<OptimizationRemark>(MergableCIs.front(),
751 "OpenMPParallelRegionMerging", Remark);
752
753 Function *OriginalFn = BB->getParent();
754 LLVM_DEBUG(dbgs() << TAG << "Merge " << MergableCIs.size()
755 << " parallel regions in " << OriginalFn->getName()
756 << "\n");
757
758 // Isolate the calls to merge in a separate block.
759 EndBB = SplitBlock(BB, MergableCIs.back()->getNextNode(), DT, LI);
760 BasicBlock *AfterBB =
761 SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI);
762 StartBB = SplitBlock(BB, MergableCIs.front(), DT, LI, nullptr,
763 "omp.par.merged");
764
765 assert(BB->getUniqueSuccessor() == StartBB && "Expected a different CFG");
766 const DebugLoc DL = BB->getTerminator()->getDebugLoc();
767 BB->getTerminator()->eraseFromParent();
768
769 // Create sequential regions for sequential instructions that are
770 // in-between mergable parallel regions.
771 for (auto *It = MergableCIs.begin(), *End = MergableCIs.end() - 1;
772 It != End; ++It) {
773 Instruction *ForkCI = *It;
774 Instruction *NextForkCI = *(It + 1);
775
776 // Continue if there are not in-between instructions.
777 if (ForkCI->getNextNode() == NextForkCI)
778 continue;
779
780 CreateSequentialRegion(OriginalFn, BB, ForkCI->getNextNode(),
781 NextForkCI->getPrevNode());
782 }
783
784 OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB, BB->end()),
785 DL);
786 IRBuilder<>::InsertPoint AllocaIP(
787 &OriginalFn->getEntryBlock(),
788 OriginalFn->getEntryBlock().getFirstInsertionPt());
789 // Create the merged parallel region with default proc binding, to
790 // avoid overriding binding settings, and without explicit cancellation.
791 InsertPointTy AfterIP = OMPInfoCache.OMPBuilder.createParallel(
792 Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB, nullptr, nullptr,
793 OMP_PROC_BIND_default, /* IsCancellable */ false);
794 BranchInst::Create(AfterBB, AfterIP.getBlock());
795
796 // Perform the actual outlining.
797 OMPInfoCache.OMPBuilder.finalize(/* AllowExtractorSinking */ true);
798
799 Function *OutlinedFn = MergableCIs.front()->getCaller();
800
801 // Replace the __kmpc_fork_call calls with direct calls to the outlined
802 // callbacks.
803 SmallVector<Value *, 8> Args;
804 for (auto *CI : MergableCIs) {
805 Value *Callee =
806 CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts();
807 FunctionType *FT =
808 cast<FunctionType>(Callee->getType()->getPointerElementType());
809 Args.clear();
810 Args.push_back(OutlinedFn->getArg(0));
811 Args.push_back(OutlinedFn->getArg(1));
812 for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands();
813 U < E; ++U)
814 Args.push_back(CI->getArgOperand(U));
815
816 CallInst *NewCI = CallInst::Create(FT, Callee, Args, "", CI);
817 if (CI->getDebugLoc())
818 NewCI->setDebugLoc(CI->getDebugLoc());
819
820 // Forward parameter attributes from the callback to the callee.
821 for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands();
822 U < E; ++U)
823 for (const Attribute &A : CI->getAttributes().getParamAttributes(U))
824 NewCI->addParamAttr(
825 U - (CallbackFirstArgOperand - CallbackCalleeOperand), A);
826
827 // Emit an explicit barrier to replace the implicit fork-join barrier.
828 if (CI != MergableCIs.back()) {
829 // TODO: Remove barrier if the merged parallel region includes the
830 // 'nowait' clause.
831 OMPInfoCache.OMPBuilder.createBarrier(
832 InsertPointTy(NewCI->getParent(),
833 NewCI->getNextNode()->getIterator()),
834 OMPD_parallel);
835 }
836
837 auto Remark = [&](OptimizationRemark OR) {
838 return OR << "Parallel region at "
839 << ore::NV("OpenMPParallelMerge", CI->getDebugLoc())
840 << " merged with "
841 << ore::NV("OpenMPParallelMergeFront",
842 MergableCIs.front()->getDebugLoc());
843 };
844 if (CI != MergableCIs.front())
845 emitRemark<OptimizationRemark>(CI, "OpenMPParallelRegionMerging",
846 Remark);
847
848 CI->eraseFromParent();
849 }
850
851 assert(OutlinedFn != OriginalFn && "Outlining failed");
852 CGUpdater.registerOutlinedFunction(*OriginalFn, *OutlinedFn);
853 CGUpdater.reanalyzeFunction(*OriginalFn);
854
855 NumOpenMPParallelRegionsMerged += MergableCIs.size();
856
857 return true;
858 };
859
860 // Helper function that identifes sequences of
861 // __kmpc_fork_call uses in a basic block.
862 auto DetectPRsCB = [&](Use &U, Function &F) {
863 CallInst *CI = getCallIfRegularCall(U, &RFI);
864 BB2PRMap[CI->getParent()].insert(CI);
865
866 return false;
867 };
868
869 BB2PRMap.clear();
870 RFI.foreachUse(SCC, DetectPRsCB);
871 SmallVector<SmallVector<CallInst *, 4>, 4> MergableCIsVector;
872 // Find mergable parallel regions within a basic block that are
873 // safe to merge, that is any in-between instructions can safely
874 // execute in parallel after merging.
875 // TODO: support merging across basic-blocks.
876 for (auto &It : BB2PRMap) {
877 auto &CIs = It.getSecond();
878 if (CIs.size() < 2)
879 continue;
880
881 BasicBlock *BB = It.getFirst();
882 SmallVector<CallInst *, 4> MergableCIs;
883
884 /// Returns true if the instruction is mergable, false otherwise.
885 /// A terminator instruction is unmergable by definition since merging
886 /// works within a BB. Instructions before the mergable region are
887 /// mergable if they are not calls to OpenMP runtime functions that may
888 /// set different execution parameters for subsequent parallel regions.
889 /// Instructions in-between parallel regions are mergable if they are not
890 /// calls to any non-intrinsic function since that may call a non-mergable
891 /// OpenMP runtime function.
892 auto IsMergable = [&](Instruction &I, bool IsBeforeMergableRegion) {
893 // We do not merge across BBs, hence return false (unmergable) if the
894 // instruction is a terminator.
895 if (I.isTerminator())
896 return false;
897
898 if (!isa<CallInst>(&I))
899 return true;
900
901 CallInst *CI = cast<CallInst>(&I);
902 if (IsBeforeMergableRegion) {
903 Function *CalledFunction = CI->getCalledFunction();
904 if (!CalledFunction)
905 return false;
906 // Return false (unmergable) if the call before the parallel
907 // region calls an explicit affinity (proc_bind) or number of
908 // threads (num_threads) compiler-generated function. Those settings
909 // may be incompatible with following parallel regions.
910 // TODO: ICV tracking to detect compatibility.
911 for (const auto &RFI : UnmergableCallsInfo) {
912 if (CalledFunction == RFI.Declaration)
913 return false;
914 }
915 } else {
916 // Return false (unmergable) if there is a call instruction
917 // in-between parallel regions when it is not an intrinsic. It
918 // may call an unmergable OpenMP runtime function in its callpath.
919 // TODO: Keep track of possible OpenMP calls in the callpath.
920 if (!isa<IntrinsicInst>(CI))
921 return false;
922 }
923
924 return true;
925 };
926 // Find maximal number of parallel region CIs that are safe to merge.
927 for (auto It = BB->begin(), End = BB->end(); It != End;) {
928 Instruction &I = *It;
929 ++It;
930
931 if (CIs.count(&I)) {
932 MergableCIs.push_back(cast<CallInst>(&I));
933 continue;
934 }
935
936 // Continue expanding if the instruction is mergable.
937 if (IsMergable(I, MergableCIs.empty()))
938 continue;
939
940 // Forward the instruction iterator to skip the next parallel region
941 // since there is an unmergable instruction which can affect it.
942 for (; It != End; ++It) {
943 Instruction &SkipI = *It;
944 if (CIs.count(&SkipI)) {
945 LLVM_DEBUG(dbgs() << TAG << "Skip parallel region " << SkipI
946 << " due to " << I << "\n");
947 ++It;
948 break;
949 }
950 }
951
952 // Store mergable regions found.
953 if (MergableCIs.size() > 1) {
954 MergableCIsVector.push_back(MergableCIs);
955 LLVM_DEBUG(dbgs() << TAG << "Found " << MergableCIs.size()
956 << " parallel regions in block " << BB->getName()
957 << " of function " << BB->getParent()->getName()
958 << "\n";);
959 }
960
961 MergableCIs.clear();
962 }
963
964 if (!MergableCIsVector.empty()) {
965 Changed = true;
966
967 for (auto &MergableCIs : MergableCIsVector)
968 Merge(MergableCIs, BB);
969 }
970 }
971
972 if (Changed) {
973 /// Re-collect use for fork calls, emitted barrier calls, and
974 /// any emitted master/end_master calls.
975 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_fork_call);
976 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_barrier);
977 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_master);
978 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_end_master);
979 }
980
981 return Changed;
982 }
983
984 /// Try to delete parallel regions if possible.
deleteParallelRegions__anon43e5a8d30111::OpenMPOpt985 bool deleteParallelRegions() {
986 const unsigned CallbackCalleeOperand = 2;
987
988 OMPInformationCache::RuntimeFunctionInfo &RFI =
989 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
990
991 if (!RFI.Declaration)
992 return false;
993
994 bool Changed = false;
995 auto DeleteCallCB = [&](Use &U, Function &) {
996 CallInst *CI = getCallIfRegularCall(U);
997 if (!CI)
998 return false;
999 auto *Fn = dyn_cast<Function>(
1000 CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts());
1001 if (!Fn)
1002 return false;
1003 if (!Fn->onlyReadsMemory())
1004 return false;
1005 if (!Fn->hasFnAttribute(Attribute::WillReturn))
1006 return false;
1007
1008 LLVM_DEBUG(dbgs() << TAG << "Delete read-only parallel region in "
1009 << CI->getCaller()->getName() << "\n");
1010
1011 auto Remark = [&](OptimizationRemark OR) {
1012 return OR << "Parallel region in "
1013 << ore::NV("OpenMPParallelDelete", CI->getCaller()->getName())
1014 << " deleted";
1015 };
1016 emitRemark<OptimizationRemark>(CI, "OpenMPParallelRegionDeletion",
1017 Remark);
1018
1019 CGUpdater.removeCallSite(*CI);
1020 CI->eraseFromParent();
1021 Changed = true;
1022 ++NumOpenMPParallelRegionsDeleted;
1023 return true;
1024 };
1025
1026 RFI.foreachUse(SCC, DeleteCallCB);
1027
1028 return Changed;
1029 }
1030
1031 /// Try to eliminate runtime calls by reusing existing ones.
deduplicateRuntimeCalls__anon43e5a8d30111::OpenMPOpt1032 bool deduplicateRuntimeCalls() {
1033 bool Changed = false;
1034
1035 RuntimeFunction DeduplicableRuntimeCallIDs[] = {
1036 OMPRTL_omp_get_num_threads,
1037 OMPRTL_omp_in_parallel,
1038 OMPRTL_omp_get_cancellation,
1039 OMPRTL_omp_get_thread_limit,
1040 OMPRTL_omp_get_supported_active_levels,
1041 OMPRTL_omp_get_level,
1042 OMPRTL_omp_get_ancestor_thread_num,
1043 OMPRTL_omp_get_team_size,
1044 OMPRTL_omp_get_active_level,
1045 OMPRTL_omp_in_final,
1046 OMPRTL_omp_get_proc_bind,
1047 OMPRTL_omp_get_num_places,
1048 OMPRTL_omp_get_num_procs,
1049 OMPRTL_omp_get_place_num,
1050 OMPRTL_omp_get_partition_num_places,
1051 OMPRTL_omp_get_partition_place_nums};
1052
1053 // Global-tid is handled separately.
1054 SmallSetVector<Value *, 16> GTIdArgs;
1055 collectGlobalThreadIdArguments(GTIdArgs);
1056 LLVM_DEBUG(dbgs() << TAG << "Found " << GTIdArgs.size()
1057 << " global thread ID arguments\n");
1058
1059 for (Function *F : SCC) {
1060 for (auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs)
1061 Changed |= deduplicateRuntimeCalls(
1062 *F, OMPInfoCache.RFIs[DeduplicableRuntimeCallID]);
1063
1064 // __kmpc_global_thread_num is special as we can replace it with an
1065 // argument in enough cases to make it worth trying.
1066 Value *GTIdArg = nullptr;
1067 for (Argument &Arg : F->args())
1068 if (GTIdArgs.count(&Arg)) {
1069 GTIdArg = &Arg;
1070 break;
1071 }
1072 Changed |= deduplicateRuntimeCalls(
1073 *F, OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num], GTIdArg);
1074 }
1075
1076 return Changed;
1077 }
1078
1079 /// Tries to hide the latency of runtime calls that involve host to
1080 /// device memory transfers by splitting them into their "issue" and "wait"
1081 /// versions. The "issue" is moved upwards as much as possible. The "wait" is
1082 /// moved downards as much as possible. The "issue" issues the memory transfer
1083 /// asynchronously, returning a handle. The "wait" waits in the returned
1084 /// handle for the memory transfer to finish.
hideMemTransfersLatency__anon43e5a8d30111::OpenMPOpt1085 bool hideMemTransfersLatency() {
1086 auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper];
1087 bool Changed = false;
1088 auto SplitMemTransfers = [&](Use &U, Function &Decl) {
1089 auto *RTCall = getCallIfRegularCall(U, &RFI);
1090 if (!RTCall)
1091 return false;
1092
1093 OffloadArray OffloadArrays[3];
1094 if (!getValuesInOffloadArrays(*RTCall, OffloadArrays))
1095 return false;
1096
1097 LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays));
1098
1099 // TODO: Check if can be moved upwards.
1100 bool WasSplit = false;
1101 Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall);
1102 if (WaitMovementPoint)
1103 WasSplit = splitTargetDataBeginRTC(*RTCall, *WaitMovementPoint);
1104
1105 Changed |= WasSplit;
1106 return WasSplit;
1107 };
1108 RFI.foreachUse(SCC, SplitMemTransfers);
1109
1110 return Changed;
1111 }
1112
analysisGlobalization__anon43e5a8d30111::OpenMPOpt1113 void analysisGlobalization() {
1114 RuntimeFunction GlobalizationRuntimeIDs[] = {
1115 OMPRTL___kmpc_data_sharing_coalesced_push_stack,
1116 OMPRTL___kmpc_data_sharing_push_stack};
1117
1118 for (const auto GlobalizationCallID : GlobalizationRuntimeIDs) {
1119 auto &RFI = OMPInfoCache.RFIs[GlobalizationCallID];
1120
1121 auto CheckGlobalization = [&](Use &U, Function &Decl) {
1122 if (CallInst *CI = getCallIfRegularCall(U, &RFI)) {
1123 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
1124 return ORA
1125 << "Found thread data sharing on the GPU. "
1126 << "Expect degraded performance due to data globalization.";
1127 };
1128 emitRemark<OptimizationRemarkAnalysis>(CI, "OpenMPGlobalization",
1129 Remark);
1130 }
1131
1132 return false;
1133 };
1134
1135 RFI.foreachUse(SCC, CheckGlobalization);
1136 }
1137 }
1138
1139 /// Maps the values stored in the offload arrays passed as arguments to
1140 /// \p RuntimeCall into the offload arrays in \p OAs.
getValuesInOffloadArrays__anon43e5a8d30111::OpenMPOpt1141 bool getValuesInOffloadArrays(CallInst &RuntimeCall,
1142 MutableArrayRef<OffloadArray> OAs) {
1143 assert(OAs.size() == 3 && "Need space for three offload arrays!");
1144
1145 // A runtime call that involves memory offloading looks something like:
1146 // call void @__tgt_target_data_begin_mapper(arg0, arg1,
1147 // i8** %offload_baseptrs, i8** %offload_ptrs, i64* %offload_sizes,
1148 // ...)
1149 // So, the idea is to access the allocas that allocate space for these
1150 // offload arrays, offload_baseptrs, offload_ptrs, offload_sizes.
1151 // Therefore:
1152 // i8** %offload_baseptrs.
1153 Value *BasePtrsArg =
1154 RuntimeCall.getArgOperand(OffloadArray::BasePtrsArgNum);
1155 // i8** %offload_ptrs.
1156 Value *PtrsArg = RuntimeCall.getArgOperand(OffloadArray::PtrsArgNum);
1157 // i8** %offload_sizes.
1158 Value *SizesArg = RuntimeCall.getArgOperand(OffloadArray::SizesArgNum);
1159
1160 // Get values stored in **offload_baseptrs.
1161 auto *V = getUnderlyingObject(BasePtrsArg);
1162 if (!isa<AllocaInst>(V))
1163 return false;
1164 auto *BasePtrsArray = cast<AllocaInst>(V);
1165 if (!OAs[0].initialize(*BasePtrsArray, RuntimeCall))
1166 return false;
1167
1168 // Get values stored in **offload_baseptrs.
1169 V = getUnderlyingObject(PtrsArg);
1170 if (!isa<AllocaInst>(V))
1171 return false;
1172 auto *PtrsArray = cast<AllocaInst>(V);
1173 if (!OAs[1].initialize(*PtrsArray, RuntimeCall))
1174 return false;
1175
1176 // Get values stored in **offload_sizes.
1177 V = getUnderlyingObject(SizesArg);
1178 // If it's a [constant] global array don't analyze it.
1179 if (isa<GlobalValue>(V))
1180 return isa<Constant>(V);
1181 if (!isa<AllocaInst>(V))
1182 return false;
1183
1184 auto *SizesArray = cast<AllocaInst>(V);
1185 if (!OAs[2].initialize(*SizesArray, RuntimeCall))
1186 return false;
1187
1188 return true;
1189 }
1190
1191 /// Prints the values in the OffloadArrays \p OAs using LLVM_DEBUG.
1192 /// For now this is a way to test that the function getValuesInOffloadArrays
1193 /// is working properly.
1194 /// TODO: Move this to a unittest when unittests are available for OpenMPOpt.
dumpValuesInOffloadArrays__anon43e5a8d30111::OpenMPOpt1195 void dumpValuesInOffloadArrays(ArrayRef<OffloadArray> OAs) {
1196 assert(OAs.size() == 3 && "There are three offload arrays to debug!");
1197
1198 LLVM_DEBUG(dbgs() << TAG << " Successfully got offload values:\n");
1199 std::string ValuesStr;
1200 raw_string_ostream Printer(ValuesStr);
1201 std::string Separator = " --- ";
1202
1203 for (auto *BP : OAs[0].StoredValues) {
1204 BP->print(Printer);
1205 Printer << Separator;
1206 }
1207 LLVM_DEBUG(dbgs() << "\t\toffload_baseptrs: " << Printer.str() << "\n");
1208 ValuesStr.clear();
1209
1210 for (auto *P : OAs[1].StoredValues) {
1211 P->print(Printer);
1212 Printer << Separator;
1213 }
1214 LLVM_DEBUG(dbgs() << "\t\toffload_ptrs: " << Printer.str() << "\n");
1215 ValuesStr.clear();
1216
1217 for (auto *S : OAs[2].StoredValues) {
1218 S->print(Printer);
1219 Printer << Separator;
1220 }
1221 LLVM_DEBUG(dbgs() << "\t\toffload_sizes: " << Printer.str() << "\n");
1222 }
1223
1224 /// Returns the instruction where the "wait" counterpart \p RuntimeCall can be
1225 /// moved. Returns nullptr if the movement is not possible, or not worth it.
canBeMovedDownwards__anon43e5a8d30111::OpenMPOpt1226 Instruction *canBeMovedDownwards(CallInst &RuntimeCall) {
1227 // FIXME: This traverses only the BasicBlock where RuntimeCall is.
1228 // Make it traverse the CFG.
1229
1230 Instruction *CurrentI = &RuntimeCall;
1231 bool IsWorthIt = false;
1232 while ((CurrentI = CurrentI->getNextNode())) {
1233
1234 // TODO: Once we detect the regions to be offloaded we should use the
1235 // alias analysis manager to check if CurrentI may modify one of
1236 // the offloaded regions.
1237 if (CurrentI->mayHaveSideEffects() || CurrentI->mayReadFromMemory()) {
1238 if (IsWorthIt)
1239 return CurrentI;
1240
1241 return nullptr;
1242 }
1243
1244 // FIXME: For now if we move it over anything without side effect
1245 // is worth it.
1246 IsWorthIt = true;
1247 }
1248
1249 // Return end of BasicBlock.
1250 return RuntimeCall.getParent()->getTerminator();
1251 }
1252
1253 /// Splits \p RuntimeCall into its "issue" and "wait" counterparts.
splitTargetDataBeginRTC__anon43e5a8d30111::OpenMPOpt1254 bool splitTargetDataBeginRTC(CallInst &RuntimeCall,
1255 Instruction &WaitMovementPoint) {
1256 // Create stack allocated handle (__tgt_async_info) at the beginning of the
1257 // function. Used for storing information of the async transfer, allowing to
1258 // wait on it later.
1259 auto &IRBuilder = OMPInfoCache.OMPBuilder;
1260 auto *F = RuntimeCall.getCaller();
1261 Instruction *FirstInst = &(F->getEntryBlock().front());
1262 AllocaInst *Handle = new AllocaInst(
1263 IRBuilder.AsyncInfo, F->getAddressSpace(), "handle", FirstInst);
1264
1265 // Add "issue" runtime call declaration:
1266 // declare %struct.tgt_async_info @__tgt_target_data_begin_issue(i64, i32,
1267 // i8**, i8**, i64*, i64*)
1268 FunctionCallee IssueDecl = IRBuilder.getOrCreateRuntimeFunction(
1269 M, OMPRTL___tgt_target_data_begin_mapper_issue);
1270
1271 // Change RuntimeCall call site for its asynchronous version.
1272 SmallVector<Value *, 16> Args;
1273 for (auto &Arg : RuntimeCall.args())
1274 Args.push_back(Arg.get());
1275 Args.push_back(Handle);
1276
1277 CallInst *IssueCallsite =
1278 CallInst::Create(IssueDecl, Args, /*NameStr=*/"", &RuntimeCall);
1279 RuntimeCall.eraseFromParent();
1280
1281 // Add "wait" runtime call declaration:
1282 // declare void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info)
1283 FunctionCallee WaitDecl = IRBuilder.getOrCreateRuntimeFunction(
1284 M, OMPRTL___tgt_target_data_begin_mapper_wait);
1285
1286 Value *WaitParams[2] = {
1287 IssueCallsite->getArgOperand(
1288 OffloadArray::DeviceIDArgNum), // device_id.
1289 Handle // handle to wait on.
1290 };
1291 CallInst::Create(WaitDecl, WaitParams, /*NameStr=*/"", &WaitMovementPoint);
1292
1293 return true;
1294 }
1295
combinedIdentStruct__anon43e5a8d30111::OpenMPOpt1296 static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent,
1297 bool GlobalOnly, bool &SingleChoice) {
1298 if (CurrentIdent == NextIdent)
1299 return CurrentIdent;
1300
1301 // TODO: Figure out how to actually combine multiple debug locations. For
1302 // now we just keep an existing one if there is a single choice.
1303 if (!GlobalOnly || isa<GlobalValue>(NextIdent)) {
1304 SingleChoice = !CurrentIdent;
1305 return NextIdent;
1306 }
1307 return nullptr;
1308 }
1309
1310 /// Return an `struct ident_t*` value that represents the ones used in the
1311 /// calls of \p RFI inside of \p F. If \p GlobalOnly is true, we will not
1312 /// return a local `struct ident_t*`. For now, if we cannot find a suitable
1313 /// return value we create one from scratch. We also do not yet combine
1314 /// information, e.g., the source locations, see combinedIdentStruct.
1315 Value *
getCombinedIdentFromCallUsesIn__anon43e5a8d30111::OpenMPOpt1316 getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI,
1317 Function &F, bool GlobalOnly) {
1318 bool SingleChoice = true;
1319 Value *Ident = nullptr;
1320 auto CombineIdentStruct = [&](Use &U, Function &Caller) {
1321 CallInst *CI = getCallIfRegularCall(U, &RFI);
1322 if (!CI || &F != &Caller)
1323 return false;
1324 Ident = combinedIdentStruct(Ident, CI->getArgOperand(0),
1325 /* GlobalOnly */ true, SingleChoice);
1326 return false;
1327 };
1328 RFI.foreachUse(SCC, CombineIdentStruct);
1329
1330 if (!Ident || !SingleChoice) {
1331 // The IRBuilder uses the insertion block to get to the module, this is
1332 // unfortunate but we work around it for now.
1333 if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock())
1334 OMPInfoCache.OMPBuilder.updateToLocation(OpenMPIRBuilder::InsertPointTy(
1335 &F.getEntryBlock(), F.getEntryBlock().begin()));
1336 // Create a fallback location if non was found.
1337 // TODO: Use the debug locations of the calls instead.
1338 Constant *Loc = OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr();
1339 Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc);
1340 }
1341 return Ident;
1342 }
1343
1344 /// Try to eliminate calls of \p RFI in \p F by reusing an existing one or
1345 /// \p ReplVal if given.
deduplicateRuntimeCalls__anon43e5a8d30111::OpenMPOpt1346 bool deduplicateRuntimeCalls(Function &F,
1347 OMPInformationCache::RuntimeFunctionInfo &RFI,
1348 Value *ReplVal = nullptr) {
1349 auto *UV = RFI.getUseVector(F);
1350 if (!UV || UV->size() + (ReplVal != nullptr) < 2)
1351 return false;
1352
1353 LLVM_DEBUG(
1354 dbgs() << TAG << "Deduplicate " << UV->size() << " uses of " << RFI.Name
1355 << (ReplVal ? " with an existing value\n" : "\n") << "\n");
1356
1357 assert((!ReplVal || (isa<Argument>(ReplVal) &&
1358 cast<Argument>(ReplVal)->getParent() == &F)) &&
1359 "Unexpected replacement value!");
1360
1361 // TODO: Use dominance to find a good position instead.
1362 auto CanBeMoved = [this](CallBase &CB) {
1363 unsigned NumArgs = CB.getNumArgOperands();
1364 if (NumArgs == 0)
1365 return true;
1366 if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr)
1367 return false;
1368 for (unsigned u = 1; u < NumArgs; ++u)
1369 if (isa<Instruction>(CB.getArgOperand(u)))
1370 return false;
1371 return true;
1372 };
1373
1374 if (!ReplVal) {
1375 for (Use *U : *UV)
1376 if (CallInst *CI = getCallIfRegularCall(*U, &RFI)) {
1377 if (!CanBeMoved(*CI))
1378 continue;
1379
1380 auto Remark = [&](OptimizationRemark OR) {
1381 auto newLoc = &*F.getEntryBlock().getFirstInsertionPt();
1382 return OR << "OpenMP runtime call "
1383 << ore::NV("OpenMPOptRuntime", RFI.Name) << " moved to "
1384 << ore::NV("OpenMPRuntimeMoves", newLoc->getDebugLoc());
1385 };
1386 emitRemark<OptimizationRemark>(CI, "OpenMPRuntimeCodeMotion", Remark);
1387
1388 CI->moveBefore(&*F.getEntryBlock().getFirstInsertionPt());
1389 ReplVal = CI;
1390 break;
1391 }
1392 if (!ReplVal)
1393 return false;
1394 }
1395
1396 // If we use a call as a replacement value we need to make sure the ident is
1397 // valid at the new location. For now we just pick a global one, either
1398 // existing and used by one of the calls, or created from scratch.
1399 if (CallBase *CI = dyn_cast<CallBase>(ReplVal)) {
1400 if (CI->getNumArgOperands() > 0 &&
1401 CI->getArgOperand(0)->getType() == OMPInfoCache.OMPBuilder.IdentPtr) {
1402 Value *Ident = getCombinedIdentFromCallUsesIn(RFI, F,
1403 /* GlobalOnly */ true);
1404 CI->setArgOperand(0, Ident);
1405 }
1406 }
1407
1408 bool Changed = false;
1409 auto ReplaceAndDeleteCB = [&](Use &U, Function &Caller) {
1410 CallInst *CI = getCallIfRegularCall(U, &RFI);
1411 if (!CI || CI == ReplVal || &F != &Caller)
1412 return false;
1413 assert(CI->getCaller() == &F && "Unexpected call!");
1414
1415 auto Remark = [&](OptimizationRemark OR) {
1416 return OR << "OpenMP runtime call "
1417 << ore::NV("OpenMPOptRuntime", RFI.Name) << " deduplicated";
1418 };
1419 emitRemark<OptimizationRemark>(CI, "OpenMPRuntimeDeduplicated", Remark);
1420
1421 CGUpdater.removeCallSite(*CI);
1422 CI->replaceAllUsesWith(ReplVal);
1423 CI->eraseFromParent();
1424 ++NumOpenMPRuntimeCallsDeduplicated;
1425 Changed = true;
1426 return true;
1427 };
1428 RFI.foreachUse(SCC, ReplaceAndDeleteCB);
1429
1430 return Changed;
1431 }
1432
1433 /// Collect arguments that represent the global thread id in \p GTIdArgs.
collectGlobalThreadIdArguments__anon43e5a8d30111::OpenMPOpt1434 void collectGlobalThreadIdArguments(SmallSetVector<Value *, 16> >IdArgs) {
1435 // TODO: Below we basically perform a fixpoint iteration with a pessimistic
1436 // initialization. We could define an AbstractAttribute instead and
1437 // run the Attributor here once it can be run as an SCC pass.
1438
1439 // Helper to check the argument \p ArgNo at all call sites of \p F for
1440 // a GTId.
1441 auto CallArgOpIsGTId = [&](Function &F, unsigned ArgNo, CallInst &RefCI) {
1442 if (!F.hasLocalLinkage())
1443 return false;
1444 for (Use &U : F.uses()) {
1445 if (CallInst *CI = getCallIfRegularCall(U)) {
1446 Value *ArgOp = CI->getArgOperand(ArgNo);
1447 if (CI == &RefCI || GTIdArgs.count(ArgOp) ||
1448 getCallIfRegularCall(
1449 *ArgOp, &OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num]))
1450 continue;
1451 }
1452 return false;
1453 }
1454 return true;
1455 };
1456
1457 // Helper to identify uses of a GTId as GTId arguments.
1458 auto AddUserArgs = [&](Value >Id) {
1459 for (Use &U : GTId.uses())
1460 if (CallInst *CI = dyn_cast<CallInst>(U.getUser()))
1461 if (CI->isArgOperand(&U))
1462 if (Function *Callee = CI->getCalledFunction())
1463 if (CallArgOpIsGTId(*Callee, U.getOperandNo(), *CI))
1464 GTIdArgs.insert(Callee->getArg(U.getOperandNo()));
1465 };
1466
1467 // The argument users of __kmpc_global_thread_num calls are GTIds.
1468 OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI =
1469 OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num];
1470
1471 GlobThreadNumRFI.foreachUse(SCC, [&](Use &U, Function &F) {
1472 if (CallInst *CI = getCallIfRegularCall(U, &GlobThreadNumRFI))
1473 AddUserArgs(*CI);
1474 return false;
1475 });
1476
1477 // Transitively search for more arguments by looking at the users of the
1478 // ones we know already. During the search the GTIdArgs vector is extended
1479 // so we cannot cache the size nor can we use a range based for.
1480 for (unsigned u = 0; u < GTIdArgs.size(); ++u)
1481 AddUserArgs(*GTIdArgs[u]);
1482 }
1483
1484 /// Kernel (=GPU) optimizations and utility functions
1485 ///
1486 ///{{
1487
1488 /// Check if \p F is a kernel, hence entry point for target offloading.
isKernel__anon43e5a8d30111::OpenMPOpt1489 bool isKernel(Function &F) { return OMPInfoCache.Kernels.count(&F); }
1490
1491 /// Cache to remember the unique kernel for a function.
1492 DenseMap<Function *, Optional<Kernel>> UniqueKernelMap;
1493
1494 /// Find the unique kernel that will execute \p F, if any.
1495 Kernel getUniqueKernelFor(Function &F);
1496
1497 /// Find the unique kernel that will execute \p I, if any.
getUniqueKernelFor__anon43e5a8d30111::OpenMPOpt1498 Kernel getUniqueKernelFor(Instruction &I) {
1499 return getUniqueKernelFor(*I.getFunction());
1500 }
1501
1502 /// Rewrite the device (=GPU) code state machine create in non-SPMD mode in
1503 /// the cases we can avoid taking the address of a function.
1504 bool rewriteDeviceCodeStateMachine();
1505
1506 ///
1507 ///}}
1508
1509 /// Emit a remark generically
1510 ///
1511 /// This template function can be used to generically emit a remark. The
1512 /// RemarkKind should be one of the following:
1513 /// - OptimizationRemark to indicate a successful optimization attempt
1514 /// - OptimizationRemarkMissed to report a failed optimization attempt
1515 /// - OptimizationRemarkAnalysis to provide additional information about an
1516 /// optimization attempt
1517 ///
1518 /// The remark is built using a callback function provided by the caller that
1519 /// takes a RemarkKind as input and returns a RemarkKind.
1520 template <typename RemarkKind,
1521 typename RemarkCallBack = function_ref<RemarkKind(RemarkKind &&)>>
emitRemark__anon43e5a8d30111::OpenMPOpt1522 void emitRemark(Instruction *Inst, StringRef RemarkName,
1523 RemarkCallBack &&RemarkCB) const {
1524 Function *F = Inst->getParent()->getParent();
1525 auto &ORE = OREGetter(F);
1526
1527 ORE.emit(
1528 [&]() { return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, Inst)); });
1529 }
1530
1531 /// Emit a remark on a function. Since only OptimizationRemark is supporting
1532 /// this, it can't be made generic.
1533 void
emitRemarkOnFunction__anon43e5a8d30111::OpenMPOpt1534 emitRemarkOnFunction(Function *F, StringRef RemarkName,
1535 function_ref<OptimizationRemark(OptimizationRemark &&)>
1536 &&RemarkCB) const {
1537 auto &ORE = OREGetter(F);
1538
1539 ORE.emit([&]() {
1540 return RemarkCB(OptimizationRemark(DEBUG_TYPE, RemarkName, F));
1541 });
1542 }
1543
1544 /// The underlying module.
1545 Module &M;
1546
1547 /// The SCC we are operating on.
1548 SmallVectorImpl<Function *> &SCC;
1549
1550 /// Callback to update the call graph, the first argument is a removed call,
1551 /// the second an optional replacement call.
1552 CallGraphUpdater &CGUpdater;
1553
1554 /// Callback to get an OptimizationRemarkEmitter from a Function *
1555 OptimizationRemarkGetter OREGetter;
1556
1557 /// OpenMP-specific information cache. Also Used for Attributor runs.
1558 OMPInformationCache &OMPInfoCache;
1559
1560 /// Attributor instance.
1561 Attributor &A;
1562
1563 /// Helper function to run Attributor on SCC.
runAttributor__anon43e5a8d30111::OpenMPOpt1564 bool runAttributor() {
1565 if (SCC.empty())
1566 return false;
1567
1568 registerAAs();
1569
1570 ChangeStatus Changed = A.run();
1571
1572 LLVM_DEBUG(dbgs() << "[Attributor] Done with " << SCC.size()
1573 << " functions, result: " << Changed << ".\n");
1574
1575 return Changed == ChangeStatus::CHANGED;
1576 }
1577
1578 /// Populate the Attributor with abstract attribute opportunities in the
1579 /// function.
registerAAs__anon43e5a8d30111::OpenMPOpt1580 void registerAAs() {
1581 if (SCC.empty())
1582 return;
1583
1584 // Create CallSite AA for all Getters.
1585 for (int Idx = 0; Idx < OMPInfoCache.ICVs.size() - 1; ++Idx) {
1586 auto ICVInfo = OMPInfoCache.ICVs[static_cast<InternalControlVar>(Idx)];
1587
1588 auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter];
1589
1590 auto CreateAA = [&](Use &U, Function &Caller) {
1591 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI);
1592 if (!CI)
1593 return false;
1594
1595 auto &CB = cast<CallBase>(*CI);
1596
1597 IRPosition CBPos = IRPosition::callsite_function(CB);
1598 A.getOrCreateAAFor<AAICVTracker>(CBPos);
1599 return false;
1600 };
1601
1602 GetterRFI.foreachUse(SCC, CreateAA);
1603 }
1604 }
1605 };
1606
getUniqueKernelFor(Function & F)1607 Kernel OpenMPOpt::getUniqueKernelFor(Function &F) {
1608 if (!OMPInfoCache.ModuleSlice.count(&F))
1609 return nullptr;
1610
1611 // Use a scope to keep the lifetime of the CachedKernel short.
1612 {
1613 Optional<Kernel> &CachedKernel = UniqueKernelMap[&F];
1614 if (CachedKernel)
1615 return *CachedKernel;
1616
1617 // TODO: We should use an AA to create an (optimistic and callback
1618 // call-aware) call graph. For now we stick to simple patterns that
1619 // are less powerful, basically the worst fixpoint.
1620 if (isKernel(F)) {
1621 CachedKernel = Kernel(&F);
1622 return *CachedKernel;
1623 }
1624
1625 CachedKernel = nullptr;
1626 if (!F.hasLocalLinkage()) {
1627
1628 // See https://openmp.llvm.org/remarks/OptimizationRemarks.html
1629 auto Remark = [&](OptimizationRemark OR) {
1630 return OR << "[OMP100] Potentially unknown OpenMP target region caller";
1631 };
1632 emitRemarkOnFunction(&F, "OMP100", Remark);
1633
1634 return nullptr;
1635 }
1636 }
1637
1638 auto GetUniqueKernelForUse = [&](const Use &U) -> Kernel {
1639 if (auto *Cmp = dyn_cast<ICmpInst>(U.getUser())) {
1640 // Allow use in equality comparisons.
1641 if (Cmp->isEquality())
1642 return getUniqueKernelFor(*Cmp);
1643 return nullptr;
1644 }
1645 if (auto *CB = dyn_cast<CallBase>(U.getUser())) {
1646 // Allow direct calls.
1647 if (CB->isCallee(&U))
1648 return getUniqueKernelFor(*CB);
1649 // Allow the use in __kmpc_kernel_prepare_parallel calls.
1650 if (Function *Callee = CB->getCalledFunction())
1651 if (Callee->getName() == "__kmpc_kernel_prepare_parallel")
1652 return getUniqueKernelFor(*CB);
1653 return nullptr;
1654 }
1655 // Disallow every other use.
1656 return nullptr;
1657 };
1658
1659 // TODO: In the future we want to track more than just a unique kernel.
1660 SmallPtrSet<Kernel, 2> PotentialKernels;
1661 OMPInformationCache::foreachUse(F, [&](const Use &U) {
1662 PotentialKernels.insert(GetUniqueKernelForUse(U));
1663 });
1664
1665 Kernel K = nullptr;
1666 if (PotentialKernels.size() == 1)
1667 K = *PotentialKernels.begin();
1668
1669 // Cache the result.
1670 UniqueKernelMap[&F] = K;
1671
1672 return K;
1673 }
1674
rewriteDeviceCodeStateMachine()1675 bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
1676 OMPInformationCache::RuntimeFunctionInfo &KernelPrepareParallelRFI =
1677 OMPInfoCache.RFIs[OMPRTL___kmpc_kernel_prepare_parallel];
1678
1679 bool Changed = false;
1680 if (!KernelPrepareParallelRFI)
1681 return Changed;
1682
1683 for (Function *F : SCC) {
1684
1685 // Check if the function is uses in a __kmpc_kernel_prepare_parallel call at
1686 // all.
1687 bool UnknownUse = false;
1688 bool KernelPrepareUse = false;
1689 unsigned NumDirectCalls = 0;
1690
1691 SmallVector<Use *, 2> ToBeReplacedStateMachineUses;
1692 OMPInformationCache::foreachUse(*F, [&](Use &U) {
1693 if (auto *CB = dyn_cast<CallBase>(U.getUser()))
1694 if (CB->isCallee(&U)) {
1695 ++NumDirectCalls;
1696 return;
1697 }
1698
1699 if (isa<ICmpInst>(U.getUser())) {
1700 ToBeReplacedStateMachineUses.push_back(&U);
1701 return;
1702 }
1703 if (!KernelPrepareUse && OpenMPOpt::getCallIfRegularCall(
1704 *U.getUser(), &KernelPrepareParallelRFI)) {
1705 KernelPrepareUse = true;
1706 ToBeReplacedStateMachineUses.push_back(&U);
1707 return;
1708 }
1709 UnknownUse = true;
1710 });
1711
1712 // Do not emit a remark if we haven't seen a __kmpc_kernel_prepare_parallel
1713 // use.
1714 if (!KernelPrepareUse)
1715 continue;
1716
1717 {
1718 auto Remark = [&](OptimizationRemark OR) {
1719 return OR << "Found a parallel region that is called in a target "
1720 "region but not part of a combined target construct nor "
1721 "nesed inside a target construct without intermediate "
1722 "code. This can lead to excessive register usage for "
1723 "unrelated target regions in the same translation unit "
1724 "due to spurious call edges assumed by ptxas.";
1725 };
1726 emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark);
1727 }
1728
1729 // If this ever hits, we should investigate.
1730 // TODO: Checking the number of uses is not a necessary restriction and
1731 // should be lifted.
1732 if (UnknownUse || NumDirectCalls != 1 ||
1733 ToBeReplacedStateMachineUses.size() != 2) {
1734 {
1735 auto Remark = [&](OptimizationRemark OR) {
1736 return OR << "Parallel region is used in "
1737 << (UnknownUse ? "unknown" : "unexpected")
1738 << " ways; will not attempt to rewrite the state machine.";
1739 };
1740 emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark);
1741 }
1742 continue;
1743 }
1744
1745 // Even if we have __kmpc_kernel_prepare_parallel calls, we (for now) give
1746 // up if the function is not called from a unique kernel.
1747 Kernel K = getUniqueKernelFor(*F);
1748 if (!K) {
1749 {
1750 auto Remark = [&](OptimizationRemark OR) {
1751 return OR << "Parallel region is not known to be called from a "
1752 "unique single target region, maybe the surrounding "
1753 "function has external linkage?; will not attempt to "
1754 "rewrite the state machine use.";
1755 };
1756 emitRemarkOnFunction(F, "OpenMPParallelRegionInMultipleKernesl",
1757 Remark);
1758 }
1759 continue;
1760 }
1761
1762 // We now know F is a parallel body function called only from the kernel K.
1763 // We also identified the state machine uses in which we replace the
1764 // function pointer by a new global symbol for identification purposes. This
1765 // ensures only direct calls to the function are left.
1766
1767 {
1768 auto RemarkParalleRegion = [&](OptimizationRemark OR) {
1769 return OR << "Specialize parallel region that is only reached from a "
1770 "single target region to avoid spurious call edges and "
1771 "excessive register usage in other target regions. "
1772 "(parallel region ID: "
1773 << ore::NV("OpenMPParallelRegion", F->getName())
1774 << ", kernel ID: "
1775 << ore::NV("OpenMPTargetRegion", K->getName()) << ")";
1776 };
1777 emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD",
1778 RemarkParalleRegion);
1779 auto RemarkKernel = [&](OptimizationRemark OR) {
1780 return OR << "Target region containing the parallel region that is "
1781 "specialized. (parallel region ID: "
1782 << ore::NV("OpenMPParallelRegion", F->getName())
1783 << ", kernel ID: "
1784 << ore::NV("OpenMPTargetRegion", K->getName()) << ")";
1785 };
1786 emitRemarkOnFunction(K, "OpenMPParallelRegionInNonSPMD", RemarkKernel);
1787 }
1788
1789 Module &M = *F->getParent();
1790 Type *Int8Ty = Type::getInt8Ty(M.getContext());
1791
1792 auto *ID = new GlobalVariable(
1793 M, Int8Ty, /* isConstant */ true, GlobalValue::PrivateLinkage,
1794 UndefValue::get(Int8Ty), F->getName() + ".ID");
1795
1796 for (Use *U : ToBeReplacedStateMachineUses)
1797 U->set(ConstantExpr::getBitCast(ID, U->get()->getType()));
1798
1799 ++NumOpenMPParallelRegionsReplacedInGPUStateMachine;
1800
1801 Changed = true;
1802 }
1803
1804 return Changed;
1805 }
1806
1807 /// Abstract Attribute for tracking ICV values.
1808 struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> {
1809 using Base = StateWrapper<BooleanState, AbstractAttribute>;
AAICVTracker__anon43e5a8d30111::AAICVTracker1810 AAICVTracker(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
1811
initialize__anon43e5a8d30111::AAICVTracker1812 void initialize(Attributor &A) override {
1813 Function *F = getAnchorScope();
1814 if (!F || !A.isFunctionIPOAmendable(*F))
1815 indicatePessimisticFixpoint();
1816 }
1817
1818 /// Returns true if value is assumed to be tracked.
isAssumedTracked__anon43e5a8d30111::AAICVTracker1819 bool isAssumedTracked() const { return getAssumed(); }
1820
1821 /// Returns true if value is known to be tracked.
isKnownTracked__anon43e5a8d30111::AAICVTracker1822 bool isKnownTracked() const { return getAssumed(); }
1823
1824 /// Create an abstract attribute biew for the position \p IRP.
1825 static AAICVTracker &createForPosition(const IRPosition &IRP, Attributor &A);
1826
1827 /// Return the value with which \p I can be replaced for specific \p ICV.
getReplacementValue__anon43e5a8d30111::AAICVTracker1828 virtual Optional<Value *> getReplacementValue(InternalControlVar ICV,
1829 const Instruction *I,
1830 Attributor &A) const {
1831 return None;
1832 }
1833
1834 /// Return an assumed unique ICV value if a single candidate is found. If
1835 /// there cannot be one, return a nullptr. If it is not clear yet, return the
1836 /// Optional::NoneType.
1837 virtual Optional<Value *>
1838 getUniqueReplacementValue(InternalControlVar ICV) const = 0;
1839
1840 // Currently only nthreads is being tracked.
1841 // this array will only grow with time.
1842 InternalControlVar TrackableICVs[1] = {ICV_nthreads};
1843
1844 /// See AbstractAttribute::getName()
getName__anon43e5a8d30111::AAICVTracker1845 const std::string getName() const override { return "AAICVTracker"; }
1846
1847 /// See AbstractAttribute::getIdAddr()
getIdAddr__anon43e5a8d30111::AAICVTracker1848 const char *getIdAddr() const override { return &ID; }
1849
1850 /// This function should return true if the type of the \p AA is AAICVTracker
classof__anon43e5a8d30111::AAICVTracker1851 static bool classof(const AbstractAttribute *AA) {
1852 return (AA->getIdAddr() == &ID);
1853 }
1854
1855 static const char ID;
1856 };
1857
1858 struct AAICVTrackerFunction : public AAICVTracker {
AAICVTrackerFunction__anon43e5a8d30111::AAICVTrackerFunction1859 AAICVTrackerFunction(const IRPosition &IRP, Attributor &A)
1860 : AAICVTracker(IRP, A) {}
1861
1862 // FIXME: come up with better string.
getAsStr__anon43e5a8d30111::AAICVTrackerFunction1863 const std::string getAsStr() const override { return "ICVTrackerFunction"; }
1864
1865 // FIXME: come up with some stats.
trackStatistics__anon43e5a8d30111::AAICVTrackerFunction1866 void trackStatistics() const override {}
1867
1868 /// We don't manifest anything for this AA.
manifest__anon43e5a8d30111::AAICVTrackerFunction1869 ChangeStatus manifest(Attributor &A) override {
1870 return ChangeStatus::UNCHANGED;
1871 }
1872
1873 // Map of ICV to their values at specific program point.
1874 EnumeratedArray<DenseMap<Instruction *, Value *>, InternalControlVar,
1875 InternalControlVar::ICV___last>
1876 ICVReplacementValuesMap;
1877
updateImpl__anon43e5a8d30111::AAICVTrackerFunction1878 ChangeStatus updateImpl(Attributor &A) override {
1879 ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
1880
1881 Function *F = getAnchorScope();
1882
1883 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
1884
1885 for (InternalControlVar ICV : TrackableICVs) {
1886 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
1887
1888 auto &ValuesMap = ICVReplacementValuesMap[ICV];
1889 auto TrackValues = [&](Use &U, Function &) {
1890 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U);
1891 if (!CI)
1892 return false;
1893
1894 // FIXME: handle setters with more that 1 arguments.
1895 /// Track new value.
1896 if (ValuesMap.insert(std::make_pair(CI, CI->getArgOperand(0))).second)
1897 HasChanged = ChangeStatus::CHANGED;
1898
1899 return false;
1900 };
1901
1902 auto CallCheck = [&](Instruction &I) {
1903 Optional<Value *> ReplVal = getValueForCall(A, &I, ICV);
1904 if (ReplVal.hasValue() &&
1905 ValuesMap.insert(std::make_pair(&I, *ReplVal)).second)
1906 HasChanged = ChangeStatus::CHANGED;
1907
1908 return true;
1909 };
1910
1911 // Track all changes of an ICV.
1912 SetterRFI.foreachUse(TrackValues, F);
1913
1914 A.checkForAllInstructions(CallCheck, *this, {Instruction::Call},
1915 /* CheckBBLivenessOnly */ true);
1916
1917 /// TODO: Figure out a way to avoid adding entry in
1918 /// ICVReplacementValuesMap
1919 Instruction *Entry = &F->getEntryBlock().front();
1920 if (HasChanged == ChangeStatus::CHANGED && !ValuesMap.count(Entry))
1921 ValuesMap.insert(std::make_pair(Entry, nullptr));
1922 }
1923
1924 return HasChanged;
1925 }
1926
1927 /// Hepler to check if \p I is a call and get the value for it if it is
1928 /// unique.
getValueForCall__anon43e5a8d30111::AAICVTrackerFunction1929 Optional<Value *> getValueForCall(Attributor &A, const Instruction *I,
1930 InternalControlVar &ICV) const {
1931
1932 const auto *CB = dyn_cast<CallBase>(I);
1933 if (!CB || CB->hasFnAttr("no_openmp") ||
1934 CB->hasFnAttr("no_openmp_routines"))
1935 return None;
1936
1937 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
1938 auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter];
1939 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
1940 Function *CalledFunction = CB->getCalledFunction();
1941
1942 // Indirect call, assume ICV changes.
1943 if (CalledFunction == nullptr)
1944 return nullptr;
1945 if (CalledFunction == GetterRFI.Declaration)
1946 return None;
1947 if (CalledFunction == SetterRFI.Declaration) {
1948 if (ICVReplacementValuesMap[ICV].count(I))
1949 return ICVReplacementValuesMap[ICV].lookup(I);
1950
1951 return nullptr;
1952 }
1953
1954 // Since we don't know, assume it changes the ICV.
1955 if (CalledFunction->isDeclaration())
1956 return nullptr;
1957
1958 const auto &ICVTrackingAA =
1959 A.getAAFor<AAICVTracker>(*this, IRPosition::callsite_returned(*CB));
1960
1961 if (ICVTrackingAA.isAssumedTracked())
1962 return ICVTrackingAA.getUniqueReplacementValue(ICV);
1963
1964 // If we don't know, assume it changes.
1965 return nullptr;
1966 }
1967
1968 // We don't check unique value for a function, so return None.
1969 Optional<Value *>
getUniqueReplacementValue__anon43e5a8d30111::AAICVTrackerFunction1970 getUniqueReplacementValue(InternalControlVar ICV) const override {
1971 return None;
1972 }
1973
1974 /// Return the value with which \p I can be replaced for specific \p ICV.
getReplacementValue__anon43e5a8d30111::AAICVTrackerFunction1975 Optional<Value *> getReplacementValue(InternalControlVar ICV,
1976 const Instruction *I,
1977 Attributor &A) const override {
1978 const auto &ValuesMap = ICVReplacementValuesMap[ICV];
1979 if (ValuesMap.count(I))
1980 return ValuesMap.lookup(I);
1981
1982 SmallVector<const Instruction *, 16> Worklist;
1983 SmallPtrSet<const Instruction *, 16> Visited;
1984 Worklist.push_back(I);
1985
1986 Optional<Value *> ReplVal;
1987
1988 while (!Worklist.empty()) {
1989 const Instruction *CurrInst = Worklist.pop_back_val();
1990 if (!Visited.insert(CurrInst).second)
1991 continue;
1992
1993 const BasicBlock *CurrBB = CurrInst->getParent();
1994
1995 // Go up and look for all potential setters/calls that might change the
1996 // ICV.
1997 while ((CurrInst = CurrInst->getPrevNode())) {
1998 if (ValuesMap.count(CurrInst)) {
1999 Optional<Value *> NewReplVal = ValuesMap.lookup(CurrInst);
2000 // Unknown value, track new.
2001 if (!ReplVal.hasValue()) {
2002 ReplVal = NewReplVal;
2003 break;
2004 }
2005
2006 // If we found a new value, we can't know the icv value anymore.
2007 if (NewReplVal.hasValue())
2008 if (ReplVal != NewReplVal)
2009 return nullptr;
2010
2011 break;
2012 }
2013
2014 Optional<Value *> NewReplVal = getValueForCall(A, CurrInst, ICV);
2015 if (!NewReplVal.hasValue())
2016 continue;
2017
2018 // Unknown value, track new.
2019 if (!ReplVal.hasValue()) {
2020 ReplVal = NewReplVal;
2021 break;
2022 }
2023
2024 // if (NewReplVal.hasValue())
2025 // We found a new value, we can't know the icv value anymore.
2026 if (ReplVal != NewReplVal)
2027 return nullptr;
2028 }
2029
2030 // If we are in the same BB and we have a value, we are done.
2031 if (CurrBB == I->getParent() && ReplVal.hasValue())
2032 return ReplVal;
2033
2034 // Go through all predecessors and add terminators for analysis.
2035 for (const BasicBlock *Pred : predecessors(CurrBB))
2036 if (const Instruction *Terminator = Pred->getTerminator())
2037 Worklist.push_back(Terminator);
2038 }
2039
2040 return ReplVal;
2041 }
2042 };
2043
2044 struct AAICVTrackerFunctionReturned : AAICVTracker {
AAICVTrackerFunctionReturned__anon43e5a8d30111::AAICVTrackerFunctionReturned2045 AAICVTrackerFunctionReturned(const IRPosition &IRP, Attributor &A)
2046 : AAICVTracker(IRP, A) {}
2047
2048 // FIXME: come up with better string.
getAsStr__anon43e5a8d30111::AAICVTrackerFunctionReturned2049 const std::string getAsStr() const override {
2050 return "ICVTrackerFunctionReturned";
2051 }
2052
2053 // FIXME: come up with some stats.
trackStatistics__anon43e5a8d30111::AAICVTrackerFunctionReturned2054 void trackStatistics() const override {}
2055
2056 /// We don't manifest anything for this AA.
manifest__anon43e5a8d30111::AAICVTrackerFunctionReturned2057 ChangeStatus manifest(Attributor &A) override {
2058 return ChangeStatus::UNCHANGED;
2059 }
2060
2061 // Map of ICV to their values at specific program point.
2062 EnumeratedArray<Optional<Value *>, InternalControlVar,
2063 InternalControlVar::ICV___last>
2064 ICVReplacementValuesMap;
2065
2066 /// Return the value with which \p I can be replaced for specific \p ICV.
2067 Optional<Value *>
getUniqueReplacementValue__anon43e5a8d30111::AAICVTrackerFunctionReturned2068 getUniqueReplacementValue(InternalControlVar ICV) const override {
2069 return ICVReplacementValuesMap[ICV];
2070 }
2071
updateImpl__anon43e5a8d30111::AAICVTrackerFunctionReturned2072 ChangeStatus updateImpl(Attributor &A) override {
2073 ChangeStatus Changed = ChangeStatus::UNCHANGED;
2074 const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
2075 *this, IRPosition::function(*getAnchorScope()));
2076
2077 if (!ICVTrackingAA.isAssumedTracked())
2078 return indicatePessimisticFixpoint();
2079
2080 for (InternalControlVar ICV : TrackableICVs) {
2081 Optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
2082 Optional<Value *> UniqueICVValue;
2083
2084 auto CheckReturnInst = [&](Instruction &I) {
2085 Optional<Value *> NewReplVal =
2086 ICVTrackingAA.getReplacementValue(ICV, &I, A);
2087
2088 // If we found a second ICV value there is no unique returned value.
2089 if (UniqueICVValue.hasValue() && UniqueICVValue != NewReplVal)
2090 return false;
2091
2092 UniqueICVValue = NewReplVal;
2093
2094 return true;
2095 };
2096
2097 if (!A.checkForAllInstructions(CheckReturnInst, *this, {Instruction::Ret},
2098 /* CheckBBLivenessOnly */ true))
2099 UniqueICVValue = nullptr;
2100
2101 if (UniqueICVValue == ReplVal)
2102 continue;
2103
2104 ReplVal = UniqueICVValue;
2105 Changed = ChangeStatus::CHANGED;
2106 }
2107
2108 return Changed;
2109 }
2110 };
2111
2112 struct AAICVTrackerCallSite : AAICVTracker {
AAICVTrackerCallSite__anon43e5a8d30111::AAICVTrackerCallSite2113 AAICVTrackerCallSite(const IRPosition &IRP, Attributor &A)
2114 : AAICVTracker(IRP, A) {}
2115
initialize__anon43e5a8d30111::AAICVTrackerCallSite2116 void initialize(Attributor &A) override {
2117 Function *F = getAnchorScope();
2118 if (!F || !A.isFunctionIPOAmendable(*F))
2119 indicatePessimisticFixpoint();
2120
2121 // We only initialize this AA for getters, so we need to know which ICV it
2122 // gets.
2123 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2124 for (InternalControlVar ICV : TrackableICVs) {
2125 auto ICVInfo = OMPInfoCache.ICVs[ICV];
2126 auto &Getter = OMPInfoCache.RFIs[ICVInfo.Getter];
2127 if (Getter.Declaration == getAssociatedFunction()) {
2128 AssociatedICV = ICVInfo.Kind;
2129 return;
2130 }
2131 }
2132
2133 /// Unknown ICV.
2134 indicatePessimisticFixpoint();
2135 }
2136
manifest__anon43e5a8d30111::AAICVTrackerCallSite2137 ChangeStatus manifest(Attributor &A) override {
2138 if (!ReplVal.hasValue() || !ReplVal.getValue())
2139 return ChangeStatus::UNCHANGED;
2140
2141 A.changeValueAfterManifest(*getCtxI(), **ReplVal);
2142 A.deleteAfterManifest(*getCtxI());
2143
2144 return ChangeStatus::CHANGED;
2145 }
2146
2147 // FIXME: come up with better string.
getAsStr__anon43e5a8d30111::AAICVTrackerCallSite2148 const std::string getAsStr() const override { return "ICVTrackerCallSite"; }
2149
2150 // FIXME: come up with some stats.
trackStatistics__anon43e5a8d30111::AAICVTrackerCallSite2151 void trackStatistics() const override {}
2152
2153 InternalControlVar AssociatedICV;
2154 Optional<Value *> ReplVal;
2155
updateImpl__anon43e5a8d30111::AAICVTrackerCallSite2156 ChangeStatus updateImpl(Attributor &A) override {
2157 const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
2158 *this, IRPosition::function(*getAnchorScope()));
2159
2160 // We don't have any information, so we assume it changes the ICV.
2161 if (!ICVTrackingAA.isAssumedTracked())
2162 return indicatePessimisticFixpoint();
2163
2164 Optional<Value *> NewReplVal =
2165 ICVTrackingAA.getReplacementValue(AssociatedICV, getCtxI(), A);
2166
2167 if (ReplVal == NewReplVal)
2168 return ChangeStatus::UNCHANGED;
2169
2170 ReplVal = NewReplVal;
2171 return ChangeStatus::CHANGED;
2172 }
2173
2174 // Return the value with which associated value can be replaced for specific
2175 // \p ICV.
2176 Optional<Value *>
getUniqueReplacementValue__anon43e5a8d30111::AAICVTrackerCallSite2177 getUniqueReplacementValue(InternalControlVar ICV) const override {
2178 return ReplVal;
2179 }
2180 };
2181
2182 struct AAICVTrackerCallSiteReturned : AAICVTracker {
AAICVTrackerCallSiteReturned__anon43e5a8d30111::AAICVTrackerCallSiteReturned2183 AAICVTrackerCallSiteReturned(const IRPosition &IRP, Attributor &A)
2184 : AAICVTracker(IRP, A) {}
2185
2186 // FIXME: come up with better string.
getAsStr__anon43e5a8d30111::AAICVTrackerCallSiteReturned2187 const std::string getAsStr() const override {
2188 return "ICVTrackerCallSiteReturned";
2189 }
2190
2191 // FIXME: come up with some stats.
trackStatistics__anon43e5a8d30111::AAICVTrackerCallSiteReturned2192 void trackStatistics() const override {}
2193
2194 /// We don't manifest anything for this AA.
manifest__anon43e5a8d30111::AAICVTrackerCallSiteReturned2195 ChangeStatus manifest(Attributor &A) override {
2196 return ChangeStatus::UNCHANGED;
2197 }
2198
2199 // Map of ICV to their values at specific program point.
2200 EnumeratedArray<Optional<Value *>, InternalControlVar,
2201 InternalControlVar::ICV___last>
2202 ICVReplacementValuesMap;
2203
2204 /// Return the value with which associated value can be replaced for specific
2205 /// \p ICV.
2206 Optional<Value *>
getUniqueReplacementValue__anon43e5a8d30111::AAICVTrackerCallSiteReturned2207 getUniqueReplacementValue(InternalControlVar ICV) const override {
2208 return ICVReplacementValuesMap[ICV];
2209 }
2210
updateImpl__anon43e5a8d30111::AAICVTrackerCallSiteReturned2211 ChangeStatus updateImpl(Attributor &A) override {
2212 ChangeStatus Changed = ChangeStatus::UNCHANGED;
2213 const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
2214 *this, IRPosition::returned(*getAssociatedFunction()));
2215
2216 // We don't have any information, so we assume it changes the ICV.
2217 if (!ICVTrackingAA.isAssumedTracked())
2218 return indicatePessimisticFixpoint();
2219
2220 for (InternalControlVar ICV : TrackableICVs) {
2221 Optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
2222 Optional<Value *> NewReplVal =
2223 ICVTrackingAA.getUniqueReplacementValue(ICV);
2224
2225 if (ReplVal == NewReplVal)
2226 continue;
2227
2228 ReplVal = NewReplVal;
2229 Changed = ChangeStatus::CHANGED;
2230 }
2231 return Changed;
2232 }
2233 };
2234 } // namespace
2235
2236 const char AAICVTracker::ID = 0;
2237
createForPosition(const IRPosition & IRP,Attributor & A)2238 AAICVTracker &AAICVTracker::createForPosition(const IRPosition &IRP,
2239 Attributor &A) {
2240 AAICVTracker *AA = nullptr;
2241 switch (IRP.getPositionKind()) {
2242 case IRPosition::IRP_INVALID:
2243 case IRPosition::IRP_FLOAT:
2244 case IRPosition::IRP_ARGUMENT:
2245 case IRPosition::IRP_CALL_SITE_ARGUMENT:
2246 llvm_unreachable("ICVTracker can only be created for function position!");
2247 case IRPosition::IRP_RETURNED:
2248 AA = new (A.Allocator) AAICVTrackerFunctionReturned(IRP, A);
2249 break;
2250 case IRPosition::IRP_CALL_SITE_RETURNED:
2251 AA = new (A.Allocator) AAICVTrackerCallSiteReturned(IRP, A);
2252 break;
2253 case IRPosition::IRP_CALL_SITE:
2254 AA = new (A.Allocator) AAICVTrackerCallSite(IRP, A);
2255 break;
2256 case IRPosition::IRP_FUNCTION:
2257 AA = new (A.Allocator) AAICVTrackerFunction(IRP, A);
2258 break;
2259 }
2260
2261 return *AA;
2262 }
2263
run(LazyCallGraph::SCC & C,CGSCCAnalysisManager & AM,LazyCallGraph & CG,CGSCCUpdateResult & UR)2264 PreservedAnalyses OpenMPOptPass::run(LazyCallGraph::SCC &C,
2265 CGSCCAnalysisManager &AM,
2266 LazyCallGraph &CG, CGSCCUpdateResult &UR) {
2267 if (!containsOpenMP(*C.begin()->getFunction().getParent(), OMPInModule))
2268 return PreservedAnalyses::all();
2269
2270 if (DisableOpenMPOptimizations)
2271 return PreservedAnalyses::all();
2272
2273 SmallVector<Function *, 16> SCC;
2274 // If there are kernels in the module, we have to run on all SCC's.
2275 bool SCCIsInteresting = !OMPInModule.getKernels().empty();
2276 for (LazyCallGraph::Node &N : C) {
2277 Function *Fn = &N.getFunction();
2278 SCC.push_back(Fn);
2279
2280 // Do we already know that the SCC contains kernels,
2281 // or that OpenMP functions are called from this SCC?
2282 if (SCCIsInteresting)
2283 continue;
2284 // If not, let's check that.
2285 SCCIsInteresting |= OMPInModule.containsOMPRuntimeCalls(Fn);
2286 }
2287
2288 if (!SCCIsInteresting || SCC.empty())
2289 return PreservedAnalyses::all();
2290
2291 FunctionAnalysisManager &FAM =
2292 AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
2293
2294 AnalysisGetter AG(FAM);
2295
2296 auto OREGetter = [&FAM](Function *F) -> OptimizationRemarkEmitter & {
2297 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
2298 };
2299
2300 CallGraphUpdater CGUpdater;
2301 CGUpdater.initialize(CG, C, AM, UR);
2302
2303 SetVector<Function *> Functions(SCC.begin(), SCC.end());
2304 BumpPtrAllocator Allocator;
2305 OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator,
2306 /*CGSCC*/ Functions, OMPInModule.getKernels());
2307
2308 Attributor A(Functions, InfoCache, CGUpdater);
2309
2310 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
2311 bool Changed = OMPOpt.run();
2312 if (Changed)
2313 return PreservedAnalyses::none();
2314
2315 return PreservedAnalyses::all();
2316 }
2317
2318 namespace {
2319
2320 struct OpenMPOptLegacyPass : public CallGraphSCCPass {
2321 CallGraphUpdater CGUpdater;
2322 OpenMPInModule OMPInModule;
2323 static char ID;
2324
OpenMPOptLegacyPass__anon43e5a8d32c11::OpenMPOptLegacyPass2325 OpenMPOptLegacyPass() : CallGraphSCCPass(ID) {
2326 initializeOpenMPOptLegacyPassPass(*PassRegistry::getPassRegistry());
2327 }
2328
getAnalysisUsage__anon43e5a8d32c11::OpenMPOptLegacyPass2329 void getAnalysisUsage(AnalysisUsage &AU) const override {
2330 CallGraphSCCPass::getAnalysisUsage(AU);
2331 }
2332
doInitialization__anon43e5a8d32c11::OpenMPOptLegacyPass2333 bool doInitialization(CallGraph &CG) override {
2334 // Disable the pass if there is no OpenMP (runtime call) in the module.
2335 containsOpenMP(CG.getModule(), OMPInModule);
2336 return false;
2337 }
2338
runOnSCC__anon43e5a8d32c11::OpenMPOptLegacyPass2339 bool runOnSCC(CallGraphSCC &CGSCC) override {
2340 if (!containsOpenMP(CGSCC.getCallGraph().getModule(), OMPInModule))
2341 return false;
2342 if (DisableOpenMPOptimizations || skipSCC(CGSCC))
2343 return false;
2344
2345 SmallVector<Function *, 16> SCC;
2346 // If there are kernels in the module, we have to run on all SCC's.
2347 bool SCCIsInteresting = !OMPInModule.getKernels().empty();
2348 for (CallGraphNode *CGN : CGSCC) {
2349 Function *Fn = CGN->getFunction();
2350 if (!Fn || Fn->isDeclaration())
2351 continue;
2352 SCC.push_back(Fn);
2353
2354 // Do we already know that the SCC contains kernels,
2355 // or that OpenMP functions are called from this SCC?
2356 if (SCCIsInteresting)
2357 continue;
2358 // If not, let's check that.
2359 SCCIsInteresting |= OMPInModule.containsOMPRuntimeCalls(Fn);
2360 }
2361
2362 if (!SCCIsInteresting || SCC.empty())
2363 return false;
2364
2365 CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
2366 CGUpdater.initialize(CG, CGSCC);
2367
2368 // Maintain a map of functions to avoid rebuilding the ORE
2369 DenseMap<Function *, std::unique_ptr<OptimizationRemarkEmitter>> OREMap;
2370 auto OREGetter = [&OREMap](Function *F) -> OptimizationRemarkEmitter & {
2371 std::unique_ptr<OptimizationRemarkEmitter> &ORE = OREMap[F];
2372 if (!ORE)
2373 ORE = std::make_unique<OptimizationRemarkEmitter>(F);
2374 return *ORE;
2375 };
2376
2377 AnalysisGetter AG;
2378 SetVector<Function *> Functions(SCC.begin(), SCC.end());
2379 BumpPtrAllocator Allocator;
2380 OMPInformationCache InfoCache(
2381 *(Functions.back()->getParent()), AG, Allocator,
2382 /*CGSCC*/ Functions, OMPInModule.getKernels());
2383
2384 Attributor A(Functions, InfoCache, CGUpdater);
2385
2386 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
2387 return OMPOpt.run();
2388 }
2389
doFinalization__anon43e5a8d32c11::OpenMPOptLegacyPass2390 bool doFinalization(CallGraph &CG) override { return CGUpdater.finalize(); }
2391 };
2392
2393 } // end anonymous namespace
2394
identifyKernels(Module & M)2395 void OpenMPInModule::identifyKernels(Module &M) {
2396
2397 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
2398 if (!MD)
2399 return;
2400
2401 for (auto *Op : MD->operands()) {
2402 if (Op->getNumOperands() < 2)
2403 continue;
2404 MDString *KindID = dyn_cast<MDString>(Op->getOperand(1));
2405 if (!KindID || KindID->getString() != "kernel")
2406 continue;
2407
2408 Function *KernelFn =
2409 mdconst::dyn_extract_or_null<Function>(Op->getOperand(0));
2410 if (!KernelFn)
2411 continue;
2412
2413 ++NumOpenMPTargetRegionKernels;
2414
2415 Kernels.insert(KernelFn);
2416 }
2417 }
2418
containsOpenMP(Module & M,OpenMPInModule & OMPInModule)2419 bool llvm::omp::containsOpenMP(Module &M, OpenMPInModule &OMPInModule) {
2420 if (OMPInModule.isKnown())
2421 return OMPInModule;
2422
2423 auto RecordFunctionsContainingUsesOf = [&](Function *F) {
2424 for (User *U : F->users())
2425 if (auto *I = dyn_cast<Instruction>(U))
2426 OMPInModule.FuncsWithOMPRuntimeCalls.insert(I->getFunction());
2427 };
2428
2429 // MSVC doesn't like long if-else chains for some reason and instead just
2430 // issues an error. Work around it..
2431 do {
2432 #define OMP_RTL(_Enum, _Name, ...) \
2433 if (Function *F = M.getFunction(_Name)) { \
2434 RecordFunctionsContainingUsesOf(F); \
2435 OMPInModule = true; \
2436 }
2437 #include "llvm/Frontend/OpenMP/OMPKinds.def"
2438 } while (false);
2439
2440 // Identify kernels once. TODO: We should split the OMPInformationCache into a
2441 // module and an SCC part. The kernel information, among other things, could
2442 // go into the module part.
2443 if (OMPInModule.isKnown() && OMPInModule) {
2444 OMPInModule.identifyKernels(M);
2445 return true;
2446 }
2447
2448 return OMPInModule = false;
2449 }
2450
2451 char OpenMPOptLegacyPass::ID = 0;
2452
2453 INITIALIZE_PASS_BEGIN(OpenMPOptLegacyPass, "openmpopt",
2454 "OpenMP specific optimizations", false, false)
INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)2455 INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
2456 INITIALIZE_PASS_END(OpenMPOptLegacyPass, "openmpopt",
2457 "OpenMP specific optimizations", false, false)
2458
2459 Pass *llvm::createOpenMPOptLegacyPass() { return new OpenMPOptLegacyPass(); }
2460