1 //===-- LoopSink.cpp - Loop Sink Pass -------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does the inverse transformation of what LICM does.
10 // It traverses all of the instructions in the loop's preheader and sinks
11 // them to the loop body where frequency is lower than the loop's preheader.
12 // This pass is a reverse-transformation of LICM. It differs from the Sink
13 // pass in the following ways:
14 //
15 // * It only handles sinking of instructions from the loop's preheader to the
16 // loop's body
17 // * It uses alias set tracker to get more accurate alias info
18 // * It uses block frequency info to find the optimal sinking locations
19 //
20 // Overall algorithm:
21 //
22 // For I in Preheader:
23 // InsertBBs = BBs that uses I
24 // For BB in sorted(LoopBBs):
25 // DomBBs = BBs in InsertBBs that are dominated by BB
26 // if freq(DomBBs) > freq(BB)
27 // InsertBBs = UseBBs - DomBBs + BB
28 // For BB in InsertBBs:
29 // Insert I at BB's beginning
30 //
31 //===----------------------------------------------------------------------===//
32
33 #include "llvm/Transforms/Scalar/LoopSink.h"
34 #include "llvm/ADT/Statistic.h"
35 #include "llvm/Analysis/AliasAnalysis.h"
36 #include "llvm/Analysis/AliasSetTracker.h"
37 #include "llvm/Analysis/BasicAliasAnalysis.h"
38 #include "llvm/Analysis/BlockFrequencyInfo.h"
39 #include "llvm/Analysis/Loads.h"
40 #include "llvm/Analysis/LoopInfo.h"
41 #include "llvm/Analysis/LoopPass.h"
42 #include "llvm/Analysis/MemorySSA.h"
43 #include "llvm/Analysis/MemorySSAUpdater.h"
44 #include "llvm/Analysis/ScalarEvolution.h"
45 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
46 #include "llvm/IR/Dominators.h"
47 #include "llvm/IR/Instructions.h"
48 #include "llvm/IR/LLVMContext.h"
49 #include "llvm/IR/Metadata.h"
50 #include "llvm/InitializePasses.h"
51 #include "llvm/Support/CommandLine.h"
52 #include "llvm/Transforms/Scalar.h"
53 #include "llvm/Transforms/Scalar/LoopPassManager.h"
54 #include "llvm/Transforms/Utils/Local.h"
55 #include "llvm/Transforms/Utils/LoopUtils.h"
56 using namespace llvm;
57
58 #define DEBUG_TYPE "loopsink"
59
60 STATISTIC(NumLoopSunk, "Number of instructions sunk into loop");
61 STATISTIC(NumLoopSunkCloned, "Number of cloned instructions sunk into loop");
62
63 static cl::opt<unsigned> SinkFrequencyPercentThreshold(
64 "sink-freq-percent-threshold", cl::Hidden, cl::init(90),
65 cl::desc("Do not sink instructions that require cloning unless they "
66 "execute less than this percent of the time."));
67
68 static cl::opt<unsigned> MaxNumberOfUseBBsForSinking(
69 "max-uses-for-sinking", cl::Hidden, cl::init(30),
70 cl::desc("Do not sink instructions that have too many uses."));
71
72 static cl::opt<bool> EnableMSSAInLoopSink(
73 "enable-mssa-in-loop-sink", cl::Hidden, cl::init(true),
74 cl::desc("Enable MemorySSA for LoopSink in new pass manager"));
75
76 static cl::opt<bool> EnableMSSAInLegacyLoopSink(
77 "enable-mssa-in-legacy-loop-sink", cl::Hidden, cl::init(false),
78 cl::desc("Enable MemorySSA for LoopSink in legacy pass manager"));
79
80 /// Return adjusted total frequency of \p BBs.
81 ///
82 /// * If there is only one BB, sinking instruction will not introduce code
83 /// size increase. Thus there is no need to adjust the frequency.
84 /// * If there are more than one BB, sinking would lead to code size increase.
85 /// In this case, we add some "tax" to the total frequency to make it harder
86 /// to sink. E.g.
87 /// Freq(Preheader) = 100
88 /// Freq(BBs) = sum(50, 49) = 99
89 /// Even if Freq(BBs) < Freq(Preheader), we will not sink from Preheade to
90 /// BBs as the difference is too small to justify the code size increase.
91 /// To model this, The adjusted Freq(BBs) will be:
92 /// AdjustedFreq(BBs) = 99 / SinkFrequencyPercentThreshold%
adjustedSumFreq(SmallPtrSetImpl<BasicBlock * > & BBs,BlockFrequencyInfo & BFI)93 static BlockFrequency adjustedSumFreq(SmallPtrSetImpl<BasicBlock *> &BBs,
94 BlockFrequencyInfo &BFI) {
95 BlockFrequency T = 0;
96 for (BasicBlock *B : BBs)
97 T += BFI.getBlockFreq(B);
98 if (BBs.size() > 1)
99 T /= BranchProbability(SinkFrequencyPercentThreshold, 100);
100 return T;
101 }
102
103 /// Return a set of basic blocks to insert sinked instructions.
104 ///
105 /// The returned set of basic blocks (BBsToSinkInto) should satisfy:
106 ///
107 /// * Inside the loop \p L
108 /// * For each UseBB in \p UseBBs, there is at least one BB in BBsToSinkInto
109 /// that domintates the UseBB
110 /// * Has minimum total frequency that is no greater than preheader frequency
111 ///
112 /// The purpose of the function is to find the optimal sinking points to
113 /// minimize execution cost, which is defined as "sum of frequency of
114 /// BBsToSinkInto".
115 /// As a result, the returned BBsToSinkInto needs to have minimum total
116 /// frequency.
117 /// Additionally, if the total frequency of BBsToSinkInto exceeds preheader
118 /// frequency, the optimal solution is not sinking (return empty set).
119 ///
120 /// \p ColdLoopBBs is used to help find the optimal sinking locations.
121 /// It stores a list of BBs that is:
122 ///
123 /// * Inside the loop \p L
124 /// * Has a frequency no larger than the loop's preheader
125 /// * Sorted by BB frequency
126 ///
127 /// The complexity of the function is O(UseBBs.size() * ColdLoopBBs.size()).
128 /// To avoid expensive computation, we cap the maximum UseBBs.size() in its
129 /// caller.
130 static SmallPtrSet<BasicBlock *, 2>
findBBsToSinkInto(const Loop & L,const SmallPtrSetImpl<BasicBlock * > & UseBBs,const SmallVectorImpl<BasicBlock * > & ColdLoopBBs,DominatorTree & DT,BlockFrequencyInfo & BFI)131 findBBsToSinkInto(const Loop &L, const SmallPtrSetImpl<BasicBlock *> &UseBBs,
132 const SmallVectorImpl<BasicBlock *> &ColdLoopBBs,
133 DominatorTree &DT, BlockFrequencyInfo &BFI) {
134 SmallPtrSet<BasicBlock *, 2> BBsToSinkInto;
135 if (UseBBs.size() == 0)
136 return BBsToSinkInto;
137
138 BBsToSinkInto.insert(UseBBs.begin(), UseBBs.end());
139 SmallPtrSet<BasicBlock *, 2> BBsDominatedByColdestBB;
140
141 // For every iteration:
142 // * Pick the ColdestBB from ColdLoopBBs
143 // * Find the set BBsDominatedByColdestBB that satisfy:
144 // - BBsDominatedByColdestBB is a subset of BBsToSinkInto
145 // - Every BB in BBsDominatedByColdestBB is dominated by ColdestBB
146 // * If Freq(ColdestBB) < Freq(BBsDominatedByColdestBB), remove
147 // BBsDominatedByColdestBB from BBsToSinkInto, add ColdestBB to
148 // BBsToSinkInto
149 for (BasicBlock *ColdestBB : ColdLoopBBs) {
150 BBsDominatedByColdestBB.clear();
151 for (BasicBlock *SinkedBB : BBsToSinkInto)
152 if (DT.dominates(ColdestBB, SinkedBB))
153 BBsDominatedByColdestBB.insert(SinkedBB);
154 if (BBsDominatedByColdestBB.size() == 0)
155 continue;
156 if (adjustedSumFreq(BBsDominatedByColdestBB, BFI) >
157 BFI.getBlockFreq(ColdestBB)) {
158 for (BasicBlock *DominatedBB : BBsDominatedByColdestBB) {
159 BBsToSinkInto.erase(DominatedBB);
160 }
161 BBsToSinkInto.insert(ColdestBB);
162 }
163 }
164
165 // Can't sink into blocks that have no valid insertion point.
166 for (BasicBlock *BB : BBsToSinkInto) {
167 if (BB->getFirstInsertionPt() == BB->end()) {
168 BBsToSinkInto.clear();
169 break;
170 }
171 }
172
173 // If the total frequency of BBsToSinkInto is larger than preheader frequency,
174 // do not sink.
175 if (adjustedSumFreq(BBsToSinkInto, BFI) >
176 BFI.getBlockFreq(L.getLoopPreheader()))
177 BBsToSinkInto.clear();
178 return BBsToSinkInto;
179 }
180
181 // Sinks \p I from the loop \p L's preheader to its uses. Returns true if
182 // sinking is successful.
183 // \p LoopBlockNumber is used to sort the insertion blocks to ensure
184 // determinism.
sinkInstruction(Loop & L,Instruction & I,const SmallVectorImpl<BasicBlock * > & ColdLoopBBs,const SmallDenseMap<BasicBlock *,int,16> & LoopBlockNumber,LoopInfo & LI,DominatorTree & DT,BlockFrequencyInfo & BFI,MemorySSAUpdater * MSSAU)185 static bool sinkInstruction(
186 Loop &L, Instruction &I, const SmallVectorImpl<BasicBlock *> &ColdLoopBBs,
187 const SmallDenseMap<BasicBlock *, int, 16> &LoopBlockNumber, LoopInfo &LI,
188 DominatorTree &DT, BlockFrequencyInfo &BFI, MemorySSAUpdater *MSSAU) {
189 // Compute the set of blocks in loop L which contain a use of I.
190 SmallPtrSet<BasicBlock *, 2> BBs;
191 for (auto &U : I.uses()) {
192 Instruction *UI = cast<Instruction>(U.getUser());
193 // We cannot sink I to PHI-uses.
194 if (dyn_cast<PHINode>(UI))
195 return false;
196 // We cannot sink I if it has uses outside of the loop.
197 if (!L.contains(LI.getLoopFor(UI->getParent())))
198 return false;
199 BBs.insert(UI->getParent());
200 }
201
202 // findBBsToSinkInto is O(BBs.size() * ColdLoopBBs.size()). We cap the max
203 // BBs.size() to avoid expensive computation.
204 // FIXME: Handle code size growth for min_size and opt_size.
205 if (BBs.size() > MaxNumberOfUseBBsForSinking)
206 return false;
207
208 // Find the set of BBs that we should insert a copy of I.
209 SmallPtrSet<BasicBlock *, 2> BBsToSinkInto =
210 findBBsToSinkInto(L, BBs, ColdLoopBBs, DT, BFI);
211 if (BBsToSinkInto.empty())
212 return false;
213
214 // Return if any of the candidate blocks to sink into is non-cold.
215 if (BBsToSinkInto.size() > 1) {
216 for (auto *BB : BBsToSinkInto)
217 if (!LoopBlockNumber.count(BB))
218 return false;
219 }
220
221 // Copy the final BBs into a vector and sort them using the total ordering
222 // of the loop block numbers as iterating the set doesn't give a useful
223 // order. No need to stable sort as the block numbers are a total ordering.
224 SmallVector<BasicBlock *, 2> SortedBBsToSinkInto;
225 llvm::append_range(SortedBBsToSinkInto, BBsToSinkInto);
226 llvm::sort(SortedBBsToSinkInto, [&](BasicBlock *A, BasicBlock *B) {
227 return LoopBlockNumber.find(A)->second < LoopBlockNumber.find(B)->second;
228 });
229
230 BasicBlock *MoveBB = *SortedBBsToSinkInto.begin();
231 // FIXME: Optimize the efficiency for cloned value replacement. The current
232 // implementation is O(SortedBBsToSinkInto.size() * I.num_uses()).
233 for (BasicBlock *N : makeArrayRef(SortedBBsToSinkInto).drop_front(1)) {
234 assert(LoopBlockNumber.find(N)->second >
235 LoopBlockNumber.find(MoveBB)->second &&
236 "BBs not sorted!");
237 // Clone I and replace its uses.
238 Instruction *IC = I.clone();
239 IC->setName(I.getName());
240 IC->insertBefore(&*N->getFirstInsertionPt());
241
242 if (MSSAU && MSSAU->getMemorySSA()->getMemoryAccess(&I)) {
243 // Create a new MemoryAccess and let MemorySSA set its defining access.
244 MemoryAccess *NewMemAcc =
245 MSSAU->createMemoryAccessInBB(IC, nullptr, N, MemorySSA::Beginning);
246 if (NewMemAcc) {
247 if (auto *MemDef = dyn_cast<MemoryDef>(NewMemAcc))
248 MSSAU->insertDef(MemDef, /*RenameUses=*/true);
249 else {
250 auto *MemUse = cast<MemoryUse>(NewMemAcc);
251 MSSAU->insertUse(MemUse, /*RenameUses=*/true);
252 }
253 }
254 }
255
256 // Replaces uses of I with IC in N
257 I.replaceUsesWithIf(IC, [N](Use &U) {
258 return cast<Instruction>(U.getUser())->getParent() == N;
259 });
260 // Replaces uses of I with IC in blocks dominated by N
261 replaceDominatedUsesWith(&I, IC, DT, N);
262 LLVM_DEBUG(dbgs() << "Sinking a clone of " << I << " To: " << N->getName()
263 << '\n');
264 NumLoopSunkCloned++;
265 }
266 LLVM_DEBUG(dbgs() << "Sinking " << I << " To: " << MoveBB->getName() << '\n');
267 NumLoopSunk++;
268 I.moveBefore(&*MoveBB->getFirstInsertionPt());
269
270 if (MSSAU)
271 if (MemoryUseOrDef *OldMemAcc = cast_or_null<MemoryUseOrDef>(
272 MSSAU->getMemorySSA()->getMemoryAccess(&I)))
273 MSSAU->moveToPlace(OldMemAcc, MoveBB, MemorySSA::Beginning);
274
275 return true;
276 }
277
278 /// Sinks instructions from loop's preheader to the loop body if the
279 /// sum frequency of inserted copy is smaller than preheader's frequency.
sinkLoopInvariantInstructions(Loop & L,AAResults & AA,LoopInfo & LI,DominatorTree & DT,BlockFrequencyInfo & BFI,ScalarEvolution * SE,AliasSetTracker * CurAST,MemorySSA * MSSA)280 static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,
281 DominatorTree &DT,
282 BlockFrequencyInfo &BFI,
283 ScalarEvolution *SE,
284 AliasSetTracker *CurAST,
285 MemorySSA *MSSA) {
286 BasicBlock *Preheader = L.getLoopPreheader();
287 assert(Preheader && "Expected loop to have preheader");
288
289 assert(Preheader->getParent()->hasProfileData() &&
290 "Unexpected call when profile data unavailable.");
291
292 const BlockFrequency PreheaderFreq = BFI.getBlockFreq(Preheader);
293 // If there are no basic blocks with lower frequency than the preheader then
294 // we can avoid the detailed analysis as we will never find profitable sinking
295 // opportunities.
296 if (all_of(L.blocks(), [&](const BasicBlock *BB) {
297 return BFI.getBlockFreq(BB) > PreheaderFreq;
298 }))
299 return false;
300
301 std::unique_ptr<MemorySSAUpdater> MSSAU;
302 std::unique_ptr<SinkAndHoistLICMFlags> LICMFlags;
303 if (MSSA) {
304 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
305 LICMFlags =
306 std::make_unique<SinkAndHoistLICMFlags>(/*IsSink=*/true, &L, MSSA);
307 }
308
309 bool Changed = false;
310
311 // Sort loop's basic blocks by frequency
312 SmallVector<BasicBlock *, 10> ColdLoopBBs;
313 SmallDenseMap<BasicBlock *, int, 16> LoopBlockNumber;
314 int i = 0;
315 for (BasicBlock *B : L.blocks())
316 if (BFI.getBlockFreq(B) < BFI.getBlockFreq(L.getLoopPreheader())) {
317 ColdLoopBBs.push_back(B);
318 LoopBlockNumber[B] = ++i;
319 }
320 llvm::stable_sort(ColdLoopBBs, [&](BasicBlock *A, BasicBlock *B) {
321 return BFI.getBlockFreq(A) < BFI.getBlockFreq(B);
322 });
323
324 // Traverse preheader's instructions in reverse order becaue if A depends
325 // on B (A appears after B), A needs to be sinked first before B can be
326 // sinked.
327 for (auto II = Preheader->rbegin(), E = Preheader->rend(); II != E;) {
328 Instruction *I = &*II++;
329 // No need to check for instruction's operands are loop invariant.
330 assert(L.hasLoopInvariantOperands(I) &&
331 "Insts in a loop's preheader should have loop invariant operands!");
332 if (!canSinkOrHoistInst(*I, &AA, &DT, &L, CurAST, MSSAU.get(), false,
333 LICMFlags.get()))
334 continue;
335 if (sinkInstruction(L, *I, ColdLoopBBs, LoopBlockNumber, LI, DT, BFI,
336 MSSAU.get()))
337 Changed = true;
338 }
339
340 if (Changed && SE)
341 SE->forgetLoopDispositions(&L);
342 return Changed;
343 }
344
computeAliasSet(Loop & L,BasicBlock & Preheader,AliasSetTracker & CurAST)345 static void computeAliasSet(Loop &L, BasicBlock &Preheader,
346 AliasSetTracker &CurAST) {
347 for (BasicBlock *BB : L.blocks())
348 CurAST.add(*BB);
349 CurAST.add(Preheader);
350 }
351
run(Function & F,FunctionAnalysisManager & FAM)352 PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) {
353 LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
354 // Nothing to do if there are no loops.
355 if (LI.empty())
356 return PreservedAnalyses::all();
357
358 AAResults &AA = FAM.getResult<AAManager>(F);
359 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
360 BlockFrequencyInfo &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
361
362 MemorySSA *MSSA = EnableMSSAInLoopSink
363 ? &FAM.getResult<MemorySSAAnalysis>(F).getMSSA()
364 : nullptr;
365
366 // We want to do a postorder walk over the loops. Since loops are a tree this
367 // is equivalent to a reversed preorder walk and preorder is easy to compute
368 // without recursion. Since we reverse the preorder, we will visit siblings
369 // in reverse program order. This isn't expected to matter at all but is more
370 // consistent with sinking algorithms which generally work bottom-up.
371 SmallVector<Loop *, 4> PreorderLoops = LI.getLoopsInPreorder();
372
373 bool Changed = false;
374 do {
375 Loop &L = *PreorderLoops.pop_back_val();
376
377 BasicBlock *Preheader = L.getLoopPreheader();
378 if (!Preheader)
379 continue;
380
381 // Enable LoopSink only when runtime profile is available.
382 // With static profile, the sinking decision may be sub-optimal.
383 if (!Preheader->getParent()->hasProfileData())
384 continue;
385
386 std::unique_ptr<AliasSetTracker> CurAST;
387 if (!EnableMSSAInLoopSink) {
388 CurAST = std::make_unique<AliasSetTracker>(AA);
389 computeAliasSet(L, *Preheader, *CurAST.get());
390 }
391
392 // Note that we don't pass SCEV here because it is only used to invalidate
393 // loops in SCEV and we don't preserve (or request) SCEV at all making that
394 // unnecessary.
395 Changed |= sinkLoopInvariantInstructions(L, AA, LI, DT, BFI,
396 /*ScalarEvolution*/ nullptr,
397 CurAST.get(), MSSA);
398 } while (!PreorderLoops.empty());
399
400 if (!Changed)
401 return PreservedAnalyses::all();
402
403 PreservedAnalyses PA;
404 PA.preserveSet<CFGAnalyses>();
405
406 if (MSSA) {
407 PA.preserve<MemorySSAAnalysis>();
408
409 if (VerifyMemorySSA)
410 MSSA->verifyMemorySSA();
411 }
412
413 return PA;
414 }
415
416 namespace {
417 struct LegacyLoopSinkPass : public LoopPass {
418 static char ID;
LegacyLoopSinkPass__anon6373213a0511::LegacyLoopSinkPass419 LegacyLoopSinkPass() : LoopPass(ID) {
420 initializeLegacyLoopSinkPassPass(*PassRegistry::getPassRegistry());
421 }
422
runOnLoop__anon6373213a0511::LegacyLoopSinkPass423 bool runOnLoop(Loop *L, LPPassManager &LPM) override {
424 if (skipLoop(L))
425 return false;
426
427 BasicBlock *Preheader = L->getLoopPreheader();
428 if (!Preheader)
429 return false;
430
431 // Enable LoopSink only when runtime profile is available.
432 // With static profile, the sinking decision may be sub-optimal.
433 if (!Preheader->getParent()->hasProfileData())
434 return false;
435
436 AAResults &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
437 auto *SE = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
438 std::unique_ptr<AliasSetTracker> CurAST;
439 MemorySSA *MSSA = nullptr;
440 if (EnableMSSAInLegacyLoopSink)
441 MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
442 else {
443 CurAST = std::make_unique<AliasSetTracker>(AA);
444 computeAliasSet(*L, *Preheader, *CurAST.get());
445 }
446
447 bool Changed = sinkLoopInvariantInstructions(
448 *L, AA, getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
449 getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
450 getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(),
451 SE ? &SE->getSE() : nullptr, CurAST.get(), MSSA);
452
453 if (MSSA && VerifyMemorySSA)
454 MSSA->verifyMemorySSA();
455
456 return Changed;
457 }
458
getAnalysisUsage__anon6373213a0511::LegacyLoopSinkPass459 void getAnalysisUsage(AnalysisUsage &AU) const override {
460 AU.setPreservesCFG();
461 AU.addRequired<BlockFrequencyInfoWrapperPass>();
462 getLoopAnalysisUsage(AU);
463 if (EnableMSSAInLegacyLoopSink) {
464 AU.addRequired<MemorySSAWrapperPass>();
465 AU.addPreserved<MemorySSAWrapperPass>();
466 }
467 }
468 };
469 }
470
471 char LegacyLoopSinkPass::ID = 0;
472 INITIALIZE_PASS_BEGIN(LegacyLoopSinkPass, "loop-sink", "Loop Sink", false,
473 false)
INITIALIZE_PASS_DEPENDENCY(LoopPass)474 INITIALIZE_PASS_DEPENDENCY(LoopPass)
475 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
476 INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
477 INITIALIZE_PASS_END(LegacyLoopSinkPass, "loop-sink", "Loop Sink", false, false)
478
479 Pass *llvm::createLoopSinkPass() { return new LegacyLoopSinkPass(); }
480