1 //===- LoopCacheAnalysis.cpp - Loop Cache Analysis -------------------------==//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
6 // See https://llvm.org/LICENSE.txt for license information.
7 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8 //
9 //===----------------------------------------------------------------------===//
10 ///
11 /// \file
12 /// This file defines the implementation for the loop cache analysis.
13 /// The implementation is largely based on the following paper:
14 ///
15 ///       Compiler Optimizations for Improving Data Locality
16 ///       By: Steve Carr, Katherine S. McKinley, Chau-Wen Tseng
17 ///       http://www.cs.utexas.edu/users/mckinley/papers/asplos-1994.pdf
18 ///
19 /// The general approach taken to estimate the number of cache lines used by the
20 /// memory references in an inner loop is:
21 ///    1. Partition memory references that exhibit temporal or spacial reuse
22 ///       into reference groups.
23 ///    2. For each loop L in the a loop nest LN:
24 ///       a. Compute the cost of the reference group
25 ///       b. Compute the loop cost by summing up the reference groups costs
26 //===----------------------------------------------------------------------===//
27 
28 #include "llvm/Analysis/LoopCacheAnalysis.h"
29 #include "llvm/ADT/BreadthFirstIterator.h"
30 #include "llvm/ADT/Sequence.h"
31 #include "llvm/ADT/SmallVector.h"
32 #include "llvm/Analysis/AliasAnalysis.h"
33 #include "llvm/Analysis/DependenceAnalysis.h"
34 #include "llvm/Analysis/LoopInfo.h"
35 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
36 #include "llvm/Analysis/TargetTransformInfo.h"
37 #include "llvm/Support/CommandLine.h"
38 #include "llvm/Support/Debug.h"
39 
40 using namespace llvm;
41 
42 #define DEBUG_TYPE "loop-cache-cost"
43 
44 static cl::opt<unsigned> DefaultTripCount(
45     "default-trip-count", cl::init(100), cl::Hidden,
46     cl::desc("Use this to specify the default trip count of a loop"));
47 
48 // In this analysis two array references are considered to exhibit temporal
49 // reuse if they access either the same memory location, or a memory location
50 // with distance smaller than a configurable threshold.
51 static cl::opt<unsigned> TemporalReuseThreshold(
52     "temporal-reuse-threshold", cl::init(2), cl::Hidden,
53     cl::desc("Use this to specify the max. distance between array elements "
54              "accessed in a loop so that the elements are classified to have "
55              "temporal reuse"));
56 
57 /// Retrieve the innermost loop in the given loop nest \p Loops. It returns a
58 /// nullptr if any loops in the loop vector supplied has more than one sibling.
59 /// The loop vector is expected to contain loops collected in breadth-first
60 /// order.
getInnerMostLoop(const LoopVectorTy & Loops)61 static Loop *getInnerMostLoop(const LoopVectorTy &Loops) {
62   assert(!Loops.empty() && "Expecting a non-empy loop vector");
63 
64   Loop *LastLoop = Loops.back();
65   Loop *ParentLoop = LastLoop->getParentLoop();
66 
67   if (ParentLoop == nullptr) {
68     assert(Loops.size() == 1 && "Expecting a single loop");
69     return LastLoop;
70   }
71 
72   return (llvm::is_sorted(Loops,
73                           [](const Loop *L1, const Loop *L2) {
74                             return L1->getLoopDepth() < L2->getLoopDepth();
75                           }))
76              ? LastLoop
77              : nullptr;
78 }
79 
isOneDimensionalArray(const SCEV & AccessFn,const SCEV & ElemSize,const Loop & L,ScalarEvolution & SE)80 static bool isOneDimensionalArray(const SCEV &AccessFn, const SCEV &ElemSize,
81                                   const Loop &L, ScalarEvolution &SE) {
82   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(&AccessFn);
83   if (!AR || !AR->isAffine())
84     return false;
85 
86   assert(AR->getLoop() && "AR should have a loop");
87 
88   // Check that start and increment are not add recurrences.
89   const SCEV *Start = AR->getStart();
90   const SCEV *Step = AR->getStepRecurrence(SE);
91   if (isa<SCEVAddRecExpr>(Start) || isa<SCEVAddRecExpr>(Step))
92     return false;
93 
94   // Check that start and increment are both invariant in the loop.
95   if (!SE.isLoopInvariant(Start, &L) || !SE.isLoopInvariant(Step, &L))
96     return false;
97 
98   const SCEV *StepRec = AR->getStepRecurrence(SE);
99   if (StepRec && SE.isKnownNegative(StepRec))
100     StepRec = SE.getNegativeSCEV(StepRec);
101 
102   return StepRec == &ElemSize;
103 }
104 
105 /// Compute the trip count for the given loop \p L. Return the SCEV expression
106 /// for the trip count or nullptr if it cannot be computed.
computeTripCount(const Loop & L,ScalarEvolution & SE)107 static const SCEV *computeTripCount(const Loop &L, ScalarEvolution &SE) {
108   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(&L);
109   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount) ||
110       !isa<SCEVConstant>(BackedgeTakenCount))
111     return nullptr;
112   return SE.getTripCountFromExitCount(BackedgeTakenCount);
113 }
114 
115 //===----------------------------------------------------------------------===//
116 // IndexedReference implementation
117 //
operator <<(raw_ostream & OS,const IndexedReference & R)118 raw_ostream &llvm::operator<<(raw_ostream &OS, const IndexedReference &R) {
119   if (!R.IsValid) {
120     OS << R.StoreOrLoadInst;
121     OS << ", IsValid=false.";
122     return OS;
123   }
124 
125   OS << *R.BasePointer;
126   for (const SCEV *Subscript : R.Subscripts)
127     OS << "[" << *Subscript << "]";
128 
129   OS << ", Sizes: ";
130   for (const SCEV *Size : R.Sizes)
131     OS << "[" << *Size << "]";
132 
133   return OS;
134 }
135 
IndexedReference(Instruction & StoreOrLoadInst,const LoopInfo & LI,ScalarEvolution & SE)136 IndexedReference::IndexedReference(Instruction &StoreOrLoadInst,
137                                    const LoopInfo &LI, ScalarEvolution &SE)
138     : StoreOrLoadInst(StoreOrLoadInst), SE(SE) {
139   assert((isa<StoreInst>(StoreOrLoadInst) || isa<LoadInst>(StoreOrLoadInst)) &&
140          "Expecting a load or store instruction");
141 
142   IsValid = delinearize(LI);
143   if (IsValid)
144     LLVM_DEBUG(dbgs().indent(2) << "Succesfully delinearized: " << *this
145                                 << "\n");
146 }
147 
hasSpacialReuse(const IndexedReference & Other,unsigned CLS,AAResults & AA) const148 Optional<bool> IndexedReference::hasSpacialReuse(const IndexedReference &Other,
149                                                  unsigned CLS,
150                                                  AAResults &AA) const {
151   assert(IsValid && "Expecting a valid reference");
152 
153   if (BasePointer != Other.getBasePointer() && !isAliased(Other, AA)) {
154     LLVM_DEBUG(dbgs().indent(2)
155                << "No spacial reuse: different base pointers\n");
156     return false;
157   }
158 
159   unsigned NumSubscripts = getNumSubscripts();
160   if (NumSubscripts != Other.getNumSubscripts()) {
161     LLVM_DEBUG(dbgs().indent(2)
162                << "No spacial reuse: different number of subscripts\n");
163     return false;
164   }
165 
166   // all subscripts must be equal, except the leftmost one (the last one).
167   for (auto SubNum : seq<unsigned>(0, NumSubscripts - 1)) {
168     if (getSubscript(SubNum) != Other.getSubscript(SubNum)) {
169       LLVM_DEBUG(dbgs().indent(2) << "No spacial reuse, different subscripts: "
170                                   << "\n\t" << *getSubscript(SubNum) << "\n\t"
171                                   << *Other.getSubscript(SubNum) << "\n");
172       return false;
173     }
174   }
175 
176   // the difference between the last subscripts must be less than the cache line
177   // size.
178   const SCEV *LastSubscript = getLastSubscript();
179   const SCEV *OtherLastSubscript = Other.getLastSubscript();
180   const SCEVConstant *Diff = dyn_cast<SCEVConstant>(
181       SE.getMinusSCEV(LastSubscript, OtherLastSubscript));
182 
183   if (Diff == nullptr) {
184     LLVM_DEBUG(dbgs().indent(2)
185                << "No spacial reuse, difference between subscript:\n\t"
186                << *LastSubscript << "\n\t" << OtherLastSubscript
187                << "\nis not constant.\n");
188     return None;
189   }
190 
191   bool InSameCacheLine = (Diff->getValue()->getSExtValue() < CLS);
192 
193   LLVM_DEBUG({
194     if (InSameCacheLine)
195       dbgs().indent(2) << "Found spacial reuse.\n";
196     else
197       dbgs().indent(2) << "No spacial reuse.\n";
198   });
199 
200   return InSameCacheLine;
201 }
202 
hasTemporalReuse(const IndexedReference & Other,unsigned MaxDistance,const Loop & L,DependenceInfo & DI,AAResults & AA) const203 Optional<bool> IndexedReference::hasTemporalReuse(const IndexedReference &Other,
204                                                   unsigned MaxDistance,
205                                                   const Loop &L,
206                                                   DependenceInfo &DI,
207                                                   AAResults &AA) const {
208   assert(IsValid && "Expecting a valid reference");
209 
210   if (BasePointer != Other.getBasePointer() && !isAliased(Other, AA)) {
211     LLVM_DEBUG(dbgs().indent(2)
212                << "No temporal reuse: different base pointer\n");
213     return false;
214   }
215 
216   std::unique_ptr<Dependence> D =
217       DI.depends(&StoreOrLoadInst, &Other.StoreOrLoadInst, true);
218 
219   if (D == nullptr) {
220     LLVM_DEBUG(dbgs().indent(2) << "No temporal reuse: no dependence\n");
221     return false;
222   }
223 
224   if (D->isLoopIndependent()) {
225     LLVM_DEBUG(dbgs().indent(2) << "Found temporal reuse\n");
226     return true;
227   }
228 
229   // Check the dependence distance at every loop level. There is temporal reuse
230   // if the distance at the given loop's depth is small (|d| <= MaxDistance) and
231   // it is zero at every other loop level.
232   int LoopDepth = L.getLoopDepth();
233   int Levels = D->getLevels();
234   for (int Level = 1; Level <= Levels; ++Level) {
235     const SCEV *Distance = D->getDistance(Level);
236     const SCEVConstant *SCEVConst = dyn_cast_or_null<SCEVConstant>(Distance);
237 
238     if (SCEVConst == nullptr) {
239       LLVM_DEBUG(dbgs().indent(2) << "No temporal reuse: distance unknown\n");
240       return None;
241     }
242 
243     const ConstantInt &CI = *SCEVConst->getValue();
244     if (Level != LoopDepth && !CI.isZero()) {
245       LLVM_DEBUG(dbgs().indent(2)
246                  << "No temporal reuse: distance is not zero at depth=" << Level
247                  << "\n");
248       return false;
249     } else if (Level == LoopDepth && CI.getSExtValue() > MaxDistance) {
250       LLVM_DEBUG(
251           dbgs().indent(2)
252           << "No temporal reuse: distance is greater than MaxDistance at depth="
253           << Level << "\n");
254       return false;
255     }
256   }
257 
258   LLVM_DEBUG(dbgs().indent(2) << "Found temporal reuse\n");
259   return true;
260 }
261 
computeRefCost(const Loop & L,unsigned CLS) const262 CacheCostTy IndexedReference::computeRefCost(const Loop &L,
263                                              unsigned CLS) const {
264   assert(IsValid && "Expecting a valid reference");
265   LLVM_DEBUG({
266     dbgs().indent(2) << "Computing cache cost for:\n";
267     dbgs().indent(4) << *this << "\n";
268   });
269 
270   // If the indexed reference is loop invariant the cost is one.
271   if (isLoopInvariant(L)) {
272     LLVM_DEBUG(dbgs().indent(4) << "Reference is loop invariant: RefCost=1\n");
273     return 1;
274   }
275 
276   const SCEV *TripCount = computeTripCount(L, SE);
277   if (!TripCount) {
278     LLVM_DEBUG(dbgs() << "Trip count of loop " << L.getName()
279                       << " could not be computed, using DefaultTripCount\n");
280     const SCEV *ElemSize = Sizes.back();
281     TripCount = SE.getConstant(ElemSize->getType(), DefaultTripCount);
282   }
283   LLVM_DEBUG(dbgs() << "TripCount=" << *TripCount << "\n");
284 
285   // If the indexed reference is 'consecutive' the cost is
286   // (TripCount*Stride)/CLS, otherwise the cost is TripCount.
287   const SCEV *RefCost = TripCount;
288 
289   if (isConsecutive(L, CLS)) {
290     const SCEV *Coeff = getLastCoefficient();
291     const SCEV *ElemSize = Sizes.back();
292     const SCEV *Stride = SE.getMulExpr(Coeff, ElemSize);
293     const SCEV *CacheLineSize = SE.getConstant(Stride->getType(), CLS);
294     Type *WiderType = SE.getWiderType(Stride->getType(), TripCount->getType());
295     if (SE.isKnownNegative(Stride))
296       Stride = SE.getNegativeSCEV(Stride);
297     Stride = SE.getNoopOrAnyExtend(Stride, WiderType);
298     TripCount = SE.getNoopOrAnyExtend(TripCount, WiderType);
299     const SCEV *Numerator = SE.getMulExpr(Stride, TripCount);
300     RefCost = SE.getUDivExpr(Numerator, CacheLineSize);
301 
302     LLVM_DEBUG(dbgs().indent(4)
303                << "Access is consecutive: RefCost=(TripCount*Stride)/CLS="
304                << *RefCost << "\n");
305   } else
306     LLVM_DEBUG(dbgs().indent(4)
307                << "Access is not consecutive: RefCost=TripCount=" << *RefCost
308                << "\n");
309 
310   // Attempt to fold RefCost into a constant.
311   if (auto ConstantCost = dyn_cast<SCEVConstant>(RefCost))
312     return ConstantCost->getValue()->getSExtValue();
313 
314   LLVM_DEBUG(dbgs().indent(4)
315              << "RefCost is not a constant! Setting to RefCost=InvalidCost "
316                 "(invalid value).\n");
317 
318   return CacheCost::InvalidCost;
319 }
320 
delinearize(const LoopInfo & LI)321 bool IndexedReference::delinearize(const LoopInfo &LI) {
322   assert(Subscripts.empty() && "Subscripts should be empty");
323   assert(Sizes.empty() && "Sizes should be empty");
324   assert(!IsValid && "Should be called once from the constructor");
325   LLVM_DEBUG(dbgs() << "Delinearizing: " << StoreOrLoadInst << "\n");
326 
327   const SCEV *ElemSize = SE.getElementSize(&StoreOrLoadInst);
328   const BasicBlock *BB = StoreOrLoadInst.getParent();
329 
330   if (Loop *L = LI.getLoopFor(BB)) {
331     const SCEV *AccessFn =
332         SE.getSCEVAtScope(getPointerOperand(&StoreOrLoadInst), L);
333 
334     BasePointer = dyn_cast<SCEVUnknown>(SE.getPointerBase(AccessFn));
335     if (BasePointer == nullptr) {
336       LLVM_DEBUG(
337           dbgs().indent(2)
338           << "ERROR: failed to delinearize, can't identify base pointer\n");
339       return false;
340     }
341 
342     AccessFn = SE.getMinusSCEV(AccessFn, BasePointer);
343 
344     LLVM_DEBUG(dbgs().indent(2) << "In Loop '" << L->getName()
345                                 << "', AccessFn: " << *AccessFn << "\n");
346 
347     SE.delinearize(AccessFn, Subscripts, Sizes,
348                    SE.getElementSize(&StoreOrLoadInst));
349 
350     if (Subscripts.empty() || Sizes.empty() ||
351         Subscripts.size() != Sizes.size()) {
352       // Attempt to determine whether we have a single dimensional array access.
353       // before giving up.
354       if (!isOneDimensionalArray(*AccessFn, *ElemSize, *L, SE)) {
355         LLVM_DEBUG(dbgs().indent(2)
356                    << "ERROR: failed to delinearize reference\n");
357         Subscripts.clear();
358         Sizes.clear();
359         return false;
360       }
361 
362       // The array may be accessed in reverse, for example:
363       //   for (i = N; i > 0; i--)
364       //     A[i] = 0;
365       // In this case, reconstruct the access function using the absolute value
366       // of the step recurrence.
367       const SCEVAddRecExpr *AccessFnAR = dyn_cast<SCEVAddRecExpr>(AccessFn);
368       const SCEV *StepRec = AccessFnAR ? AccessFnAR->getStepRecurrence(SE) : nullptr;
369 
370       if (StepRec && SE.isKnownNegative(StepRec))
371         AccessFn = SE.getAddRecExpr(AccessFnAR->getStart(),
372                                     SE.getNegativeSCEV(StepRec),
373                                     AccessFnAR->getLoop(),
374                                     AccessFnAR->getNoWrapFlags());
375       const SCEV *Div = SE.getUDivExactExpr(AccessFn, ElemSize);
376       Subscripts.push_back(Div);
377       Sizes.push_back(ElemSize);
378     }
379 
380     return all_of(Subscripts, [&](const SCEV *Subscript) {
381       return isSimpleAddRecurrence(*Subscript, *L);
382     });
383   }
384 
385   return false;
386 }
387 
isLoopInvariant(const Loop & L) const388 bool IndexedReference::isLoopInvariant(const Loop &L) const {
389   Value *Addr = getPointerOperand(&StoreOrLoadInst);
390   assert(Addr != nullptr && "Expecting either a load or a store instruction");
391   assert(SE.isSCEVable(Addr->getType()) && "Addr should be SCEVable");
392 
393   if (SE.isLoopInvariant(SE.getSCEV(Addr), &L))
394     return true;
395 
396   // The indexed reference is loop invariant if none of the coefficients use
397   // the loop induction variable.
398   bool allCoeffForLoopAreZero = all_of(Subscripts, [&](const SCEV *Subscript) {
399     return isCoeffForLoopZeroOrInvariant(*Subscript, L);
400   });
401 
402   return allCoeffForLoopAreZero;
403 }
404 
isConsecutive(const Loop & L,unsigned CLS) const405 bool IndexedReference::isConsecutive(const Loop &L, unsigned CLS) const {
406   // The indexed reference is 'consecutive' if the only coefficient that uses
407   // the loop induction variable is the last one...
408   const SCEV *LastSubscript = Subscripts.back();
409   for (const SCEV *Subscript : Subscripts) {
410     if (Subscript == LastSubscript)
411       continue;
412     if (!isCoeffForLoopZeroOrInvariant(*Subscript, L))
413       return false;
414   }
415 
416   // ...and the access stride is less than the cache line size.
417   const SCEV *Coeff = getLastCoefficient();
418   const SCEV *ElemSize = Sizes.back();
419   const SCEV *Stride = SE.getMulExpr(Coeff, ElemSize);
420   const SCEV *CacheLineSize = SE.getConstant(Stride->getType(), CLS);
421 
422   Stride = SE.isKnownNegative(Stride) ? SE.getNegativeSCEV(Stride) : Stride;
423   return SE.isKnownPredicate(ICmpInst::ICMP_ULT, Stride, CacheLineSize);
424 }
425 
getLastCoefficient() const426 const SCEV *IndexedReference::getLastCoefficient() const {
427   const SCEV *LastSubscript = getLastSubscript();
428   assert(isa<SCEVAddRecExpr>(LastSubscript) &&
429          "Expecting a SCEV add recurrence expression");
430   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LastSubscript);
431   return AR->getStepRecurrence(SE);
432 }
433 
isCoeffForLoopZeroOrInvariant(const SCEV & Subscript,const Loop & L) const434 bool IndexedReference::isCoeffForLoopZeroOrInvariant(const SCEV &Subscript,
435                                                      const Loop &L) const {
436   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(&Subscript);
437   return (AR != nullptr) ? AR->getLoop() != &L
438                          : SE.isLoopInvariant(&Subscript, &L);
439 }
440 
isSimpleAddRecurrence(const SCEV & Subscript,const Loop & L) const441 bool IndexedReference::isSimpleAddRecurrence(const SCEV &Subscript,
442                                              const Loop &L) const {
443   if (!isa<SCEVAddRecExpr>(Subscript))
444     return false;
445 
446   const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(&Subscript);
447   assert(AR->getLoop() && "AR should have a loop");
448 
449   if (!AR->isAffine())
450     return false;
451 
452   const SCEV *Start = AR->getStart();
453   const SCEV *Step = AR->getStepRecurrence(SE);
454 
455   if (!SE.isLoopInvariant(Start, &L) || !SE.isLoopInvariant(Step, &L))
456     return false;
457 
458   return true;
459 }
460 
isAliased(const IndexedReference & Other,AAResults & AA) const461 bool IndexedReference::isAliased(const IndexedReference &Other,
462                                  AAResults &AA) const {
463   const auto &Loc1 = MemoryLocation::get(&StoreOrLoadInst);
464   const auto &Loc2 = MemoryLocation::get(&Other.StoreOrLoadInst);
465   return AA.isMustAlias(Loc1, Loc2);
466 }
467 
468 //===----------------------------------------------------------------------===//
469 // CacheCost implementation
470 //
operator <<(raw_ostream & OS,const CacheCost & CC)471 raw_ostream &llvm::operator<<(raw_ostream &OS, const CacheCost &CC) {
472   for (const auto &LC : CC.LoopCosts) {
473     const Loop *L = LC.first;
474     OS << "Loop '" << L->getName() << "' has cost = " << LC.second << "\n";
475   }
476   return OS;
477 }
478 
CacheCost(const LoopVectorTy & Loops,const LoopInfo & LI,ScalarEvolution & SE,TargetTransformInfo & TTI,AAResults & AA,DependenceInfo & DI,Optional<unsigned> TRT)479 CacheCost::CacheCost(const LoopVectorTy &Loops, const LoopInfo &LI,
480                      ScalarEvolution &SE, TargetTransformInfo &TTI,
481                      AAResults &AA, DependenceInfo &DI,
482                      Optional<unsigned> TRT)
483     : Loops(Loops), TripCounts(), LoopCosts(),
484       TRT((TRT == None) ? Optional<unsigned>(TemporalReuseThreshold) : TRT),
485       LI(LI), SE(SE), TTI(TTI), AA(AA), DI(DI) {
486   assert(!Loops.empty() && "Expecting a non-empty loop vector.");
487 
488   for (const Loop *L : Loops) {
489     unsigned TripCount = SE.getSmallConstantTripCount(L);
490     TripCount = (TripCount == 0) ? DefaultTripCount : TripCount;
491     TripCounts.push_back({L, TripCount});
492   }
493 
494   calculateCacheFootprint();
495 }
496 
497 std::unique_ptr<CacheCost>
getCacheCost(Loop & Root,LoopStandardAnalysisResults & AR,DependenceInfo & DI,Optional<unsigned> TRT)498 CacheCost::getCacheCost(Loop &Root, LoopStandardAnalysisResults &AR,
499                         DependenceInfo &DI, Optional<unsigned> TRT) {
500   if (!Root.isOutermost()) {
501     LLVM_DEBUG(dbgs() << "Expecting the outermost loop in a loop nest\n");
502     return nullptr;
503   }
504 
505   LoopVectorTy Loops;
506   append_range(Loops, breadth_first(&Root));
507 
508   if (!getInnerMostLoop(Loops)) {
509     LLVM_DEBUG(dbgs() << "Cannot compute cache cost of loop nest with more "
510                          "than one innermost loop\n");
511     return nullptr;
512   }
513 
514   return std::make_unique<CacheCost>(Loops, AR.LI, AR.SE, AR.TTI, AR.AA, DI, TRT);
515 }
516 
calculateCacheFootprint()517 void CacheCost::calculateCacheFootprint() {
518   LLVM_DEBUG(dbgs() << "POPULATING REFERENCE GROUPS\n");
519   ReferenceGroupsTy RefGroups;
520   if (!populateReferenceGroups(RefGroups))
521     return;
522 
523   LLVM_DEBUG(dbgs() << "COMPUTING LOOP CACHE COSTS\n");
524   for (const Loop *L : Loops) {
525     assert((std::find_if(LoopCosts.begin(), LoopCosts.end(),
526                          [L](const LoopCacheCostTy &LCC) {
527                            return LCC.first == L;
528                          }) == LoopCosts.end()) &&
529            "Should not add duplicate element");
530     CacheCostTy LoopCost = computeLoopCacheCost(*L, RefGroups);
531     LoopCosts.push_back(std::make_pair(L, LoopCost));
532   }
533 
534   sortLoopCosts();
535   RefGroups.clear();
536 }
537 
populateReferenceGroups(ReferenceGroupsTy & RefGroups) const538 bool CacheCost::populateReferenceGroups(ReferenceGroupsTy &RefGroups) const {
539   assert(RefGroups.empty() && "Reference groups should be empty");
540 
541   unsigned CLS = TTI.getCacheLineSize();
542   Loop *InnerMostLoop = getInnerMostLoop(Loops);
543   assert(InnerMostLoop != nullptr && "Expecting a valid innermost loop");
544 
545   for (BasicBlock *BB : InnerMostLoop->getBlocks()) {
546     for (Instruction &I : *BB) {
547       if (!isa<StoreInst>(I) && !isa<LoadInst>(I))
548         continue;
549 
550       std::unique_ptr<IndexedReference> R(new IndexedReference(I, LI, SE));
551       if (!R->isValid())
552         continue;
553 
554       bool Added = false;
555       for (ReferenceGroupTy &RefGroup : RefGroups) {
556         const IndexedReference &Representative = *RefGroup.front().get();
557         LLVM_DEBUG({
558           dbgs() << "References:\n";
559           dbgs().indent(2) << *R << "\n";
560           dbgs().indent(2) << Representative << "\n";
561         });
562 
563 
564        // FIXME: Both positive and negative access functions will be placed
565        // into the same reference group, resulting in a bi-directional array
566        // access such as:
567        //   for (i = N; i > 0; i--)
568        //     A[i] = A[N - i];
569        // having the same cost calculation as a single dimention access pattern
570        //   for (i = 0; i < N; i++)
571        //     A[i] = A[i];
572        // when in actuality, depending on the array size, the first example
573        // should have a cost closer to 2x the second due to the two cache
574        // access per iteration from opposite ends of the array
575         Optional<bool> HasTemporalReuse =
576             R->hasTemporalReuse(Representative, *TRT, *InnerMostLoop, DI, AA);
577         Optional<bool> HasSpacialReuse =
578             R->hasSpacialReuse(Representative, CLS, AA);
579 
580         if ((HasTemporalReuse.hasValue() && *HasTemporalReuse) ||
581             (HasSpacialReuse.hasValue() && *HasSpacialReuse)) {
582           RefGroup.push_back(std::move(R));
583           Added = true;
584           break;
585         }
586       }
587 
588       if (!Added) {
589         ReferenceGroupTy RG;
590         RG.push_back(std::move(R));
591         RefGroups.push_back(std::move(RG));
592       }
593     }
594   }
595 
596   if (RefGroups.empty())
597     return false;
598 
599   LLVM_DEBUG({
600     dbgs() << "\nIDENTIFIED REFERENCE GROUPS:\n";
601     int n = 1;
602     for (const ReferenceGroupTy &RG : RefGroups) {
603       dbgs().indent(2) << "RefGroup " << n << ":\n";
604       for (const auto &IR : RG)
605         dbgs().indent(4) << *IR << "\n";
606       n++;
607     }
608     dbgs() << "\n";
609   });
610 
611   return true;
612 }
613 
614 CacheCostTy
computeLoopCacheCost(const Loop & L,const ReferenceGroupsTy & RefGroups) const615 CacheCost::computeLoopCacheCost(const Loop &L,
616                                 const ReferenceGroupsTy &RefGroups) const {
617   if (!L.isLoopSimplifyForm())
618     return InvalidCost;
619 
620   LLVM_DEBUG(dbgs() << "Considering loop '" << L.getName()
621                     << "' as innermost loop.\n");
622 
623   // Compute the product of the trip counts of each other loop in the nest.
624   CacheCostTy TripCountsProduct = 1;
625   for (const auto &TC : TripCounts) {
626     if (TC.first == &L)
627       continue;
628     TripCountsProduct *= TC.second;
629   }
630 
631   CacheCostTy LoopCost = 0;
632   for (const ReferenceGroupTy &RG : RefGroups) {
633     CacheCostTy RefGroupCost = computeRefGroupCacheCost(RG, L);
634     LoopCost += RefGroupCost * TripCountsProduct;
635   }
636 
637   LLVM_DEBUG(dbgs().indent(2) << "Loop '" << L.getName()
638                               << "' has cost=" << LoopCost << "\n");
639 
640   return LoopCost;
641 }
642 
computeRefGroupCacheCost(const ReferenceGroupTy & RG,const Loop & L) const643 CacheCostTy CacheCost::computeRefGroupCacheCost(const ReferenceGroupTy &RG,
644                                                 const Loop &L) const {
645   assert(!RG.empty() && "Reference group should have at least one member.");
646 
647   const IndexedReference *Representative = RG.front().get();
648   return Representative->computeRefCost(L, TTI.getCacheLineSize());
649 }
650 
651 //===----------------------------------------------------------------------===//
652 // LoopCachePrinterPass implementation
653 //
run(Loop & L,LoopAnalysisManager & AM,LoopStandardAnalysisResults & AR,LPMUpdater & U)654 PreservedAnalyses LoopCachePrinterPass::run(Loop &L, LoopAnalysisManager &AM,
655                                             LoopStandardAnalysisResults &AR,
656                                             LPMUpdater &U) {
657   Function *F = L.getHeader()->getParent();
658   DependenceInfo DI(F, &AR.AA, &AR.SE, &AR.LI);
659 
660   if (auto CC = CacheCost::getCacheCost(L, AR, DI))
661     OS << *CC;
662 
663   return PreservedAnalyses::all();
664 }
665