1 //===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// This file implements SLP analysis based on VPlan. The analysis is based on
9 /// the ideas described in
10 ///
11 ///   Look-ahead SLP: auto-vectorization in the presence of commutative
12 ///   operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
13 ///   Luís F. W. Góes
14 ///
15 //===----------------------------------------------------------------------===//
16 
17 #include "VPlan.h"
18 #include "VPlanValue.h"
19 #include "llvm/ADT/DenseMap.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/Analysis/VectorUtils.h"
22 #include "llvm/IR/Instruction.h"
23 #include "llvm/IR/Instructions.h"
24 #include "llvm/IR/Type.h"
25 #include "llvm/IR/Value.h"
26 #include "llvm/Support/Casting.h"
27 #include "llvm/Support/Debug.h"
28 #include "llvm/Support/ErrorHandling.h"
29 #include "llvm/Support/raw_ostream.h"
30 #include <algorithm>
31 #include <cassert>
32 #include <utility>
33 
34 using namespace llvm;
35 
36 #define DEBUG_TYPE "vplan-slp"
37 
38 // Number of levels to look ahead when re-ordering multi node operands.
39 static unsigned LookaheadMaxDepth = 5;
40 
41 VPInstruction *VPlanSlp::markFailed() {
42   // FIXME: Currently this is used to signal we hit instructions we cannot
43   //        trivially SLP'ize.
44   CompletelySLP = false;
45   return nullptr;
46 }
47 
48 void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) {
49   if (all_of(Operands, [](VPValue *V) {
50         return cast<VPInstruction>(V)->getUnderlyingInstr();
51       })) {
52     unsigned BundleSize = 0;
53     for (VPValue *V : Operands) {
54       Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType();
55       assert(!T->isVectorTy() && "Only scalar types supported for now");
56       BundleSize += T->getScalarSizeInBits();
57     }
58     WidestBundleBits = std::max(WidestBundleBits, BundleSize);
59   }
60 
61   auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New);
62   assert(Res.second &&
63          "Already created a combined instruction for the operand bundle");
64   (void)Res;
65 }
66 
67 bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const {
68   // Currently we only support VPInstructions.
69   if (!all_of(Operands, [](VPValue *Op) {
70         return Op && isa<VPInstruction>(Op) &&
71                cast<VPInstruction>(Op)->getUnderlyingInstr();
72       })) {
73     LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n");
74     return false;
75   }
76 
77   // Check if opcodes and type width agree for all instructions in the bundle.
78   // FIXME: Differing widths/opcodes can be handled by inserting additional
79   //        instructions.
80   // FIXME: Deal with non-primitive types.
81   const Instruction *OriginalInstr =
82       cast<VPInstruction>(Operands[0])->getUnderlyingInstr();
83   unsigned Opcode = OriginalInstr->getOpcode();
84   unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits();
85   if (!all_of(Operands, [Opcode, Width](VPValue *Op) {
86         const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr();
87         return I->getOpcode() == Opcode &&
88                I->getType()->getPrimitiveSizeInBits() == Width;
89       })) {
90     LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n");
91     return false;
92   }
93 
94   // For now, all operands must be defined in the same BB.
95   if (any_of(Operands, [this](VPValue *Op) {
96         return cast<VPInstruction>(Op)->getParent() != &this->BB;
97       })) {
98     LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n");
99     return false;
100   }
101 
102   if (any_of(Operands,
103              [](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) {
104     LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n");
105     return false;
106   }
107 
108   // For loads, check that there are no instructions writing to memory in
109   // between them.
110   // TODO: we only have to forbid instructions writing to memory that could
111   //       interfere with any of the loads in the bundle
112   if (Opcode == Instruction::Load) {
113     unsigned LoadsSeen = 0;
114     VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent();
115     for (auto &I : *Parent) {
116       auto *VPI = dyn_cast<VPInstruction>(&I);
117       if (!VPI)
118         break;
119       if (VPI->getOpcode() == Instruction::Load &&
120           llvm::is_contained(Operands, VPI))
121         LoadsSeen++;
122 
123       if (LoadsSeen == Operands.size())
124         break;
125       if (LoadsSeen > 0 && VPI->mayWriteToMemory()) {
126         LLVM_DEBUG(
127             dbgs() << "VPSLP: instruction modifying memory between loads\n");
128         return false;
129       }
130     }
131 
132     if (!all_of(Operands, [](VPValue *Op) {
133           return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
134               ->isSimple();
135         })) {
136       LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n");
137       return false;
138     }
139   }
140 
141   if (Opcode == Instruction::Store)
142     if (!all_of(Operands, [](VPValue *Op) {
143           return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
144               ->isSimple();
145         })) {
146       LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n");
147       return false;
148     }
149 
150   return true;
151 }
152 
153 static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values,
154                                              unsigned OperandIndex) {
155   SmallVector<VPValue *, 4> Operands;
156   for (VPValue *V : Values) {
157     // Currently we only support VPInstructions.
158     auto *U = cast<VPInstruction>(V);
159     Operands.push_back(U->getOperand(OperandIndex));
160   }
161   return Operands;
162 }
163 
164 static bool areCommutative(ArrayRef<VPValue *> Values) {
165   return Instruction::isCommutative(
166       cast<VPInstruction>(Values[0])->getOpcode());
167 }
168 
169 static SmallVector<SmallVector<VPValue *, 4>, 4>
170 getOperands(ArrayRef<VPValue *> Values) {
171   SmallVector<SmallVector<VPValue *, 4>, 4> Result;
172   auto *VPI = cast<VPInstruction>(Values[0]);
173 
174   switch (VPI->getOpcode()) {
175   case Instruction::Load:
176     llvm_unreachable("Loads terminate a tree, no need to get operands");
177   case Instruction::Store:
178     Result.push_back(getOperands(Values, 0));
179     break;
180   default:
181     for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I)
182       Result.push_back(getOperands(Values, I));
183     break;
184   }
185 
186   return Result;
187 }
188 
189 /// Returns the opcode of Values or ~0 if they do not all agree.
190 static Optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) {
191   unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode();
192   if (any_of(Values, [Opcode](VPValue *V) {
193         return cast<VPInstruction>(V)->getOpcode() != Opcode;
194       }))
195     return None;
196   return {Opcode};
197 }
198 
199 /// Returns true if A and B access sequential memory if they are loads or
200 /// stores or if they have identical opcodes otherwise.
201 static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B,
202                                   VPInterleavedAccessInfo &IAI) {
203   if (A->getOpcode() != B->getOpcode())
204     return false;
205 
206   if (A->getOpcode() != Instruction::Load &&
207       A->getOpcode() != Instruction::Store)
208     return true;
209   auto *GA = IAI.getInterleaveGroup(A);
210   auto *GB = IAI.getInterleaveGroup(B);
211 
212   return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B);
213 }
214 
215 /// Implements getLAScore from Listing 7 in the paper.
216 /// Traverses and compares operands of V1 and V2 to MaxLevel.
217 static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel,
218                            VPInterleavedAccessInfo &IAI) {
219   auto *I1 = dyn_cast<VPInstruction>(V1);
220   auto *I2 = dyn_cast<VPInstruction>(V2);
221   // Currently we only support VPInstructions.
222   if (!I1 || !I2)
223     return 0;
224 
225   if (MaxLevel == 0)
226     return (unsigned)areConsecutiveOrMatch(I1, I2, IAI);
227 
228   unsigned Score = 0;
229   for (unsigned I = 0, EV1 = I1->getNumOperands(); I < EV1; ++I)
230     for (unsigned J = 0, EV2 = I2->getNumOperands(); J < EV2; ++J)
231       Score +=
232           getLAScore(I1->getOperand(I), I2->getOperand(J), MaxLevel - 1, IAI);
233   return Score;
234 }
235 
236 std::pair<VPlanSlp::OpMode, VPValue *>
237 VPlanSlp::getBest(OpMode Mode, VPValue *Last,
238                   SmallPtrSetImpl<VPValue *> &Candidates,
239                   VPInterleavedAccessInfo &IAI) {
240   assert((Mode == OpMode::Load || Mode == OpMode::Opcode) &&
241          "Currently we only handle load and commutative opcodes");
242   LLVM_DEBUG(dbgs() << "      getBest\n");
243 
244   SmallVector<VPValue *, 4> BestCandidates;
245   LLVM_DEBUG(dbgs() << "        Candidates  for "
246                     << *cast<VPInstruction>(Last)->getUnderlyingInstr() << " ");
247   for (auto *Candidate : Candidates) {
248     auto *LastI = cast<VPInstruction>(Last);
249     auto *CandidateI = cast<VPInstruction>(Candidate);
250     if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) {
251       LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr()
252                         << " ");
253       BestCandidates.push_back(Candidate);
254     }
255   }
256   LLVM_DEBUG(dbgs() << "\n");
257 
258   if (BestCandidates.empty())
259     return {OpMode::Failed, nullptr};
260 
261   if (BestCandidates.size() == 1)
262     return {Mode, BestCandidates[0]};
263 
264   VPValue *Best = nullptr;
265   unsigned BestScore = 0;
266   for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) {
267     unsigned PrevScore = ~0u;
268     bool AllSame = true;
269 
270     // FIXME: Avoid visiting the same operands multiple times.
271     for (auto *Candidate : BestCandidates) {
272       unsigned Score = getLAScore(Last, Candidate, Depth, IAI);
273       if (PrevScore == ~0u)
274         PrevScore = Score;
275       if (PrevScore != Score)
276         AllSame = false;
277       PrevScore = Score;
278 
279       if (Score > BestScore) {
280         BestScore = Score;
281         Best = Candidate;
282       }
283     }
284     if (!AllSame)
285       break;
286   }
287   LLVM_DEBUG(dbgs() << "Found best "
288                     << *cast<VPInstruction>(Best)->getUnderlyingInstr()
289                     << "\n");
290   Candidates.erase(Best);
291 
292   return {Mode, Best};
293 }
294 
295 SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() {
296   SmallVector<MultiNodeOpTy, 4> FinalOrder;
297   SmallVector<OpMode, 4> Mode;
298   FinalOrder.reserve(MultiNodeOps.size());
299   Mode.reserve(MultiNodeOps.size());
300 
301   LLVM_DEBUG(dbgs() << "Reordering multinode\n");
302 
303   for (auto &Operands : MultiNodeOps) {
304     FinalOrder.push_back({Operands.first, {Operands.second[0]}});
305     if (cast<VPInstruction>(Operands.second[0])->getOpcode() ==
306         Instruction::Load)
307       Mode.push_back(OpMode::Load);
308     else
309       Mode.push_back(OpMode::Opcode);
310   }
311 
312   for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) {
313     LLVM_DEBUG(dbgs() << "  Finding best value for lane " << Lane << "\n");
314     SmallPtrSet<VPValue *, 4> Candidates;
315     LLVM_DEBUG(dbgs() << "  Candidates  ");
316     for (auto Ops : MultiNodeOps) {
317       LLVM_DEBUG(
318           dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr()
319                  << " ");
320       Candidates.insert(Ops.second[Lane]);
321     }
322     LLVM_DEBUG(dbgs() << "\n");
323 
324     for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) {
325       LLVM_DEBUG(dbgs() << "  Checking " << Op << "\n");
326       if (Mode[Op] == OpMode::Failed)
327         continue;
328 
329       VPValue *Last = FinalOrder[Op].second[Lane - 1];
330       std::pair<OpMode, VPValue *> Res =
331           getBest(Mode[Op], Last, Candidates, IAI);
332       if (Res.second)
333         FinalOrder[Op].second.push_back(Res.second);
334       else
335         // TODO: handle this case
336         FinalOrder[Op].second.push_back(markFailed());
337     }
338   }
339 
340   return FinalOrder;
341 }
342 
343 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
344 void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) {
345   dbgs() << " Ops: ";
346   for (auto Op : Values) {
347     if (auto *VPInstr = cast_or_null<VPInstruction>(Op))
348       if (auto *Instr = VPInstr->getUnderlyingInstr()) {
349         dbgs() << *Instr << " | ";
350         continue;
351       }
352     dbgs() << " nullptr | ";
353   }
354   dbgs() << "\n";
355 }
356 #endif
357 
358 VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
359   assert(!Values.empty() && "Need some operands!");
360 
361   // If we already visited this instruction bundle, re-use the existing node
362   auto I = BundleToCombined.find(to_vector<4>(Values));
363   if (I != BundleToCombined.end()) {
364 #ifndef NDEBUG
365     // Check that the resulting graph is a tree. If we re-use a node, this means
366     // its values have multiple users. We only allow this, if all users of each
367     // value are the same instruction.
368     for (auto *V : Values) {
369       auto UI = V->user_begin();
370       auto *FirstUser = *UI++;
371       while (UI != V->user_end()) {
372         assert(*UI == FirstUser && "Currently we only support SLP trees.");
373         UI++;
374       }
375     }
376 #endif
377     return I->second;
378   }
379 
380   // Dump inputs
381   LLVM_DEBUG({
382     dbgs() << "buildGraph: ";
383     dumpBundle(Values);
384   });
385 
386   if (!areVectorizable(Values))
387     return markFailed();
388 
389   assert(getOpcode(Values) && "Opcodes for all values must match");
390   unsigned ValuesOpcode = *getOpcode(Values);
391 
392   SmallVector<VPValue *, 4> CombinedOperands;
393   if (areCommutative(Values)) {
394     bool MultiNodeRoot = !MultiNodeActive;
395     MultiNodeActive = true;
396     for (auto &Operands : getOperands(Values)) {
397       LLVM_DEBUG({
398         dbgs() << "  Visiting Commutative";
399         dumpBundle(Operands);
400       });
401 
402       auto OperandsOpcode = getOpcode(Operands);
403       if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) {
404         LLVM_DEBUG(dbgs() << "    Same opcode, continue building\n");
405         CombinedOperands.push_back(buildGraph(Operands));
406       } else {
407         LLVM_DEBUG(dbgs() << "    Adding multinode Ops\n");
408         // Create dummy VPInstruction, which will we replace later by the
409         // re-ordered operand.
410         VPInstruction *Op = new VPInstruction(0, {});
411         CombinedOperands.push_back(Op);
412         MultiNodeOps.emplace_back(Op, Operands);
413       }
414     }
415 
416     if (MultiNodeRoot) {
417       LLVM_DEBUG(dbgs() << "Reorder \n");
418       MultiNodeActive = false;
419 
420       auto FinalOrder = reorderMultiNodeOps();
421 
422       MultiNodeOps.clear();
423       for (auto &Ops : FinalOrder) {
424         VPInstruction *NewOp = buildGraph(Ops.second);
425         Ops.first->replaceAllUsesWith(NewOp);
426         for (unsigned i = 0; i < CombinedOperands.size(); i++)
427           if (CombinedOperands[i] == Ops.first)
428             CombinedOperands[i] = NewOp;
429         delete Ops.first;
430         Ops.first = NewOp;
431       }
432       LLVM_DEBUG(dbgs() << "Found final order\n");
433     }
434   } else {
435     LLVM_DEBUG(dbgs() << "  NonCommuntative\n");
436     if (ValuesOpcode == Instruction::Load)
437       for (VPValue *V : Values)
438         CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0));
439     else
440       for (auto &Operands : getOperands(Values))
441         CombinedOperands.push_back(buildGraph(Operands));
442   }
443 
444   unsigned Opcode;
445   switch (ValuesOpcode) {
446   case Instruction::Load:
447     Opcode = VPInstruction::SLPLoad;
448     break;
449   case Instruction::Store:
450     Opcode = VPInstruction::SLPStore;
451     break;
452   default:
453     Opcode = ValuesOpcode;
454     break;
455   }
456 
457   if (!CompletelySLP)
458     return markFailed();
459 
460   assert(CombinedOperands.size() > 0 && "Need more some operands");
461   auto *Inst = cast<VPInstruction>(Values[0])->getUnderlyingInstr();
462   auto *VPI = new VPInstruction(Opcode, CombinedOperands, Inst->getDebugLoc());
463   VPI->setUnderlyingInstr(Inst);
464 
465   LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " "
466                     << *cast<VPInstruction>(Values[0]) << "\n");
467   addCombined(Values, VPI);
468   return VPI;
469 }
470