1*da58b97aSjoerg //===- MVELaneInterleaving.cpp - Inverleave for MVE instructions ----------===//
2*da58b97aSjoerg //
3*da58b97aSjoerg // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*da58b97aSjoerg // See https://llvm.org/LICENSE.txt for license information.
5*da58b97aSjoerg // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*da58b97aSjoerg //
7*da58b97aSjoerg //===----------------------------------------------------------------------===//
8*da58b97aSjoerg //
9*da58b97aSjoerg // This pass interleaves around sext/zext/trunc instructions. MVE does not have
10*da58b97aSjoerg // a single sext/zext or trunc instruction that takes the bottom half of a
11*da58b97aSjoerg // vector and extends to a full width, like NEON has with MOVL. Instead it is
12*da58b97aSjoerg // expected that this happens through top/bottom instructions. So the MVE
13*da58b97aSjoerg // equivalent VMOVLT/B instructions take either the even or odd elements of the
14*da58b97aSjoerg // input and extend them to the larger type, producing a vector with half the
15*da58b97aSjoerg // number of elements each of double the bitwidth. As there is no simple
16*da58b97aSjoerg // instruction, we often have to turn sext/zext/trunc into a series of lane
17*da58b97aSjoerg // moves (or stack loads/stores, which we do not do yet).
18*da58b97aSjoerg //
19*da58b97aSjoerg // This pass takes vector code that starts at truncs, looks for interconnected
20*da58b97aSjoerg // blobs of operations that end with sext/zext (or constants/splats) of the
21*da58b97aSjoerg // form:
22*da58b97aSjoerg //   %sa = sext v8i16 %a to v8i32
23*da58b97aSjoerg //   %sb = sext v8i16 %b to v8i32
24*da58b97aSjoerg //   %add = add v8i32 %sa, %sb
25*da58b97aSjoerg //   %r = trunc %add to v8i16
26*da58b97aSjoerg // And adds shuffles to allow the use of VMOVL/VMOVN instrctions:
27*da58b97aSjoerg //   %sha = shuffle v8i16 %a, undef, <0, 2, 4, 6, 1, 3, 5, 7>
28*da58b97aSjoerg //   %sa = sext v8i16 %sha to v8i32
29*da58b97aSjoerg //   %shb = shuffle v8i16 %b, undef, <0, 2, 4, 6, 1, 3, 5, 7>
30*da58b97aSjoerg //   %sb = sext v8i16 %shb to v8i32
31*da58b97aSjoerg //   %add = add v8i32 %sa, %sb
32*da58b97aSjoerg //   %r = trunc %add to v8i16
33*da58b97aSjoerg //   %shr = shuffle v8i16 %r, undef, <0, 4, 1, 5, 2, 6, 3, 7>
34*da58b97aSjoerg // Which can then be split and lowered to MVE instructions efficiently:
35*da58b97aSjoerg //   %sa_b = VMOVLB.s16 %a
36*da58b97aSjoerg //   %sa_t = VMOVLT.s16 %a
37*da58b97aSjoerg //   %sb_b = VMOVLB.s16 %b
38*da58b97aSjoerg //   %sb_t = VMOVLT.s16 %b
39*da58b97aSjoerg //   %add_b = VADD.i32 %sa_b, %sb_b
40*da58b97aSjoerg //   %add_t = VADD.i32 %sa_t, %sb_t
41*da58b97aSjoerg //   %r = VMOVNT.i16 %add_b, %add_t
42*da58b97aSjoerg //
43*da58b97aSjoerg //===----------------------------------------------------------------------===//
44*da58b97aSjoerg 
45*da58b97aSjoerg #include "ARM.h"
46*da58b97aSjoerg #include "ARMBaseInstrInfo.h"
47*da58b97aSjoerg #include "ARMSubtarget.h"
48*da58b97aSjoerg #include "llvm/Analysis/TargetTransformInfo.h"
49*da58b97aSjoerg #include "llvm/CodeGen/TargetLowering.h"
50*da58b97aSjoerg #include "llvm/CodeGen/TargetPassConfig.h"
51*da58b97aSjoerg #include "llvm/CodeGen/TargetSubtargetInfo.h"
52*da58b97aSjoerg #include "llvm/IR/BasicBlock.h"
53*da58b97aSjoerg #include "llvm/IR/Constant.h"
54*da58b97aSjoerg #include "llvm/IR/Constants.h"
55*da58b97aSjoerg #include "llvm/IR/DerivedTypes.h"
56*da58b97aSjoerg #include "llvm/IR/Function.h"
57*da58b97aSjoerg #include "llvm/IR/IRBuilder.h"
58*da58b97aSjoerg #include "llvm/IR/InstIterator.h"
59*da58b97aSjoerg #include "llvm/IR/InstrTypes.h"
60*da58b97aSjoerg #include "llvm/IR/Instruction.h"
61*da58b97aSjoerg #include "llvm/IR/Instructions.h"
62*da58b97aSjoerg #include "llvm/IR/IntrinsicInst.h"
63*da58b97aSjoerg #include "llvm/IR/Intrinsics.h"
64*da58b97aSjoerg #include "llvm/IR/IntrinsicsARM.h"
65*da58b97aSjoerg #include "llvm/IR/PatternMatch.h"
66*da58b97aSjoerg #include "llvm/IR/Type.h"
67*da58b97aSjoerg #include "llvm/IR/Value.h"
68*da58b97aSjoerg #include "llvm/InitializePasses.h"
69*da58b97aSjoerg #include "llvm/Pass.h"
70*da58b97aSjoerg #include "llvm/Support/Casting.h"
71*da58b97aSjoerg #include <algorithm>
72*da58b97aSjoerg #include <cassert>
73*da58b97aSjoerg 
74*da58b97aSjoerg using namespace llvm;
75*da58b97aSjoerg 
76*da58b97aSjoerg #define DEBUG_TYPE "mve-laneinterleave"
77*da58b97aSjoerg 
78*da58b97aSjoerg cl::opt<bool> EnableInterleave(
79*da58b97aSjoerg     "enable-mve-interleave", cl::Hidden, cl::init(true),
80*da58b97aSjoerg     cl::desc("Enable interleave MVE vector operation lowering"));
81*da58b97aSjoerg 
82*da58b97aSjoerg namespace {
83*da58b97aSjoerg 
84*da58b97aSjoerg class MVELaneInterleaving : public FunctionPass {
85*da58b97aSjoerg public:
86*da58b97aSjoerg   static char ID; // Pass identification, replacement for typeid
87*da58b97aSjoerg 
MVELaneInterleaving()88*da58b97aSjoerg   explicit MVELaneInterleaving() : FunctionPass(ID) {
89*da58b97aSjoerg     initializeMVELaneInterleavingPass(*PassRegistry::getPassRegistry());
90*da58b97aSjoerg   }
91*da58b97aSjoerg 
92*da58b97aSjoerg   bool runOnFunction(Function &F) override;
93*da58b97aSjoerg 
getPassName() const94*da58b97aSjoerg   StringRef getPassName() const override { return "MVE lane interleaving"; }
95*da58b97aSjoerg 
getAnalysisUsage(AnalysisUsage & AU) const96*da58b97aSjoerg   void getAnalysisUsage(AnalysisUsage &AU) const override {
97*da58b97aSjoerg     AU.setPreservesCFG();
98*da58b97aSjoerg     AU.addRequired<TargetPassConfig>();
99*da58b97aSjoerg     FunctionPass::getAnalysisUsage(AU);
100*da58b97aSjoerg   }
101*da58b97aSjoerg };
102*da58b97aSjoerg 
103*da58b97aSjoerg } // end anonymous namespace
104*da58b97aSjoerg 
105*da58b97aSjoerg char MVELaneInterleaving::ID = 0;
106*da58b97aSjoerg 
107*da58b97aSjoerg INITIALIZE_PASS(MVELaneInterleaving, DEBUG_TYPE, "MVE lane interleaving", false,
108*da58b97aSjoerg                 false)
109*da58b97aSjoerg 
createMVELaneInterleavingPass()110*da58b97aSjoerg Pass *llvm::createMVELaneInterleavingPass() {
111*da58b97aSjoerg   return new MVELaneInterleaving();
112*da58b97aSjoerg }
113*da58b97aSjoerg 
isProfitableToInterleave(SmallSetVector<Instruction *,4> & Exts,SmallSetVector<Instruction *,4> & Truncs)114*da58b97aSjoerg static bool isProfitableToInterleave(SmallSetVector<Instruction *, 4> &Exts,
115*da58b97aSjoerg                                      SmallSetVector<Instruction *, 4> &Truncs) {
116*da58b97aSjoerg   // This is not always beneficial to transform. Exts can be incorporated into
117*da58b97aSjoerg   // loads, Truncs can be folded into stores.
118*da58b97aSjoerg   // Truncs are usually the same number of instructions,
119*da58b97aSjoerg   //  VSTRH.32(A);VSTRH.32(B) vs VSTRH.16(VMOVNT A, B) with interleaving
120*da58b97aSjoerg   // Exts are unfortunately more instructions in the general case:
121*da58b97aSjoerg   //  A=VLDRH.32; B=VLDRH.32;
122*da58b97aSjoerg   // vs with interleaving:
123*da58b97aSjoerg   //  T=VLDRH.16; A=VMOVNB T; B=VMOVNT T
124*da58b97aSjoerg   // But those VMOVL may be folded into a VMULL.
125*da58b97aSjoerg 
126*da58b97aSjoerg   // But expensive extends/truncs are always good to remove. FPExts always
127*da58b97aSjoerg   // involve extra VCVT's so are always considered to be beneficial to convert.
128*da58b97aSjoerg   for (auto *E : Exts) {
129*da58b97aSjoerg     if (isa<FPExtInst>(E) || !isa<LoadInst>(E->getOperand(0))) {
130*da58b97aSjoerg       LLVM_DEBUG(dbgs() << "Beneficial due to " << *E << "\n");
131*da58b97aSjoerg       return true;
132*da58b97aSjoerg     }
133*da58b97aSjoerg   }
134*da58b97aSjoerg   for (auto *T : Truncs) {
135*da58b97aSjoerg     if (T->hasOneUse() && !isa<StoreInst>(*T->user_begin())) {
136*da58b97aSjoerg       LLVM_DEBUG(dbgs() << "Beneficial due to " << *T << "\n");
137*da58b97aSjoerg       return true;
138*da58b97aSjoerg     }
139*da58b97aSjoerg   }
140*da58b97aSjoerg 
141*da58b97aSjoerg   // Otherwise, we know we have a load(ext), see if any of the Extends are a
142*da58b97aSjoerg   // vmull. This is a simple heuristic and certainly not perfect.
143*da58b97aSjoerg   for (auto *E : Exts) {
144*da58b97aSjoerg     if (!E->hasOneUse() ||
145*da58b97aSjoerg         cast<Instruction>(*E->user_begin())->getOpcode() != Instruction::Mul) {
146*da58b97aSjoerg       LLVM_DEBUG(dbgs() << "Not beneficial due to " << *E << "\n");
147*da58b97aSjoerg       return false;
148*da58b97aSjoerg     }
149*da58b97aSjoerg   }
150*da58b97aSjoerg   return true;
151*da58b97aSjoerg }
152*da58b97aSjoerg 
tryInterleave(Instruction * Start,SmallPtrSetImpl<Instruction * > & Visited)153*da58b97aSjoerg static bool tryInterleave(Instruction *Start,
154*da58b97aSjoerg                           SmallPtrSetImpl<Instruction *> &Visited) {
155*da58b97aSjoerg   LLVM_DEBUG(dbgs() << "tryInterleave from " << *Start << "\n");
156*da58b97aSjoerg   auto *VT = cast<FixedVectorType>(Start->getType());
157*da58b97aSjoerg 
158*da58b97aSjoerg   if (!isa<Instruction>(Start->getOperand(0)))
159*da58b97aSjoerg     return false;
160*da58b97aSjoerg 
161*da58b97aSjoerg   // Look for connected operations starting from Ext's, terminating at Truncs.
162*da58b97aSjoerg   std::vector<Instruction *> Worklist;
163*da58b97aSjoerg   Worklist.push_back(Start);
164*da58b97aSjoerg   Worklist.push_back(cast<Instruction>(Start->getOperand(0)));
165*da58b97aSjoerg 
166*da58b97aSjoerg   SmallSetVector<Instruction *, 4> Truncs;
167*da58b97aSjoerg   SmallSetVector<Instruction *, 4> Exts;
168*da58b97aSjoerg   SmallSetVector<Use *, 4> OtherLeafs;
169*da58b97aSjoerg   SmallSetVector<Instruction *, 4> Ops;
170*da58b97aSjoerg 
171*da58b97aSjoerg   while (!Worklist.empty()) {
172*da58b97aSjoerg     Instruction *I = Worklist.back();
173*da58b97aSjoerg     Worklist.pop_back();
174*da58b97aSjoerg 
175*da58b97aSjoerg     switch (I->getOpcode()) {
176*da58b97aSjoerg     // Truncs
177*da58b97aSjoerg     case Instruction::Trunc:
178*da58b97aSjoerg     case Instruction::FPTrunc:
179*da58b97aSjoerg       if (Truncs.count(I))
180*da58b97aSjoerg         continue;
181*da58b97aSjoerg       Truncs.insert(I);
182*da58b97aSjoerg       Visited.insert(I);
183*da58b97aSjoerg       break;
184*da58b97aSjoerg 
185*da58b97aSjoerg     // Extend leafs
186*da58b97aSjoerg     case Instruction::SExt:
187*da58b97aSjoerg     case Instruction::ZExt:
188*da58b97aSjoerg     case Instruction::FPExt:
189*da58b97aSjoerg       if (Exts.count(I))
190*da58b97aSjoerg         continue;
191*da58b97aSjoerg       for (auto *Use : I->users())
192*da58b97aSjoerg         Worklist.push_back(cast<Instruction>(Use));
193*da58b97aSjoerg       Exts.insert(I);
194*da58b97aSjoerg       break;
195*da58b97aSjoerg 
196*da58b97aSjoerg     case Instruction::Call: {
197*da58b97aSjoerg       IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
198*da58b97aSjoerg       if (!II)
199*da58b97aSjoerg         return false;
200*da58b97aSjoerg 
201*da58b97aSjoerg       switch (II->getIntrinsicID()) {
202*da58b97aSjoerg       case Intrinsic::abs:
203*da58b97aSjoerg       case Intrinsic::smin:
204*da58b97aSjoerg       case Intrinsic::smax:
205*da58b97aSjoerg       case Intrinsic::umin:
206*da58b97aSjoerg       case Intrinsic::umax:
207*da58b97aSjoerg       case Intrinsic::sadd_sat:
208*da58b97aSjoerg       case Intrinsic::ssub_sat:
209*da58b97aSjoerg       case Intrinsic::uadd_sat:
210*da58b97aSjoerg       case Intrinsic::usub_sat:
211*da58b97aSjoerg       case Intrinsic::minnum:
212*da58b97aSjoerg       case Intrinsic::maxnum:
213*da58b97aSjoerg       case Intrinsic::fabs:
214*da58b97aSjoerg       case Intrinsic::fma:
215*da58b97aSjoerg       case Intrinsic::ceil:
216*da58b97aSjoerg       case Intrinsic::floor:
217*da58b97aSjoerg       case Intrinsic::rint:
218*da58b97aSjoerg       case Intrinsic::round:
219*da58b97aSjoerg       case Intrinsic::trunc:
220*da58b97aSjoerg         break;
221*da58b97aSjoerg       default:
222*da58b97aSjoerg         return false;
223*da58b97aSjoerg       }
224*da58b97aSjoerg       LLVM_FALLTHROUGH; // Fall through to treating these like an operator below.
225*da58b97aSjoerg     }
226*da58b97aSjoerg     // Binary/tertiary ops
227*da58b97aSjoerg     case Instruction::Add:
228*da58b97aSjoerg     case Instruction::Sub:
229*da58b97aSjoerg     case Instruction::Mul:
230*da58b97aSjoerg     case Instruction::AShr:
231*da58b97aSjoerg     case Instruction::LShr:
232*da58b97aSjoerg     case Instruction::Shl:
233*da58b97aSjoerg     case Instruction::ICmp:
234*da58b97aSjoerg     case Instruction::FCmp:
235*da58b97aSjoerg     case Instruction::FAdd:
236*da58b97aSjoerg     case Instruction::FMul:
237*da58b97aSjoerg     case Instruction::Select:
238*da58b97aSjoerg       if (Ops.count(I))
239*da58b97aSjoerg         continue;
240*da58b97aSjoerg       Ops.insert(I);
241*da58b97aSjoerg 
242*da58b97aSjoerg       for (Use &Op : I->operands()) {
243*da58b97aSjoerg         if (!isa<FixedVectorType>(Op->getType()))
244*da58b97aSjoerg           continue;
245*da58b97aSjoerg         if (isa<Instruction>(Op))
246*da58b97aSjoerg           Worklist.push_back(cast<Instruction>(&Op));
247*da58b97aSjoerg         else
248*da58b97aSjoerg           OtherLeafs.insert(&Op);
249*da58b97aSjoerg       }
250*da58b97aSjoerg 
251*da58b97aSjoerg       for (auto *Use : I->users())
252*da58b97aSjoerg         Worklist.push_back(cast<Instruction>(Use));
253*da58b97aSjoerg       break;
254*da58b97aSjoerg 
255*da58b97aSjoerg     case Instruction::ShuffleVector:
256*da58b97aSjoerg       // A shuffle of a splat is a splat.
257*da58b97aSjoerg       if (cast<ShuffleVectorInst>(I)->isZeroEltSplat())
258*da58b97aSjoerg         continue;
259*da58b97aSjoerg       LLVM_FALLTHROUGH;
260*da58b97aSjoerg 
261*da58b97aSjoerg     default:
262*da58b97aSjoerg       LLVM_DEBUG(dbgs() << "  Unhandled instruction: " << *I << "\n");
263*da58b97aSjoerg       return false;
264*da58b97aSjoerg     }
265*da58b97aSjoerg   }
266*da58b97aSjoerg 
267*da58b97aSjoerg   if (Exts.empty() && OtherLeafs.empty())
268*da58b97aSjoerg     return false;
269*da58b97aSjoerg 
270*da58b97aSjoerg   LLVM_DEBUG({
271*da58b97aSjoerg     dbgs() << "Found group:\n  Exts:";
272*da58b97aSjoerg     for (auto *I : Exts)
273*da58b97aSjoerg       dbgs() << "  " << *I << "\n";
274*da58b97aSjoerg     dbgs() << "  Ops:";
275*da58b97aSjoerg     for (auto *I : Ops)
276*da58b97aSjoerg       dbgs() << "  " << *I << "\n";
277*da58b97aSjoerg     dbgs() << "  OtherLeafs:";
278*da58b97aSjoerg     for (auto *I : OtherLeafs)
279*da58b97aSjoerg       dbgs() << "  " << *I->get() << " of " << *I->getUser() << "\n";
280*da58b97aSjoerg     dbgs() << "Truncs:";
281*da58b97aSjoerg     for (auto *I : Truncs)
282*da58b97aSjoerg       dbgs() << "  " << *I << "\n";
283*da58b97aSjoerg   });
284*da58b97aSjoerg 
285*da58b97aSjoerg   assert(!Truncs.empty() && "Expected some truncs");
286*da58b97aSjoerg 
287*da58b97aSjoerg   // Check types
288*da58b97aSjoerg   unsigned NumElts = VT->getNumElements();
289*da58b97aSjoerg   unsigned BaseElts = VT->getScalarSizeInBits() == 16
290*da58b97aSjoerg                           ? 8
291*da58b97aSjoerg                           : (VT->getScalarSizeInBits() == 8 ? 16 : 0);
292*da58b97aSjoerg   if (BaseElts == 0 || NumElts % BaseElts != 0) {
293*da58b97aSjoerg     LLVM_DEBUG(dbgs() << "  Type is unsupported\n");
294*da58b97aSjoerg     return false;
295*da58b97aSjoerg   }
296*da58b97aSjoerg   if (Start->getOperand(0)->getType()->getScalarSizeInBits() !=
297*da58b97aSjoerg       VT->getScalarSizeInBits() * 2) {
298*da58b97aSjoerg     LLVM_DEBUG(dbgs() << "  Type not double sized\n");
299*da58b97aSjoerg     return false;
300*da58b97aSjoerg   }
301*da58b97aSjoerg   for (Instruction *I : Exts)
302*da58b97aSjoerg     if (I->getOperand(0)->getType() != VT) {
303*da58b97aSjoerg       LLVM_DEBUG(dbgs() << "  Wrong type on " << *I << "\n");
304*da58b97aSjoerg       return false;
305*da58b97aSjoerg     }
306*da58b97aSjoerg   for (Instruction *I : Truncs)
307*da58b97aSjoerg     if (I->getType() != VT) {
308*da58b97aSjoerg       LLVM_DEBUG(dbgs() << "  Wrong type on " << *I << "\n");
309*da58b97aSjoerg       return false;
310*da58b97aSjoerg     }
311*da58b97aSjoerg 
312*da58b97aSjoerg   // Check that it looks beneficial
313*da58b97aSjoerg   if (!isProfitableToInterleave(Exts, Truncs))
314*da58b97aSjoerg     return false;
315*da58b97aSjoerg 
316*da58b97aSjoerg   // Create new shuffles around the extends / truncs / other leaves.
317*da58b97aSjoerg   IRBuilder<> Builder(Start);
318*da58b97aSjoerg 
319*da58b97aSjoerg   SmallVector<int, 16> LeafMask;
320*da58b97aSjoerg   SmallVector<int, 16> TruncMask;
321*da58b97aSjoerg   // LeafMask : 0, 2, 4, 6, 1, 3, 5, 7   8, 10, 12, 14,  9, 11, 13, 15
322*da58b97aSjoerg   // TruncMask: 0, 4, 1, 5, 2, 6, 3, 7   8, 12,  9, 13, 10, 14, 11, 15
323*da58b97aSjoerg   for (unsigned Base = 0; Base < NumElts; Base += BaseElts) {
324*da58b97aSjoerg     for (unsigned i = 0; i < BaseElts / 2; i++)
325*da58b97aSjoerg       LeafMask.push_back(Base + i * 2);
326*da58b97aSjoerg     for (unsigned i = 0; i < BaseElts / 2; i++)
327*da58b97aSjoerg       LeafMask.push_back(Base + i * 2 + 1);
328*da58b97aSjoerg   }
329*da58b97aSjoerg   for (unsigned Base = 0; Base < NumElts; Base += BaseElts) {
330*da58b97aSjoerg     for (unsigned i = 0; i < BaseElts / 2; i++) {
331*da58b97aSjoerg       TruncMask.push_back(Base + i);
332*da58b97aSjoerg       TruncMask.push_back(Base + i + BaseElts / 2);
333*da58b97aSjoerg     }
334*da58b97aSjoerg   }
335*da58b97aSjoerg 
336*da58b97aSjoerg   for (Instruction *I : Exts) {
337*da58b97aSjoerg     LLVM_DEBUG(dbgs() << "Replacing ext " << *I << "\n");
338*da58b97aSjoerg     Builder.SetInsertPoint(I);
339*da58b97aSjoerg     Value *Shuffle = Builder.CreateShuffleVector(I->getOperand(0), LeafMask);
340*da58b97aSjoerg     bool FPext = isa<FPExtInst>(I);
341*da58b97aSjoerg     bool Sext = isa<SExtInst>(I);
342*da58b97aSjoerg     Value *Ext = FPext ? Builder.CreateFPExt(Shuffle, I->getType())
343*da58b97aSjoerg                        : Sext ? Builder.CreateSExt(Shuffle, I->getType())
344*da58b97aSjoerg                               : Builder.CreateZExt(Shuffle, I->getType());
345*da58b97aSjoerg     I->replaceAllUsesWith(Ext);
346*da58b97aSjoerg     LLVM_DEBUG(dbgs() << "  with " << *Shuffle << "\n");
347*da58b97aSjoerg   }
348*da58b97aSjoerg 
349*da58b97aSjoerg   for (Use *I : OtherLeafs) {
350*da58b97aSjoerg     LLVM_DEBUG(dbgs() << "Replacing leaf " << *I << "\n");
351*da58b97aSjoerg     Builder.SetInsertPoint(cast<Instruction>(I->getUser()));
352*da58b97aSjoerg     Value *Shuffle = Builder.CreateShuffleVector(I->get(), LeafMask);
353*da58b97aSjoerg     I->getUser()->setOperand(I->getOperandNo(), Shuffle);
354*da58b97aSjoerg     LLVM_DEBUG(dbgs() << "  with " << *Shuffle << "\n");
355*da58b97aSjoerg   }
356*da58b97aSjoerg 
357*da58b97aSjoerg   for (Instruction *I : Truncs) {
358*da58b97aSjoerg     LLVM_DEBUG(dbgs() << "Replacing trunc " << *I << "\n");
359*da58b97aSjoerg 
360*da58b97aSjoerg     Builder.SetInsertPoint(I->getParent(), ++I->getIterator());
361*da58b97aSjoerg     Value *Shuf = Builder.CreateShuffleVector(I, TruncMask);
362*da58b97aSjoerg     I->replaceAllUsesWith(Shuf);
363*da58b97aSjoerg     cast<Instruction>(Shuf)->setOperand(0, I);
364*da58b97aSjoerg 
365*da58b97aSjoerg     LLVM_DEBUG(dbgs() << "  with " << *Shuf << "\n");
366*da58b97aSjoerg   }
367*da58b97aSjoerg 
368*da58b97aSjoerg   return true;
369*da58b97aSjoerg }
370*da58b97aSjoerg 
runOnFunction(Function & F)371*da58b97aSjoerg bool MVELaneInterleaving::runOnFunction(Function &F) {
372*da58b97aSjoerg   if (!EnableInterleave)
373*da58b97aSjoerg     return false;
374*da58b97aSjoerg   auto &TPC = getAnalysis<TargetPassConfig>();
375*da58b97aSjoerg   auto &TM = TPC.getTM<TargetMachine>();
376*da58b97aSjoerg   auto *ST = &TM.getSubtarget<ARMSubtarget>(F);
377*da58b97aSjoerg   if (!ST->hasMVEIntegerOps())
378*da58b97aSjoerg     return false;
379*da58b97aSjoerg 
380*da58b97aSjoerg   bool Changed = false;
381*da58b97aSjoerg 
382*da58b97aSjoerg   SmallPtrSet<Instruction *, 16> Visited;
383*da58b97aSjoerg   for (Instruction &I : reverse(instructions(F))) {
384*da58b97aSjoerg     if (I.getType()->isVectorTy() &&
385*da58b97aSjoerg         (isa<TruncInst>(I) || isa<FPTruncInst>(I)) && !Visited.count(&I))
386*da58b97aSjoerg       Changed |= tryInterleave(&I, Visited);
387*da58b97aSjoerg   }
388*da58b97aSjoerg 
389*da58b97aSjoerg   return Changed;
390*da58b97aSjoerg }
391