1 //=== AArch64PostSelectOptimize.cpp ---------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does post-instruction-selection optimizations in the GlobalISel
10 // pipeline, before the rest of codegen runs.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AArch64.h"
15 #include "AArch64TargetMachine.h"
16 #include "MCTargetDesc/AArch64MCTargetDesc.h"
17 #include "llvm/ADT/STLExtras.h"
18 #include "llvm/CodeGen/GlobalISel/Utils.h"
19 #include "llvm/CodeGen/MachineBasicBlock.h"
20 #include "llvm/CodeGen/MachineFunctionPass.h"
21 #include "llvm/CodeGen/MachineInstr.h"
22 #include "llvm/CodeGen/MachineOperand.h"
23 #include "llvm/CodeGen/TargetPassConfig.h"
24 #include "llvm/Support/Debug.h"
25 #include "llvm/Support/ErrorHandling.h"
26 
27 #define DEBUG_TYPE "aarch64-post-select-optimize"
28 
29 using namespace llvm;
30 
31 namespace {
32 class AArch64PostSelectOptimize : public MachineFunctionPass {
33 public:
34   static char ID;
35 
36   AArch64PostSelectOptimize();
37 
38   StringRef getPassName() const override {
39     return "AArch64 Post Select Optimizer";
40   }
41 
42   bool runOnMachineFunction(MachineFunction &MF) override;
43 
44   void getAnalysisUsage(AnalysisUsage &AU) const override;
45 
46 private:
47   bool optimizeNZCVDefs(MachineBasicBlock &MBB);
48   bool doPeepholeOpts(MachineBasicBlock &MBB);
49   /// Look for cross regclass copies that can be trivially eliminated.
50   bool foldSimpleCrossClassCopies(MachineInstr &MI);
51 };
52 } // end anonymous namespace
53 
54 void AArch64PostSelectOptimize::getAnalysisUsage(AnalysisUsage &AU) const {
55   AU.addRequired<TargetPassConfig>();
56   AU.setPreservesCFG();
57   getSelectionDAGFallbackAnalysisUsage(AU);
58   MachineFunctionPass::getAnalysisUsage(AU);
59 }
60 
61 AArch64PostSelectOptimize::AArch64PostSelectOptimize()
62     : MachineFunctionPass(ID) {
63   initializeAArch64PostSelectOptimizePass(*PassRegistry::getPassRegistry());
64 }
65 
66 unsigned getNonFlagSettingVariant(unsigned Opc) {
67   switch (Opc) {
68   default:
69     return 0;
70   case AArch64::SUBSXrr:
71     return AArch64::SUBXrr;
72   case AArch64::SUBSWrr:
73     return AArch64::SUBWrr;
74   case AArch64::SUBSXrs:
75     return AArch64::SUBXrs;
76   case AArch64::SUBSWrs:
77     return AArch64::SUBWrs;
78   case AArch64::SUBSXri:
79     return AArch64::SUBXri;
80   case AArch64::SUBSWri:
81     return AArch64::SUBWri;
82   case AArch64::ADDSXrr:
83     return AArch64::ADDXrr;
84   case AArch64::ADDSWrr:
85     return AArch64::ADDWrr;
86   case AArch64::ADDSXrs:
87     return AArch64::ADDXrs;
88   case AArch64::ADDSWrs:
89     return AArch64::ADDWrs;
90   case AArch64::ADDSXri:
91     return AArch64::ADDXri;
92   case AArch64::ADDSWri:
93     return AArch64::ADDWri;
94   case AArch64::SBCSXr:
95     return AArch64::SBCXr;
96   case AArch64::SBCSWr:
97     return AArch64::SBCWr;
98   case AArch64::ADCSXr:
99     return AArch64::ADCXr;
100   case AArch64::ADCSWr:
101     return AArch64::ADCWr;
102   }
103 }
104 
105 bool AArch64PostSelectOptimize::doPeepholeOpts(MachineBasicBlock &MBB) {
106   bool Changed = false;
107   for (auto &MI : make_early_inc_range(make_range(MBB.begin(), MBB.end()))) {
108     Changed |= foldSimpleCrossClassCopies(MI);
109   }
110   return Changed;
111 }
112 
113 bool AArch64PostSelectOptimize::foldSimpleCrossClassCopies(MachineInstr &MI) {
114   auto *MF = MI.getMF();
115   auto &MRI = MF->getRegInfo();
116 
117   if (!MI.isCopy())
118     return false;
119 
120   if (MI.getOperand(1).getSubReg())
121     return false; // Don't deal with subreg copies
122 
123   Register Src = MI.getOperand(1).getReg();
124   Register Dst = MI.getOperand(0).getReg();
125 
126   if (Src.isPhysical() || Dst.isPhysical())
127     return false;
128 
129   const TargetRegisterClass *SrcRC = MRI.getRegClass(Src);
130   const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
131 
132   if (SrcRC == DstRC)
133     return false;
134 
135 
136   if (SrcRC->hasSubClass(DstRC)) {
137     // This is the case where the source class is a superclass of the dest, so
138     // if the copy is the only user of the source, we can just constrain the
139     // source reg to the dest class.
140 
141     if (!MRI.hasOneNonDBGUse(Src))
142       return false; // Only constrain single uses of the source.
143 
144     // Constrain to dst reg class as long as it's not a weird class that only
145     // has a few registers.
146     if (!MRI.constrainRegClass(Src, DstRC, /* MinNumRegs */ 25))
147       return false;
148   } else if (DstRC->hasSubClass(SrcRC)) {
149     // This is the inverse case, where the destination class is a superclass of
150     // the source. Here, if the copy is the only user, we can just constrain
151     // the user of the copy to use the smaller class of the source.
152   } else {
153     return false;
154   }
155 
156   MRI.replaceRegWith(Dst, Src);
157   MI.eraseFromParent();
158   return true;
159 }
160 
161 bool AArch64PostSelectOptimize::optimizeNZCVDefs(MachineBasicBlock &MBB) {
162   // If we find a dead NZCV implicit-def, we
163   // - try to convert the operation to a non-flag-setting equivalent
164   // - or mark the def as dead to aid later peephole optimizations.
165 
166   // Use cases:
167   // 1)
168   // Consider the following code:
169   //  FCMPSrr %0, %1, implicit-def $nzcv
170   //  %sel1:gpr32 = CSELWr %_, %_, 12, implicit $nzcv
171   //  %sub:gpr32 = SUBSWrr %_, %_, implicit-def $nzcv
172   //  FCMPSrr %0, %1, implicit-def $nzcv
173   //  %sel2:gpr32 = CSELWr %_, %_, 12, implicit $nzcv
174   // This kind of code where we have 2 FCMPs each feeding a CSEL can happen
175   // when we have a single IR fcmp being used by two selects. During selection,
176   // to ensure that there can be no clobbering of nzcv between the fcmp and the
177   // csel, we have to generate an fcmp immediately before each csel is
178   // selected.
179   // However, often we can essentially CSE these together later in MachineCSE.
180   // This doesn't work though if there are unrelated flag-setting instructions
181   // in between the two FCMPs. In this case, the SUBS defines NZCV
182   // but it doesn't have any users, being overwritten by the second FCMP.
183   //
184   // 2)
185   // The instruction selector always emits the flag-setting variant of ADC/SBC
186   // while selecting G_UADDE/G_SADDE/G_USUBE/G_SSUBE. If the carry-out of these
187   // instructions is never used, we can switch to the non-flag-setting variant.
188 
189   bool Changed = false;
190   auto &MF = *MBB.getParent();
191   auto &Subtarget = MF.getSubtarget();
192   const auto &TII = Subtarget.getInstrInfo();
193   auto TRI = Subtarget.getRegisterInfo();
194   auto RBI = Subtarget.getRegBankInfo();
195   auto &MRI = MF.getRegInfo();
196 
197   LiveRegUnits LRU(*MBB.getParent()->getSubtarget().getRegisterInfo());
198   LRU.addLiveOuts(MBB);
199 
200   for (auto &II : instructionsWithoutDebug(MBB.rbegin(), MBB.rend())) {
201     bool NZCVDead = LRU.available(AArch64::NZCV);
202     if (NZCVDead && II.definesRegister(AArch64::NZCV)) {
203       // The instruction defines NZCV, but NZCV is dead.
204       unsigned NewOpc = getNonFlagSettingVariant(II.getOpcode());
205       int DeadNZCVIdx = II.findRegisterDefOperandIdx(AArch64::NZCV);
206       if (DeadNZCVIdx != -1) {
207         if (NewOpc) {
208           // If there is an equivalent non-flag-setting op, we convert.
209           LLVM_DEBUG(dbgs() << "Post-select optimizer: converting flag-setting "
210                                "op: "
211                             << II);
212           II.setDesc(TII->get(NewOpc));
213           II.removeOperand(DeadNZCVIdx);
214           // Changing the opcode can result in differing regclass requirements,
215           // e.g. SUBSWri uses gpr32 for the dest, whereas SUBWri uses gpr32sp.
216           // Constrain the regclasses, possibly introducing a copy.
217           constrainOperandRegClass(MF, *TRI, MRI, *TII, *RBI, II, II.getDesc(),
218                                    II.getOperand(0), 0);
219           Changed |= true;
220         } else {
221           // Otherwise, we just set the nzcv imp-def operand to be dead, so the
222           // peephole optimizations can optimize them further.
223           II.getOperand(DeadNZCVIdx).setIsDead();
224         }
225       }
226     }
227     LRU.stepBackward(II);
228   }
229   return Changed;
230 }
231 
232 bool AArch64PostSelectOptimize::runOnMachineFunction(MachineFunction &MF) {
233   if (MF.getProperties().hasProperty(
234           MachineFunctionProperties::Property::FailedISel))
235     return false;
236   assert(MF.getProperties().hasProperty(
237              MachineFunctionProperties::Property::Selected) &&
238          "Expected a selected MF");
239 
240   bool Changed = false;
241   for (auto &BB : MF) {
242     Changed |= optimizeNZCVDefs(BB);
243     Changed |= doPeepholeOpts(BB);
244   }
245   return Changed;
246 }
247 
248 char AArch64PostSelectOptimize::ID = 0;
249 INITIALIZE_PASS_BEGIN(AArch64PostSelectOptimize, DEBUG_TYPE,
250                       "Optimize AArch64 selected instructions",
251                       false, false)
252 INITIALIZE_PASS_END(AArch64PostSelectOptimize, DEBUG_TYPE,
253                     "Optimize AArch64 selected instructions", false,
254                     false)
255 
256 namespace llvm {
257 FunctionPass *createAArch64PostSelectOptimize() {
258   return new AArch64PostSelectOptimize();
259 }
260 } // end namespace llvm
261