1 //===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains a pass that performs load / store related peephole
10 // optimizations. This pass should be run after register allocation.
11 //
12 // The pass runs after the PrologEpilogInserter where we emit the CFI
13 // instructions. In order to preserve the correctness of the unwind informaiton,
14 // the pass should not change the order of any two instructions, one of which
15 // has the FrameSetup/FrameDestroy flag or, alternatively, apply an add-hoc fix
16 // to unwind information.
17 //
18 //===----------------------------------------------------------------------===//
19
20 #include "AArch64InstrInfo.h"
21 #include "AArch64MachineFunctionInfo.h"
22 #include "AArch64Subtarget.h"
23 #include "MCTargetDesc/AArch64AddressingModes.h"
24 #include "llvm/ADT/SmallVector.h"
25 #include "llvm/ADT/Statistic.h"
26 #include "llvm/ADT/StringRef.h"
27 #include "llvm/ADT/iterator_range.h"
28 #include "llvm/Analysis/AliasAnalysis.h"
29 #include "llvm/CodeGen/MachineBasicBlock.h"
30 #include "llvm/CodeGen/MachineFunction.h"
31 #include "llvm/CodeGen/MachineFunctionPass.h"
32 #include "llvm/CodeGen/MachineInstr.h"
33 #include "llvm/CodeGen/MachineInstrBuilder.h"
34 #include "llvm/CodeGen/MachineOperand.h"
35 #include "llvm/CodeGen/MachineRegisterInfo.h"
36 #include "llvm/CodeGen/TargetRegisterInfo.h"
37 #include "llvm/IR/DebugLoc.h"
38 #include "llvm/MC/MCAsmInfo.h"
39 #include "llvm/MC/MCDwarf.h"
40 #include "llvm/MC/MCRegisterInfo.h"
41 #include "llvm/Pass.h"
42 #include "llvm/Support/CommandLine.h"
43 #include "llvm/Support/Debug.h"
44 #include "llvm/Support/DebugCounter.h"
45 #include "llvm/Support/ErrorHandling.h"
46 #include "llvm/Support/raw_ostream.h"
47 #include <cassert>
48 #include <cstdint>
49 #include <functional>
50 #include <iterator>
51 #include <limits>
52 #include <optional>
53
54 using namespace llvm;
55
56 #define DEBUG_TYPE "aarch64-ldst-opt"
57
58 STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
59 STATISTIC(NumPostFolded, "Number of post-index updates folded");
60 STATISTIC(NumPreFolded, "Number of pre-index updates folded");
61 STATISTIC(NumUnscaledPairCreated,
62 "Number of load/store from unscaled generated");
63 STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
64 STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
65
66 DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming",
67 "Controls which pairs are considered for renaming");
68
69 // The LdStLimit limits how far we search for load/store pairs.
70 static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
71 cl::init(20), cl::Hidden);
72
73 // The UpdateLimit limits how far we search for update instructions when we form
74 // pre-/post-index instructions.
75 static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
76 cl::Hidden);
77
78 // Enable register renaming to find additional store pairing opportunities.
79 static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming",
80 cl::init(true), cl::Hidden);
81
82 #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
83
84 namespace {
85
86 using LdStPairFlags = struct LdStPairFlags {
87 // If a matching instruction is found, MergeForward is set to true if the
88 // merge is to remove the first instruction and replace the second with
89 // a pair-wise insn, and false if the reverse is true.
90 bool MergeForward = false;
91
92 // SExtIdx gives the index of the result of the load pair that must be
93 // extended. The value of SExtIdx assumes that the paired load produces the
94 // value in this order: (I, returned iterator), i.e., -1 means no value has
95 // to be extended, 0 means I, and 1 means the returned iterator.
96 int SExtIdx = -1;
97
98 // If not none, RenameReg can be used to rename the result register of the
99 // first store in a pair. Currently this only works when merging stores
100 // forward.
101 std::optional<MCPhysReg> RenameReg;
102
103 LdStPairFlags() = default;
104
105 void setMergeForward(bool V = true) { MergeForward = V; }
106 bool getMergeForward() const { return MergeForward; }
107
108 void setSExtIdx(int V) { SExtIdx = V; }
109 int getSExtIdx() const { return SExtIdx; }
110
111 void setRenameReg(MCPhysReg R) { RenameReg = R; }
112 void clearRenameReg() { RenameReg = std::nullopt; }
113 std::optional<MCPhysReg> getRenameReg() const { return RenameReg; }
114 };
115
116 struct AArch64LoadStoreOpt : public MachineFunctionPass {
117 static char ID;
118
AArch64LoadStoreOpt__anon7ffd38df0111::AArch64LoadStoreOpt119 AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
120 initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
121 }
122
123 AliasAnalysis *AA;
124 const AArch64InstrInfo *TII;
125 const TargetRegisterInfo *TRI;
126 const AArch64Subtarget *Subtarget;
127
128 // Track which register units have been modified and used.
129 LiveRegUnits ModifiedRegUnits, UsedRegUnits;
130 LiveRegUnits DefinedInBB;
131
getAnalysisUsage__anon7ffd38df0111::AArch64LoadStoreOpt132 void getAnalysisUsage(AnalysisUsage &AU) const override {
133 AU.addRequired<AAResultsWrapperPass>();
134 MachineFunctionPass::getAnalysisUsage(AU);
135 }
136
137 // Scan the instructions looking for a load/store that can be combined
138 // with the current instruction into a load/store pair.
139 // Return the matching instruction if one is found, else MBB->end().
140 MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
141 LdStPairFlags &Flags,
142 unsigned Limit,
143 bool FindNarrowMerge);
144
145 // Scan the instructions looking for a store that writes to the address from
146 // which the current load instruction reads. Return true if one is found.
147 bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
148 MachineBasicBlock::iterator &StoreI);
149
150 // Merge the two instructions indicated into a wider narrow store instruction.
151 MachineBasicBlock::iterator
152 mergeNarrowZeroStores(MachineBasicBlock::iterator I,
153 MachineBasicBlock::iterator MergeMI,
154 const LdStPairFlags &Flags);
155
156 // Merge the two instructions indicated into a single pair-wise instruction.
157 MachineBasicBlock::iterator
158 mergePairedInsns(MachineBasicBlock::iterator I,
159 MachineBasicBlock::iterator Paired,
160 const LdStPairFlags &Flags);
161
162 // Promote the load that reads directly from the address stored to.
163 MachineBasicBlock::iterator
164 promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
165 MachineBasicBlock::iterator StoreI);
166
167 // Scan the instruction list to find a base register update that can
168 // be combined with the current instruction (a load or store) using
169 // pre or post indexed addressing with writeback. Scan forwards.
170 MachineBasicBlock::iterator
171 findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
172 int UnscaledOffset, unsigned Limit);
173
174 // Scan the instruction list to find a base register update that can
175 // be combined with the current instruction (a load or store) using
176 // pre or post indexed addressing with writeback. Scan backwards.
177 MachineBasicBlock::iterator
178 findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
179
180 // Find an instruction that updates the base register of the ld/st
181 // instruction.
182 bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
183 unsigned BaseReg, int Offset);
184
185 // Merge a pre- or post-index base register update into a ld/st instruction.
186 MachineBasicBlock::iterator
187 mergeUpdateInsn(MachineBasicBlock::iterator I,
188 MachineBasicBlock::iterator Update, bool IsPreIdx);
189
190 // Find and merge zero store instructions.
191 bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
192
193 // Find and pair ldr/str instructions.
194 bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);
195
196 // Find and promote load instructions which read directly from store.
197 bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
198
199 // Find and merge a base register updates before or after a ld/st instruction.
200 bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
201
202 bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
203
204 bool runOnMachineFunction(MachineFunction &Fn) override;
205
getRequiredProperties__anon7ffd38df0111::AArch64LoadStoreOpt206 MachineFunctionProperties getRequiredProperties() const override {
207 return MachineFunctionProperties().set(
208 MachineFunctionProperties::Property::NoVRegs);
209 }
210
getPassName__anon7ffd38df0111::AArch64LoadStoreOpt211 StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; }
212 };
213
214 char AArch64LoadStoreOpt::ID = 0;
215
216 } // end anonymous namespace
217
218 INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
219 AARCH64_LOAD_STORE_OPT_NAME, false, false)
220
isNarrowStore(unsigned Opc)221 static bool isNarrowStore(unsigned Opc) {
222 switch (Opc) {
223 default:
224 return false;
225 case AArch64::STRBBui:
226 case AArch64::STURBBi:
227 case AArch64::STRHHui:
228 case AArch64::STURHHi:
229 return true;
230 }
231 }
232
233 // These instruction set memory tag and either keep memory contents unchanged or
234 // set it to zero, ignoring the address part of the source register.
isTagStore(const MachineInstr & MI)235 static bool isTagStore(const MachineInstr &MI) {
236 switch (MI.getOpcode()) {
237 default:
238 return false;
239 case AArch64::STGi:
240 case AArch64::STZGi:
241 case AArch64::ST2Gi:
242 case AArch64::STZ2Gi:
243 return true;
244 }
245 }
246
getMatchingNonSExtOpcode(unsigned Opc,bool * IsValidLdStrOpc=nullptr)247 static unsigned getMatchingNonSExtOpcode(unsigned Opc,
248 bool *IsValidLdStrOpc = nullptr) {
249 if (IsValidLdStrOpc)
250 *IsValidLdStrOpc = true;
251 switch (Opc) {
252 default:
253 if (IsValidLdStrOpc)
254 *IsValidLdStrOpc = false;
255 return std::numeric_limits<unsigned>::max();
256 case AArch64::STRDui:
257 case AArch64::STURDi:
258 case AArch64::STRDpre:
259 case AArch64::STRQui:
260 case AArch64::STURQi:
261 case AArch64::STRQpre:
262 case AArch64::STRBBui:
263 case AArch64::STURBBi:
264 case AArch64::STRHHui:
265 case AArch64::STURHHi:
266 case AArch64::STRWui:
267 case AArch64::STRWpre:
268 case AArch64::STURWi:
269 case AArch64::STRXui:
270 case AArch64::STRXpre:
271 case AArch64::STURXi:
272 case AArch64::LDRDui:
273 case AArch64::LDURDi:
274 case AArch64::LDRDpre:
275 case AArch64::LDRQui:
276 case AArch64::LDURQi:
277 case AArch64::LDRQpre:
278 case AArch64::LDRWui:
279 case AArch64::LDURWi:
280 case AArch64::LDRWpre:
281 case AArch64::LDRXui:
282 case AArch64::LDURXi:
283 case AArch64::LDRXpre:
284 case AArch64::STRSui:
285 case AArch64::STURSi:
286 case AArch64::STRSpre:
287 case AArch64::LDRSui:
288 case AArch64::LDURSi:
289 case AArch64::LDRSpre:
290 return Opc;
291 case AArch64::LDRSWui:
292 return AArch64::LDRWui;
293 case AArch64::LDURSWi:
294 return AArch64::LDURWi;
295 case AArch64::LDRSWpre:
296 return AArch64::LDRWpre;
297 }
298 }
299
getMatchingWideOpcode(unsigned Opc)300 static unsigned getMatchingWideOpcode(unsigned Opc) {
301 switch (Opc) {
302 default:
303 llvm_unreachable("Opcode has no wide equivalent!");
304 case AArch64::STRBBui:
305 return AArch64::STRHHui;
306 case AArch64::STRHHui:
307 return AArch64::STRWui;
308 case AArch64::STURBBi:
309 return AArch64::STURHHi;
310 case AArch64::STURHHi:
311 return AArch64::STURWi;
312 case AArch64::STURWi:
313 return AArch64::STURXi;
314 case AArch64::STRWui:
315 return AArch64::STRXui;
316 }
317 }
318
getMatchingPairOpcode(unsigned Opc)319 static unsigned getMatchingPairOpcode(unsigned Opc) {
320 switch (Opc) {
321 default:
322 llvm_unreachable("Opcode has no pairwise equivalent!");
323 case AArch64::STRSui:
324 case AArch64::STURSi:
325 return AArch64::STPSi;
326 case AArch64::STRSpre:
327 return AArch64::STPSpre;
328 case AArch64::STRDui:
329 case AArch64::STURDi:
330 return AArch64::STPDi;
331 case AArch64::STRDpre:
332 return AArch64::STPDpre;
333 case AArch64::STRQui:
334 case AArch64::STURQi:
335 return AArch64::STPQi;
336 case AArch64::STRQpre:
337 return AArch64::STPQpre;
338 case AArch64::STRWui:
339 case AArch64::STURWi:
340 return AArch64::STPWi;
341 case AArch64::STRWpre:
342 return AArch64::STPWpre;
343 case AArch64::STRXui:
344 case AArch64::STURXi:
345 return AArch64::STPXi;
346 case AArch64::STRXpre:
347 return AArch64::STPXpre;
348 case AArch64::LDRSui:
349 case AArch64::LDURSi:
350 return AArch64::LDPSi;
351 case AArch64::LDRSpre:
352 return AArch64::LDPSpre;
353 case AArch64::LDRDui:
354 case AArch64::LDURDi:
355 return AArch64::LDPDi;
356 case AArch64::LDRDpre:
357 return AArch64::LDPDpre;
358 case AArch64::LDRQui:
359 case AArch64::LDURQi:
360 return AArch64::LDPQi;
361 case AArch64::LDRQpre:
362 return AArch64::LDPQpre;
363 case AArch64::LDRWui:
364 case AArch64::LDURWi:
365 return AArch64::LDPWi;
366 case AArch64::LDRWpre:
367 return AArch64::LDPWpre;
368 case AArch64::LDRXui:
369 case AArch64::LDURXi:
370 return AArch64::LDPXi;
371 case AArch64::LDRXpre:
372 return AArch64::LDPXpre;
373 case AArch64::LDRSWui:
374 case AArch64::LDURSWi:
375 return AArch64::LDPSWi;
376 case AArch64::LDRSWpre:
377 return AArch64::LDPSWpre;
378 }
379 }
380
isMatchingStore(MachineInstr & LoadInst,MachineInstr & StoreInst)381 static unsigned isMatchingStore(MachineInstr &LoadInst,
382 MachineInstr &StoreInst) {
383 unsigned LdOpc = LoadInst.getOpcode();
384 unsigned StOpc = StoreInst.getOpcode();
385 switch (LdOpc) {
386 default:
387 llvm_unreachable("Unsupported load instruction!");
388 case AArch64::LDRBBui:
389 return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||
390 StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
391 case AArch64::LDURBBi:
392 return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||
393 StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
394 case AArch64::LDRHHui:
395 return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||
396 StOpc == AArch64::STRXui;
397 case AArch64::LDURHHi:
398 return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||
399 StOpc == AArch64::STURXi;
400 case AArch64::LDRWui:
401 return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
402 case AArch64::LDURWi:
403 return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
404 case AArch64::LDRXui:
405 return StOpc == AArch64::STRXui;
406 case AArch64::LDURXi:
407 return StOpc == AArch64::STURXi;
408 }
409 }
410
getPreIndexedOpcode(unsigned Opc)411 static unsigned getPreIndexedOpcode(unsigned Opc) {
412 // FIXME: We don't currently support creating pre-indexed loads/stores when
413 // the load or store is the unscaled version. If we decide to perform such an
414 // optimization in the future the cases for the unscaled loads/stores will
415 // need to be added here.
416 switch (Opc) {
417 default:
418 llvm_unreachable("Opcode has no pre-indexed equivalent!");
419 case AArch64::STRSui:
420 return AArch64::STRSpre;
421 case AArch64::STRDui:
422 return AArch64::STRDpre;
423 case AArch64::STRQui:
424 return AArch64::STRQpre;
425 case AArch64::STRBBui:
426 return AArch64::STRBBpre;
427 case AArch64::STRHHui:
428 return AArch64::STRHHpre;
429 case AArch64::STRWui:
430 return AArch64::STRWpre;
431 case AArch64::STRXui:
432 return AArch64::STRXpre;
433 case AArch64::LDRSui:
434 return AArch64::LDRSpre;
435 case AArch64::LDRDui:
436 return AArch64::LDRDpre;
437 case AArch64::LDRQui:
438 return AArch64::LDRQpre;
439 case AArch64::LDRBBui:
440 return AArch64::LDRBBpre;
441 case AArch64::LDRHHui:
442 return AArch64::LDRHHpre;
443 case AArch64::LDRWui:
444 return AArch64::LDRWpre;
445 case AArch64::LDRXui:
446 return AArch64::LDRXpre;
447 case AArch64::LDRSWui:
448 return AArch64::LDRSWpre;
449 case AArch64::LDPSi:
450 return AArch64::LDPSpre;
451 case AArch64::LDPSWi:
452 return AArch64::LDPSWpre;
453 case AArch64::LDPDi:
454 return AArch64::LDPDpre;
455 case AArch64::LDPQi:
456 return AArch64::LDPQpre;
457 case AArch64::LDPWi:
458 return AArch64::LDPWpre;
459 case AArch64::LDPXi:
460 return AArch64::LDPXpre;
461 case AArch64::STPSi:
462 return AArch64::STPSpre;
463 case AArch64::STPDi:
464 return AArch64::STPDpre;
465 case AArch64::STPQi:
466 return AArch64::STPQpre;
467 case AArch64::STPWi:
468 return AArch64::STPWpre;
469 case AArch64::STPXi:
470 return AArch64::STPXpre;
471 case AArch64::STGi:
472 return AArch64::STGPreIndex;
473 case AArch64::STZGi:
474 return AArch64::STZGPreIndex;
475 case AArch64::ST2Gi:
476 return AArch64::ST2GPreIndex;
477 case AArch64::STZ2Gi:
478 return AArch64::STZ2GPreIndex;
479 case AArch64::STGPi:
480 return AArch64::STGPpre;
481 }
482 }
483
getPostIndexedOpcode(unsigned Opc)484 static unsigned getPostIndexedOpcode(unsigned Opc) {
485 switch (Opc) {
486 default:
487 llvm_unreachable("Opcode has no post-indexed wise equivalent!");
488 case AArch64::STRSui:
489 case AArch64::STURSi:
490 return AArch64::STRSpost;
491 case AArch64::STRDui:
492 case AArch64::STURDi:
493 return AArch64::STRDpost;
494 case AArch64::STRQui:
495 case AArch64::STURQi:
496 return AArch64::STRQpost;
497 case AArch64::STRBBui:
498 return AArch64::STRBBpost;
499 case AArch64::STRHHui:
500 return AArch64::STRHHpost;
501 case AArch64::STRWui:
502 case AArch64::STURWi:
503 return AArch64::STRWpost;
504 case AArch64::STRXui:
505 case AArch64::STURXi:
506 return AArch64::STRXpost;
507 case AArch64::LDRSui:
508 case AArch64::LDURSi:
509 return AArch64::LDRSpost;
510 case AArch64::LDRDui:
511 case AArch64::LDURDi:
512 return AArch64::LDRDpost;
513 case AArch64::LDRQui:
514 case AArch64::LDURQi:
515 return AArch64::LDRQpost;
516 case AArch64::LDRBBui:
517 return AArch64::LDRBBpost;
518 case AArch64::LDRHHui:
519 return AArch64::LDRHHpost;
520 case AArch64::LDRWui:
521 case AArch64::LDURWi:
522 return AArch64::LDRWpost;
523 case AArch64::LDRXui:
524 case AArch64::LDURXi:
525 return AArch64::LDRXpost;
526 case AArch64::LDRSWui:
527 return AArch64::LDRSWpost;
528 case AArch64::LDPSi:
529 return AArch64::LDPSpost;
530 case AArch64::LDPSWi:
531 return AArch64::LDPSWpost;
532 case AArch64::LDPDi:
533 return AArch64::LDPDpost;
534 case AArch64::LDPQi:
535 return AArch64::LDPQpost;
536 case AArch64::LDPWi:
537 return AArch64::LDPWpost;
538 case AArch64::LDPXi:
539 return AArch64::LDPXpost;
540 case AArch64::STPSi:
541 return AArch64::STPSpost;
542 case AArch64::STPDi:
543 return AArch64::STPDpost;
544 case AArch64::STPQi:
545 return AArch64::STPQpost;
546 case AArch64::STPWi:
547 return AArch64::STPWpost;
548 case AArch64::STPXi:
549 return AArch64::STPXpost;
550 case AArch64::STGi:
551 return AArch64::STGPostIndex;
552 case AArch64::STZGi:
553 return AArch64::STZGPostIndex;
554 case AArch64::ST2Gi:
555 return AArch64::ST2GPostIndex;
556 case AArch64::STZ2Gi:
557 return AArch64::STZ2GPostIndex;
558 case AArch64::STGPi:
559 return AArch64::STGPpost;
560 }
561 }
562
isPreLdStPairCandidate(MachineInstr & FirstMI,MachineInstr & MI)563 static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) {
564
565 unsigned OpcA = FirstMI.getOpcode();
566 unsigned OpcB = MI.getOpcode();
567
568 switch (OpcA) {
569 default:
570 return false;
571 case AArch64::STRSpre:
572 return (OpcB == AArch64::STRSui) || (OpcB == AArch64::STURSi);
573 case AArch64::STRDpre:
574 return (OpcB == AArch64::STRDui) || (OpcB == AArch64::STURDi);
575 case AArch64::STRQpre:
576 return (OpcB == AArch64::STRQui) || (OpcB == AArch64::STURQi);
577 case AArch64::STRWpre:
578 return (OpcB == AArch64::STRWui) || (OpcB == AArch64::STURWi);
579 case AArch64::STRXpre:
580 return (OpcB == AArch64::STRXui) || (OpcB == AArch64::STURXi);
581 case AArch64::LDRSpre:
582 return (OpcB == AArch64::LDRSui) || (OpcB == AArch64::LDURSi);
583 case AArch64::LDRDpre:
584 return (OpcB == AArch64::LDRDui) || (OpcB == AArch64::LDURDi);
585 case AArch64::LDRQpre:
586 return (OpcB == AArch64::LDRQui) || (OpcB == AArch64::LDURQi);
587 case AArch64::LDRWpre:
588 return (OpcB == AArch64::LDRWui) || (OpcB == AArch64::LDURWi);
589 case AArch64::LDRXpre:
590 return (OpcB == AArch64::LDRXui) || (OpcB == AArch64::LDURXi);
591 case AArch64::LDRSWpre:
592 return (OpcB == AArch64::LDRSWui) || (OpcB == AArch64::LDURSWi);
593 }
594 }
595
596 // Returns the scale and offset range of pre/post indexed variants of MI.
getPrePostIndexedMemOpInfo(const MachineInstr & MI,int & Scale,int & MinOffset,int & MaxOffset)597 static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale,
598 int &MinOffset, int &MaxOffset) {
599 bool IsPaired = AArch64InstrInfo::isPairedLdSt(MI);
600 bool IsTagStore = isTagStore(MI);
601 // ST*G and all paired ldst have the same scale in pre/post-indexed variants
602 // as in the "unsigned offset" variant.
603 // All other pre/post indexed ldst instructions are unscaled.
604 Scale = (IsTagStore || IsPaired) ? AArch64InstrInfo::getMemScale(MI) : 1;
605
606 if (IsPaired) {
607 MinOffset = -64;
608 MaxOffset = 63;
609 } else {
610 MinOffset = -256;
611 MaxOffset = 255;
612 }
613 }
614
getLdStRegOp(MachineInstr & MI,unsigned PairedRegOp=0)615 static MachineOperand &getLdStRegOp(MachineInstr &MI,
616 unsigned PairedRegOp = 0) {
617 assert(PairedRegOp < 2 && "Unexpected register operand idx.");
618 bool IsPreLdSt = AArch64InstrInfo::isPreLdSt(MI);
619 if (IsPreLdSt)
620 PairedRegOp += 1;
621 unsigned Idx =
622 AArch64InstrInfo::isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0;
623 return MI.getOperand(Idx);
624 }
625
isLdOffsetInRangeOfSt(MachineInstr & LoadInst,MachineInstr & StoreInst,const AArch64InstrInfo * TII)626 static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
627 MachineInstr &StoreInst,
628 const AArch64InstrInfo *TII) {
629 assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
630 int LoadSize = TII->getMemScale(LoadInst);
631 int StoreSize = TII->getMemScale(StoreInst);
632 int UnscaledStOffset =
633 TII->hasUnscaledLdStOffset(StoreInst)
634 ? AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm()
635 : AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm() * StoreSize;
636 int UnscaledLdOffset =
637 TII->hasUnscaledLdStOffset(LoadInst)
638 ? AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm()
639 : AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm() * LoadSize;
640 return (UnscaledStOffset <= UnscaledLdOffset) &&
641 (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
642 }
643
isPromotableZeroStoreInst(MachineInstr & MI)644 static bool isPromotableZeroStoreInst(MachineInstr &MI) {
645 unsigned Opc = MI.getOpcode();
646 return (Opc == AArch64::STRWui || Opc == AArch64::STURWi ||
647 isNarrowStore(Opc)) &&
648 getLdStRegOp(MI).getReg() == AArch64::WZR;
649 }
650
isPromotableLoadFromStore(MachineInstr & MI)651 static bool isPromotableLoadFromStore(MachineInstr &MI) {
652 switch (MI.getOpcode()) {
653 default:
654 return false;
655 // Scaled instructions.
656 case AArch64::LDRBBui:
657 case AArch64::LDRHHui:
658 case AArch64::LDRWui:
659 case AArch64::LDRXui:
660 // Unscaled instructions.
661 case AArch64::LDURBBi:
662 case AArch64::LDURHHi:
663 case AArch64::LDURWi:
664 case AArch64::LDURXi:
665 return true;
666 }
667 }
668
isMergeableLdStUpdate(MachineInstr & MI)669 static bool isMergeableLdStUpdate(MachineInstr &MI) {
670 unsigned Opc = MI.getOpcode();
671 switch (Opc) {
672 default:
673 return false;
674 // Scaled instructions.
675 case AArch64::STRSui:
676 case AArch64::STRDui:
677 case AArch64::STRQui:
678 case AArch64::STRXui:
679 case AArch64::STRWui:
680 case AArch64::STRHHui:
681 case AArch64::STRBBui:
682 case AArch64::LDRSui:
683 case AArch64::LDRDui:
684 case AArch64::LDRQui:
685 case AArch64::LDRXui:
686 case AArch64::LDRWui:
687 case AArch64::LDRHHui:
688 case AArch64::LDRBBui:
689 case AArch64::STGi:
690 case AArch64::STZGi:
691 case AArch64::ST2Gi:
692 case AArch64::STZ2Gi:
693 case AArch64::STGPi:
694 // Unscaled instructions.
695 case AArch64::STURSi:
696 case AArch64::STURDi:
697 case AArch64::STURQi:
698 case AArch64::STURWi:
699 case AArch64::STURXi:
700 case AArch64::LDURSi:
701 case AArch64::LDURDi:
702 case AArch64::LDURQi:
703 case AArch64::LDURWi:
704 case AArch64::LDURXi:
705 // Paired instructions.
706 case AArch64::LDPSi:
707 case AArch64::LDPSWi:
708 case AArch64::LDPDi:
709 case AArch64::LDPQi:
710 case AArch64::LDPWi:
711 case AArch64::LDPXi:
712 case AArch64::STPSi:
713 case AArch64::STPDi:
714 case AArch64::STPQi:
715 case AArch64::STPWi:
716 case AArch64::STPXi:
717 // Make sure this is a reg+imm (as opposed to an address reloc).
718 if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
719 return false;
720
721 return true;
722 }
723 }
724
isRewritableImplicitDef(unsigned Opc)725 static bool isRewritableImplicitDef(unsigned Opc) {
726 switch (Opc) {
727 default:
728 return false;
729 case AArch64::ORRWrs:
730 case AArch64::ADDWri:
731 return true;
732 }
733 }
734
735 MachineBasicBlock::iterator
mergeNarrowZeroStores(MachineBasicBlock::iterator I,MachineBasicBlock::iterator MergeMI,const LdStPairFlags & Flags)736 AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
737 MachineBasicBlock::iterator MergeMI,
738 const LdStPairFlags &Flags) {
739 assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&
740 "Expected promotable zero stores.");
741
742 MachineBasicBlock::iterator E = I->getParent()->end();
743 MachineBasicBlock::iterator NextI = next_nodbg(I, E);
744 // If NextI is the second of the two instructions to be merged, we need
745 // to skip one further. Either way we merge will invalidate the iterator,
746 // and we don't need to scan the new instruction, as it's a pairwise
747 // instruction, which we're not considering for further action anyway.
748 if (NextI == MergeMI)
749 NextI = next_nodbg(NextI, E);
750
751 unsigned Opc = I->getOpcode();
752 unsigned MergeMIOpc = MergeMI->getOpcode();
753 bool IsScaled = !TII->hasUnscaledLdStOffset(Opc);
754 bool IsMergedMIScaled = !TII->hasUnscaledLdStOffset(MergeMIOpc);
755 int OffsetStride = IsScaled ? TII->getMemScale(*I) : 1;
756 int MergeMIOffsetStride = IsMergedMIScaled ? TII->getMemScale(*MergeMI) : 1;
757
758 bool MergeForward = Flags.getMergeForward();
759 // Insert our new paired instruction after whichever of the paired
760 // instructions MergeForward indicates.
761 MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I;
762 // Also based on MergeForward is from where we copy the base register operand
763 // so we get the flags compatible with the input code.
764 const MachineOperand &BaseRegOp =
765 MergeForward ? AArch64InstrInfo::getLdStBaseOp(*MergeMI)
766 : AArch64InstrInfo::getLdStBaseOp(*I);
767
768 // Which register is Rt and which is Rt2 depends on the offset order.
769 int64_t IOffsetInBytes =
770 AArch64InstrInfo::getLdStOffsetOp(*I).getImm() * OffsetStride;
771 int64_t MIOffsetInBytes =
772 AArch64InstrInfo::getLdStOffsetOp(*MergeMI).getImm() *
773 MergeMIOffsetStride;
774 // Select final offset based on the offset order.
775 int64_t OffsetImm;
776 if (IOffsetInBytes > MIOffsetInBytes)
777 OffsetImm = MIOffsetInBytes;
778 else
779 OffsetImm = IOffsetInBytes;
780
781 int NewOpcode = getMatchingWideOpcode(Opc);
782 bool FinalIsScaled = !TII->hasUnscaledLdStOffset(NewOpcode);
783
784 // Adjust final offset if the result opcode is a scaled store.
785 if (FinalIsScaled) {
786 int NewOffsetStride = FinalIsScaled ? TII->getMemScale(NewOpcode) : 1;
787 assert(((OffsetImm % NewOffsetStride) == 0) &&
788 "Offset should be a multiple of the store memory scale");
789 OffsetImm = OffsetImm / NewOffsetStride;
790 }
791
792 // Construct the new instruction.
793 DebugLoc DL = I->getDebugLoc();
794 MachineBasicBlock *MBB = I->getParent();
795 MachineInstrBuilder MIB;
796 MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
797 .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
798 .add(BaseRegOp)
799 .addImm(OffsetImm)
800 .cloneMergedMemRefs({&*I, &*MergeMI})
801 .setMIFlags(I->mergeFlagsWith(*MergeMI));
802 (void)MIB;
803
804 LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n ");
805 LLVM_DEBUG(I->print(dbgs()));
806 LLVM_DEBUG(dbgs() << " ");
807 LLVM_DEBUG(MergeMI->print(dbgs()));
808 LLVM_DEBUG(dbgs() << " with instruction:\n ");
809 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
810 LLVM_DEBUG(dbgs() << "\n");
811
812 // Erase the old instructions.
813 I->eraseFromParent();
814 MergeMI->eraseFromParent();
815 return NextI;
816 }
817
818 // Apply Fn to all instructions between MI and the beginning of the block, until
819 // a def for DefReg is reached. Returns true, iff Fn returns true for all
820 // visited instructions. Stop after visiting Limit iterations.
forAllMIsUntilDef(MachineInstr & MI,MCPhysReg DefReg,const TargetRegisterInfo * TRI,unsigned Limit,std::function<bool (MachineInstr &,bool)> & Fn)821 static bool forAllMIsUntilDef(MachineInstr &MI, MCPhysReg DefReg,
822 const TargetRegisterInfo *TRI, unsigned Limit,
823 std::function<bool(MachineInstr &, bool)> &Fn) {
824 auto MBB = MI.getParent();
825 for (MachineInstr &I :
826 instructionsWithoutDebug(MI.getReverseIterator(), MBB->instr_rend())) {
827 if (!Limit)
828 return false;
829 --Limit;
830
831 bool isDef = any_of(I.operands(), [DefReg, TRI](MachineOperand &MOP) {
832 return MOP.isReg() && MOP.isDef() && !MOP.isDebug() && MOP.getReg() &&
833 TRI->regsOverlap(MOP.getReg(), DefReg);
834 });
835 if (!Fn(I, isDef))
836 return false;
837 if (isDef)
838 break;
839 }
840 return true;
841 }
842
updateDefinedRegisters(MachineInstr & MI,LiveRegUnits & Units,const TargetRegisterInfo * TRI)843 static void updateDefinedRegisters(MachineInstr &MI, LiveRegUnits &Units,
844 const TargetRegisterInfo *TRI) {
845
846 for (const MachineOperand &MOP : phys_regs_and_masks(MI))
847 if (MOP.isReg() && MOP.isKill())
848 Units.removeReg(MOP.getReg());
849
850 for (const MachineOperand &MOP : phys_regs_and_masks(MI))
851 if (MOP.isReg() && !MOP.isKill())
852 Units.addReg(MOP.getReg());
853 }
854
855 MachineBasicBlock::iterator
mergePairedInsns(MachineBasicBlock::iterator I,MachineBasicBlock::iterator Paired,const LdStPairFlags & Flags)856 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
857 MachineBasicBlock::iterator Paired,
858 const LdStPairFlags &Flags) {
859 MachineBasicBlock::iterator E = I->getParent()->end();
860 MachineBasicBlock::iterator NextI = next_nodbg(I, E);
861 // If NextI is the second of the two instructions to be merged, we need
862 // to skip one further. Either way we merge will invalidate the iterator,
863 // and we don't need to scan the new instruction, as it's a pairwise
864 // instruction, which we're not considering for further action anyway.
865 if (NextI == Paired)
866 NextI = next_nodbg(NextI, E);
867
868 int SExtIdx = Flags.getSExtIdx();
869 unsigned Opc =
870 SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
871 bool IsUnscaled = TII->hasUnscaledLdStOffset(Opc);
872 int OffsetStride = IsUnscaled ? TII->getMemScale(*I) : 1;
873
874 bool MergeForward = Flags.getMergeForward();
875
876 std::optional<MCPhysReg> RenameReg = Flags.getRenameReg();
877 if (RenameReg) {
878 MCRegister RegToRename = getLdStRegOp(*I).getReg();
879 DefinedInBB.addReg(*RenameReg);
880
881 // Return the sub/super register for RenameReg, matching the size of
882 // OriginalReg.
883 auto GetMatchingSubReg =
884 [this, RenameReg](const TargetRegisterClass *C) -> MCPhysReg {
885 for (MCPhysReg SubOrSuper :
886 TRI->sub_and_superregs_inclusive(*RenameReg)) {
887 if (C->contains(SubOrSuper))
888 return SubOrSuper;
889 }
890 llvm_unreachable("Should have found matching sub or super register!");
891 };
892
893 std::function<bool(MachineInstr &, bool)> UpdateMIs =
894 [this, RegToRename, GetMatchingSubReg, MergeForward](MachineInstr &MI,
895 bool IsDef) {
896 if (IsDef) {
897 bool SeenDef = false;
898 for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); ++OpIdx) {
899 MachineOperand &MOP = MI.getOperand(OpIdx);
900 // Rename the first explicit definition and all implicit
901 // definitions matching RegToRename.
902 if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
903 (!MergeForward || !SeenDef ||
904 (MOP.isDef() && MOP.isImplicit())) &&
905 TRI->regsOverlap(MOP.getReg(), RegToRename)) {
906 assert((MOP.isImplicit() ||
907 (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
908 "Need renamable operands");
909 Register MatchingReg;
910 if (const TargetRegisterClass *RC =
911 MI.getRegClassConstraint(OpIdx, TII, TRI))
912 MatchingReg = GetMatchingSubReg(RC);
913 else {
914 if (!isRewritableImplicitDef(MI.getOpcode()))
915 continue;
916 MatchingReg = GetMatchingSubReg(
917 TRI->getMinimalPhysRegClass(MOP.getReg()));
918 }
919 MOP.setReg(MatchingReg);
920 SeenDef = true;
921 }
922 }
923 } else {
924 for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); ++OpIdx) {
925 MachineOperand &MOP = MI.getOperand(OpIdx);
926 if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
927 TRI->regsOverlap(MOP.getReg(), RegToRename)) {
928 assert((MOP.isImplicit() ||
929 (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
930 "Need renamable operands");
931 Register MatchingReg;
932 if (const TargetRegisterClass *RC =
933 MI.getRegClassConstraint(OpIdx, TII, TRI))
934 MatchingReg = GetMatchingSubReg(RC);
935 else
936 MatchingReg = GetMatchingSubReg(
937 TRI->getMinimalPhysRegClass(MOP.getReg()));
938 assert(MatchingReg != AArch64::NoRegister &&
939 "Cannot find matching regs for renaming");
940 MOP.setReg(MatchingReg);
941 }
942 }
943 }
944 LLVM_DEBUG(dbgs() << "Renamed " << MI);
945 return true;
946 };
947 forAllMIsUntilDef(MergeForward ? *I : *std::prev(Paired), RegToRename, TRI,
948 UINT32_MAX, UpdateMIs);
949
950 #if !defined(NDEBUG)
951 // For forward merging store:
952 // Make sure the register used for renaming is not used between the
953 // paired instructions. That would trash the content before the new
954 // paired instruction.
955 MCPhysReg RegToCheck = *RenameReg;
956 // For backward merging load:
957 // Make sure the register being renamed is not used between the
958 // paired instructions. That would trash the content after the new
959 // paired instruction.
960 if (!MergeForward)
961 RegToCheck = RegToRename;
962 for (auto &MI :
963 iterator_range<MachineInstrBundleIterator<llvm::MachineInstr>>(
964 MergeForward ? std::next(I) : I,
965 MergeForward ? std::next(Paired) : Paired))
966 assert(all_of(MI.operands(),
967 [this, RegToCheck](const MachineOperand &MOP) {
968 return !MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
969 MOP.isUndef() ||
970 !TRI->regsOverlap(MOP.getReg(), RegToCheck);
971 }) &&
972 "Rename register used between paired instruction, trashing the "
973 "content");
974 #endif
975 }
976
977 // Insert our new paired instruction after whichever of the paired
978 // instructions MergeForward indicates.
979 MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
980 // Also based on MergeForward is from where we copy the base register operand
981 // so we get the flags compatible with the input code.
982 const MachineOperand &BaseRegOp =
983 MergeForward ? AArch64InstrInfo::getLdStBaseOp(*Paired)
984 : AArch64InstrInfo::getLdStBaseOp(*I);
985
986 int Offset = AArch64InstrInfo::getLdStOffsetOp(*I).getImm();
987 int PairedOffset = AArch64InstrInfo::getLdStOffsetOp(*Paired).getImm();
988 bool PairedIsUnscaled = TII->hasUnscaledLdStOffset(Paired->getOpcode());
989 if (IsUnscaled != PairedIsUnscaled) {
990 // We're trying to pair instructions that differ in how they are scaled. If
991 // I is scaled then scale the offset of Paired accordingly. Otherwise, do
992 // the opposite (i.e., make Paired's offset unscaled).
993 int MemSize = TII->getMemScale(*Paired);
994 if (PairedIsUnscaled) {
995 // If the unscaled offset isn't a multiple of the MemSize, we can't
996 // pair the operations together.
997 assert(!(PairedOffset % TII->getMemScale(*Paired)) &&
998 "Offset should be a multiple of the stride!");
999 PairedOffset /= MemSize;
1000 } else {
1001 PairedOffset *= MemSize;
1002 }
1003 }
1004
1005 // Which register is Rt and which is Rt2 depends on the offset order.
1006 // However, for pre load/stores the Rt should be the one of the pre
1007 // load/store.
1008 MachineInstr *RtMI, *Rt2MI;
1009 if (Offset == PairedOffset + OffsetStride &&
1010 !AArch64InstrInfo::isPreLdSt(*I)) {
1011 RtMI = &*Paired;
1012 Rt2MI = &*I;
1013 // Here we swapped the assumption made for SExtIdx.
1014 // I.e., we turn ldp I, Paired into ldp Paired, I.
1015 // Update the index accordingly.
1016 if (SExtIdx != -1)
1017 SExtIdx = (SExtIdx + 1) % 2;
1018 } else {
1019 RtMI = &*I;
1020 Rt2MI = &*Paired;
1021 }
1022 int OffsetImm = AArch64InstrInfo::getLdStOffsetOp(*RtMI).getImm();
1023 // Scale the immediate offset, if necessary.
1024 if (TII->hasUnscaledLdStOffset(RtMI->getOpcode())) {
1025 assert(!(OffsetImm % TII->getMemScale(*RtMI)) &&
1026 "Unscaled offset cannot be scaled.");
1027 OffsetImm /= TII->getMemScale(*RtMI);
1028 }
1029
1030 // Construct the new instruction.
1031 MachineInstrBuilder MIB;
1032 DebugLoc DL = I->getDebugLoc();
1033 MachineBasicBlock *MBB = I->getParent();
1034 MachineOperand RegOp0 = getLdStRegOp(*RtMI);
1035 MachineOperand RegOp1 = getLdStRegOp(*Rt2MI);
1036 MachineOperand &PairedRegOp = RtMI == &*Paired ? RegOp0 : RegOp1;
1037 // Kill flags may become invalid when moving stores for pairing.
1038 if (RegOp0.isUse()) {
1039 if (!MergeForward) {
1040 // Clear kill flags on store if moving upwards. Example:
1041 // STRWui kill %w0, ...
1042 // USE %w1
1043 // STRWui kill %w1 ; need to clear kill flag when moving STRWui upwards
1044 // We are about to move the store of w1, so its kill flag may become
1045 // invalid; not the case for w0.
1046 // Since w1 is used between the stores, the kill flag on w1 is cleared
1047 // after merging.
1048 // STPWi kill %w0, %w1, ...
1049 // USE %w1
1050 for (auto It = std::next(I); It != Paired && PairedRegOp.isKill(); ++It)
1051 if (It->readsRegister(PairedRegOp.getReg(), TRI))
1052 PairedRegOp.setIsKill(false);
1053 } else {
1054 // Clear kill flags of the first stores register. Example:
1055 // STRWui %w1, ...
1056 // USE kill %w1 ; need to clear kill flag when moving STRWui downwards
1057 // STRW %w0
1058 Register Reg = getLdStRegOp(*I).getReg();
1059 for (MachineInstr &MI : make_range(std::next(I), Paired))
1060 MI.clearRegisterKills(Reg, TRI);
1061 }
1062 }
1063
1064 unsigned int MatchPairOpcode = getMatchingPairOpcode(Opc);
1065 MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(MatchPairOpcode));
1066
1067 // Adds the pre-index operand for pre-indexed ld/st pairs.
1068 if (AArch64InstrInfo::isPreLdSt(*RtMI))
1069 MIB.addReg(BaseRegOp.getReg(), RegState::Define);
1070
1071 MIB.add(RegOp0)
1072 .add(RegOp1)
1073 .add(BaseRegOp)
1074 .addImm(OffsetImm)
1075 .cloneMergedMemRefs({&*I, &*Paired})
1076 .setMIFlags(I->mergeFlagsWith(*Paired));
1077
1078 (void)MIB;
1079
1080 LLVM_DEBUG(
1081 dbgs() << "Creating pair load/store. Replacing instructions:\n ");
1082 LLVM_DEBUG(I->print(dbgs()));
1083 LLVM_DEBUG(dbgs() << " ");
1084 LLVM_DEBUG(Paired->print(dbgs()));
1085 LLVM_DEBUG(dbgs() << " with instruction:\n ");
1086 if (SExtIdx != -1) {
1087 // Generate the sign extension for the proper result of the ldp.
1088 // I.e., with X1, that would be:
1089 // %w1 = KILL %w1, implicit-def %x1
1090 // %x1 = SBFMXri killed %x1, 0, 31
1091 MachineOperand &DstMO = MIB->getOperand(SExtIdx);
1092 // Right now, DstMO has the extended register, since it comes from an
1093 // extended opcode.
1094 Register DstRegX = DstMO.getReg();
1095 // Get the W variant of that register.
1096 Register DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
1097 // Update the result of LDP to use the W instead of the X variant.
1098 DstMO.setReg(DstRegW);
1099 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1100 LLVM_DEBUG(dbgs() << "\n");
1101 // Make the machine verifier happy by providing a definition for
1102 // the X register.
1103 // Insert this definition right after the generated LDP, i.e., before
1104 // InsertionPoint.
1105 MachineInstrBuilder MIBKill =
1106 BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW)
1107 .addReg(DstRegW)
1108 .addReg(DstRegX, RegState::Define);
1109 MIBKill->getOperand(2).setImplicit();
1110 // Create the sign extension.
1111 MachineInstrBuilder MIBSXTW =
1112 BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX)
1113 .addReg(DstRegX)
1114 .addImm(0)
1115 .addImm(31);
1116 (void)MIBSXTW;
1117 LLVM_DEBUG(dbgs() << " Extend operand:\n ");
1118 LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
1119 } else {
1120 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1121 }
1122 LLVM_DEBUG(dbgs() << "\n");
1123
1124 if (MergeForward)
1125 for (const MachineOperand &MOP : phys_regs_and_masks(*I))
1126 if (MOP.isReg() && MOP.isKill())
1127 DefinedInBB.addReg(MOP.getReg());
1128
1129 // Erase the old instructions.
1130 I->eraseFromParent();
1131 Paired->eraseFromParent();
1132
1133 return NextI;
1134 }
1135
1136 MachineBasicBlock::iterator
promoteLoadFromStore(MachineBasicBlock::iterator LoadI,MachineBasicBlock::iterator StoreI)1137 AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
1138 MachineBasicBlock::iterator StoreI) {
1139 MachineBasicBlock::iterator NextI =
1140 next_nodbg(LoadI, LoadI->getParent()->end());
1141
1142 int LoadSize = TII->getMemScale(*LoadI);
1143 int StoreSize = TII->getMemScale(*StoreI);
1144 Register LdRt = getLdStRegOp(*LoadI).getReg();
1145 const MachineOperand &StMO = getLdStRegOp(*StoreI);
1146 Register StRt = getLdStRegOp(*StoreI).getReg();
1147 bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
1148
1149 assert((IsStoreXReg ||
1150 TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
1151 "Unexpected RegClass");
1152
1153 MachineInstr *BitExtMI;
1154 if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) {
1155 // Remove the load, if the destination register of the loads is the same
1156 // register for stored value.
1157 if (StRt == LdRt && LoadSize == 8) {
1158 for (MachineInstr &MI : make_range(StoreI->getIterator(),
1159 LoadI->getIterator())) {
1160 if (MI.killsRegister(StRt, TRI)) {
1161 MI.clearRegisterKills(StRt, TRI);
1162 break;
1163 }
1164 }
1165 LLVM_DEBUG(dbgs() << "Remove load instruction:\n ");
1166 LLVM_DEBUG(LoadI->print(dbgs()));
1167 LLVM_DEBUG(dbgs() << "\n");
1168 LoadI->eraseFromParent();
1169 return NextI;
1170 }
1171 // Replace the load with a mov if the load and store are in the same size.
1172 BitExtMI =
1173 BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1174 TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
1175 .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
1176 .add(StMO)
1177 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
1178 .setMIFlags(LoadI->getFlags());
1179 } else {
1180 // FIXME: Currently we disable this transformation in big-endian targets as
1181 // performance and correctness are verified only in little-endian.
1182 if (!Subtarget->isLittleEndian())
1183 return NextI;
1184 bool IsUnscaled = TII->hasUnscaledLdStOffset(*LoadI);
1185 assert(IsUnscaled == TII->hasUnscaledLdStOffset(*StoreI) &&
1186 "Unsupported ld/st match");
1187 assert(LoadSize <= StoreSize && "Invalid load size");
1188 int UnscaledLdOffset =
1189 IsUnscaled
1190 ? AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm()
1191 : AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm() * LoadSize;
1192 int UnscaledStOffset =
1193 IsUnscaled
1194 ? AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm()
1195 : AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm() * StoreSize;
1196 int Width = LoadSize * 8;
1197 Register DestReg =
1198 IsStoreXReg ? Register(TRI->getMatchingSuperReg(
1199 LdRt, AArch64::sub_32, &AArch64::GPR64RegClass))
1200 : LdRt;
1201
1202 assert((UnscaledLdOffset >= UnscaledStOffset &&
1203 (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
1204 "Invalid offset");
1205
1206 int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
1207 int Imms = Immr + Width - 1;
1208 if (UnscaledLdOffset == UnscaledStOffset) {
1209 uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N
1210 | ((Immr) << 6) // immr
1211 | ((Imms) << 0) // imms
1212 ;
1213
1214 BitExtMI =
1215 BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1216 TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
1217 DestReg)
1218 .add(StMO)
1219 .addImm(AndMaskEncoded)
1220 .setMIFlags(LoadI->getFlags());
1221 } else {
1222 BitExtMI =
1223 BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1224 TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
1225 DestReg)
1226 .add(StMO)
1227 .addImm(Immr)
1228 .addImm(Imms)
1229 .setMIFlags(LoadI->getFlags());
1230 }
1231 }
1232
1233 // Clear kill flags between store and load.
1234 for (MachineInstr &MI : make_range(StoreI->getIterator(),
1235 BitExtMI->getIterator()))
1236 if (MI.killsRegister(StRt, TRI)) {
1237 MI.clearRegisterKills(StRt, TRI);
1238 break;
1239 }
1240
1241 LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n ");
1242 LLVM_DEBUG(StoreI->print(dbgs()));
1243 LLVM_DEBUG(dbgs() << " ");
1244 LLVM_DEBUG(LoadI->print(dbgs()));
1245 LLVM_DEBUG(dbgs() << " with instructions:\n ");
1246 LLVM_DEBUG(StoreI->print(dbgs()));
1247 LLVM_DEBUG(dbgs() << " ");
1248 LLVM_DEBUG((BitExtMI)->print(dbgs()));
1249 LLVM_DEBUG(dbgs() << "\n");
1250
1251 // Erase the old instructions.
1252 LoadI->eraseFromParent();
1253 return NextI;
1254 }
1255
inBoundsForPair(bool IsUnscaled,int Offset,int OffsetStride)1256 static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
1257 // Convert the byte-offset used by unscaled into an "element" offset used
1258 // by the scaled pair load/store instructions.
1259 if (IsUnscaled) {
1260 // If the byte-offset isn't a multiple of the stride, there's no point
1261 // trying to match it.
1262 if (Offset % OffsetStride)
1263 return false;
1264 Offset /= OffsetStride;
1265 }
1266 return Offset <= 63 && Offset >= -64;
1267 }
1268
1269 // Do alignment, specialized to power of 2 and for signed ints,
1270 // avoiding having to do a C-style cast from uint_64t to int when
1271 // using alignTo from include/llvm/Support/MathExtras.h.
1272 // FIXME: Move this function to include/MathExtras.h?
alignTo(int Num,int PowOf2)1273 static int alignTo(int Num, int PowOf2) {
1274 return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
1275 }
1276
mayAlias(MachineInstr & MIa,SmallVectorImpl<MachineInstr * > & MemInsns,AliasAnalysis * AA)1277 static bool mayAlias(MachineInstr &MIa,
1278 SmallVectorImpl<MachineInstr *> &MemInsns,
1279 AliasAnalysis *AA) {
1280 for (MachineInstr *MIb : MemInsns) {
1281 if (MIa.mayAlias(AA, *MIb, /*UseTBAA*/ false)) {
1282 LLVM_DEBUG(dbgs() << "Aliasing with: "; MIb->dump());
1283 return true;
1284 }
1285 }
1286
1287 LLVM_DEBUG(dbgs() << "No aliases found\n");
1288 return false;
1289 }
1290
findMatchingStore(MachineBasicBlock::iterator I,unsigned Limit,MachineBasicBlock::iterator & StoreI)1291 bool AArch64LoadStoreOpt::findMatchingStore(
1292 MachineBasicBlock::iterator I, unsigned Limit,
1293 MachineBasicBlock::iterator &StoreI) {
1294 MachineBasicBlock::iterator B = I->getParent()->begin();
1295 MachineBasicBlock::iterator MBBI = I;
1296 MachineInstr &LoadMI = *I;
1297 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(LoadMI).getReg();
1298
1299 // If the load is the first instruction in the block, there's obviously
1300 // not any matching store.
1301 if (MBBI == B)
1302 return false;
1303
1304 // Track which register units have been modified and used between the first
1305 // insn and the second insn.
1306 ModifiedRegUnits.clear();
1307 UsedRegUnits.clear();
1308
1309 unsigned Count = 0;
1310 do {
1311 MBBI = prev_nodbg(MBBI, B);
1312 MachineInstr &MI = *MBBI;
1313
1314 // Don't count transient instructions towards the search limit since there
1315 // may be different numbers of them if e.g. debug information is present.
1316 if (!MI.isTransient())
1317 ++Count;
1318
1319 // If the load instruction reads directly from the address to which the
1320 // store instruction writes and the stored value is not modified, we can
1321 // promote the load. Since we do not handle stores with pre-/post-index,
1322 // it's unnecessary to check if BaseReg is modified by the store itself.
1323 // Also we can't handle stores without an immediate offset operand,
1324 // while the operand might be the address for a global variable.
1325 if (MI.mayStore() && isMatchingStore(LoadMI, MI) &&
1326 BaseReg == AArch64InstrInfo::getLdStBaseOp(MI).getReg() &&
1327 AArch64InstrInfo::getLdStOffsetOp(MI).isImm() &&
1328 isLdOffsetInRangeOfSt(LoadMI, MI, TII) &&
1329 ModifiedRegUnits.available(getLdStRegOp(MI).getReg())) {
1330 StoreI = MBBI;
1331 return true;
1332 }
1333
1334 if (MI.isCall())
1335 return false;
1336
1337 // Update modified / uses register units.
1338 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1339
1340 // Otherwise, if the base register is modified, we have no match, so
1341 // return early.
1342 if (!ModifiedRegUnits.available(BaseReg))
1343 return false;
1344
1345 // If we encounter a store aliased with the load, return early.
1346 if (MI.mayStore() && LoadMI.mayAlias(AA, MI, /*UseTBAA*/ false))
1347 return false;
1348 } while (MBBI != B && Count < Limit);
1349 return false;
1350 }
1351
needsWinCFI(const MachineFunction * MF)1352 static bool needsWinCFI(const MachineFunction *MF) {
1353 return MF->getTarget().getMCAsmInfo()->usesWindowsCFI() &&
1354 MF->getFunction().needsUnwindTableEntry();
1355 }
1356
1357 // Returns true if FirstMI and MI are candidates for merging or pairing.
1358 // Otherwise, returns false.
areCandidatesToMergeOrPair(MachineInstr & FirstMI,MachineInstr & MI,LdStPairFlags & Flags,const AArch64InstrInfo * TII)1359 static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
1360 LdStPairFlags &Flags,
1361 const AArch64InstrInfo *TII) {
1362 // If this is volatile or if pairing is suppressed, not a candidate.
1363 if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
1364 return false;
1365
1366 // We should have already checked FirstMI for pair suppression and volatility.
1367 assert(!FirstMI.hasOrderedMemoryRef() &&
1368 !TII->isLdStPairSuppressed(FirstMI) &&
1369 "FirstMI shouldn't get here if either of these checks are true.");
1370
1371 if (needsWinCFI(MI.getMF()) && (MI.getFlag(MachineInstr::FrameSetup) ||
1372 MI.getFlag(MachineInstr::FrameDestroy)))
1373 return false;
1374
1375 unsigned OpcA = FirstMI.getOpcode();
1376 unsigned OpcB = MI.getOpcode();
1377
1378 // Opcodes match: If the opcodes are pre ld/st there is nothing more to check.
1379 if (OpcA == OpcB)
1380 return !AArch64InstrInfo::isPreLdSt(FirstMI);
1381
1382 // Two pre ld/st of different opcodes cannot be merged either
1383 if (AArch64InstrInfo::isPreLdSt(FirstMI) && AArch64InstrInfo::isPreLdSt(MI))
1384 return false;
1385
1386 // Try to match a sign-extended load/store with a zero-extended load/store.
1387 bool IsValidLdStrOpc, PairIsValidLdStrOpc;
1388 unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc);
1389 assert(IsValidLdStrOpc &&
1390 "Given Opc should be a Load or Store with an immediate");
1391 // OpcA will be the first instruction in the pair.
1392 if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) {
1393 Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0);
1394 return true;
1395 }
1396
1397 // If the second instruction isn't even a mergable/pairable load/store, bail
1398 // out.
1399 if (!PairIsValidLdStrOpc)
1400 return false;
1401
1402 // FIXME: We don't support merging narrow stores with mixed scaled/unscaled
1403 // offsets.
1404 if (isNarrowStore(OpcA) || isNarrowStore(OpcB))
1405 return false;
1406
1407 // The STR<S,D,Q,W,X>pre - STR<S,D,Q,W,X>ui and
1408 // LDR<S,D,Q,W,X,SW>pre-LDR<S,D,Q,W,X,SW>ui
1409 // are candidate pairs that can be merged.
1410 if (isPreLdStPairCandidate(FirstMI, MI))
1411 return true;
1412
1413 // Try to match an unscaled load/store with a scaled load/store.
1414 return TII->hasUnscaledLdStOffset(OpcA) != TII->hasUnscaledLdStOffset(OpcB) &&
1415 getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB);
1416
1417 // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
1418 }
1419
canRenameMOP(const MachineOperand & MOP,const TargetRegisterInfo * TRI)1420 static bool canRenameMOP(const MachineOperand &MOP,
1421 const TargetRegisterInfo *TRI) {
1422 if (MOP.isReg()) {
1423 auto *RegClass = TRI->getMinimalPhysRegClass(MOP.getReg());
1424 // Renaming registers with multiple disjunct sub-registers (e.g. the
1425 // result of a LD3) means that all sub-registers are renamed, potentially
1426 // impacting other instructions we did not check. Bail out.
1427 // Note that this relies on the structure of the AArch64 register file. In
1428 // particular, a subregister cannot be written without overwriting the
1429 // whole register.
1430 if (RegClass->HasDisjunctSubRegs) {
1431 LLVM_DEBUG(
1432 dbgs()
1433 << " Cannot rename operands with multiple disjunct subregisters ("
1434 << MOP << ")\n");
1435 return false;
1436 }
1437
1438 // We cannot rename arbitrary implicit-defs, the specific rule to rewrite
1439 // them must be known. For example, in ORRWrs the implicit-def
1440 // corresponds to the result register.
1441 if (MOP.isImplicit() && MOP.isDef()) {
1442 if (!isRewritableImplicitDef(MOP.getParent()->getOpcode()))
1443 return false;
1444 return TRI->isSuperOrSubRegisterEq(
1445 MOP.getParent()->getOperand(0).getReg(), MOP.getReg());
1446 }
1447 }
1448 return MOP.isImplicit() ||
1449 (MOP.isRenamable() && !MOP.isEarlyClobber() && !MOP.isTied());
1450 }
1451
1452 static bool
canRenameUpToDef(MachineInstr & FirstMI,LiveRegUnits & UsedInBetween,SmallPtrSetImpl<const TargetRegisterClass * > & RequiredClasses,const TargetRegisterInfo * TRI)1453 canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween,
1454 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1455 const TargetRegisterInfo *TRI) {
1456 if (!FirstMI.mayStore())
1457 return false;
1458
1459 // Check if we can find an unused register which we can use to rename
1460 // the register used by the first load/store.
1461
1462 auto RegToRename = getLdStRegOp(FirstMI).getReg();
1463 // For now, we only rename if the store operand gets killed at the store.
1464 if (!getLdStRegOp(FirstMI).isKill() &&
1465 !any_of(FirstMI.operands(),
1466 [TRI, RegToRename](const MachineOperand &MOP) {
1467 return MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1468 MOP.isImplicit() && MOP.isKill() &&
1469 TRI->regsOverlap(RegToRename, MOP.getReg());
1470 })) {
1471 LLVM_DEBUG(dbgs() << " Operand not killed at " << FirstMI);
1472 return false;
1473 }
1474
1475 bool FoundDef = false;
1476
1477 // For each instruction between FirstMI and the previous def for RegToRename,
1478 // we
1479 // * check if we can rename RegToRename in this instruction
1480 // * collect the registers used and required register classes for RegToRename.
1481 std::function<bool(MachineInstr &, bool)> CheckMIs = [&](MachineInstr &MI,
1482 bool IsDef) {
1483 LLVM_DEBUG(dbgs() << "Checking " << MI);
1484 // Currently we do not try to rename across frame-setup instructions.
1485 if (MI.getFlag(MachineInstr::FrameSetup)) {
1486 LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions "
1487 << "currently\n");
1488 return false;
1489 }
1490
1491 UsedInBetween.accumulate(MI);
1492
1493 // For a definition, check that we can rename the definition and exit the
1494 // loop.
1495 FoundDef = IsDef;
1496
1497 // For defs, check if we can rename the first def of RegToRename.
1498 if (FoundDef) {
1499 // For some pseudo instructions, we might not generate code in the end
1500 // (e.g. KILL) and we would end up without a correct def for the rename
1501 // register.
1502 // TODO: This might be overly conservative and we could handle those cases
1503 // in multiple ways:
1504 // 1. Insert an extra copy, to materialize the def.
1505 // 2. Skip pseudo-defs until we find an non-pseudo def.
1506 if (MI.isPseudo()) {
1507 LLVM_DEBUG(dbgs() << " Cannot rename pseudo/bundle instruction\n");
1508 return false;
1509 }
1510
1511 for (auto &MOP : MI.operands()) {
1512 if (!MOP.isReg() || !MOP.isDef() || MOP.isDebug() || !MOP.getReg() ||
1513 !TRI->regsOverlap(MOP.getReg(), RegToRename))
1514 continue;
1515 if (!canRenameMOP(MOP, TRI)) {
1516 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);
1517 return false;
1518 }
1519 RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg()));
1520 }
1521 return true;
1522 } else {
1523 for (auto &MOP : MI.operands()) {
1524 if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
1525 !TRI->regsOverlap(MOP.getReg(), RegToRename))
1526 continue;
1527
1528 if (!canRenameMOP(MOP, TRI)) {
1529 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);
1530 return false;
1531 }
1532 RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg()));
1533 }
1534 }
1535 return true;
1536 };
1537
1538 if (!forAllMIsUntilDef(FirstMI, RegToRename, TRI, LdStLimit, CheckMIs))
1539 return false;
1540
1541 if (!FoundDef) {
1542 LLVM_DEBUG(dbgs() << " Did not find definition for register in BB\n");
1543 return false;
1544 }
1545 return true;
1546 }
1547
1548 // We want to merge the second load into the first by rewriting the usages of
1549 // the same reg between first (incl.) and second (excl.). We don't need to care
1550 // about any insns before FirstLoad or after SecondLoad.
1551 // 1. The second load writes new value into the same reg.
1552 // - The renaming is impossible to impact later use of the reg.
1553 // - The second load always trash the value written by the first load which
1554 // means the reg must be killed before the second load.
1555 // 2. The first load must be a def for the same reg so we don't need to look
1556 // into anything before it.
canRenameUntilSecondLoad(MachineInstr & FirstLoad,MachineInstr & SecondLoad,LiveRegUnits & UsedInBetween,SmallPtrSetImpl<const TargetRegisterClass * > & RequiredClasses,const TargetRegisterInfo * TRI)1557 static bool canRenameUntilSecondLoad(
1558 MachineInstr &FirstLoad, MachineInstr &SecondLoad,
1559 LiveRegUnits &UsedInBetween,
1560 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1561 const TargetRegisterInfo *TRI) {
1562 if (FirstLoad.isPseudo())
1563 return false;
1564
1565 UsedInBetween.accumulate(FirstLoad);
1566 auto RegToRename = getLdStRegOp(FirstLoad).getReg();
1567 bool Success = std::all_of(
1568 FirstLoad.getIterator(), SecondLoad.getIterator(),
1569 [&](MachineInstr &MI) {
1570 LLVM_DEBUG(dbgs() << "Checking " << MI);
1571 // Currently we do not try to rename across frame-setup instructions.
1572 if (MI.getFlag(MachineInstr::FrameSetup)) {
1573 LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions "
1574 << "currently\n");
1575 return false;
1576 }
1577
1578 for (auto &MOP : MI.operands()) {
1579 if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
1580 !TRI->regsOverlap(MOP.getReg(), RegToRename))
1581 continue;
1582 if (!canRenameMOP(MOP, TRI)) {
1583 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);
1584 return false;
1585 }
1586 RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg()));
1587 }
1588
1589 return true;
1590 });
1591 return Success;
1592 }
1593
1594 // Check if we can find a physical register for renaming \p Reg. This register
1595 // must:
1596 // * not be defined already in \p DefinedInBB; DefinedInBB must contain all
1597 // defined registers up to the point where the renamed register will be used,
1598 // * not used in \p UsedInBetween; UsedInBetween must contain all accessed
1599 // registers in the range the rename register will be used,
1600 // * is available in all used register classes (checked using RequiredClasses).
tryToFindRegisterToRename(const MachineFunction & MF,Register Reg,LiveRegUnits & DefinedInBB,LiveRegUnits & UsedInBetween,SmallPtrSetImpl<const TargetRegisterClass * > & RequiredClasses,const TargetRegisterInfo * TRI)1601 static std::optional<MCPhysReg> tryToFindRegisterToRename(
1602 const MachineFunction &MF, Register Reg, LiveRegUnits &DefinedInBB,
1603 LiveRegUnits &UsedInBetween,
1604 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1605 const TargetRegisterInfo *TRI) {
1606 const MachineRegisterInfo &RegInfo = MF.getRegInfo();
1607
1608 // Checks if any sub- or super-register of PR is callee saved.
1609 auto AnySubOrSuperRegCalleePreserved = [&MF, TRI](MCPhysReg PR) {
1610 return any_of(TRI->sub_and_superregs_inclusive(PR),
1611 [&MF, TRI](MCPhysReg SubOrSuper) {
1612 return TRI->isCalleeSavedPhysReg(SubOrSuper, MF);
1613 });
1614 };
1615
1616 // Check if PR or one of its sub- or super-registers can be used for all
1617 // required register classes.
1618 auto CanBeUsedForAllClasses = [&RequiredClasses, TRI](MCPhysReg PR) {
1619 return all_of(RequiredClasses, [PR, TRI](const TargetRegisterClass *C) {
1620 return any_of(
1621 TRI->sub_and_superregs_inclusive(PR),
1622 [C](MCPhysReg SubOrSuper) { return C->contains(SubOrSuper); });
1623 });
1624 };
1625
1626 auto *RegClass = TRI->getMinimalPhysRegClass(Reg);
1627 for (const MCPhysReg &PR : *RegClass) {
1628 if (DefinedInBB.available(PR) && UsedInBetween.available(PR) &&
1629 !RegInfo.isReserved(PR) && !AnySubOrSuperRegCalleePreserved(PR) &&
1630 CanBeUsedForAllClasses(PR)) {
1631 DefinedInBB.addReg(PR);
1632 LLVM_DEBUG(dbgs() << "Found rename register " << printReg(PR, TRI)
1633 << "\n");
1634 return {PR};
1635 }
1636 }
1637 LLVM_DEBUG(dbgs() << "No rename register found from "
1638 << TRI->getRegClassName(RegClass) << "\n");
1639 return std::nullopt;
1640 }
1641
1642 // For store pairs: returns a register from FirstMI to the beginning of the
1643 // block that can be renamed.
1644 // For load pairs: returns a register from FirstMI to MI that can be renamed.
findRenameRegForSameLdStRegPair(std::optional<bool> MaybeCanRename,MachineInstr & FirstMI,MachineInstr & MI,Register Reg,LiveRegUnits & DefinedInBB,LiveRegUnits & UsedInBetween,SmallPtrSetImpl<const TargetRegisterClass * > & RequiredClasses,const TargetRegisterInfo * TRI)1645 static std::optional<MCPhysReg> findRenameRegForSameLdStRegPair(
1646 std::optional<bool> MaybeCanRename, MachineInstr &FirstMI, MachineInstr &MI,
1647 Register Reg, LiveRegUnits &DefinedInBB, LiveRegUnits &UsedInBetween,
1648 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1649 const TargetRegisterInfo *TRI) {
1650 std::optional<MCPhysReg> RenameReg;
1651 if (!DebugCounter::shouldExecute(RegRenamingCounter))
1652 return RenameReg;
1653
1654 auto *RegClass = TRI->getMinimalPhysRegClass(getLdStRegOp(FirstMI).getReg());
1655 MachineFunction &MF = *FirstMI.getParent()->getParent();
1656 if (!RegClass || !MF.getRegInfo().tracksLiveness())
1657 return RenameReg;
1658
1659 const bool IsLoad = FirstMI.mayLoad();
1660
1661 if (!MaybeCanRename) {
1662 if (IsLoad)
1663 MaybeCanRename = {canRenameUntilSecondLoad(FirstMI, MI, UsedInBetween,
1664 RequiredClasses, TRI)};
1665 else
1666 MaybeCanRename = {
1667 canRenameUpToDef(FirstMI, UsedInBetween, RequiredClasses, TRI)};
1668 }
1669
1670 if (*MaybeCanRename) {
1671 RenameReg = tryToFindRegisterToRename(MF, Reg, DefinedInBB, UsedInBetween,
1672 RequiredClasses, TRI);
1673 }
1674 return RenameReg;
1675 }
1676
1677 /// Scan the instructions looking for a load/store that can be combined with the
1678 /// current instruction into a wider equivalent or a load/store pair.
1679 MachineBasicBlock::iterator
findMatchingInsn(MachineBasicBlock::iterator I,LdStPairFlags & Flags,unsigned Limit,bool FindNarrowMerge)1680 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
1681 LdStPairFlags &Flags, unsigned Limit,
1682 bool FindNarrowMerge) {
1683 MachineBasicBlock::iterator E = I->getParent()->end();
1684 MachineBasicBlock::iterator MBBI = I;
1685 MachineBasicBlock::iterator MBBIWithRenameReg;
1686 MachineInstr &FirstMI = *I;
1687 MBBI = next_nodbg(MBBI, E);
1688
1689 bool MayLoad = FirstMI.mayLoad();
1690 bool IsUnscaled = TII->hasUnscaledLdStOffset(FirstMI);
1691 Register Reg = getLdStRegOp(FirstMI).getReg();
1692 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(FirstMI).getReg();
1693 int Offset = AArch64InstrInfo::getLdStOffsetOp(FirstMI).getImm();
1694 int OffsetStride = IsUnscaled ? TII->getMemScale(FirstMI) : 1;
1695 bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
1696
1697 std::optional<bool> MaybeCanRename;
1698 if (!EnableRenaming)
1699 MaybeCanRename = {false};
1700
1701 SmallPtrSet<const TargetRegisterClass *, 5> RequiredClasses;
1702 LiveRegUnits UsedInBetween;
1703 UsedInBetween.init(*TRI);
1704
1705 Flags.clearRenameReg();
1706
1707 // Track which register units have been modified and used between the first
1708 // insn (inclusive) and the second insn.
1709 ModifiedRegUnits.clear();
1710 UsedRegUnits.clear();
1711
1712 // Remember any instructions that read/write memory between FirstMI and MI.
1713 SmallVector<MachineInstr *, 4> MemInsns;
1714
1715 LLVM_DEBUG(dbgs() << "Find match for: "; FirstMI.dump());
1716 for (unsigned Count = 0; MBBI != E && Count < Limit;
1717 MBBI = next_nodbg(MBBI, E)) {
1718 MachineInstr &MI = *MBBI;
1719 LLVM_DEBUG(dbgs() << "Analysing 2nd insn: "; MI.dump());
1720
1721 UsedInBetween.accumulate(MI);
1722
1723 // Don't count transient instructions towards the search limit since there
1724 // may be different numbers of them if e.g. debug information is present.
1725 if (!MI.isTransient())
1726 ++Count;
1727
1728 Flags.setSExtIdx(-1);
1729 if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
1730 AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) {
1731 assert(MI.mayLoadOrStore() && "Expected memory operation.");
1732 // If we've found another instruction with the same opcode, check to see
1733 // if the base and offset are compatible with our starting instruction.
1734 // These instructions all have scaled immediate operands, so we just
1735 // check for +1/-1. Make sure to check the new instruction offset is
1736 // actually an immediate and not a symbolic reference destined for
1737 // a relocation.
1738 Register MIBaseReg = AArch64InstrInfo::getLdStBaseOp(MI).getReg();
1739 int MIOffset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
1740 bool MIIsUnscaled = TII->hasUnscaledLdStOffset(MI);
1741 if (IsUnscaled != MIIsUnscaled) {
1742 // We're trying to pair instructions that differ in how they are scaled.
1743 // If FirstMI is scaled then scale the offset of MI accordingly.
1744 // Otherwise, do the opposite (i.e., make MI's offset unscaled).
1745 int MemSize = TII->getMemScale(MI);
1746 if (MIIsUnscaled) {
1747 // If the unscaled offset isn't a multiple of the MemSize, we can't
1748 // pair the operations together: bail and keep looking.
1749 if (MIOffset % MemSize) {
1750 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1751 UsedRegUnits, TRI);
1752 MemInsns.push_back(&MI);
1753 continue;
1754 }
1755 MIOffset /= MemSize;
1756 } else {
1757 MIOffset *= MemSize;
1758 }
1759 }
1760
1761 bool IsPreLdSt = isPreLdStPairCandidate(FirstMI, MI);
1762
1763 if (BaseReg == MIBaseReg) {
1764 // If the offset of the second ld/st is not equal to the size of the
1765 // destination register it can’t be paired with a pre-index ld/st
1766 // pair. Additionally if the base reg is used or modified the operations
1767 // can't be paired: bail and keep looking.
1768 if (IsPreLdSt) {
1769 bool IsOutOfBounds = MIOffset != TII->getMemScale(MI);
1770 bool IsBaseRegUsed = !UsedRegUnits.available(
1771 AArch64InstrInfo::getLdStBaseOp(MI).getReg());
1772 bool IsBaseRegModified = !ModifiedRegUnits.available(
1773 AArch64InstrInfo::getLdStBaseOp(MI).getReg());
1774 // If the stored value and the address of the second instruction is
1775 // the same, it needs to be using the updated register and therefore
1776 // it must not be folded.
1777 bool IsMIRegTheSame =
1778 TRI->regsOverlap(getLdStRegOp(MI).getReg(),
1779 AArch64InstrInfo::getLdStBaseOp(MI).getReg());
1780 if (IsOutOfBounds || IsBaseRegUsed || IsBaseRegModified ||
1781 IsMIRegTheSame) {
1782 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1783 UsedRegUnits, TRI);
1784 MemInsns.push_back(&MI);
1785 continue;
1786 }
1787 } else {
1788 if ((Offset != MIOffset + OffsetStride) &&
1789 (Offset + OffsetStride != MIOffset)) {
1790 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1791 UsedRegUnits, TRI);
1792 MemInsns.push_back(&MI);
1793 continue;
1794 }
1795 }
1796
1797 int MinOffset = Offset < MIOffset ? Offset : MIOffset;
1798 if (FindNarrowMerge) {
1799 // If the alignment requirements of the scaled wide load/store
1800 // instruction can't express the offset of the scaled narrow input,
1801 // bail and keep looking. For promotable zero stores, allow only when
1802 // the stored value is the same (i.e., WZR).
1803 if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) ||
1804 (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {
1805 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1806 UsedRegUnits, TRI);
1807 MemInsns.push_back(&MI);
1808 continue;
1809 }
1810 } else {
1811 // Pairwise instructions have a 7-bit signed offset field. Single
1812 // insns have a 12-bit unsigned offset field. If the resultant
1813 // immediate offset of merging these instructions is out of range for
1814 // a pairwise instruction, bail and keep looking.
1815 if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) {
1816 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1817 UsedRegUnits, TRI);
1818 MemInsns.push_back(&MI);
1819 LLVM_DEBUG(dbgs() << "Offset doesn't fit in immediate, "
1820 << "keep looking.\n");
1821 continue;
1822 }
1823 // If the alignment requirements of the paired (scaled) instruction
1824 // can't express the offset of the unscaled input, bail and keep
1825 // looking.
1826 if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
1827 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1828 UsedRegUnits, TRI);
1829 MemInsns.push_back(&MI);
1830 LLVM_DEBUG(dbgs()
1831 << "Offset doesn't fit due to alignment requirements, "
1832 << "keep looking.\n");
1833 continue;
1834 }
1835 }
1836
1837 // If the BaseReg has been modified, then we cannot do the optimization.
1838 // For example, in the following pattern
1839 // ldr x1 [x2]
1840 // ldr x2 [x3]
1841 // ldr x4 [x2, #8],
1842 // the first and third ldr cannot be converted to ldp x1, x4, [x2]
1843 if (!ModifiedRegUnits.available(BaseReg))
1844 return E;
1845
1846 const bool SameLoadReg = MayLoad && TRI->isSuperOrSubRegisterEq(
1847 Reg, getLdStRegOp(MI).getReg());
1848
1849 // If the Rt of the second instruction (destination register of the
1850 // load) was not modified or used between the two instructions and none
1851 // of the instructions between the second and first alias with the
1852 // second, we can combine the second into the first.
1853 bool RtNotModified =
1854 ModifiedRegUnits.available(getLdStRegOp(MI).getReg());
1855 bool RtNotUsed = !(MI.mayLoad() && !SameLoadReg &&
1856 !UsedRegUnits.available(getLdStRegOp(MI).getReg()));
1857
1858 LLVM_DEBUG(dbgs() << "Checking, can combine 2nd into 1st insn:\n"
1859 << "Reg '" << getLdStRegOp(MI) << "' not modified: "
1860 << (RtNotModified ? "true" : "false") << "\n"
1861 << "Reg '" << getLdStRegOp(MI) << "' not used: "
1862 << (RtNotUsed ? "true" : "false") << "\n");
1863
1864 if (RtNotModified && RtNotUsed && !mayAlias(MI, MemInsns, AA)) {
1865 // For pairs loading into the same reg, try to find a renaming
1866 // opportunity to allow the renaming of Reg between FirstMI and MI
1867 // and combine MI into FirstMI; otherwise bail and keep looking.
1868 if (SameLoadReg) {
1869 std::optional<MCPhysReg> RenameReg =
1870 findRenameRegForSameLdStRegPair(MaybeCanRename, FirstMI, MI,
1871 Reg, DefinedInBB, UsedInBetween,
1872 RequiredClasses, TRI);
1873 if (!RenameReg) {
1874 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1875 UsedRegUnits, TRI);
1876 MemInsns.push_back(&MI);
1877 LLVM_DEBUG(dbgs() << "Can't find reg for renaming, "
1878 << "keep looking.\n");
1879 continue;
1880 }
1881 Flags.setRenameReg(*RenameReg);
1882 }
1883
1884 Flags.setMergeForward(false);
1885 if (!SameLoadReg)
1886 Flags.clearRenameReg();
1887 return MBBI;
1888 }
1889
1890 // Likewise, if the Rt of the first instruction is not modified or used
1891 // between the two instructions and none of the instructions between the
1892 // first and the second alias with the first, we can combine the first
1893 // into the second.
1894 RtNotModified = !(
1895 MayLoad && !UsedRegUnits.available(getLdStRegOp(FirstMI).getReg()));
1896
1897 LLVM_DEBUG(dbgs() << "Checking, can combine 1st into 2nd insn:\n"
1898 << "Reg '" << getLdStRegOp(FirstMI)
1899 << "' not modified: "
1900 << (RtNotModified ? "true" : "false") << "\n");
1901
1902 if (RtNotModified && !mayAlias(FirstMI, MemInsns, AA)) {
1903 if (ModifiedRegUnits.available(getLdStRegOp(FirstMI).getReg())) {
1904 Flags.setMergeForward(true);
1905 Flags.clearRenameReg();
1906 return MBBI;
1907 }
1908
1909 std::optional<MCPhysReg> RenameReg = findRenameRegForSameLdStRegPair(
1910 MaybeCanRename, FirstMI, MI, Reg, DefinedInBB, UsedInBetween,
1911 RequiredClasses, TRI);
1912 if (RenameReg) {
1913 Flags.setMergeForward(true);
1914 Flags.setRenameReg(*RenameReg);
1915 MBBIWithRenameReg = MBBI;
1916 }
1917 }
1918 LLVM_DEBUG(dbgs() << "Unable to combine these instructions due to "
1919 << "interference in between, keep looking.\n");
1920 }
1921 }
1922
1923 if (Flags.getRenameReg())
1924 return MBBIWithRenameReg;
1925
1926 // If the instruction wasn't a matching load or store. Stop searching if we
1927 // encounter a call instruction that might modify memory.
1928 if (MI.isCall()) {
1929 LLVM_DEBUG(dbgs() << "Found a call, stop looking.\n");
1930 return E;
1931 }
1932
1933 // Update modified / uses register units.
1934 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1935
1936 // Otherwise, if the base register is modified, we have no match, so
1937 // return early.
1938 if (!ModifiedRegUnits.available(BaseReg)) {
1939 LLVM_DEBUG(dbgs() << "Base reg is modified, stop looking.\n");
1940 return E;
1941 }
1942
1943 // Update list of instructions that read/write memory.
1944 if (MI.mayLoadOrStore())
1945 MemInsns.push_back(&MI);
1946 }
1947 return E;
1948 }
1949
1950 static MachineBasicBlock::iterator
maybeMoveCFI(MachineInstr & MI,MachineBasicBlock::iterator MaybeCFI)1951 maybeMoveCFI(MachineInstr &MI, MachineBasicBlock::iterator MaybeCFI) {
1952 auto End = MI.getParent()->end();
1953 if (MaybeCFI == End ||
1954 MaybeCFI->getOpcode() != TargetOpcode::CFI_INSTRUCTION ||
1955 !(MI.getFlag(MachineInstr::FrameSetup) ||
1956 MI.getFlag(MachineInstr::FrameDestroy)) ||
1957 AArch64InstrInfo::getLdStBaseOp(MI).getReg() != AArch64::SP)
1958 return End;
1959
1960 const MachineFunction &MF = *MI.getParent()->getParent();
1961 unsigned CFIIndex = MaybeCFI->getOperand(0).getCFIIndex();
1962 const MCCFIInstruction &CFI = MF.getFrameInstructions()[CFIIndex];
1963 switch (CFI.getOperation()) {
1964 case MCCFIInstruction::OpDefCfa:
1965 case MCCFIInstruction::OpDefCfaOffset:
1966 return MaybeCFI;
1967 default:
1968 return End;
1969 }
1970 }
1971
1972 MachineBasicBlock::iterator
mergeUpdateInsn(MachineBasicBlock::iterator I,MachineBasicBlock::iterator Update,bool IsPreIdx)1973 AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
1974 MachineBasicBlock::iterator Update,
1975 bool IsPreIdx) {
1976 assert((Update->getOpcode() == AArch64::ADDXri ||
1977 Update->getOpcode() == AArch64::SUBXri) &&
1978 "Unexpected base register update instruction to merge!");
1979 MachineBasicBlock::iterator E = I->getParent()->end();
1980 MachineBasicBlock::iterator NextI = next_nodbg(I, E);
1981
1982 // If updating the SP and the following instruction is CFA offset related CFI
1983 // instruction move it after the merged instruction.
1984 MachineBasicBlock::iterator CFI =
1985 IsPreIdx ? maybeMoveCFI(*Update, next_nodbg(Update, E)) : E;
1986
1987 // Return the instruction following the merged instruction, which is
1988 // the instruction following our unmerged load. Unless that's the add/sub
1989 // instruction we're merging, in which case it's the one after that.
1990 if (NextI == Update)
1991 NextI = next_nodbg(NextI, E);
1992
1993 int Value = Update->getOperand(2).getImm();
1994 assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
1995 "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
1996 if (Update->getOpcode() == AArch64::SUBXri)
1997 Value = -Value;
1998
1999 unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
2000 : getPostIndexedOpcode(I->getOpcode());
2001 MachineInstrBuilder MIB;
2002 int Scale, MinOffset, MaxOffset;
2003 getPrePostIndexedMemOpInfo(*I, Scale, MinOffset, MaxOffset);
2004 if (!AArch64InstrInfo::isPairedLdSt(*I)) {
2005 // Non-paired instruction.
2006 MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
2007 .add(getLdStRegOp(*Update))
2008 .add(getLdStRegOp(*I))
2009 .add(AArch64InstrInfo::getLdStBaseOp(*I))
2010 .addImm(Value / Scale)
2011 .setMemRefs(I->memoperands())
2012 .setMIFlags(I->mergeFlagsWith(*Update));
2013 } else {
2014 // Paired instruction.
2015 MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
2016 .add(getLdStRegOp(*Update))
2017 .add(getLdStRegOp(*I, 0))
2018 .add(getLdStRegOp(*I, 1))
2019 .add(AArch64InstrInfo::getLdStBaseOp(*I))
2020 .addImm(Value / Scale)
2021 .setMemRefs(I->memoperands())
2022 .setMIFlags(I->mergeFlagsWith(*Update));
2023 }
2024 if (CFI != E) {
2025 MachineBasicBlock *MBB = I->getParent();
2026 MBB->splice(std::next(MIB.getInstr()->getIterator()), MBB, CFI);
2027 }
2028
2029 if (IsPreIdx) {
2030 ++NumPreFolded;
2031 LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store.");
2032 } else {
2033 ++NumPostFolded;
2034 LLVM_DEBUG(dbgs() << "Creating post-indexed load/store.");
2035 }
2036 LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
2037 LLVM_DEBUG(I->print(dbgs()));
2038 LLVM_DEBUG(dbgs() << " ");
2039 LLVM_DEBUG(Update->print(dbgs()));
2040 LLVM_DEBUG(dbgs() << " with instruction:\n ");
2041 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
2042 LLVM_DEBUG(dbgs() << "\n");
2043
2044 // Erase the old instructions for the block.
2045 I->eraseFromParent();
2046 Update->eraseFromParent();
2047
2048 return NextI;
2049 }
2050
isMatchingUpdateInsn(MachineInstr & MemMI,MachineInstr & MI,unsigned BaseReg,int Offset)2051 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
2052 MachineInstr &MI,
2053 unsigned BaseReg, int Offset) {
2054 switch (MI.getOpcode()) {
2055 default:
2056 break;
2057 case AArch64::SUBXri:
2058 case AArch64::ADDXri:
2059 // Make sure it's a vanilla immediate operand, not a relocation or
2060 // anything else we can't handle.
2061 if (!MI.getOperand(2).isImm())
2062 break;
2063 // Watch out for 1 << 12 shifted value.
2064 if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm()))
2065 break;
2066
2067 // The update instruction source and destination register must be the
2068 // same as the load/store base register.
2069 if (MI.getOperand(0).getReg() != BaseReg ||
2070 MI.getOperand(1).getReg() != BaseReg)
2071 break;
2072
2073 int UpdateOffset = MI.getOperand(2).getImm();
2074 if (MI.getOpcode() == AArch64::SUBXri)
2075 UpdateOffset = -UpdateOffset;
2076
2077 // The immediate must be a multiple of the scaling factor of the pre/post
2078 // indexed instruction.
2079 int Scale, MinOffset, MaxOffset;
2080 getPrePostIndexedMemOpInfo(MemMI, Scale, MinOffset, MaxOffset);
2081 if (UpdateOffset % Scale != 0)
2082 break;
2083
2084 // Scaled offset must fit in the instruction immediate.
2085 int ScaledOffset = UpdateOffset / Scale;
2086 if (ScaledOffset > MaxOffset || ScaledOffset < MinOffset)
2087 break;
2088
2089 // If we have a non-zero Offset, we check that it matches the amount
2090 // we're adding to the register.
2091 if (!Offset || Offset == UpdateOffset)
2092 return true;
2093 break;
2094 }
2095 return false;
2096 }
2097
findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,int UnscaledOffset,unsigned Limit)2098 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
2099 MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
2100 MachineBasicBlock::iterator E = I->getParent()->end();
2101 MachineInstr &MemMI = *I;
2102 MachineBasicBlock::iterator MBBI = I;
2103
2104 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
2105 int MIUnscaledOffset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm() *
2106 TII->getMemScale(MemMI);
2107
2108 // Scan forward looking for post-index opportunities. Updating instructions
2109 // can't be formed if the memory instruction doesn't have the offset we're
2110 // looking for.
2111 if (MIUnscaledOffset != UnscaledOffset)
2112 return E;
2113
2114 // If the base register overlaps a source/destination register, we can't
2115 // merge the update. This does not apply to tag store instructions which
2116 // ignore the address part of the source register.
2117 // This does not apply to STGPi as well, which does not have unpredictable
2118 // behavior in this case unlike normal stores, and always performs writeback
2119 // after reading the source register value.
2120 if (!isTagStore(MemMI) && MemMI.getOpcode() != AArch64::STGPi) {
2121 bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI);
2122 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
2123 Register DestReg = getLdStRegOp(MemMI, i).getReg();
2124 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
2125 return E;
2126 }
2127 }
2128
2129 // Track which register units have been modified and used between the first
2130 // insn (inclusive) and the second insn.
2131 ModifiedRegUnits.clear();
2132 UsedRegUnits.clear();
2133 MBBI = next_nodbg(MBBI, E);
2134
2135 // We can't post-increment the stack pointer if any instruction between
2136 // the memory access (I) and the increment (MBBI) can access the memory
2137 // region defined by [SP, MBBI].
2138 const bool BaseRegSP = BaseReg == AArch64::SP;
2139 if (BaseRegSP && needsWinCFI(I->getMF())) {
2140 // FIXME: For now, we always block the optimization over SP in windows
2141 // targets as it requires to adjust the unwind/debug info, messing up
2142 // the unwind info can actually cause a miscompile.
2143 return E;
2144 }
2145
2146 for (unsigned Count = 0; MBBI != E && Count < Limit;
2147 MBBI = next_nodbg(MBBI, E)) {
2148 MachineInstr &MI = *MBBI;
2149
2150 // Don't count transient instructions towards the search limit since there
2151 // may be different numbers of them if e.g. debug information is present.
2152 if (!MI.isTransient())
2153 ++Count;
2154
2155 // If we found a match, return it.
2156 if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset))
2157 return MBBI;
2158
2159 // Update the status of what the instruction clobbered and used.
2160 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2161
2162 // Otherwise, if the base register is used or modified, we have no match, so
2163 // return early.
2164 // If we are optimizing SP, do not allow instructions that may load or store
2165 // in between the load and the optimized value update.
2166 if (!ModifiedRegUnits.available(BaseReg) ||
2167 !UsedRegUnits.available(BaseReg) ||
2168 (BaseRegSP && MBBI->mayLoadOrStore()))
2169 return E;
2170 }
2171 return E;
2172 }
2173
findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I,unsigned Limit)2174 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
2175 MachineBasicBlock::iterator I, unsigned Limit) {
2176 MachineBasicBlock::iterator B = I->getParent()->begin();
2177 MachineBasicBlock::iterator E = I->getParent()->end();
2178 MachineInstr &MemMI = *I;
2179 MachineBasicBlock::iterator MBBI = I;
2180 MachineFunction &MF = *MemMI.getMF();
2181
2182 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
2183 int Offset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm();
2184
2185 // If the load/store is the first instruction in the block, there's obviously
2186 // not any matching update. Ditto if the memory offset isn't zero.
2187 if (MBBI == B || Offset != 0)
2188 return E;
2189 // If the base register overlaps a destination register, we can't
2190 // merge the update.
2191 if (!isTagStore(MemMI)) {
2192 bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI);
2193 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
2194 Register DestReg = getLdStRegOp(MemMI, i).getReg();
2195 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
2196 return E;
2197 }
2198 }
2199
2200 const bool BaseRegSP = BaseReg == AArch64::SP;
2201 if (BaseRegSP && needsWinCFI(I->getMF())) {
2202 // FIXME: For now, we always block the optimization over SP in windows
2203 // targets as it requires to adjust the unwind/debug info, messing up
2204 // the unwind info can actually cause a miscompile.
2205 return E;
2206 }
2207
2208 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2209 unsigned RedZoneSize =
2210 Subtarget.getTargetLowering()->getRedZoneSize(MF.getFunction());
2211
2212 // Track which register units have been modified and used between the first
2213 // insn (inclusive) and the second insn.
2214 ModifiedRegUnits.clear();
2215 UsedRegUnits.clear();
2216 unsigned Count = 0;
2217 bool MemAcessBeforeSPPreInc = false;
2218 do {
2219 MBBI = prev_nodbg(MBBI, B);
2220 MachineInstr &MI = *MBBI;
2221
2222 // Don't count transient instructions towards the search limit since there
2223 // may be different numbers of them if e.g. debug information is present.
2224 if (!MI.isTransient())
2225 ++Count;
2226
2227 // If we found a match, return it.
2228 if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset)) {
2229 // Check that the update value is within our red zone limit (which may be
2230 // zero).
2231 if (MemAcessBeforeSPPreInc && MBBI->getOperand(2).getImm() > RedZoneSize)
2232 return E;
2233 return MBBI;
2234 }
2235
2236 // Update the status of what the instruction clobbered and used.
2237 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2238
2239 // Otherwise, if the base register is used or modified, we have no match, so
2240 // return early.
2241 if (!ModifiedRegUnits.available(BaseReg) ||
2242 !UsedRegUnits.available(BaseReg))
2243 return E;
2244 // Keep track if we have a memory access before an SP pre-increment, in this
2245 // case we need to validate later that the update amount respects the red
2246 // zone.
2247 if (BaseRegSP && MBBI->mayLoadOrStore())
2248 MemAcessBeforeSPPreInc = true;
2249 } while (MBBI != B && Count < Limit);
2250 return E;
2251 }
2252
tryToPromoteLoadFromStore(MachineBasicBlock::iterator & MBBI)2253 bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
2254 MachineBasicBlock::iterator &MBBI) {
2255 MachineInstr &MI = *MBBI;
2256 // If this is a volatile load, don't mess with it.
2257 if (MI.hasOrderedMemoryRef())
2258 return false;
2259
2260 if (needsWinCFI(MI.getMF()) && MI.getFlag(MachineInstr::FrameDestroy))
2261 return false;
2262
2263 // Make sure this is a reg+imm.
2264 // FIXME: It is possible to extend it to handle reg+reg cases.
2265 if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
2266 return false;
2267
2268 // Look backward up to LdStLimit instructions.
2269 MachineBasicBlock::iterator StoreI;
2270 if (findMatchingStore(MBBI, LdStLimit, StoreI)) {
2271 ++NumLoadsFromStoresPromoted;
2272 // Promote the load. Keeping the iterator straight is a
2273 // pain, so we let the merge routine tell us what the next instruction
2274 // is after it's done mucking about.
2275 MBBI = promoteLoadFromStore(MBBI, StoreI);
2276 return true;
2277 }
2278 return false;
2279 }
2280
2281 // Merge adjacent zero stores into a wider store.
tryToMergeZeroStInst(MachineBasicBlock::iterator & MBBI)2282 bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
2283 MachineBasicBlock::iterator &MBBI) {
2284 assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store.");
2285 MachineInstr &MI = *MBBI;
2286 MachineBasicBlock::iterator E = MI.getParent()->end();
2287
2288 if (!TII->isCandidateToMergeOrPair(MI))
2289 return false;
2290
2291 // Look ahead up to LdStLimit instructions for a mergable instruction.
2292 LdStPairFlags Flags;
2293 MachineBasicBlock::iterator MergeMI =
2294 findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true);
2295 if (MergeMI != E) {
2296 ++NumZeroStoresPromoted;
2297
2298 // Keeping the iterator straight is a pain, so we let the merge routine tell
2299 // us what the next instruction is after it's done mucking about.
2300 MBBI = mergeNarrowZeroStores(MBBI, MergeMI, Flags);
2301 return true;
2302 }
2303 return false;
2304 }
2305
2306 // Find loads and stores that can be merged into a single load or store pair
2307 // instruction.
tryToPairLdStInst(MachineBasicBlock::iterator & MBBI)2308 bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
2309 MachineInstr &MI = *MBBI;
2310 MachineBasicBlock::iterator E = MI.getParent()->end();
2311
2312 if (!TII->isCandidateToMergeOrPair(MI))
2313 return false;
2314
2315 // If disable-ldp feature is opted, do not emit ldp.
2316 if (MI.mayLoad() && Subtarget->hasDisableLdp())
2317 return false;
2318
2319 // If disable-stp feature is opted, do not emit stp.
2320 if (MI.mayStore() && Subtarget->hasDisableStp())
2321 return false;
2322
2323 // Early exit if the offset is not possible to match. (6 bits of positive
2324 // range, plus allow an extra one in case we find a later insn that matches
2325 // with Offset-1)
2326 bool IsUnscaled = TII->hasUnscaledLdStOffset(MI);
2327 int Offset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
2328 int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : 1;
2329 // Allow one more for offset.
2330 if (Offset > 0)
2331 Offset -= OffsetStride;
2332 if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
2333 return false;
2334
2335 // Look ahead up to LdStLimit instructions for a pairable instruction.
2336 LdStPairFlags Flags;
2337 MachineBasicBlock::iterator Paired =
2338 findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false);
2339 if (Paired != E) {
2340 ++NumPairCreated;
2341 if (TII->hasUnscaledLdStOffset(MI))
2342 ++NumUnscaledPairCreated;
2343 // Keeping the iterator straight is a pain, so we let the merge routine tell
2344 // us what the next instruction is after it's done mucking about.
2345 auto Prev = std::prev(MBBI);
2346
2347 // Fetch the memoperand of the load/store that is a candidate for
2348 // combination.
2349 MachineMemOperand *MemOp =
2350 MI.memoperands_empty() ? nullptr : MI.memoperands().front();
2351
2352 // Get the needed alignments to check them if
2353 // ldp-aligned-only/stp-aligned-only features are opted.
2354 uint64_t MemAlignment = MemOp ? MemOp->getAlign().value() : -1;
2355 uint64_t TypeAlignment = MemOp ? Align(MemOp->getSize()).value() : -1;
2356
2357 // If a load arrives and ldp-aligned-only feature is opted, check that the
2358 // alignment of the source pointer is at least double the alignment of the
2359 // type.
2360 if (MI.mayLoad() && Subtarget->hasLdpAlignedOnly() && MemOp &&
2361 MemAlignment < 2 * TypeAlignment)
2362 return false;
2363
2364 // If a store arrives and stp-aligned-only feature is opted, check that the
2365 // alignment of the source pointer is at least double the alignment of the
2366 // type.
2367 if (MI.mayStore() && Subtarget->hasStpAlignedOnly() && MemOp &&
2368 MemAlignment < 2 * TypeAlignment)
2369 return false;
2370
2371 MBBI = mergePairedInsns(MBBI, Paired, Flags);
2372 // Collect liveness info for instructions between Prev and the new position
2373 // MBBI.
2374 for (auto I = std::next(Prev); I != MBBI; I++)
2375 updateDefinedRegisters(*I, DefinedInBB, TRI);
2376
2377 return true;
2378 }
2379 return false;
2380 }
2381
tryToMergeLdStUpdate(MachineBasicBlock::iterator & MBBI)2382 bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
2383 (MachineBasicBlock::iterator &MBBI) {
2384 MachineInstr &MI = *MBBI;
2385 MachineBasicBlock::iterator E = MI.getParent()->end();
2386 MachineBasicBlock::iterator Update;
2387
2388 // Look forward to try to form a post-index instruction. For example,
2389 // ldr x0, [x20]
2390 // add x20, x20, #32
2391 // merged into:
2392 // ldr x0, [x20], #32
2393 Update = findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit);
2394 if (Update != E) {
2395 // Merge the update into the ld/st.
2396 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
2397 return true;
2398 }
2399
2400 // Don't know how to handle unscaled pre/post-index versions below, so bail.
2401 if (TII->hasUnscaledLdStOffset(MI.getOpcode()))
2402 return false;
2403
2404 // Look back to try to find a pre-index instruction. For example,
2405 // add x0, x0, #8
2406 // ldr x1, [x0]
2407 // merged into:
2408 // ldr x1, [x0, #8]!
2409 Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit);
2410 if (Update != E) {
2411 // Merge the update into the ld/st.
2412 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
2413 return true;
2414 }
2415
2416 // The immediate in the load/store is scaled by the size of the memory
2417 // operation. The immediate in the add we're looking for,
2418 // however, is not, so adjust here.
2419 int UnscaledOffset =
2420 AArch64InstrInfo::getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI);
2421
2422 // Look forward to try to find a pre-index instruction. For example,
2423 // ldr x1, [x0, #64]
2424 // add x0, x0, #64
2425 // merged into:
2426 // ldr x1, [x0, #64]!
2427 Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit);
2428 if (Update != E) {
2429 // Merge the update into the ld/st.
2430 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
2431 return true;
2432 }
2433
2434 return false;
2435 }
2436
optimizeBlock(MachineBasicBlock & MBB,bool EnableNarrowZeroStOpt)2437 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
2438 bool EnableNarrowZeroStOpt) {
2439
2440 bool Modified = false;
2441 // Four tranformations to do here:
2442 // 1) Find loads that directly read from stores and promote them by
2443 // replacing with mov instructions. If the store is wider than the load,
2444 // the load will be replaced with a bitfield extract.
2445 // e.g.,
2446 // str w1, [x0, #4]
2447 // ldrh w2, [x0, #6]
2448 // ; becomes
2449 // str w1, [x0, #4]
2450 // lsr w2, w1, #16
2451 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2452 MBBI != E;) {
2453 if (isPromotableLoadFromStore(*MBBI) && tryToPromoteLoadFromStore(MBBI))
2454 Modified = true;
2455 else
2456 ++MBBI;
2457 }
2458 // 2) Merge adjacent zero stores into a wider store.
2459 // e.g.,
2460 // strh wzr, [x0]
2461 // strh wzr, [x0, #2]
2462 // ; becomes
2463 // str wzr, [x0]
2464 // e.g.,
2465 // str wzr, [x0]
2466 // str wzr, [x0, #4]
2467 // ; becomes
2468 // str xzr, [x0]
2469 if (EnableNarrowZeroStOpt)
2470 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2471 MBBI != E;) {
2472 if (isPromotableZeroStoreInst(*MBBI) && tryToMergeZeroStInst(MBBI))
2473 Modified = true;
2474 else
2475 ++MBBI;
2476 }
2477 // 3) Find loads and stores that can be merged into a single load or store
2478 // pair instruction.
2479 // e.g.,
2480 // ldr x0, [x2]
2481 // ldr x1, [x2, #8]
2482 // ; becomes
2483 // ldp x0, x1, [x2]
2484
2485 if (MBB.getParent()->getRegInfo().tracksLiveness()) {
2486 DefinedInBB.clear();
2487 DefinedInBB.addLiveIns(MBB);
2488 }
2489
2490 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2491 MBBI != E;) {
2492 // Track currently live registers up to this point, to help with
2493 // searching for a rename register on demand.
2494 updateDefinedRegisters(*MBBI, DefinedInBB, TRI);
2495 if (TII->isPairableLdStInst(*MBBI) && tryToPairLdStInst(MBBI))
2496 Modified = true;
2497 else
2498 ++MBBI;
2499 }
2500 // 4) Find base register updates that can be merged into the load or store
2501 // as a base-reg writeback.
2502 // e.g.,
2503 // ldr x0, [x2]
2504 // add x2, x2, #4
2505 // ; becomes
2506 // ldr x0, [x2], #4
2507 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2508 MBBI != E;) {
2509 if (isMergeableLdStUpdate(*MBBI) && tryToMergeLdStUpdate(MBBI))
2510 Modified = true;
2511 else
2512 ++MBBI;
2513 }
2514
2515 return Modified;
2516 }
2517
runOnMachineFunction(MachineFunction & Fn)2518 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
2519 if (skipFunction(Fn.getFunction()))
2520 return false;
2521
2522 Subtarget = &Fn.getSubtarget<AArch64Subtarget>();
2523 TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
2524 TRI = Subtarget->getRegisterInfo();
2525 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2526
2527 // Resize the modified and used register unit trackers. We do this once
2528 // per function and then clear the register units each time we optimize a load
2529 // or store.
2530 ModifiedRegUnits.init(*TRI);
2531 UsedRegUnits.init(*TRI);
2532 DefinedInBB.init(*TRI);
2533
2534 bool Modified = false;
2535 bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign();
2536 for (auto &MBB : Fn) {
2537 auto M = optimizeBlock(MBB, enableNarrowZeroStOpt);
2538 Modified |= M;
2539 }
2540
2541 return Modified;
2542 }
2543
2544 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and
2545 // stores near one another? Note: The pre-RA instruction scheduler already has
2546 // hooks to try and schedule pairable loads/stores together to improve pairing
2547 // opportunities. Thus, pre-RA pairing pass may not be worth the effort.
2548
2549 // FIXME: When pairing store instructions it's very possible for this pass to
2550 // hoist a store with a KILL marker above another use (without a KILL marker).
2551 // The resulting IR is invalid, but nothing uses the KILL markers after this
2552 // pass, so it's never caused a problem in practice.
2553
2554 /// createAArch64LoadStoreOptimizationPass - returns an instance of the
2555 /// load / store optimization pass.
createAArch64LoadStoreOptimizationPass()2556 FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
2557 return new AArch64LoadStoreOpt();
2558 }
2559