1 //===-- ParallelSnippetGenerator.cpp ----------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "ParallelSnippetGenerator.h"
10 
11 #include "BenchmarkRunner.h"
12 #include "MCInstrDescView.h"
13 #include "Target.h"
14 
15 // FIXME: Load constants into registers (e.g. with fld1) to not break
16 // instructions like x87.
17 
18 // Ideally we would like the only limitation on executing instructions to be the
19 // availability of the CPU resources (e.g. execution ports) needed to execute
20 // them, instead of the availability of their data dependencies.
21 
22 // To achieve that, one approach is to generate instructions that do not have
23 // data dependencies between them.
24 //
25 // For some instructions, this is trivial:
26 //    mov rax, qword ptr [rsi]
27 //    mov rax, qword ptr [rsi]
28 //    mov rax, qword ptr [rsi]
29 //    mov rax, qword ptr [rsi]
30 // For the above snippet, haswell just renames rax four times and executes the
31 // four instructions two at a time on P23 and P0126.
32 //
33 // For some instructions, we just need to make sure that the source is
34 // different from the destination. For example, IDIV8r reads from GPR and
35 // writes to AX. We just need to ensure that the Var is assigned a
36 // register which is different from AX:
37 //    idiv bx
38 //    idiv bx
39 //    idiv bx
40 //    idiv bx
41 // The above snippet will be able to fully saturate the ports, while the same
42 // with ax would issue one uop every `latency(IDIV8r)` cycles.
43 //
44 // Some instructions make this harder because they both read and write from
45 // the same register:
46 //    inc rax
47 //    inc rax
48 //    inc rax
49 //    inc rax
50 // This has a data dependency from each instruction to the next, limit the
51 // number of instructions that can be issued in parallel.
52 // It turns out that this is not a big issue on recent Intel CPUs because they
53 // have heuristics to balance port pressure. In the snippet above, subsequent
54 // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
55 // might end up executing them all on P0 (just because they can), or try
56 // avoiding P5 because it's usually under high pressure from vector
57 // instructions.
58 // This issue is even more important for high-latency instructions because
59 // they increase the idle time of the CPU, e.g. :
60 //    imul rax, rbx
61 //    imul rax, rbx
62 //    imul rax, rbx
63 //    imul rax, rbx
64 //
65 // To avoid that, we do the renaming statically by generating as many
66 // independent exclusive assignments as possible (until all possible registers
67 // are exhausted) e.g.:
68 //    imul rax, rbx
69 //    imul rcx, rbx
70 //    imul rdx, rbx
71 //    imul r8,  rbx
72 //
73 // Some instruction even make the above static renaming impossible because
74 // they implicitly read and write from the same operand, e.g. ADC16rr reads
75 // and writes from EFLAGS.
76 // In that case we just use a greedy register assignment and hope for the
77 // best.
78 
79 namespace llvm {
80 namespace exegesis {
81 
82 static bool hasVariablesWithTiedOperands(const Instruction &Instr) {
83   SmallVector<const Variable *, 8> Result;
84   for (const auto &Var : Instr.Variables)
85     if (Var.hasTiedOperands())
86       return true;
87   return false;
88 }
89 
90 ParallelSnippetGenerator::~ParallelSnippetGenerator() = default;
91 
92 void ParallelSnippetGenerator::instantiateMemoryOperands(
93     const unsigned ScratchSpacePointerInReg,
94     std::vector<InstructionTemplate> &Instructions) const {
95   if (ScratchSpacePointerInReg == 0)
96     return; // no memory operands.
97   const auto &ET = State.getExegesisTarget();
98   const unsigned MemStep = ET.getMaxMemoryAccessSize();
99   const size_t OriginalInstructionsSize = Instructions.size();
100   size_t I = 0;
101   for (InstructionTemplate &IT : Instructions) {
102     ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
103     ++I;
104   }
105 
106   while (Instructions.size() < kMinNumDifferentAddresses) {
107     InstructionTemplate IT = Instructions[I % OriginalInstructionsSize];
108     ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
109     ++I;
110     Instructions.push_back(std::move(IT));
111   }
112   assert(I * MemStep < BenchmarkRunner::ScratchSpace::kSize &&
113          "not enough scratch space");
114 }
115 
116 enum class RegRandomizationStrategy : uint8_t {
117   PickRandomRegs,
118   SingleStaticRegPerOperand,
119   SingleStaticReg,
120 
121   FIRST = PickRandomRegs,
122   LAST = SingleStaticReg,
123 };
124 
125 } // namespace exegesis
126 
127 template <> struct enum_iteration_traits<exegesis::RegRandomizationStrategy> {
128   static constexpr bool is_iterable = true;
129 };
130 
131 namespace exegesis {
132 
133 const char *getDescription(RegRandomizationStrategy S) {
134   switch (S) {
135   case RegRandomizationStrategy::PickRandomRegs:
136     return "randomizing registers";
137   case RegRandomizationStrategy::SingleStaticRegPerOperand:
138     return "one unique register for each position";
139   case RegRandomizationStrategy::SingleStaticReg:
140     return "reusing the same register for all positions";
141   }
142   llvm_unreachable("Unknown UseRegRandomizationStrategy enum");
143 }
144 
145 static std::variant<std::nullopt_t, MCOperand, Register>
146 generateSingleRegisterForInstrAvoidingDefUseOverlap(
147     const LLVMState &State, const BitVector &ForbiddenRegisters,
148     const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases,
149     const BitVector &Uses, const BitVector &Defs, const InstructionTemplate &IT,
150     const Operand &Op, const ArrayRef<InstructionTemplate> Instructions,
151     RegRandomizationStrategy S) {
152   const Instruction &Instr = IT.getInstr();
153   assert(Op.isReg() && Op.isExplicit() && !Op.isMemory() &&
154          !IT.getValueFor(Op).isValid());
155   assert((!Op.isUse() || !Op.isTied()) &&
156          "Not expecting to see a tied use reg");
157 
158   if (Op.isUse()) {
159     switch (S) {
160     case RegRandomizationStrategy::PickRandomRegs:
161       break;
162     case RegRandomizationStrategy::SingleStaticReg:
163     case RegRandomizationStrategy::SingleStaticRegPerOperand: {
164       if (!Instructions.empty())
165         return Instructions.front().getValueFor(Op);
166       if (S != RegRandomizationStrategy::SingleStaticReg)
167         break;
168       BitVector PossibleRegisters = Op.getRegisterAliasing().sourceBits();
169       const BitVector UseAliases = getAliasedBits(State.getRegInfo(), Uses);
170       if (std::optional<int> CommonBit =
171               getFirstCommonBit(PossibleRegisters, UseAliases))
172         return *CommonBit;
173       break;
174     }
175     }
176   }
177 
178   BitVector PossibleRegisters = Op.getRegisterAliasing().sourceBits();
179   remove(PossibleRegisters, ForbiddenRegisters);
180 
181   if (Op.isDef()) {
182     remove(PossibleRegisters, ImplicitUseAliases);
183     const BitVector UseAliases = getAliasedBits(State.getRegInfo(), Uses);
184     remove(PossibleRegisters, UseAliases);
185   }
186 
187   if (Op.isUse()) {
188     remove(PossibleRegisters, ImplicitDefAliases);
189     // NOTE: in general, using same reg for multiple Use's is fine.
190     if (S == RegRandomizationStrategy::SingleStaticRegPerOperand) {
191       const BitVector UseAliases = getAliasedBits(State.getRegInfo(), Uses);
192       remove(PossibleRegisters, UseAliases);
193     }
194   }
195 
196   bool IsDefWithTiedUse =
197       Instr.Variables[Op.getVariableIndex()].hasTiedOperands();
198   if (Op.isUse() || IsDefWithTiedUse) {
199     // Now, important bit: if we have used some register for def,
200     // then we can not use that same register for *any* use,
201     // be it either an untied use, or an use tied to a def.
202     // But def-ing same regs is fine, as long as there are no uses!
203     const BitVector DefsAliases = getAliasedBits(State.getRegInfo(), Defs);
204     remove(PossibleRegisters, DefsAliases);
205   }
206 
207   if (!PossibleRegisters.any())
208     return std::nullopt;
209 
210   return randomBit(PossibleRegisters);
211 }
212 
213 static std::optional<InstructionTemplate>
214 generateSingleSnippetForInstrAvoidingDefUseOverlap(
215     const LLVMState &State, const BitVector &ForbiddenRegisters,
216     const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases,
217     BitVector &Uses, BitVector &Defs, InstructionTemplate IT,
218     const ArrayRef<InstructionTemplate> Instructions,
219     RegRandomizationStrategy S) {
220   const Instruction &Instr = IT.getInstr();
221   for (const Operand &Op : Instr.Operands) {
222     if (!Op.isReg() || !Op.isExplicit() || Op.isMemory() ||
223         IT.getValueFor(Op).isValid())
224       continue;
225     assert((!Op.isUse() || !Op.isTied()) && "Will not get tied uses.");
226 
227     std::variant<std::nullopt_t, MCOperand, Register> R =
228         generateSingleRegisterForInstrAvoidingDefUseOverlap(
229             State, ForbiddenRegisters, ImplicitUseAliases, ImplicitDefAliases,
230             Uses, Defs, IT, Op, Instructions, S);
231 
232     if (std::holds_alternative<std::nullopt_t>(R))
233       return {};
234 
235     MCOperand MCOp;
236     if (std::holds_alternative<MCOperand>(R))
237       MCOp = std::get<MCOperand>(R);
238     else {
239       Register RandomReg = std::get<Register>(R);
240       if (Op.isDef())
241         Defs.set(RandomReg);
242       if (Op.isUse())
243         Uses.set(RandomReg);
244       MCOp = MCOperand::createReg(RandomReg);
245     }
246     IT.getValueFor(Op) = MCOp;
247   }
248   return IT;
249 }
250 
251 static std::vector<InstructionTemplate>
252 generateSnippetForInstrAvoidingDefUseOverlap(
253     const LLVMState &State, const InstructionTemplate &IT,
254     RegRandomizationStrategy S, const BitVector &ForbiddenRegisters) {
255   // We don't want to accidentally serialize the instruction,
256   // so we must be sure that we don't pick a def that is an implicit use,
257   // or a use that is an implicit def, so record implicit regs now.
258   BitVector ImplicitUses(State.getRegInfo().getNumRegs());
259   BitVector ImplicitDefs(State.getRegInfo().getNumRegs());
260   for (const auto &Op : IT.getInstr().Operands) {
261     if (Op.isReg() && Op.isImplicit() && !Op.isMemory()) {
262       assert(Op.isImplicitReg() && "Not an implicit register operand?");
263       if (Op.isUse())
264         ImplicitUses.set(Op.getImplicitReg());
265       else {
266         assert(Op.isDef() && "Not a use and not a def?");
267         ImplicitDefs.set(Op.getImplicitReg());
268       }
269     }
270   }
271   const BitVector ImplicitUseAliases =
272       getAliasedBits(State.getRegInfo(), ImplicitUses);
273   const BitVector ImplicitDefAliases =
274       getAliasedBits(State.getRegInfo(), ImplicitDefs);
275 
276   BitVector Defs(State.getRegInfo().getNumRegs());
277   BitVector Uses(State.getRegInfo().getNumRegs());
278   std::vector<InstructionTemplate> Instructions;
279 
280   while (true) {
281     std::optional<InstructionTemplate> TmpIT =
282         generateSingleSnippetForInstrAvoidingDefUseOverlap(
283             State, ForbiddenRegisters, ImplicitUseAliases, ImplicitDefAliases,
284             Uses, Defs, IT, Instructions, S);
285     if (!TmpIT)
286       return Instructions;
287     Instructions.push_back(std::move(*TmpIT));
288     if (!hasVariablesWithTiedOperands(IT.getInstr()))
289       return Instructions;
290     assert(Instructions.size() <= 128 && "Stuck in endless loop?");
291   }
292 }
293 
294 Expected<std::vector<CodeTemplate>>
295 ParallelSnippetGenerator::generateCodeTemplates(
296     InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const {
297   const Instruction &Instr = Variant.getInstr();
298   CodeTemplate CT;
299   CT.ScratchSpacePointerInReg =
300       Instr.hasMemoryOperands()
301           ? State.getExegesisTarget().getScratchMemoryRegister(
302                 State.getTargetMachine().getTargetTriple())
303           : 0;
304   const AliasingConfigurations SelfAliasing(Instr, Instr, ForbiddenRegisters);
305   if (SelfAliasing.empty()) {
306     CT.Info = "instruction is parallel, repeating a random one.";
307     CT.Instructions.push_back(std::move(Variant));
308     instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
309     return getSingleton(std::move(CT));
310   }
311   if (SelfAliasing.hasImplicitAliasing()) {
312     CT.Info = "instruction is serial, repeating a random one.";
313     CT.Instructions.push_back(std::move(Variant));
314     instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
315     return getSingleton(std::move(CT));
316   }
317   std::vector<CodeTemplate> Result;
318   bool HasTiedOperands = hasVariablesWithTiedOperands(Instr);
319   // If there are no tied operands, then we don't want to "saturate backedge",
320   // and the template we will produce will have only a single instruction.
321   unsigned NumUntiedUseRegs = count_if(Instr.Operands, [](const Operand &Op) {
322     return Op.isReg() && Op.isExplicit() && !Op.isMemory() && Op.isUse() &&
323            !Op.isTied();
324   });
325   SmallVector<RegRandomizationStrategy, 3> Strategies;
326   if (HasTiedOperands || NumUntiedUseRegs >= 3)
327     Strategies.push_back(RegRandomizationStrategy::PickRandomRegs);
328   if (NumUntiedUseRegs >= 2)
329     Strategies.push_back(RegRandomizationStrategy::SingleStaticRegPerOperand);
330   Strategies.push_back(RegRandomizationStrategy::SingleStaticReg);
331   for (RegRandomizationStrategy S : Strategies) {
332     CodeTemplate CurrCT = CT.clone();
333     CurrCT.Info =
334         Twine("instruction has ")
335             .concat(HasTiedOperands ? "" : "no ")
336             .concat("tied variables, avoiding "
337                     "Read-After-Write issue, picking random def and use "
338                     "registers not aliasing each other, for uses, ")
339             .concat(getDescription(S))
340             .str();
341     CurrCT.Instructions = generateSnippetForInstrAvoidingDefUseOverlap(
342         State, Variant, S, ForbiddenRegisters);
343     if (CurrCT.Instructions.empty())
344       return make_error<StringError>(
345           Twine("Failed to produce any snippet via: ").concat(CurrCT.Info),
346           inconvertibleErrorCode());
347     instantiateMemoryOperands(CurrCT.ScratchSpacePointerInReg,
348                               CurrCT.Instructions);
349     Result.push_back(std::move(CurrCT));
350   }
351   return Result;
352 }
353 
354 constexpr const size_t ParallelSnippetGenerator::kMinNumDifferentAddresses;
355 
356 } // namespace exegesis
357 } // namespace llvm
358