1 //===-- Uops.cpp ------------------------------------------------*- C++ -*-===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 
10 #include "Uops.h"
11 
12 #include "Assembler.h"
13 #include "BenchmarkRunner.h"
14 #include "MCInstrDescView.h"
15 #include "PerfHelper.h"
16 
17 // FIXME: Load constants into registers (e.g. with fld1) to not break
18 // instructions like x87.
19 
20 // Ideally we would like the only limitation on executing uops to be the issue
21 // ports. Maximizing port pressure increases the likelihood that the load is
22 // distributed evenly across possible ports.
23 
24 // To achieve that, one approach is to generate instructions that do not have
25 // data dependencies between them.
26 //
27 // For some instructions, this is trivial:
28 //    mov rax, qword ptr [rsi]
29 //    mov rax, qword ptr [rsi]
30 //    mov rax, qword ptr [rsi]
31 //    mov rax, qword ptr [rsi]
32 // For the above snippet, haswell just renames rax four times and executes the
33 // four instructions two at a time on P23 and P0126.
34 //
35 // For some instructions, we just need to make sure that the source is
36 // different from the destination. For example, IDIV8r reads from GPR and
37 // writes to AX. We just need to ensure that the Var is assigned a
38 // register which is different from AX:
39 //    idiv bx
40 //    idiv bx
41 //    idiv bx
42 //    idiv bx
43 // The above snippet will be able to fully saturate the ports, while the same
44 // with ax would issue one uop every `latency(IDIV8r)` cycles.
45 //
46 // Some instructions make this harder because they both read and write from
47 // the same register:
48 //    inc rax
49 //    inc rax
50 //    inc rax
51 //    inc rax
52 // This has a data dependency from each instruction to the next, limit the
53 // number of instructions that can be issued in parallel.
54 // It turns out that this is not a big issue on recent Intel CPUs because they
55 // have heuristics to balance port pressure. In the snippet above, subsequent
56 // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
57 // might end up executing them all on P0 (just because they can), or try
58 // avoiding P5 because it's usually under high pressure from vector
59 // instructions.
60 // This issue is even more important for high-latency instructions because
61 // they increase the idle time of the CPU, e.g. :
62 //    imul rax, rbx
63 //    imul rax, rbx
64 //    imul rax, rbx
65 //    imul rax, rbx
66 //
67 // To avoid that, we do the renaming statically by generating as many
68 // independent exclusive assignments as possible (until all possible registers
69 // are exhausted) e.g.:
70 //    imul rax, rbx
71 //    imul rcx, rbx
72 //    imul rdx, rbx
73 //    imul r8,  rbx
74 //
75 // Some instruction even make the above static renaming impossible because
76 // they implicitly read and write from the same operand, e.g. ADC16rr reads
77 // and writes from EFLAGS.
78 // In that case we just use a greedy register assignment and hope for the
79 // best.
80 
81 namespace exegesis {
82 
hasUnknownOperand(const llvm::MCOperandInfo & OpInfo)83 static bool hasUnknownOperand(const llvm::MCOperandInfo &OpInfo) {
84   return OpInfo.OperandType == llvm::MCOI::OPERAND_UNKNOWN;
85 }
86 
87 // FIXME: Handle memory, see PR36905.
hasMemoryOperand(const llvm::MCOperandInfo & OpInfo)88 static bool hasMemoryOperand(const llvm::MCOperandInfo &OpInfo) {
89   return OpInfo.OperandType == llvm::MCOI::OPERAND_MEMORY;
90 }
91 
92 llvm::Error
isInfeasible(const llvm::MCInstrDesc & MCInstrDesc) const93 UopsBenchmarkRunner::isInfeasible(const llvm::MCInstrDesc &MCInstrDesc) const {
94   if (llvm::any_of(MCInstrDesc.operands(), hasUnknownOperand))
95     return llvm::make_error<BenchmarkFailure>(
96         "Infeasible : has unknown operands");
97   if (llvm::any_of(MCInstrDesc.operands(), hasMemoryOperand))
98     return llvm::make_error<BenchmarkFailure>(
99         "Infeasible : has memory operands");
100   return llvm::Error::success();
101 }
102 
103 // Returns whether this Variable ties Use and Def operands together.
hasTiedOperands(const Instruction & Instr,const Variable & Var)104 static bool hasTiedOperands(const Instruction &Instr, const Variable &Var) {
105   bool HasUse = false;
106   bool HasDef = false;
107   for (const unsigned OpIndex : Var.TiedOperands) {
108     const Operand &Op = Instr.Operands[OpIndex];
109     if (Op.IsDef)
110       HasDef = true;
111     else
112       HasUse = true;
113   }
114   return HasUse && HasDef;
115 }
116 
117 static llvm::SmallVector<const Variable *, 8>
getTiedVariables(const Instruction & Instr)118 getTiedVariables(const Instruction &Instr) {
119   llvm::SmallVector<const Variable *, 8> Result;
120   for (const auto &Var : Instr.Variables)
121     if (hasTiedOperands(Instr, Var))
122       Result.push_back(&Var);
123   return Result;
124 }
125 
remove(llvm::BitVector & a,const llvm::BitVector & b)126 static void remove(llvm::BitVector &a, const llvm::BitVector &b) {
127   assert(a.size() == b.size());
128   for (auto I : b.set_bits())
129     a.reset(I);
130 }
131 
132 UopsBenchmarkRunner::~UopsBenchmarkRunner() = default;
133 
134 llvm::Expected<SnippetPrototype>
generatePrototype(unsigned Opcode) const135 UopsBenchmarkRunner::generatePrototype(unsigned Opcode) const {
136   const auto &InstrDesc = State.getInstrInfo().get(Opcode);
137   if (auto E = isInfeasible(InstrDesc))
138     return std::move(E);
139   const Instruction Instr(InstrDesc, RATC);
140   const AliasingConfigurations SelfAliasing(Instr, Instr);
141   if (SelfAliasing.empty()) {
142     return generateUnconstrainedPrototype(Instr, "instruction is parallel");
143   }
144   if (SelfAliasing.hasImplicitAliasing()) {
145     return generateUnconstrainedPrototype(Instr, "instruction is serial");
146   }
147   const auto TiedVariables = getTiedVariables(Instr);
148   if (!TiedVariables.empty()) {
149     if (TiedVariables.size() > 1)
150       return llvm::make_error<llvm::StringError>(
151           "Infeasible : don't know how to handle several tied variables",
152           llvm::inconvertibleErrorCode());
153     const Variable *Var = TiedVariables.front();
154     assert(Var);
155     assert(!Var->TiedOperands.empty());
156     const Operand &Op = Instr.Operands[Var->TiedOperands.front()];
157     assert(Op.Tracker);
158     SnippetPrototype Prototype;
159     Prototype.Explanation =
160         "instruction has tied variables using static renaming.";
161     for (const llvm::MCPhysReg Reg : Op.Tracker->sourceBits().set_bits()) {
162       Prototype.Snippet.emplace_back(Instr);
163       Prototype.Snippet.back().getValueFor(*Var) =
164           llvm::MCOperand::createReg(Reg);
165     }
166     return std::move(Prototype);
167   }
168   InstructionInstance II(Instr);
169   // No tied variables, we pick random values for defs.
170   llvm::BitVector Defs(State.getRegInfo().getNumRegs());
171   for (const auto &Op : Instr.Operands) {
172     if (Op.Tracker && Op.IsExplicit && Op.IsDef) {
173       auto PossibleRegisters = Op.Tracker->sourceBits();
174       remove(PossibleRegisters, RATC.reservedRegisters());
175       assert(PossibleRegisters.any() && "No register left to choose from");
176       const auto RandomReg = randomBit(PossibleRegisters);
177       Defs.set(RandomReg);
178       II.getValueFor(Op) = llvm::MCOperand::createReg(RandomReg);
179     }
180   }
181   // And pick random use values that are not reserved and don't alias with defs.
182   const auto DefAliases = getAliasedBits(State.getRegInfo(), Defs);
183   for (const auto &Op : Instr.Operands) {
184     if (Op.Tracker && Op.IsExplicit && !Op.IsDef) {
185       auto PossibleRegisters = Op.Tracker->sourceBits();
186       remove(PossibleRegisters, RATC.reservedRegisters());
187       remove(PossibleRegisters, DefAliases);
188       assert(PossibleRegisters.any() && "No register left to choose from");
189       const auto RandomReg = randomBit(PossibleRegisters);
190       II.getValueFor(Op) = llvm::MCOperand::createReg(RandomReg);
191     }
192   }
193   SnippetPrototype Prototype;
194   Prototype.Explanation =
195       "instruction has no tied variables picking Uses different from defs";
196   Prototype.Snippet.push_back(std::move(II));
197   return std::move(Prototype);
198 }
199 
200 std::vector<BenchmarkMeasure>
runMeasurements(const ExecutableFunction & Function,const unsigned NumRepetitions) const201 UopsBenchmarkRunner::runMeasurements(const ExecutableFunction &Function,
202                                      const unsigned NumRepetitions) const {
203   const auto &SchedModel = State.getSubtargetInfo().getSchedModel();
204 
205   std::vector<BenchmarkMeasure> Result;
206   for (unsigned ProcResIdx = 1;
207        ProcResIdx < SchedModel.getNumProcResourceKinds(); ++ProcResIdx) {
208     const char *const PfmCounters = SchedModel.getExtraProcessorInfo()
209                                         .PfmCounters.IssueCounters[ProcResIdx];
210     if (!PfmCounters)
211       continue;
212     // We sum counts when there are several counters for a single ProcRes
213     // (e.g. P23 on SandyBridge).
214     int64_t CounterValue = 0;
215     llvm::SmallVector<llvm::StringRef, 2> CounterNames;
216     llvm::StringRef(PfmCounters).split(CounterNames, ',');
217     for (const auto &CounterName : CounterNames) {
218       pfm::PerfEvent UopPerfEvent(CounterName);
219       if (!UopPerfEvent.valid())
220         llvm::report_fatal_error(
221             llvm::Twine("invalid perf event ").concat(PfmCounters));
222       pfm::Counter Counter(UopPerfEvent);
223       Counter.start();
224       Function();
225       Counter.stop();
226       CounterValue += Counter.read();
227     }
228     Result.push_back({llvm::itostr(ProcResIdx),
229                       static_cast<double>(CounterValue) / NumRepetitions,
230                       SchedModel.getProcResource(ProcResIdx)->Name});
231   }
232   return Result;
233 }
234 
235 } // namespace exegesis
236