1 //===-- Uops.cpp ------------------------------------------------*- C++ -*-===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9
10 #include "Uops.h"
11
12 #include "Assembler.h"
13 #include "BenchmarkRunner.h"
14 #include "MCInstrDescView.h"
15 #include "PerfHelper.h"
16
17 // FIXME: Load constants into registers (e.g. with fld1) to not break
18 // instructions like x87.
19
20 // Ideally we would like the only limitation on executing uops to be the issue
21 // ports. Maximizing port pressure increases the likelihood that the load is
22 // distributed evenly across possible ports.
23
24 // To achieve that, one approach is to generate instructions that do not have
25 // data dependencies between them.
26 //
27 // For some instructions, this is trivial:
28 // mov rax, qword ptr [rsi]
29 // mov rax, qword ptr [rsi]
30 // mov rax, qword ptr [rsi]
31 // mov rax, qword ptr [rsi]
32 // For the above snippet, haswell just renames rax four times and executes the
33 // four instructions two at a time on P23 and P0126.
34 //
35 // For some instructions, we just need to make sure that the source is
36 // different from the destination. For example, IDIV8r reads from GPR and
37 // writes to AX. We just need to ensure that the Var is assigned a
38 // register which is different from AX:
39 // idiv bx
40 // idiv bx
41 // idiv bx
42 // idiv bx
43 // The above snippet will be able to fully saturate the ports, while the same
44 // with ax would issue one uop every `latency(IDIV8r)` cycles.
45 //
46 // Some instructions make this harder because they both read and write from
47 // the same register:
48 // inc rax
49 // inc rax
50 // inc rax
51 // inc rax
52 // This has a data dependency from each instruction to the next, limit the
53 // number of instructions that can be issued in parallel.
54 // It turns out that this is not a big issue on recent Intel CPUs because they
55 // have heuristics to balance port pressure. In the snippet above, subsequent
56 // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
57 // might end up executing them all on P0 (just because they can), or try
58 // avoiding P5 because it's usually under high pressure from vector
59 // instructions.
60 // This issue is even more important for high-latency instructions because
61 // they increase the idle time of the CPU, e.g. :
62 // imul rax, rbx
63 // imul rax, rbx
64 // imul rax, rbx
65 // imul rax, rbx
66 //
67 // To avoid that, we do the renaming statically by generating as many
68 // independent exclusive assignments as possible (until all possible registers
69 // are exhausted) e.g.:
70 // imul rax, rbx
71 // imul rcx, rbx
72 // imul rdx, rbx
73 // imul r8, rbx
74 //
75 // Some instruction even make the above static renaming impossible because
76 // they implicitly read and write from the same operand, e.g. ADC16rr reads
77 // and writes from EFLAGS.
78 // In that case we just use a greedy register assignment and hope for the
79 // best.
80
81 namespace exegesis {
82
hasUnknownOperand(const llvm::MCOperandInfo & OpInfo)83 static bool hasUnknownOperand(const llvm::MCOperandInfo &OpInfo) {
84 return OpInfo.OperandType == llvm::MCOI::OPERAND_UNKNOWN;
85 }
86
87 // FIXME: Handle memory, see PR36905.
hasMemoryOperand(const llvm::MCOperandInfo & OpInfo)88 static bool hasMemoryOperand(const llvm::MCOperandInfo &OpInfo) {
89 return OpInfo.OperandType == llvm::MCOI::OPERAND_MEMORY;
90 }
91
92 llvm::Error
isInfeasible(const llvm::MCInstrDesc & MCInstrDesc) const93 UopsBenchmarkRunner::isInfeasible(const llvm::MCInstrDesc &MCInstrDesc) const {
94 if (llvm::any_of(MCInstrDesc.operands(), hasUnknownOperand))
95 return llvm::make_error<BenchmarkFailure>(
96 "Infeasible : has unknown operands");
97 if (llvm::any_of(MCInstrDesc.operands(), hasMemoryOperand))
98 return llvm::make_error<BenchmarkFailure>(
99 "Infeasible : has memory operands");
100 return llvm::Error::success();
101 }
102
103 // Returns whether this Variable ties Use and Def operands together.
hasTiedOperands(const Instruction & Instr,const Variable & Var)104 static bool hasTiedOperands(const Instruction &Instr, const Variable &Var) {
105 bool HasUse = false;
106 bool HasDef = false;
107 for (const unsigned OpIndex : Var.TiedOperands) {
108 const Operand &Op = Instr.Operands[OpIndex];
109 if (Op.IsDef)
110 HasDef = true;
111 else
112 HasUse = true;
113 }
114 return HasUse && HasDef;
115 }
116
117 static llvm::SmallVector<const Variable *, 8>
getTiedVariables(const Instruction & Instr)118 getTiedVariables(const Instruction &Instr) {
119 llvm::SmallVector<const Variable *, 8> Result;
120 for (const auto &Var : Instr.Variables)
121 if (hasTiedOperands(Instr, Var))
122 Result.push_back(&Var);
123 return Result;
124 }
125
remove(llvm::BitVector & a,const llvm::BitVector & b)126 static void remove(llvm::BitVector &a, const llvm::BitVector &b) {
127 assert(a.size() == b.size());
128 for (auto I : b.set_bits())
129 a.reset(I);
130 }
131
132 UopsBenchmarkRunner::~UopsBenchmarkRunner() = default;
133
134 llvm::Expected<SnippetPrototype>
generatePrototype(unsigned Opcode) const135 UopsBenchmarkRunner::generatePrototype(unsigned Opcode) const {
136 const auto &InstrDesc = State.getInstrInfo().get(Opcode);
137 if (auto E = isInfeasible(InstrDesc))
138 return std::move(E);
139 const Instruction Instr(InstrDesc, RATC);
140 const AliasingConfigurations SelfAliasing(Instr, Instr);
141 if (SelfAliasing.empty()) {
142 return generateUnconstrainedPrototype(Instr, "instruction is parallel");
143 }
144 if (SelfAliasing.hasImplicitAliasing()) {
145 return generateUnconstrainedPrototype(Instr, "instruction is serial");
146 }
147 const auto TiedVariables = getTiedVariables(Instr);
148 if (!TiedVariables.empty()) {
149 if (TiedVariables.size() > 1)
150 return llvm::make_error<llvm::StringError>(
151 "Infeasible : don't know how to handle several tied variables",
152 llvm::inconvertibleErrorCode());
153 const Variable *Var = TiedVariables.front();
154 assert(Var);
155 assert(!Var->TiedOperands.empty());
156 const Operand &Op = Instr.Operands[Var->TiedOperands.front()];
157 assert(Op.Tracker);
158 SnippetPrototype Prototype;
159 Prototype.Explanation =
160 "instruction has tied variables using static renaming.";
161 for (const llvm::MCPhysReg Reg : Op.Tracker->sourceBits().set_bits()) {
162 Prototype.Snippet.emplace_back(Instr);
163 Prototype.Snippet.back().getValueFor(*Var) =
164 llvm::MCOperand::createReg(Reg);
165 }
166 return std::move(Prototype);
167 }
168 InstructionInstance II(Instr);
169 // No tied variables, we pick random values for defs.
170 llvm::BitVector Defs(State.getRegInfo().getNumRegs());
171 for (const auto &Op : Instr.Operands) {
172 if (Op.Tracker && Op.IsExplicit && Op.IsDef) {
173 auto PossibleRegisters = Op.Tracker->sourceBits();
174 remove(PossibleRegisters, RATC.reservedRegisters());
175 assert(PossibleRegisters.any() && "No register left to choose from");
176 const auto RandomReg = randomBit(PossibleRegisters);
177 Defs.set(RandomReg);
178 II.getValueFor(Op) = llvm::MCOperand::createReg(RandomReg);
179 }
180 }
181 // And pick random use values that are not reserved and don't alias with defs.
182 const auto DefAliases = getAliasedBits(State.getRegInfo(), Defs);
183 for (const auto &Op : Instr.Operands) {
184 if (Op.Tracker && Op.IsExplicit && !Op.IsDef) {
185 auto PossibleRegisters = Op.Tracker->sourceBits();
186 remove(PossibleRegisters, RATC.reservedRegisters());
187 remove(PossibleRegisters, DefAliases);
188 assert(PossibleRegisters.any() && "No register left to choose from");
189 const auto RandomReg = randomBit(PossibleRegisters);
190 II.getValueFor(Op) = llvm::MCOperand::createReg(RandomReg);
191 }
192 }
193 SnippetPrototype Prototype;
194 Prototype.Explanation =
195 "instruction has no tied variables picking Uses different from defs";
196 Prototype.Snippet.push_back(std::move(II));
197 return std::move(Prototype);
198 }
199
200 std::vector<BenchmarkMeasure>
runMeasurements(const ExecutableFunction & Function,const unsigned NumRepetitions) const201 UopsBenchmarkRunner::runMeasurements(const ExecutableFunction &Function,
202 const unsigned NumRepetitions) const {
203 const auto &SchedModel = State.getSubtargetInfo().getSchedModel();
204
205 std::vector<BenchmarkMeasure> Result;
206 for (unsigned ProcResIdx = 1;
207 ProcResIdx < SchedModel.getNumProcResourceKinds(); ++ProcResIdx) {
208 const char *const PfmCounters = SchedModel.getExtraProcessorInfo()
209 .PfmCounters.IssueCounters[ProcResIdx];
210 if (!PfmCounters)
211 continue;
212 // We sum counts when there are several counters for a single ProcRes
213 // (e.g. P23 on SandyBridge).
214 int64_t CounterValue = 0;
215 llvm::SmallVector<llvm::StringRef, 2> CounterNames;
216 llvm::StringRef(PfmCounters).split(CounterNames, ',');
217 for (const auto &CounterName : CounterNames) {
218 pfm::PerfEvent UopPerfEvent(CounterName);
219 if (!UopPerfEvent.valid())
220 llvm::report_fatal_error(
221 llvm::Twine("invalid perf event ").concat(PfmCounters));
222 pfm::Counter Counter(UopPerfEvent);
223 Counter.start();
224 Function();
225 Counter.stop();
226 CounterValue += Counter.read();
227 }
228 Result.push_back({llvm::itostr(ProcResIdx),
229 static_cast<double>(CounterValue) / NumRepetitions,
230 SchedModel.getProcResource(ProcResIdx)->Name});
231 }
232 return Result;
233 }
234
235 } // namespace exegesis
236