1 //===-- Target.cpp ----------------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 #include "../Target.h"
9
10 #include "../Error.h"
11 #include "../ParallelSnippetGenerator.h"
12 #include "../SerialSnippetGenerator.h"
13 #include "../SnippetGenerator.h"
14 #include "MCTargetDesc/X86BaseInfo.h"
15 #include "MCTargetDesc/X86MCTargetDesc.h"
16 #include "X86.h"
17 #include "X86Counter.h"
18 #include "X86RegisterInfo.h"
19 #include "X86Subtarget.h"
20 #include "llvm/ADT/Sequence.h"
21 #include "llvm/MC/MCInstBuilder.h"
22 #include "llvm/Support/Errc.h"
23 #include "llvm/Support/Error.h"
24 #include "llvm/Support/FormatVariadic.h"
25 #include "llvm/Support/Host.h"
26
27 #include <memory>
28 #include <string>
29 #include <vector>
30 #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
31 #include <immintrin.h>
32 #include <intrin.h>
33 #endif
34
35 namespace llvm {
36 namespace exegesis {
37
38 static cl::OptionCategory
39 BenchmarkOptions("llvm-exegesis benchmark x86-options");
40
41 // If a positive value is specified, we are going to use the LBR in
42 // latency-mode.
43 //
44 // Note:
45 // - A small value is preferred, but too low a value could result in
46 // throttling.
47 // - A prime number is preferred to avoid always skipping certain blocks.
48 //
49 static cl::opt<unsigned> LbrSamplingPeriod(
50 "x86-lbr-sample-period",
51 cl::desc("The sample period (nbranches/sample), used for LBR sampling"),
52 cl::cat(BenchmarkOptions), cl::init(0));
53
54 // FIXME: Validates that repetition-mode is loop if LBR is requested.
55
56 // Returns a non-null reason if we cannot handle the memory references in this
57 // instruction.
isInvalidMemoryInstr(const Instruction & Instr)58 static const char *isInvalidMemoryInstr(const Instruction &Instr) {
59 switch (Instr.Description.TSFlags & X86II::FormMask) {
60 default:
61 return "Unknown FormMask value";
62 // These have no memory access.
63 case X86II::Pseudo:
64 case X86II::RawFrm:
65 case X86II::AddCCFrm:
66 case X86II::PrefixByte:
67 case X86II::MRMDestReg:
68 case X86II::MRMSrcReg:
69 case X86II::MRMSrcReg4VOp3:
70 case X86II::MRMSrcRegOp4:
71 case X86II::MRMSrcRegCC:
72 case X86II::MRMXrCC:
73 case X86II::MRMr0:
74 case X86II::MRMXr:
75 case X86II::MRM0r:
76 case X86II::MRM1r:
77 case X86II::MRM2r:
78 case X86II::MRM3r:
79 case X86II::MRM4r:
80 case X86II::MRM5r:
81 case X86II::MRM6r:
82 case X86II::MRM7r:
83 case X86II::MRM0X:
84 case X86II::MRM1X:
85 case X86II::MRM2X:
86 case X86II::MRM3X:
87 case X86II::MRM4X:
88 case X86II::MRM5X:
89 case X86II::MRM6X:
90 case X86II::MRM7X:
91 case X86II::MRM_C0:
92 case X86II::MRM_C1:
93 case X86II::MRM_C2:
94 case X86II::MRM_C3:
95 case X86II::MRM_C4:
96 case X86II::MRM_C5:
97 case X86II::MRM_C6:
98 case X86II::MRM_C7:
99 case X86II::MRM_C8:
100 case X86II::MRM_C9:
101 case X86II::MRM_CA:
102 case X86II::MRM_CB:
103 case X86II::MRM_CC:
104 case X86II::MRM_CD:
105 case X86II::MRM_CE:
106 case X86II::MRM_CF:
107 case X86II::MRM_D0:
108 case X86II::MRM_D1:
109 case X86II::MRM_D2:
110 case X86II::MRM_D3:
111 case X86II::MRM_D4:
112 case X86II::MRM_D5:
113 case X86II::MRM_D6:
114 case X86II::MRM_D7:
115 case X86II::MRM_D8:
116 case X86II::MRM_D9:
117 case X86II::MRM_DA:
118 case X86II::MRM_DB:
119 case X86II::MRM_DC:
120 case X86II::MRM_DD:
121 case X86II::MRM_DE:
122 case X86II::MRM_DF:
123 case X86II::MRM_E0:
124 case X86II::MRM_E1:
125 case X86II::MRM_E2:
126 case X86II::MRM_E3:
127 case X86II::MRM_E4:
128 case X86II::MRM_E5:
129 case X86II::MRM_E6:
130 case X86II::MRM_E7:
131 case X86II::MRM_E8:
132 case X86II::MRM_E9:
133 case X86II::MRM_EA:
134 case X86II::MRM_EB:
135 case X86II::MRM_EC:
136 case X86II::MRM_ED:
137 case X86II::MRM_EE:
138 case X86II::MRM_EF:
139 case X86II::MRM_F0:
140 case X86II::MRM_F1:
141 case X86II::MRM_F2:
142 case X86II::MRM_F3:
143 case X86II::MRM_F4:
144 case X86II::MRM_F5:
145 case X86II::MRM_F6:
146 case X86II::MRM_F7:
147 case X86II::MRM_F8:
148 case X86II::MRM_F9:
149 case X86II::MRM_FA:
150 case X86II::MRM_FB:
151 case X86II::MRM_FC:
152 case X86II::MRM_FD:
153 case X86II::MRM_FE:
154 case X86II::MRM_FF:
155 case X86II::RawFrmImm8:
156 return nullptr;
157 case X86II::AddRegFrm:
158 return (Instr.Description.Opcode == X86::POP16r ||
159 Instr.Description.Opcode == X86::POP32r ||
160 Instr.Description.Opcode == X86::PUSH16r ||
161 Instr.Description.Opcode == X86::PUSH32r)
162 ? "unsupported opcode: unsupported memory access"
163 : nullptr;
164 // These access memory and are handled.
165 case X86II::MRMDestMem:
166 case X86II::MRMSrcMem:
167 case X86II::MRMSrcMem4VOp3:
168 case X86II::MRMSrcMemOp4:
169 case X86II::MRMSrcMemCC:
170 case X86II::MRMXmCC:
171 case X86II::MRMXm:
172 case X86II::MRM0m:
173 case X86II::MRM1m:
174 case X86II::MRM2m:
175 case X86II::MRM3m:
176 case X86II::MRM4m:
177 case X86II::MRM5m:
178 case X86II::MRM6m:
179 case X86II::MRM7m:
180 return nullptr;
181 // These access memory and are not handled yet.
182 case X86II::RawFrmImm16:
183 case X86II::RawFrmMemOffs:
184 case X86II::RawFrmSrc:
185 case X86II::RawFrmDst:
186 case X86II::RawFrmDstSrc:
187 return "unsupported opcode: non uniform memory access";
188 }
189 }
190
191 // If the opcode is invalid, returns a pointer to a character literal indicating
192 // the reason. nullptr indicates a valid opcode.
isInvalidOpcode(const Instruction & Instr)193 static const char *isInvalidOpcode(const Instruction &Instr) {
194 const auto OpcodeName = Instr.Name;
195 if ((Instr.Description.TSFlags & X86II::FormMask) == X86II::Pseudo)
196 return "unsupported opcode: pseudo instruction";
197 if ((OpcodeName.startswith("POP") && !OpcodeName.startswith("POPCNT")) ||
198 OpcodeName.startswith("PUSH") || OpcodeName.startswith("ADJCALLSTACK") ||
199 OpcodeName.startswith("LEAVE"))
200 return "unsupported opcode: Push/Pop/AdjCallStack/Leave";
201 switch (Instr.Description.Opcode) {
202 case X86::LFS16rm:
203 case X86::LFS32rm:
204 case X86::LFS64rm:
205 case X86::LGS16rm:
206 case X86::LGS32rm:
207 case X86::LGS64rm:
208 case X86::LSS16rm:
209 case X86::LSS32rm:
210 case X86::LSS64rm:
211 case X86::SYSENTER:
212 return "unsupported opcode";
213 default:
214 break;
215 }
216 if (const auto reason = isInvalidMemoryInstr(Instr))
217 return reason;
218 // We do not handle instructions with OPERAND_PCREL.
219 for (const Operand &Op : Instr.Operands)
220 if (Op.isExplicit() &&
221 Op.getExplicitOperandInfo().OperandType == MCOI::OPERAND_PCREL)
222 return "unsupported opcode: PC relative operand";
223 // We do not handle second-form X87 instructions. We only handle first-form
224 // ones (_Fp), see comment in X86InstrFPStack.td.
225 for (const Operand &Op : Instr.Operands)
226 if (Op.isReg() && Op.isExplicit() &&
227 Op.getExplicitOperandInfo().RegClass == X86::RSTRegClassID)
228 return "unsupported second-form X87 instruction";
229 return nullptr;
230 }
231
getX86FPFlags(const Instruction & Instr)232 static unsigned getX86FPFlags(const Instruction &Instr) {
233 return Instr.Description.TSFlags & X86II::FPTypeMask;
234 }
235
236 // Helper to fill a memory operand with a value.
setMemOp(InstructionTemplate & IT,int OpIdx,const MCOperand & OpVal)237 static void setMemOp(InstructionTemplate &IT, int OpIdx,
238 const MCOperand &OpVal) {
239 const auto Op = IT.getInstr().Operands[OpIdx];
240 assert(Op.isExplicit() && "invalid memory pattern");
241 IT.getValueFor(Op) = OpVal;
242 }
243
244 // Common (latency, uops) code for LEA templates. `GetDestReg` takes the
245 // addressing base and index registers and returns the LEA destination register.
generateLEATemplatesCommon(const Instruction & Instr,const BitVector & ForbiddenRegisters,const LLVMState & State,const SnippetGenerator::Options & Opts,std::function<void (unsigned,unsigned,BitVector & CandidateDestRegs)> RestrictDestRegs)246 static Expected<std::vector<CodeTemplate>> generateLEATemplatesCommon(
247 const Instruction &Instr, const BitVector &ForbiddenRegisters,
248 const LLVMState &State, const SnippetGenerator::Options &Opts,
249 std::function<void(unsigned, unsigned, BitVector &CandidateDestRegs)>
250 RestrictDestRegs) {
251 assert(Instr.Operands.size() == 6 && "invalid LEA");
252 assert(X86II::getMemoryOperandNo(Instr.Description.TSFlags) == 1 &&
253 "invalid LEA");
254
255 constexpr const int kDestOp = 0;
256 constexpr const int kBaseOp = 1;
257 constexpr const int kIndexOp = 3;
258 auto PossibleDestRegs =
259 Instr.Operands[kDestOp].getRegisterAliasing().sourceBits();
260 remove(PossibleDestRegs, ForbiddenRegisters);
261 auto PossibleBaseRegs =
262 Instr.Operands[kBaseOp].getRegisterAliasing().sourceBits();
263 remove(PossibleBaseRegs, ForbiddenRegisters);
264 auto PossibleIndexRegs =
265 Instr.Operands[kIndexOp].getRegisterAliasing().sourceBits();
266 remove(PossibleIndexRegs, ForbiddenRegisters);
267
268 const auto &RegInfo = State.getRegInfo();
269 std::vector<CodeTemplate> Result;
270 for (const unsigned BaseReg : PossibleBaseRegs.set_bits()) {
271 for (const unsigned IndexReg : PossibleIndexRegs.set_bits()) {
272 for (int LogScale = 0; LogScale <= 3; ++LogScale) {
273 // FIXME: Add an option for controlling how we explore immediates.
274 for (const int Disp : {0, 42}) {
275 InstructionTemplate IT(&Instr);
276 const int64_t Scale = 1ull << LogScale;
277 setMemOp(IT, 1, MCOperand::createReg(BaseReg));
278 setMemOp(IT, 2, MCOperand::createImm(Scale));
279 setMemOp(IT, 3, MCOperand::createReg(IndexReg));
280 setMemOp(IT, 4, MCOperand::createImm(Disp));
281 // SegmentReg must be 0 for LEA.
282 setMemOp(IT, 5, MCOperand::createReg(0));
283
284 // Output reg candidates are selected by the caller.
285 auto PossibleDestRegsNow = PossibleDestRegs;
286 RestrictDestRegs(BaseReg, IndexReg, PossibleDestRegsNow);
287 assert(PossibleDestRegsNow.set_bits().begin() !=
288 PossibleDestRegsNow.set_bits().end() &&
289 "no remaining registers");
290 setMemOp(
291 IT, 0,
292 MCOperand::createReg(*PossibleDestRegsNow.set_bits().begin()));
293
294 CodeTemplate CT;
295 CT.Instructions.push_back(std::move(IT));
296 CT.Config = formatv("{3}(%{0}, %{1}, {2})", RegInfo.getName(BaseReg),
297 RegInfo.getName(IndexReg), Scale, Disp)
298 .str();
299 Result.push_back(std::move(CT));
300 if (Result.size() >= Opts.MaxConfigsPerOpcode)
301 return std::move(Result);
302 }
303 }
304 }
305 }
306
307 return std::move(Result);
308 }
309
310 namespace {
311 class X86SerialSnippetGenerator : public SerialSnippetGenerator {
312 public:
313 using SerialSnippetGenerator::SerialSnippetGenerator;
314
315 Expected<std::vector<CodeTemplate>>
316 generateCodeTemplates(InstructionTemplate Variant,
317 const BitVector &ForbiddenRegisters) const override;
318 };
319 } // namespace
320
321 Expected<std::vector<CodeTemplate>>
generateCodeTemplates(InstructionTemplate Variant,const BitVector & ForbiddenRegisters) const322 X86SerialSnippetGenerator::generateCodeTemplates(
323 InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const {
324 const Instruction &Instr = Variant.getInstr();
325
326 if (const auto reason = isInvalidOpcode(Instr))
327 return make_error<Failure>(reason);
328
329 // LEA gets special attention.
330 const auto Opcode = Instr.Description.getOpcode();
331 if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) {
332 return generateLEATemplatesCommon(
333 Instr, ForbiddenRegisters, State, Opts,
334 [this](unsigned BaseReg, unsigned IndexReg,
335 BitVector &CandidateDestRegs) {
336 // We just select a destination register that aliases the base
337 // register.
338 CandidateDestRegs &=
339 State.getRATC().getRegister(BaseReg).aliasedBits();
340 });
341 }
342
343 if (Instr.hasMemoryOperands())
344 return make_error<Failure>(
345 "unsupported memory operand in latency measurements");
346
347 switch (getX86FPFlags(Instr)) {
348 case X86II::NotFP:
349 return SerialSnippetGenerator::generateCodeTemplates(Variant,
350 ForbiddenRegisters);
351 case X86II::ZeroArgFP:
352 case X86II::OneArgFP:
353 case X86II::SpecialFP:
354 case X86II::CompareFP:
355 case X86II::CondMovFP:
356 return make_error<Failure>("Unsupported x87 Instruction");
357 case X86II::OneArgFPRW:
358 case X86II::TwoArgFP:
359 // These are instructions like
360 // - `ST(0) = fsqrt(ST(0))` (OneArgFPRW)
361 // - `ST(0) = ST(0) + ST(i)` (TwoArgFP)
362 // They are intrinsically serial and do not modify the state of the stack.
363 return generateSelfAliasingCodeTemplates(Variant);
364 default:
365 llvm_unreachable("Unknown FP Type!");
366 }
367 }
368
369 namespace {
370 class X86ParallelSnippetGenerator : public ParallelSnippetGenerator {
371 public:
372 using ParallelSnippetGenerator::ParallelSnippetGenerator;
373
374 Expected<std::vector<CodeTemplate>>
375 generateCodeTemplates(InstructionTemplate Variant,
376 const BitVector &ForbiddenRegisters) const override;
377 };
378
379 } // namespace
380
381 Expected<std::vector<CodeTemplate>>
generateCodeTemplates(InstructionTemplate Variant,const BitVector & ForbiddenRegisters) const382 X86ParallelSnippetGenerator::generateCodeTemplates(
383 InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const {
384 const Instruction &Instr = Variant.getInstr();
385
386 if (const auto reason = isInvalidOpcode(Instr))
387 return make_error<Failure>(reason);
388
389 // LEA gets special attention.
390 const auto Opcode = Instr.Description.getOpcode();
391 if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) {
392 return generateLEATemplatesCommon(
393 Instr, ForbiddenRegisters, State, Opts,
394 [this](unsigned BaseReg, unsigned IndexReg,
395 BitVector &CandidateDestRegs) {
396 // Any destination register that is not used for addressing is fine.
397 remove(CandidateDestRegs,
398 State.getRATC().getRegister(BaseReg).aliasedBits());
399 remove(CandidateDestRegs,
400 State.getRATC().getRegister(IndexReg).aliasedBits());
401 });
402 }
403
404 switch (getX86FPFlags(Instr)) {
405 case X86II::NotFP:
406 return ParallelSnippetGenerator::generateCodeTemplates(Variant,
407 ForbiddenRegisters);
408 case X86II::ZeroArgFP:
409 case X86II::OneArgFP:
410 case X86II::SpecialFP:
411 return make_error<Failure>("Unsupported x87 Instruction");
412 case X86II::OneArgFPRW:
413 case X86II::TwoArgFP:
414 // These are instructions like
415 // - `ST(0) = fsqrt(ST(0))` (OneArgFPRW)
416 // - `ST(0) = ST(0) + ST(i)` (TwoArgFP)
417 // They are intrinsically serial and do not modify the state of the stack.
418 // We generate the same code for latency and uops.
419 return generateSelfAliasingCodeTemplates(Variant);
420 case X86II::CompareFP:
421 case X86II::CondMovFP:
422 // We can compute uops for any FP instruction that does not grow or shrink
423 // the stack (either do not touch the stack or push as much as they pop).
424 return generateUnconstrainedCodeTemplates(
425 Variant, "instruction does not grow/shrink the FP stack");
426 default:
427 llvm_unreachable("Unknown FP Type!");
428 }
429 }
430
getLoadImmediateOpcode(unsigned RegBitWidth)431 static unsigned getLoadImmediateOpcode(unsigned RegBitWidth) {
432 switch (RegBitWidth) {
433 case 8:
434 return X86::MOV8ri;
435 case 16:
436 return X86::MOV16ri;
437 case 32:
438 return X86::MOV32ri;
439 case 64:
440 return X86::MOV64ri;
441 }
442 llvm_unreachable("Invalid Value Width");
443 }
444
445 // Generates instruction to load an immediate value into a register.
loadImmediate(unsigned Reg,unsigned RegBitWidth,const APInt & Value)446 static MCInst loadImmediate(unsigned Reg, unsigned RegBitWidth,
447 const APInt &Value) {
448 if (Value.getBitWidth() > RegBitWidth)
449 llvm_unreachable("Value must fit in the Register");
450 return MCInstBuilder(getLoadImmediateOpcode(RegBitWidth))
451 .addReg(Reg)
452 .addImm(Value.getZExtValue());
453 }
454
455 // Allocates scratch memory on the stack.
allocateStackSpace(unsigned Bytes)456 static MCInst allocateStackSpace(unsigned Bytes) {
457 return MCInstBuilder(X86::SUB64ri8)
458 .addReg(X86::RSP)
459 .addReg(X86::RSP)
460 .addImm(Bytes);
461 }
462
463 // Fills scratch memory at offset `OffsetBytes` with value `Imm`.
fillStackSpace(unsigned MovOpcode,unsigned OffsetBytes,uint64_t Imm)464 static MCInst fillStackSpace(unsigned MovOpcode, unsigned OffsetBytes,
465 uint64_t Imm) {
466 return MCInstBuilder(MovOpcode)
467 // Address = ESP
468 .addReg(X86::RSP) // BaseReg
469 .addImm(1) // ScaleAmt
470 .addReg(0) // IndexReg
471 .addImm(OffsetBytes) // Disp
472 .addReg(0) // Segment
473 // Immediate.
474 .addImm(Imm);
475 }
476
477 // Loads scratch memory into register `Reg` using opcode `RMOpcode`.
loadToReg(unsigned Reg,unsigned RMOpcode)478 static MCInst loadToReg(unsigned Reg, unsigned RMOpcode) {
479 return MCInstBuilder(RMOpcode)
480 .addReg(Reg)
481 // Address = ESP
482 .addReg(X86::RSP) // BaseReg
483 .addImm(1) // ScaleAmt
484 .addReg(0) // IndexReg
485 .addImm(0) // Disp
486 .addReg(0); // Segment
487 }
488
489 // Releases scratch memory.
releaseStackSpace(unsigned Bytes)490 static MCInst releaseStackSpace(unsigned Bytes) {
491 return MCInstBuilder(X86::ADD64ri8)
492 .addReg(X86::RSP)
493 .addReg(X86::RSP)
494 .addImm(Bytes);
495 }
496
497 // Reserves some space on the stack, fills it with the content of the provided
498 // constant and provide methods to load the stack value into a register.
499 namespace {
500 struct ConstantInliner {
ConstantInlinerllvm::exegesis::__anonf23a29fa0511::ConstantInliner501 explicit ConstantInliner(const APInt &Constant) : Constant_(Constant) {}
502
503 std::vector<MCInst> loadAndFinalize(unsigned Reg, unsigned RegBitWidth,
504 unsigned Opcode);
505
506 std::vector<MCInst> loadX87STAndFinalize(unsigned Reg);
507
508 std::vector<MCInst> loadX87FPAndFinalize(unsigned Reg);
509
510 std::vector<MCInst> popFlagAndFinalize();
511
512 std::vector<MCInst> loadImplicitRegAndFinalize(unsigned Opcode,
513 unsigned Value);
514
515 private:
addllvm::exegesis::__anonf23a29fa0511::ConstantInliner516 ConstantInliner &add(const MCInst &Inst) {
517 Instructions.push_back(Inst);
518 return *this;
519 }
520
521 void initStack(unsigned Bytes);
522
523 static constexpr const unsigned kF80Bytes = 10; // 80 bits.
524
525 APInt Constant_;
526 std::vector<MCInst> Instructions;
527 };
528 } // namespace
529
loadAndFinalize(unsigned Reg,unsigned RegBitWidth,unsigned Opcode)530 std::vector<MCInst> ConstantInliner::loadAndFinalize(unsigned Reg,
531 unsigned RegBitWidth,
532 unsigned Opcode) {
533 assert((RegBitWidth & 7) == 0 && "RegBitWidth must be a multiple of 8 bits");
534 initStack(RegBitWidth / 8);
535 add(loadToReg(Reg, Opcode));
536 add(releaseStackSpace(RegBitWidth / 8));
537 return std::move(Instructions);
538 }
539
loadX87STAndFinalize(unsigned Reg)540 std::vector<MCInst> ConstantInliner::loadX87STAndFinalize(unsigned Reg) {
541 initStack(kF80Bytes);
542 add(MCInstBuilder(X86::LD_F80m)
543 // Address = ESP
544 .addReg(X86::RSP) // BaseReg
545 .addImm(1) // ScaleAmt
546 .addReg(0) // IndexReg
547 .addImm(0) // Disp
548 .addReg(0)); // Segment
549 if (Reg != X86::ST0)
550 add(MCInstBuilder(X86::ST_Frr).addReg(Reg));
551 add(releaseStackSpace(kF80Bytes));
552 return std::move(Instructions);
553 }
554
loadX87FPAndFinalize(unsigned Reg)555 std::vector<MCInst> ConstantInliner::loadX87FPAndFinalize(unsigned Reg) {
556 initStack(kF80Bytes);
557 add(MCInstBuilder(X86::LD_Fp80m)
558 .addReg(Reg)
559 // Address = ESP
560 .addReg(X86::RSP) // BaseReg
561 .addImm(1) // ScaleAmt
562 .addReg(0) // IndexReg
563 .addImm(0) // Disp
564 .addReg(0)); // Segment
565 add(releaseStackSpace(kF80Bytes));
566 return std::move(Instructions);
567 }
568
popFlagAndFinalize()569 std::vector<MCInst> ConstantInliner::popFlagAndFinalize() {
570 initStack(8);
571 add(MCInstBuilder(X86::POPF64));
572 return std::move(Instructions);
573 }
574
575 std::vector<MCInst>
loadImplicitRegAndFinalize(unsigned Opcode,unsigned Value)576 ConstantInliner::loadImplicitRegAndFinalize(unsigned Opcode, unsigned Value) {
577 add(allocateStackSpace(4));
578 add(fillStackSpace(X86::MOV32mi, 0, Value)); // Mask all FP exceptions
579 add(MCInstBuilder(Opcode)
580 // Address = ESP
581 .addReg(X86::RSP) // BaseReg
582 .addImm(1) // ScaleAmt
583 .addReg(0) // IndexReg
584 .addImm(0) // Disp
585 .addReg(0)); // Segment
586 add(releaseStackSpace(4));
587 return std::move(Instructions);
588 }
589
initStack(unsigned Bytes)590 void ConstantInliner::initStack(unsigned Bytes) {
591 assert(Constant_.getBitWidth() <= Bytes * 8 &&
592 "Value does not have the correct size");
593 const APInt WideConstant = Constant_.getBitWidth() < Bytes * 8
594 ? Constant_.sext(Bytes * 8)
595 : Constant_;
596 add(allocateStackSpace(Bytes));
597 size_t ByteOffset = 0;
598 for (; Bytes - ByteOffset >= 4; ByteOffset += 4)
599 add(fillStackSpace(
600 X86::MOV32mi, ByteOffset,
601 WideConstant.extractBits(32, ByteOffset * 8).getZExtValue()));
602 if (Bytes - ByteOffset >= 2) {
603 add(fillStackSpace(
604 X86::MOV16mi, ByteOffset,
605 WideConstant.extractBits(16, ByteOffset * 8).getZExtValue()));
606 ByteOffset += 2;
607 }
608 if (Bytes - ByteOffset >= 1)
609 add(fillStackSpace(
610 X86::MOV8mi, ByteOffset,
611 WideConstant.extractBits(8, ByteOffset * 8).getZExtValue()));
612 }
613
614 #include "X86GenExegesis.inc"
615
616 namespace {
617
618 class X86SavedState : public ExegesisTarget::SavedState {
619 public:
X86SavedState()620 X86SavedState() {
621 #ifdef __x86_64__
622 # if defined(_MSC_VER)
623 _fxsave64(FPState);
624 Eflags = __readeflags();
625 # elif defined(__GNUC__)
626 __builtin_ia32_fxsave64(FPState);
627 Eflags = __builtin_ia32_readeflags_u64();
628 # endif
629 #else
630 llvm_unreachable("X86 exegesis running on non-X86 target");
631 #endif
632 }
633
~X86SavedState()634 ~X86SavedState() {
635 // Restoring the X87 state does not flush pending exceptions, make sure
636 // these exceptions are flushed now.
637 #ifdef __x86_64__
638 # if defined(_MSC_VER)
639 _clearfp();
640 _fxrstor64(FPState);
641 __writeeflags(Eflags);
642 # elif defined(__GNUC__)
643 asm volatile("fwait");
644 __builtin_ia32_fxrstor64(FPState);
645 __builtin_ia32_writeeflags_u64(Eflags);
646 # endif
647 #else
648 llvm_unreachable("X86 exegesis running on non-X86 target");
649 #endif
650 }
651
652 private:
653 #ifdef __x86_64__
654 alignas(16) char FPState[512];
655 uint64_t Eflags;
656 #endif
657 };
658
659 class ExegesisX86Target : public ExegesisTarget {
660 public:
ExegesisX86Target()661 ExegesisX86Target() : ExegesisTarget(X86CpuPfmCounters) {}
662
663 Expected<std::unique_ptr<pfm::Counter>>
createCounter(StringRef CounterName,const LLVMState & State) const664 createCounter(StringRef CounterName, const LLVMState &State) const override {
665 // If LbrSamplingPeriod was provided, then ignore the
666 // CounterName because we only have one for LBR.
667 if (LbrSamplingPeriod > 0) {
668 // Can't use LBR without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, or without
669 // __linux__ (for now)
670 #if defined(HAVE_LIBPFM) && defined(LIBPFM_HAS_FIELD_CYCLES) && \
671 defined(__linux__)
672 return std::make_unique<X86LbrCounter>(
673 X86LbrPerfEvent(LbrSamplingPeriod));
674 #else
675 return llvm::make_error<llvm::StringError>(
676 "LBR counter requested without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, "
677 "or running on Linux.",
678 llvm::errc::invalid_argument);
679 #endif
680 }
681 return ExegesisTarget::createCounter(CounterName, State);
682 }
683
684 private:
685 void addTargetSpecificPasses(PassManagerBase &PM) const override;
686
687 unsigned getScratchMemoryRegister(const Triple &TT) const override;
688
689 unsigned getLoopCounterRegister(const Triple &) const override;
690
getMaxMemoryAccessSize() const691 unsigned getMaxMemoryAccessSize() const override { return 64; }
692
693 Error randomizeTargetMCOperand(const Instruction &Instr, const Variable &Var,
694 MCOperand &AssignedValue,
695 const BitVector &ForbiddenRegs) const override;
696
697 void fillMemoryOperands(InstructionTemplate &IT, unsigned Reg,
698 unsigned Offset) const override;
699
700 void decrementLoopCounterAndJump(MachineBasicBlock &MBB,
701 MachineBasicBlock &TargetMBB,
702 const MCInstrInfo &MII) const override;
703
704 std::vector<MCInst> setRegTo(const MCSubtargetInfo &STI, unsigned Reg,
705 const APInt &Value) const override;
706
getUnavailableRegisters() const707 ArrayRef<unsigned> getUnavailableRegisters() const override {
708 return makeArrayRef(kUnavailableRegisters,
709 sizeof(kUnavailableRegisters) /
710 sizeof(kUnavailableRegisters[0]));
711 }
712
allowAsBackToBack(const Instruction & Instr) const713 bool allowAsBackToBack(const Instruction &Instr) const override {
714 const unsigned Opcode = Instr.Description.Opcode;
715 return !isInvalidOpcode(Instr) && Opcode != X86::LEA64r &&
716 Opcode != X86::LEA64_32r && Opcode != X86::LEA16r;
717 }
718
719 std::vector<InstructionTemplate>
720 generateInstructionVariants(const Instruction &Instr,
721 unsigned MaxConfigsPerOpcode) const override;
722
createSerialSnippetGenerator(const LLVMState & State,const SnippetGenerator::Options & Opts) const723 std::unique_ptr<SnippetGenerator> createSerialSnippetGenerator(
724 const LLVMState &State,
725 const SnippetGenerator::Options &Opts) const override {
726 return std::make_unique<X86SerialSnippetGenerator>(State, Opts);
727 }
728
createParallelSnippetGenerator(const LLVMState & State,const SnippetGenerator::Options & Opts) const729 std::unique_ptr<SnippetGenerator> createParallelSnippetGenerator(
730 const LLVMState &State,
731 const SnippetGenerator::Options &Opts) const override {
732 return std::make_unique<X86ParallelSnippetGenerator>(State, Opts);
733 }
734
matchesArch(Triple::ArchType Arch) const735 bool matchesArch(Triple::ArchType Arch) const override {
736 return Arch == Triple::x86_64 || Arch == Triple::x86;
737 }
738
checkFeatureSupport() const739 Error checkFeatureSupport() const override {
740 // LBR is the only feature we conditionally support now.
741 // So if LBR is not requested, then we should be able to run the benchmarks.
742 if (LbrSamplingPeriod == 0)
743 return Error::success();
744
745 #if defined(__linux__) && defined(HAVE_LIBPFM) && \
746 defined(LIBPFM_HAS_FIELD_CYCLES)
747 // FIXME: Fix this.
748 // https://bugs.llvm.org/show_bug.cgi?id=48918
749 // For now, only do the check if we see an Intel machine because
750 // the counter uses some intel-specific magic and it could
751 // be confuse and think an AMD machine actually has LBR support.
752 #if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || \
753 defined(_M_X64)
754 using namespace sys::detail::x86;
755
756 if (getVendorSignature() == VendorSignatures::GENUINE_INTEL)
757 // If the kernel supports it, the hardware still may not have it.
758 return X86LbrCounter::checkLbrSupport();
759 #else
760 llvm_unreachable("Running X86 exegesis on non-X86 target");
761 #endif
762 #endif
763 return llvm::make_error<llvm::StringError>(
764 "LBR not supported on this kernel and/or platform",
765 llvm::errc::not_supported);
766 }
767
withSavedState() const768 std::unique_ptr<SavedState> withSavedState() const override {
769 return std::make_unique<X86SavedState>();
770 }
771
772 static const unsigned kUnavailableRegisters[4];
773 };
774
775 // We disable a few registers that cannot be encoded on instructions with a REX
776 // prefix.
777 const unsigned ExegesisX86Target::kUnavailableRegisters[4] = {X86::AH, X86::BH,
778 X86::CH, X86::DH};
779
780 // We're using one of R8-R15 because these registers are never hardcoded in
781 // instructions (e.g. MOVS writes to EDI, ESI, EDX), so they have less
782 // conflicts.
783 constexpr const unsigned kLoopCounterReg = X86::R8;
784
785 } // namespace
786
addTargetSpecificPasses(PassManagerBase & PM) const787 void ExegesisX86Target::addTargetSpecificPasses(PassManagerBase &PM) const {
788 // Lowers FP pseudo-instructions, e.g. ABS_Fp32 -> ABS_F.
789 PM.add(createX86FloatingPointStackifierPass());
790 }
791
getScratchMemoryRegister(const Triple & TT) const792 unsigned ExegesisX86Target::getScratchMemoryRegister(const Triple &TT) const {
793 if (!TT.isArch64Bit()) {
794 // FIXME: This would require popping from the stack, so we would have to
795 // add some additional setup code.
796 return 0;
797 }
798 return TT.isOSWindows() ? X86::RCX : X86::RDI;
799 }
800
getLoopCounterRegister(const Triple & TT) const801 unsigned ExegesisX86Target::getLoopCounterRegister(const Triple &TT) const {
802 if (!TT.isArch64Bit()) {
803 return 0;
804 }
805 return kLoopCounterReg;
806 }
807
randomizeTargetMCOperand(const Instruction & Instr,const Variable & Var,MCOperand & AssignedValue,const BitVector & ForbiddenRegs) const808 Error ExegesisX86Target::randomizeTargetMCOperand(
809 const Instruction &Instr, const Variable &Var, MCOperand &AssignedValue,
810 const BitVector &ForbiddenRegs) const {
811 const Operand &Op = Instr.getPrimaryOperand(Var);
812 switch (Op.getExplicitOperandInfo().OperandType) {
813 case X86::OperandType::OPERAND_ROUNDING_CONTROL:
814 AssignedValue =
815 MCOperand::createImm(randomIndex(X86::STATIC_ROUNDING::TO_ZERO));
816 return Error::success();
817 default:
818 break;
819 }
820 return make_error<Failure>(
821 Twine("unimplemented operand type ")
822 .concat(Twine(Op.getExplicitOperandInfo().OperandType)));
823 }
824
fillMemoryOperands(InstructionTemplate & IT,unsigned Reg,unsigned Offset) const825 void ExegesisX86Target::fillMemoryOperands(InstructionTemplate &IT,
826 unsigned Reg,
827 unsigned Offset) const {
828 assert(!isInvalidMemoryInstr(IT.getInstr()) &&
829 "fillMemoryOperands requires a valid memory instruction");
830 int MemOpIdx = X86II::getMemoryOperandNo(IT.getInstr().Description.TSFlags);
831 assert(MemOpIdx >= 0 && "invalid memory operand index");
832 // getMemoryOperandNo() ignores tied operands, so we have to add them back.
833 MemOpIdx += X86II::getOperandBias(IT.getInstr().Description);
834 setMemOp(IT, MemOpIdx + 0, MCOperand::createReg(Reg)); // BaseReg
835 setMemOp(IT, MemOpIdx + 1, MCOperand::createImm(1)); // ScaleAmt
836 setMemOp(IT, MemOpIdx + 2, MCOperand::createReg(0)); // IndexReg
837 setMemOp(IT, MemOpIdx + 3, MCOperand::createImm(Offset)); // Disp
838 setMemOp(IT, MemOpIdx + 4, MCOperand::createReg(0)); // Segment
839 }
840
decrementLoopCounterAndJump(MachineBasicBlock & MBB,MachineBasicBlock & TargetMBB,const MCInstrInfo & MII) const841 void ExegesisX86Target::decrementLoopCounterAndJump(
842 MachineBasicBlock &MBB, MachineBasicBlock &TargetMBB,
843 const MCInstrInfo &MII) const {
844 BuildMI(&MBB, DebugLoc(), MII.get(X86::ADD64ri8))
845 .addDef(kLoopCounterReg)
846 .addUse(kLoopCounterReg)
847 .addImm(-1);
848 BuildMI(&MBB, DebugLoc(), MII.get(X86::JCC_1))
849 .addMBB(&TargetMBB)
850 .addImm(X86::COND_NE);
851 }
852
setRegTo(const MCSubtargetInfo & STI,unsigned Reg,const APInt & Value) const853 std::vector<MCInst> ExegesisX86Target::setRegTo(const MCSubtargetInfo &STI,
854 unsigned Reg,
855 const APInt &Value) const {
856 if (X86::GR8RegClass.contains(Reg))
857 return {loadImmediate(Reg, 8, Value)};
858 if (X86::GR16RegClass.contains(Reg))
859 return {loadImmediate(Reg, 16, Value)};
860 if (X86::GR32RegClass.contains(Reg))
861 return {loadImmediate(Reg, 32, Value)};
862 if (X86::GR64RegClass.contains(Reg))
863 return {loadImmediate(Reg, 64, Value)};
864 ConstantInliner CI(Value);
865 if (X86::VR64RegClass.contains(Reg))
866 return CI.loadAndFinalize(Reg, 64, X86::MMX_MOVQ64rm);
867 if (X86::VR128XRegClass.contains(Reg)) {
868 if (STI.getFeatureBits()[X86::FeatureAVX512])
869 return CI.loadAndFinalize(Reg, 128, X86::VMOVDQU32Z128rm);
870 if (STI.getFeatureBits()[X86::FeatureAVX])
871 return CI.loadAndFinalize(Reg, 128, X86::VMOVDQUrm);
872 return CI.loadAndFinalize(Reg, 128, X86::MOVDQUrm);
873 }
874 if (X86::VR256XRegClass.contains(Reg)) {
875 if (STI.getFeatureBits()[X86::FeatureAVX512])
876 return CI.loadAndFinalize(Reg, 256, X86::VMOVDQU32Z256rm);
877 if (STI.getFeatureBits()[X86::FeatureAVX])
878 return CI.loadAndFinalize(Reg, 256, X86::VMOVDQUYrm);
879 }
880 if (X86::VR512RegClass.contains(Reg))
881 if (STI.getFeatureBits()[X86::FeatureAVX512])
882 return CI.loadAndFinalize(Reg, 512, X86::VMOVDQU32Zrm);
883 if (X86::RSTRegClass.contains(Reg)) {
884 return CI.loadX87STAndFinalize(Reg);
885 }
886 if (X86::RFP32RegClass.contains(Reg) || X86::RFP64RegClass.contains(Reg) ||
887 X86::RFP80RegClass.contains(Reg)) {
888 return CI.loadX87FPAndFinalize(Reg);
889 }
890 if (Reg == X86::EFLAGS)
891 return CI.popFlagAndFinalize();
892 if (Reg == X86::MXCSR)
893 return CI.loadImplicitRegAndFinalize(
894 STI.getFeatureBits()[X86::FeatureAVX] ? X86::VLDMXCSR : X86::LDMXCSR,
895 0x1f80);
896 if (Reg == X86::FPCW)
897 return CI.loadImplicitRegAndFinalize(X86::FLDCW16m, 0x37f);
898 return {}; // Not yet implemented.
899 }
900
901 // Instruction can have some variable operands, and we may want to see how
902 // different operands affect performance. So for each operand position,
903 // precompute all the possible choices we might care about,
904 // and greedily generate all the possible combinations of choices.
generateInstructionVariants(const Instruction & Instr,unsigned MaxConfigsPerOpcode) const905 std::vector<InstructionTemplate> ExegesisX86Target::generateInstructionVariants(
906 const Instruction &Instr, unsigned MaxConfigsPerOpcode) const {
907 bool Exploration = false;
908 SmallVector<SmallVector<MCOperand, 1>, 4> VariableChoices;
909 VariableChoices.resize(Instr.Variables.size());
910 for (auto I : llvm::zip(Instr.Variables, VariableChoices)) {
911 const Variable &Var = std::get<0>(I);
912 SmallVectorImpl<MCOperand> &Choices = std::get<1>(I);
913
914 switch (Instr.getPrimaryOperand(Var).getExplicitOperandInfo().OperandType) {
915 default:
916 // We don't wish to explicitly explore this variable.
917 Choices.emplace_back(); // But add invalid MCOperand to simplify logic.
918 continue;
919 case X86::OperandType::OPERAND_COND_CODE: {
920 Exploration = true;
921 auto CondCodes =
922 seq_inclusive(X86::CondCode::COND_O, X86::CondCode::LAST_VALID_COND);
923 Choices.reserve(CondCodes.size());
924 for (int CondCode : CondCodes)
925 Choices.emplace_back(MCOperand::createImm(CondCode));
926 break;
927 }
928 }
929 }
930
931 // If we don't wish to explore any variables, defer to the baseline method.
932 if (!Exploration)
933 return ExegesisTarget::generateInstructionVariants(Instr,
934 MaxConfigsPerOpcode);
935
936 std::vector<InstructionTemplate> Variants;
937 size_t NumVariants;
938 CombinationGenerator<MCOperand, decltype(VariableChoices)::value_type, 4> G(
939 VariableChoices);
940
941 // How many operand combinations can we produce, within the limit?
942 NumVariants = std::min(G.numCombinations(), (size_t)MaxConfigsPerOpcode);
943 // And actually produce all the wanted operand combinations.
944 Variants.reserve(NumVariants);
945 G.generate([&](ArrayRef<MCOperand> State) -> bool {
946 Variants.emplace_back(&Instr);
947 Variants.back().setVariableValues(State);
948 // Did we run out of space for variants?
949 return Variants.size() >= NumVariants;
950 });
951
952 assert(Variants.size() == NumVariants &&
953 Variants.size() <= MaxConfigsPerOpcode &&
954 "Should not produce too many variants");
955 return Variants;
956 }
957
getTheExegesisX86Target()958 static ExegesisTarget *getTheExegesisX86Target() {
959 static ExegesisX86Target Target;
960 return &Target;
961 }
962
InitializeX86ExegesisTarget()963 void InitializeX86ExegesisTarget() {
964 ExegesisTarget::registerTarget(getTheExegesisX86Target());
965 }
966
967 } // namespace exegesis
968 } // namespace llvm
969