1 //===-- Target.cpp ----------------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 #include "../Target.h"
9 
10 #include "../Error.h"
11 #include "../ParallelSnippetGenerator.h"
12 #include "../SerialSnippetGenerator.h"
13 #include "../SnippetGenerator.h"
14 #include "MCTargetDesc/X86BaseInfo.h"
15 #include "MCTargetDesc/X86MCTargetDesc.h"
16 #include "X86.h"
17 #include "X86Counter.h"
18 #include "X86RegisterInfo.h"
19 #include "X86Subtarget.h"
20 #include "llvm/ADT/Sequence.h"
21 #include "llvm/MC/MCInstBuilder.h"
22 #include "llvm/Support/Errc.h"
23 #include "llvm/Support/Error.h"
24 #include "llvm/Support/FormatVariadic.h"
25 #include "llvm/Support/Host.h"
26 
27 #include <memory>
28 #include <string>
29 #include <vector>
30 #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
31 #include <immintrin.h>
32 #include <intrin.h>
33 #endif
34 
35 namespace llvm {
36 namespace exegesis {
37 
38 static cl::OptionCategory
39     BenchmarkOptions("llvm-exegesis benchmark x86-options");
40 
41 // If a positive value is specified, we are going to use the LBR in
42 // latency-mode.
43 //
44 // Note:
45 //  -  A small value is preferred, but too low a value could result in
46 //     throttling.
47 //  -  A prime number is preferred to avoid always skipping certain blocks.
48 //
49 static cl::opt<unsigned> LbrSamplingPeriod(
50     "x86-lbr-sample-period",
51     cl::desc("The sample period (nbranches/sample), used for LBR sampling"),
52     cl::cat(BenchmarkOptions), cl::init(0));
53 
54 // FIXME: Validates that repetition-mode is loop if LBR is requested.
55 
56 // Returns a non-null reason if we cannot handle the memory references in this
57 // instruction.
isInvalidMemoryInstr(const Instruction & Instr)58 static const char *isInvalidMemoryInstr(const Instruction &Instr) {
59   switch (Instr.Description.TSFlags & X86II::FormMask) {
60   default:
61     return "Unknown FormMask value";
62   // These have no memory access.
63   case X86II::Pseudo:
64   case X86II::RawFrm:
65   case X86II::AddCCFrm:
66   case X86II::PrefixByte:
67   case X86II::MRMDestReg:
68   case X86II::MRMSrcReg:
69   case X86II::MRMSrcReg4VOp3:
70   case X86II::MRMSrcRegOp4:
71   case X86II::MRMSrcRegCC:
72   case X86II::MRMXrCC:
73   case X86II::MRMr0:
74   case X86II::MRMXr:
75   case X86II::MRM0r:
76   case X86II::MRM1r:
77   case X86II::MRM2r:
78   case X86II::MRM3r:
79   case X86II::MRM4r:
80   case X86II::MRM5r:
81   case X86II::MRM6r:
82   case X86II::MRM7r:
83   case X86II::MRM0X:
84   case X86II::MRM1X:
85   case X86II::MRM2X:
86   case X86II::MRM3X:
87   case X86II::MRM4X:
88   case X86II::MRM5X:
89   case X86II::MRM6X:
90   case X86II::MRM7X:
91   case X86II::MRM_C0:
92   case X86II::MRM_C1:
93   case X86II::MRM_C2:
94   case X86II::MRM_C3:
95   case X86II::MRM_C4:
96   case X86II::MRM_C5:
97   case X86II::MRM_C6:
98   case X86II::MRM_C7:
99   case X86II::MRM_C8:
100   case X86II::MRM_C9:
101   case X86II::MRM_CA:
102   case X86II::MRM_CB:
103   case X86II::MRM_CC:
104   case X86II::MRM_CD:
105   case X86II::MRM_CE:
106   case X86II::MRM_CF:
107   case X86II::MRM_D0:
108   case X86II::MRM_D1:
109   case X86II::MRM_D2:
110   case X86II::MRM_D3:
111   case X86II::MRM_D4:
112   case X86II::MRM_D5:
113   case X86II::MRM_D6:
114   case X86II::MRM_D7:
115   case X86II::MRM_D8:
116   case X86II::MRM_D9:
117   case X86II::MRM_DA:
118   case X86II::MRM_DB:
119   case X86II::MRM_DC:
120   case X86II::MRM_DD:
121   case X86II::MRM_DE:
122   case X86II::MRM_DF:
123   case X86II::MRM_E0:
124   case X86II::MRM_E1:
125   case X86II::MRM_E2:
126   case X86II::MRM_E3:
127   case X86II::MRM_E4:
128   case X86II::MRM_E5:
129   case X86II::MRM_E6:
130   case X86II::MRM_E7:
131   case X86II::MRM_E8:
132   case X86II::MRM_E9:
133   case X86II::MRM_EA:
134   case X86II::MRM_EB:
135   case X86II::MRM_EC:
136   case X86II::MRM_ED:
137   case X86II::MRM_EE:
138   case X86II::MRM_EF:
139   case X86II::MRM_F0:
140   case X86II::MRM_F1:
141   case X86II::MRM_F2:
142   case X86II::MRM_F3:
143   case X86II::MRM_F4:
144   case X86II::MRM_F5:
145   case X86II::MRM_F6:
146   case X86II::MRM_F7:
147   case X86II::MRM_F8:
148   case X86II::MRM_F9:
149   case X86II::MRM_FA:
150   case X86II::MRM_FB:
151   case X86II::MRM_FC:
152   case X86II::MRM_FD:
153   case X86II::MRM_FE:
154   case X86II::MRM_FF:
155   case X86II::RawFrmImm8:
156     return nullptr;
157   case X86II::AddRegFrm:
158     return (Instr.Description.Opcode == X86::POP16r ||
159             Instr.Description.Opcode == X86::POP32r ||
160             Instr.Description.Opcode == X86::PUSH16r ||
161             Instr.Description.Opcode == X86::PUSH32r)
162                ? "unsupported opcode: unsupported memory access"
163                : nullptr;
164   // These access memory and are handled.
165   case X86II::MRMDestMem:
166   case X86II::MRMSrcMem:
167   case X86II::MRMSrcMem4VOp3:
168   case X86II::MRMSrcMemOp4:
169   case X86II::MRMSrcMemCC:
170   case X86II::MRMXmCC:
171   case X86II::MRMXm:
172   case X86II::MRM0m:
173   case X86II::MRM1m:
174   case X86II::MRM2m:
175   case X86II::MRM3m:
176   case X86II::MRM4m:
177   case X86II::MRM5m:
178   case X86II::MRM6m:
179   case X86II::MRM7m:
180     return nullptr;
181   // These access memory and are not handled yet.
182   case X86II::RawFrmImm16:
183   case X86II::RawFrmMemOffs:
184   case X86II::RawFrmSrc:
185   case X86II::RawFrmDst:
186   case X86II::RawFrmDstSrc:
187     return "unsupported opcode: non uniform memory access";
188   }
189 }
190 
191 // If the opcode is invalid, returns a pointer to a character literal indicating
192 // the reason. nullptr indicates a valid opcode.
isInvalidOpcode(const Instruction & Instr)193 static const char *isInvalidOpcode(const Instruction &Instr) {
194   const auto OpcodeName = Instr.Name;
195   if ((Instr.Description.TSFlags & X86II::FormMask) == X86II::Pseudo)
196     return "unsupported opcode: pseudo instruction";
197   if ((OpcodeName.startswith("POP") && !OpcodeName.startswith("POPCNT")) ||
198       OpcodeName.startswith("PUSH") || OpcodeName.startswith("ADJCALLSTACK") ||
199       OpcodeName.startswith("LEAVE"))
200     return "unsupported opcode: Push/Pop/AdjCallStack/Leave";
201   switch (Instr.Description.Opcode) {
202   case X86::LFS16rm:
203   case X86::LFS32rm:
204   case X86::LFS64rm:
205   case X86::LGS16rm:
206   case X86::LGS32rm:
207   case X86::LGS64rm:
208   case X86::LSS16rm:
209   case X86::LSS32rm:
210   case X86::LSS64rm:
211   case X86::SYSENTER:
212     return "unsupported opcode";
213   default:
214     break;
215   }
216   if (const auto reason = isInvalidMemoryInstr(Instr))
217     return reason;
218   // We do not handle instructions with OPERAND_PCREL.
219   for (const Operand &Op : Instr.Operands)
220     if (Op.isExplicit() &&
221         Op.getExplicitOperandInfo().OperandType == MCOI::OPERAND_PCREL)
222       return "unsupported opcode: PC relative operand";
223   // We do not handle second-form X87 instructions. We only handle first-form
224   // ones (_Fp), see comment in X86InstrFPStack.td.
225   for (const Operand &Op : Instr.Operands)
226     if (Op.isReg() && Op.isExplicit() &&
227         Op.getExplicitOperandInfo().RegClass == X86::RSTRegClassID)
228       return "unsupported second-form X87 instruction";
229   return nullptr;
230 }
231 
getX86FPFlags(const Instruction & Instr)232 static unsigned getX86FPFlags(const Instruction &Instr) {
233   return Instr.Description.TSFlags & X86II::FPTypeMask;
234 }
235 
236 // Helper to fill a memory operand with a value.
setMemOp(InstructionTemplate & IT,int OpIdx,const MCOperand & OpVal)237 static void setMemOp(InstructionTemplate &IT, int OpIdx,
238                      const MCOperand &OpVal) {
239   const auto Op = IT.getInstr().Operands[OpIdx];
240   assert(Op.isExplicit() && "invalid memory pattern");
241   IT.getValueFor(Op) = OpVal;
242 }
243 
244 // Common (latency, uops) code for LEA templates. `GetDestReg` takes the
245 // addressing base and index registers and returns the LEA destination register.
generateLEATemplatesCommon(const Instruction & Instr,const BitVector & ForbiddenRegisters,const LLVMState & State,const SnippetGenerator::Options & Opts,std::function<void (unsigned,unsigned,BitVector & CandidateDestRegs)> RestrictDestRegs)246 static Expected<std::vector<CodeTemplate>> generateLEATemplatesCommon(
247     const Instruction &Instr, const BitVector &ForbiddenRegisters,
248     const LLVMState &State, const SnippetGenerator::Options &Opts,
249     std::function<void(unsigned, unsigned, BitVector &CandidateDestRegs)>
250         RestrictDestRegs) {
251   assert(Instr.Operands.size() == 6 && "invalid LEA");
252   assert(X86II::getMemoryOperandNo(Instr.Description.TSFlags) == 1 &&
253          "invalid LEA");
254 
255   constexpr const int kDestOp = 0;
256   constexpr const int kBaseOp = 1;
257   constexpr const int kIndexOp = 3;
258   auto PossibleDestRegs =
259       Instr.Operands[kDestOp].getRegisterAliasing().sourceBits();
260   remove(PossibleDestRegs, ForbiddenRegisters);
261   auto PossibleBaseRegs =
262       Instr.Operands[kBaseOp].getRegisterAliasing().sourceBits();
263   remove(PossibleBaseRegs, ForbiddenRegisters);
264   auto PossibleIndexRegs =
265       Instr.Operands[kIndexOp].getRegisterAliasing().sourceBits();
266   remove(PossibleIndexRegs, ForbiddenRegisters);
267 
268   const auto &RegInfo = State.getRegInfo();
269   std::vector<CodeTemplate> Result;
270   for (const unsigned BaseReg : PossibleBaseRegs.set_bits()) {
271     for (const unsigned IndexReg : PossibleIndexRegs.set_bits()) {
272       for (int LogScale = 0; LogScale <= 3; ++LogScale) {
273         // FIXME: Add an option for controlling how we explore immediates.
274         for (const int Disp : {0, 42}) {
275           InstructionTemplate IT(&Instr);
276           const int64_t Scale = 1ull << LogScale;
277           setMemOp(IT, 1, MCOperand::createReg(BaseReg));
278           setMemOp(IT, 2, MCOperand::createImm(Scale));
279           setMemOp(IT, 3, MCOperand::createReg(IndexReg));
280           setMemOp(IT, 4, MCOperand::createImm(Disp));
281           // SegmentReg must be 0 for LEA.
282           setMemOp(IT, 5, MCOperand::createReg(0));
283 
284           // Output reg candidates are selected by the caller.
285           auto PossibleDestRegsNow = PossibleDestRegs;
286           RestrictDestRegs(BaseReg, IndexReg, PossibleDestRegsNow);
287           assert(PossibleDestRegsNow.set_bits().begin() !=
288                      PossibleDestRegsNow.set_bits().end() &&
289                  "no remaining registers");
290           setMemOp(
291               IT, 0,
292               MCOperand::createReg(*PossibleDestRegsNow.set_bits().begin()));
293 
294           CodeTemplate CT;
295           CT.Instructions.push_back(std::move(IT));
296           CT.Config = formatv("{3}(%{0}, %{1}, {2})", RegInfo.getName(BaseReg),
297                               RegInfo.getName(IndexReg), Scale, Disp)
298                           .str();
299           Result.push_back(std::move(CT));
300           if (Result.size() >= Opts.MaxConfigsPerOpcode)
301             return std::move(Result);
302         }
303       }
304     }
305   }
306 
307   return std::move(Result);
308 }
309 
310 namespace {
311 class X86SerialSnippetGenerator : public SerialSnippetGenerator {
312 public:
313   using SerialSnippetGenerator::SerialSnippetGenerator;
314 
315   Expected<std::vector<CodeTemplate>>
316   generateCodeTemplates(InstructionTemplate Variant,
317                         const BitVector &ForbiddenRegisters) const override;
318 };
319 } // namespace
320 
321 Expected<std::vector<CodeTemplate>>
generateCodeTemplates(InstructionTemplate Variant,const BitVector & ForbiddenRegisters) const322 X86SerialSnippetGenerator::generateCodeTemplates(
323     InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const {
324   const Instruction &Instr = Variant.getInstr();
325 
326   if (const auto reason = isInvalidOpcode(Instr))
327     return make_error<Failure>(reason);
328 
329   // LEA gets special attention.
330   const auto Opcode = Instr.Description.getOpcode();
331   if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) {
332     return generateLEATemplatesCommon(
333         Instr, ForbiddenRegisters, State, Opts,
334         [this](unsigned BaseReg, unsigned IndexReg,
335                BitVector &CandidateDestRegs) {
336           // We just select a destination register that aliases the base
337           // register.
338           CandidateDestRegs &=
339               State.getRATC().getRegister(BaseReg).aliasedBits();
340         });
341   }
342 
343   if (Instr.hasMemoryOperands())
344     return make_error<Failure>(
345         "unsupported memory operand in latency measurements");
346 
347   switch (getX86FPFlags(Instr)) {
348   case X86II::NotFP:
349     return SerialSnippetGenerator::generateCodeTemplates(Variant,
350                                                          ForbiddenRegisters);
351   case X86II::ZeroArgFP:
352   case X86II::OneArgFP:
353   case X86II::SpecialFP:
354   case X86II::CompareFP:
355   case X86II::CondMovFP:
356     return make_error<Failure>("Unsupported x87 Instruction");
357   case X86II::OneArgFPRW:
358   case X86II::TwoArgFP:
359     // These are instructions like
360     //   - `ST(0) = fsqrt(ST(0))` (OneArgFPRW)
361     //   - `ST(0) = ST(0) + ST(i)` (TwoArgFP)
362     // They are intrinsically serial and do not modify the state of the stack.
363     return generateSelfAliasingCodeTemplates(Variant);
364   default:
365     llvm_unreachable("Unknown FP Type!");
366   }
367 }
368 
369 namespace {
370 class X86ParallelSnippetGenerator : public ParallelSnippetGenerator {
371 public:
372   using ParallelSnippetGenerator::ParallelSnippetGenerator;
373 
374   Expected<std::vector<CodeTemplate>>
375   generateCodeTemplates(InstructionTemplate Variant,
376                         const BitVector &ForbiddenRegisters) const override;
377 };
378 
379 } // namespace
380 
381 Expected<std::vector<CodeTemplate>>
generateCodeTemplates(InstructionTemplate Variant,const BitVector & ForbiddenRegisters) const382 X86ParallelSnippetGenerator::generateCodeTemplates(
383     InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const {
384   const Instruction &Instr = Variant.getInstr();
385 
386   if (const auto reason = isInvalidOpcode(Instr))
387     return make_error<Failure>(reason);
388 
389   // LEA gets special attention.
390   const auto Opcode = Instr.Description.getOpcode();
391   if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) {
392     return generateLEATemplatesCommon(
393         Instr, ForbiddenRegisters, State, Opts,
394         [this](unsigned BaseReg, unsigned IndexReg,
395                BitVector &CandidateDestRegs) {
396           // Any destination register that is not used for addressing is fine.
397           remove(CandidateDestRegs,
398                  State.getRATC().getRegister(BaseReg).aliasedBits());
399           remove(CandidateDestRegs,
400                  State.getRATC().getRegister(IndexReg).aliasedBits());
401         });
402   }
403 
404   switch (getX86FPFlags(Instr)) {
405   case X86II::NotFP:
406     return ParallelSnippetGenerator::generateCodeTemplates(Variant,
407                                                            ForbiddenRegisters);
408   case X86II::ZeroArgFP:
409   case X86II::OneArgFP:
410   case X86II::SpecialFP:
411     return make_error<Failure>("Unsupported x87 Instruction");
412   case X86II::OneArgFPRW:
413   case X86II::TwoArgFP:
414     // These are instructions like
415     //   - `ST(0) = fsqrt(ST(0))` (OneArgFPRW)
416     //   - `ST(0) = ST(0) + ST(i)` (TwoArgFP)
417     // They are intrinsically serial and do not modify the state of the stack.
418     // We generate the same code for latency and uops.
419     return generateSelfAliasingCodeTemplates(Variant);
420   case X86II::CompareFP:
421   case X86II::CondMovFP:
422     // We can compute uops for any FP instruction that does not grow or shrink
423     // the stack (either do not touch the stack or push as much as they pop).
424     return generateUnconstrainedCodeTemplates(
425         Variant, "instruction does not grow/shrink the FP stack");
426   default:
427     llvm_unreachable("Unknown FP Type!");
428   }
429 }
430 
getLoadImmediateOpcode(unsigned RegBitWidth)431 static unsigned getLoadImmediateOpcode(unsigned RegBitWidth) {
432   switch (RegBitWidth) {
433   case 8:
434     return X86::MOV8ri;
435   case 16:
436     return X86::MOV16ri;
437   case 32:
438     return X86::MOV32ri;
439   case 64:
440     return X86::MOV64ri;
441   }
442   llvm_unreachable("Invalid Value Width");
443 }
444 
445 // Generates instruction to load an immediate value into a register.
loadImmediate(unsigned Reg,unsigned RegBitWidth,const APInt & Value)446 static MCInst loadImmediate(unsigned Reg, unsigned RegBitWidth,
447                             const APInt &Value) {
448   if (Value.getBitWidth() > RegBitWidth)
449     llvm_unreachable("Value must fit in the Register");
450   return MCInstBuilder(getLoadImmediateOpcode(RegBitWidth))
451       .addReg(Reg)
452       .addImm(Value.getZExtValue());
453 }
454 
455 // Allocates scratch memory on the stack.
allocateStackSpace(unsigned Bytes)456 static MCInst allocateStackSpace(unsigned Bytes) {
457   return MCInstBuilder(X86::SUB64ri8)
458       .addReg(X86::RSP)
459       .addReg(X86::RSP)
460       .addImm(Bytes);
461 }
462 
463 // Fills scratch memory at offset `OffsetBytes` with value `Imm`.
fillStackSpace(unsigned MovOpcode,unsigned OffsetBytes,uint64_t Imm)464 static MCInst fillStackSpace(unsigned MovOpcode, unsigned OffsetBytes,
465                              uint64_t Imm) {
466   return MCInstBuilder(MovOpcode)
467       // Address = ESP
468       .addReg(X86::RSP)    // BaseReg
469       .addImm(1)           // ScaleAmt
470       .addReg(0)           // IndexReg
471       .addImm(OffsetBytes) // Disp
472       .addReg(0)           // Segment
473       // Immediate.
474       .addImm(Imm);
475 }
476 
477 // Loads scratch memory into register `Reg` using opcode `RMOpcode`.
loadToReg(unsigned Reg,unsigned RMOpcode)478 static MCInst loadToReg(unsigned Reg, unsigned RMOpcode) {
479   return MCInstBuilder(RMOpcode)
480       .addReg(Reg)
481       // Address = ESP
482       .addReg(X86::RSP) // BaseReg
483       .addImm(1)        // ScaleAmt
484       .addReg(0)        // IndexReg
485       .addImm(0)        // Disp
486       .addReg(0);       // Segment
487 }
488 
489 // Releases scratch memory.
releaseStackSpace(unsigned Bytes)490 static MCInst releaseStackSpace(unsigned Bytes) {
491   return MCInstBuilder(X86::ADD64ri8)
492       .addReg(X86::RSP)
493       .addReg(X86::RSP)
494       .addImm(Bytes);
495 }
496 
497 // Reserves some space on the stack, fills it with the content of the provided
498 // constant and provide methods to load the stack value into a register.
499 namespace {
500 struct ConstantInliner {
ConstantInlinerllvm::exegesis::__anonf23a29fa0511::ConstantInliner501   explicit ConstantInliner(const APInt &Constant) : Constant_(Constant) {}
502 
503   std::vector<MCInst> loadAndFinalize(unsigned Reg, unsigned RegBitWidth,
504                                       unsigned Opcode);
505 
506   std::vector<MCInst> loadX87STAndFinalize(unsigned Reg);
507 
508   std::vector<MCInst> loadX87FPAndFinalize(unsigned Reg);
509 
510   std::vector<MCInst> popFlagAndFinalize();
511 
512   std::vector<MCInst> loadImplicitRegAndFinalize(unsigned Opcode,
513                                                  unsigned Value);
514 
515 private:
addllvm::exegesis::__anonf23a29fa0511::ConstantInliner516   ConstantInliner &add(const MCInst &Inst) {
517     Instructions.push_back(Inst);
518     return *this;
519   }
520 
521   void initStack(unsigned Bytes);
522 
523   static constexpr const unsigned kF80Bytes = 10; // 80 bits.
524 
525   APInt Constant_;
526   std::vector<MCInst> Instructions;
527 };
528 } // namespace
529 
loadAndFinalize(unsigned Reg,unsigned RegBitWidth,unsigned Opcode)530 std::vector<MCInst> ConstantInliner::loadAndFinalize(unsigned Reg,
531                                                      unsigned RegBitWidth,
532                                                      unsigned Opcode) {
533   assert((RegBitWidth & 7) == 0 && "RegBitWidth must be a multiple of 8 bits");
534   initStack(RegBitWidth / 8);
535   add(loadToReg(Reg, Opcode));
536   add(releaseStackSpace(RegBitWidth / 8));
537   return std::move(Instructions);
538 }
539 
loadX87STAndFinalize(unsigned Reg)540 std::vector<MCInst> ConstantInliner::loadX87STAndFinalize(unsigned Reg) {
541   initStack(kF80Bytes);
542   add(MCInstBuilder(X86::LD_F80m)
543           // Address = ESP
544           .addReg(X86::RSP) // BaseReg
545           .addImm(1)        // ScaleAmt
546           .addReg(0)        // IndexReg
547           .addImm(0)        // Disp
548           .addReg(0));      // Segment
549   if (Reg != X86::ST0)
550     add(MCInstBuilder(X86::ST_Frr).addReg(Reg));
551   add(releaseStackSpace(kF80Bytes));
552   return std::move(Instructions);
553 }
554 
loadX87FPAndFinalize(unsigned Reg)555 std::vector<MCInst> ConstantInliner::loadX87FPAndFinalize(unsigned Reg) {
556   initStack(kF80Bytes);
557   add(MCInstBuilder(X86::LD_Fp80m)
558           .addReg(Reg)
559           // Address = ESP
560           .addReg(X86::RSP) // BaseReg
561           .addImm(1)        // ScaleAmt
562           .addReg(0)        // IndexReg
563           .addImm(0)        // Disp
564           .addReg(0));      // Segment
565   add(releaseStackSpace(kF80Bytes));
566   return std::move(Instructions);
567 }
568 
popFlagAndFinalize()569 std::vector<MCInst> ConstantInliner::popFlagAndFinalize() {
570   initStack(8);
571   add(MCInstBuilder(X86::POPF64));
572   return std::move(Instructions);
573 }
574 
575 std::vector<MCInst>
loadImplicitRegAndFinalize(unsigned Opcode,unsigned Value)576 ConstantInliner::loadImplicitRegAndFinalize(unsigned Opcode, unsigned Value) {
577   add(allocateStackSpace(4));
578   add(fillStackSpace(X86::MOV32mi, 0, Value)); // Mask all FP exceptions
579   add(MCInstBuilder(Opcode)
580           // Address = ESP
581           .addReg(X86::RSP) // BaseReg
582           .addImm(1)        // ScaleAmt
583           .addReg(0)        // IndexReg
584           .addImm(0)        // Disp
585           .addReg(0));      // Segment
586   add(releaseStackSpace(4));
587   return std::move(Instructions);
588 }
589 
initStack(unsigned Bytes)590 void ConstantInliner::initStack(unsigned Bytes) {
591   assert(Constant_.getBitWidth() <= Bytes * 8 &&
592          "Value does not have the correct size");
593   const APInt WideConstant = Constant_.getBitWidth() < Bytes * 8
594                                  ? Constant_.sext(Bytes * 8)
595                                  : Constant_;
596   add(allocateStackSpace(Bytes));
597   size_t ByteOffset = 0;
598   for (; Bytes - ByteOffset >= 4; ByteOffset += 4)
599     add(fillStackSpace(
600         X86::MOV32mi, ByteOffset,
601         WideConstant.extractBits(32, ByteOffset * 8).getZExtValue()));
602   if (Bytes - ByteOffset >= 2) {
603     add(fillStackSpace(
604         X86::MOV16mi, ByteOffset,
605         WideConstant.extractBits(16, ByteOffset * 8).getZExtValue()));
606     ByteOffset += 2;
607   }
608   if (Bytes - ByteOffset >= 1)
609     add(fillStackSpace(
610         X86::MOV8mi, ByteOffset,
611         WideConstant.extractBits(8, ByteOffset * 8).getZExtValue()));
612 }
613 
614 #include "X86GenExegesis.inc"
615 
616 namespace {
617 
618 class X86SavedState : public ExegesisTarget::SavedState {
619 public:
X86SavedState()620   X86SavedState() {
621 #ifdef __x86_64__
622 # if defined(_MSC_VER)
623     _fxsave64(FPState);
624     Eflags = __readeflags();
625 # elif defined(__GNUC__)
626     __builtin_ia32_fxsave64(FPState);
627     Eflags = __builtin_ia32_readeflags_u64();
628 # endif
629 #else
630     llvm_unreachable("X86 exegesis running on non-X86 target");
631 #endif
632   }
633 
~X86SavedState()634   ~X86SavedState() {
635     // Restoring the X87 state does not flush pending exceptions, make sure
636     // these exceptions are flushed now.
637 #ifdef __x86_64__
638 # if defined(_MSC_VER)
639     _clearfp();
640     _fxrstor64(FPState);
641     __writeeflags(Eflags);
642 # elif defined(__GNUC__)
643     asm volatile("fwait");
644     __builtin_ia32_fxrstor64(FPState);
645     __builtin_ia32_writeeflags_u64(Eflags);
646 # endif
647 #else
648     llvm_unreachable("X86 exegesis running on non-X86 target");
649 #endif
650   }
651 
652 private:
653 #ifdef __x86_64__
654   alignas(16) char FPState[512];
655   uint64_t Eflags;
656 #endif
657 };
658 
659 class ExegesisX86Target : public ExegesisTarget {
660 public:
ExegesisX86Target()661   ExegesisX86Target() : ExegesisTarget(X86CpuPfmCounters) {}
662 
663   Expected<std::unique_ptr<pfm::Counter>>
createCounter(StringRef CounterName,const LLVMState & State) const664   createCounter(StringRef CounterName, const LLVMState &State) const override {
665     // If LbrSamplingPeriod was provided, then ignore the
666     // CounterName because we only have one for LBR.
667     if (LbrSamplingPeriod > 0) {
668       // Can't use LBR without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, or without
669       // __linux__ (for now)
670 #if defined(HAVE_LIBPFM) && defined(LIBPFM_HAS_FIELD_CYCLES) &&                \
671     defined(__linux__)
672       return std::make_unique<X86LbrCounter>(
673           X86LbrPerfEvent(LbrSamplingPeriod));
674 #else
675       return llvm::make_error<llvm::StringError>(
676           "LBR counter requested without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, "
677           "or running on Linux.",
678           llvm::errc::invalid_argument);
679 #endif
680     }
681     return ExegesisTarget::createCounter(CounterName, State);
682   }
683 
684 private:
685   void addTargetSpecificPasses(PassManagerBase &PM) const override;
686 
687   unsigned getScratchMemoryRegister(const Triple &TT) const override;
688 
689   unsigned getLoopCounterRegister(const Triple &) const override;
690 
getMaxMemoryAccessSize() const691   unsigned getMaxMemoryAccessSize() const override { return 64; }
692 
693   Error randomizeTargetMCOperand(const Instruction &Instr, const Variable &Var,
694                                  MCOperand &AssignedValue,
695                                  const BitVector &ForbiddenRegs) const override;
696 
697   void fillMemoryOperands(InstructionTemplate &IT, unsigned Reg,
698                           unsigned Offset) const override;
699 
700   void decrementLoopCounterAndJump(MachineBasicBlock &MBB,
701                                    MachineBasicBlock &TargetMBB,
702                                    const MCInstrInfo &MII) const override;
703 
704   std::vector<MCInst> setRegTo(const MCSubtargetInfo &STI, unsigned Reg,
705                                const APInt &Value) const override;
706 
getUnavailableRegisters() const707   ArrayRef<unsigned> getUnavailableRegisters() const override {
708     return makeArrayRef(kUnavailableRegisters,
709                         sizeof(kUnavailableRegisters) /
710                             sizeof(kUnavailableRegisters[0]));
711   }
712 
allowAsBackToBack(const Instruction & Instr) const713   bool allowAsBackToBack(const Instruction &Instr) const override {
714     const unsigned Opcode = Instr.Description.Opcode;
715     return !isInvalidOpcode(Instr) && Opcode != X86::LEA64r &&
716            Opcode != X86::LEA64_32r && Opcode != X86::LEA16r;
717   }
718 
719   std::vector<InstructionTemplate>
720   generateInstructionVariants(const Instruction &Instr,
721                               unsigned MaxConfigsPerOpcode) const override;
722 
createSerialSnippetGenerator(const LLVMState & State,const SnippetGenerator::Options & Opts) const723   std::unique_ptr<SnippetGenerator> createSerialSnippetGenerator(
724       const LLVMState &State,
725       const SnippetGenerator::Options &Opts) const override {
726     return std::make_unique<X86SerialSnippetGenerator>(State, Opts);
727   }
728 
createParallelSnippetGenerator(const LLVMState & State,const SnippetGenerator::Options & Opts) const729   std::unique_ptr<SnippetGenerator> createParallelSnippetGenerator(
730       const LLVMState &State,
731       const SnippetGenerator::Options &Opts) const override {
732     return std::make_unique<X86ParallelSnippetGenerator>(State, Opts);
733   }
734 
matchesArch(Triple::ArchType Arch) const735   bool matchesArch(Triple::ArchType Arch) const override {
736     return Arch == Triple::x86_64 || Arch == Triple::x86;
737   }
738 
checkFeatureSupport() const739   Error checkFeatureSupport() const override {
740     // LBR is the only feature we conditionally support now.
741     // So if LBR is not requested, then we should be able to run the benchmarks.
742     if (LbrSamplingPeriod == 0)
743       return Error::success();
744 
745 #if defined(__linux__) && defined(HAVE_LIBPFM) &&                              \
746     defined(LIBPFM_HAS_FIELD_CYCLES)
747       // FIXME: Fix this.
748       // https://bugs.llvm.org/show_bug.cgi?id=48918
749       // For now, only do the check if we see an Intel machine because
750       // the counter uses some intel-specific magic and it could
751       // be confuse and think an AMD machine actually has LBR support.
752 #if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) ||            \
753     defined(_M_X64)
754     using namespace sys::detail::x86;
755 
756     if (getVendorSignature() == VendorSignatures::GENUINE_INTEL)
757       // If the kernel supports it, the hardware still may not have it.
758       return X86LbrCounter::checkLbrSupport();
759 #else
760     llvm_unreachable("Running X86 exegesis on non-X86 target");
761 #endif
762 #endif
763     return llvm::make_error<llvm::StringError>(
764         "LBR not supported on this kernel and/or platform",
765         llvm::errc::not_supported);
766   }
767 
withSavedState() const768   std::unique_ptr<SavedState> withSavedState() const override {
769     return std::make_unique<X86SavedState>();
770   }
771 
772   static const unsigned kUnavailableRegisters[4];
773 };
774 
775 // We disable a few registers that cannot be encoded on instructions with a REX
776 // prefix.
777 const unsigned ExegesisX86Target::kUnavailableRegisters[4] = {X86::AH, X86::BH,
778                                                               X86::CH, X86::DH};
779 
780 // We're using one of R8-R15 because these registers are never hardcoded in
781 // instructions (e.g. MOVS writes to EDI, ESI, EDX), so they have less
782 // conflicts.
783 constexpr const unsigned kLoopCounterReg = X86::R8;
784 
785 } // namespace
786 
addTargetSpecificPasses(PassManagerBase & PM) const787 void ExegesisX86Target::addTargetSpecificPasses(PassManagerBase &PM) const {
788   // Lowers FP pseudo-instructions, e.g. ABS_Fp32 -> ABS_F.
789   PM.add(createX86FloatingPointStackifierPass());
790 }
791 
getScratchMemoryRegister(const Triple & TT) const792 unsigned ExegesisX86Target::getScratchMemoryRegister(const Triple &TT) const {
793   if (!TT.isArch64Bit()) {
794     // FIXME: This would require popping from the stack, so we would have to
795     // add some additional setup code.
796     return 0;
797   }
798   return TT.isOSWindows() ? X86::RCX : X86::RDI;
799 }
800 
getLoopCounterRegister(const Triple & TT) const801 unsigned ExegesisX86Target::getLoopCounterRegister(const Triple &TT) const {
802   if (!TT.isArch64Bit()) {
803     return 0;
804   }
805   return kLoopCounterReg;
806 }
807 
randomizeTargetMCOperand(const Instruction & Instr,const Variable & Var,MCOperand & AssignedValue,const BitVector & ForbiddenRegs) const808 Error ExegesisX86Target::randomizeTargetMCOperand(
809     const Instruction &Instr, const Variable &Var, MCOperand &AssignedValue,
810     const BitVector &ForbiddenRegs) const {
811   const Operand &Op = Instr.getPrimaryOperand(Var);
812   switch (Op.getExplicitOperandInfo().OperandType) {
813   case X86::OperandType::OPERAND_ROUNDING_CONTROL:
814     AssignedValue =
815         MCOperand::createImm(randomIndex(X86::STATIC_ROUNDING::TO_ZERO));
816     return Error::success();
817   default:
818     break;
819   }
820   return make_error<Failure>(
821       Twine("unimplemented operand type ")
822           .concat(Twine(Op.getExplicitOperandInfo().OperandType)));
823 }
824 
fillMemoryOperands(InstructionTemplate & IT,unsigned Reg,unsigned Offset) const825 void ExegesisX86Target::fillMemoryOperands(InstructionTemplate &IT,
826                                            unsigned Reg,
827                                            unsigned Offset) const {
828   assert(!isInvalidMemoryInstr(IT.getInstr()) &&
829          "fillMemoryOperands requires a valid memory instruction");
830   int MemOpIdx = X86II::getMemoryOperandNo(IT.getInstr().Description.TSFlags);
831   assert(MemOpIdx >= 0 && "invalid memory operand index");
832   // getMemoryOperandNo() ignores tied operands, so we have to add them back.
833   MemOpIdx += X86II::getOperandBias(IT.getInstr().Description);
834   setMemOp(IT, MemOpIdx + 0, MCOperand::createReg(Reg));    // BaseReg
835   setMemOp(IT, MemOpIdx + 1, MCOperand::createImm(1));      // ScaleAmt
836   setMemOp(IT, MemOpIdx + 2, MCOperand::createReg(0));      // IndexReg
837   setMemOp(IT, MemOpIdx + 3, MCOperand::createImm(Offset)); // Disp
838   setMemOp(IT, MemOpIdx + 4, MCOperand::createReg(0));      // Segment
839 }
840 
decrementLoopCounterAndJump(MachineBasicBlock & MBB,MachineBasicBlock & TargetMBB,const MCInstrInfo & MII) const841 void ExegesisX86Target::decrementLoopCounterAndJump(
842     MachineBasicBlock &MBB, MachineBasicBlock &TargetMBB,
843     const MCInstrInfo &MII) const {
844   BuildMI(&MBB, DebugLoc(), MII.get(X86::ADD64ri8))
845       .addDef(kLoopCounterReg)
846       .addUse(kLoopCounterReg)
847       .addImm(-1);
848   BuildMI(&MBB, DebugLoc(), MII.get(X86::JCC_1))
849       .addMBB(&TargetMBB)
850       .addImm(X86::COND_NE);
851 }
852 
setRegTo(const MCSubtargetInfo & STI,unsigned Reg,const APInt & Value) const853 std::vector<MCInst> ExegesisX86Target::setRegTo(const MCSubtargetInfo &STI,
854                                                 unsigned Reg,
855                                                 const APInt &Value) const {
856   if (X86::GR8RegClass.contains(Reg))
857     return {loadImmediate(Reg, 8, Value)};
858   if (X86::GR16RegClass.contains(Reg))
859     return {loadImmediate(Reg, 16, Value)};
860   if (X86::GR32RegClass.contains(Reg))
861     return {loadImmediate(Reg, 32, Value)};
862   if (X86::GR64RegClass.contains(Reg))
863     return {loadImmediate(Reg, 64, Value)};
864   ConstantInliner CI(Value);
865   if (X86::VR64RegClass.contains(Reg))
866     return CI.loadAndFinalize(Reg, 64, X86::MMX_MOVQ64rm);
867   if (X86::VR128XRegClass.contains(Reg)) {
868     if (STI.getFeatureBits()[X86::FeatureAVX512])
869       return CI.loadAndFinalize(Reg, 128, X86::VMOVDQU32Z128rm);
870     if (STI.getFeatureBits()[X86::FeatureAVX])
871       return CI.loadAndFinalize(Reg, 128, X86::VMOVDQUrm);
872     return CI.loadAndFinalize(Reg, 128, X86::MOVDQUrm);
873   }
874   if (X86::VR256XRegClass.contains(Reg)) {
875     if (STI.getFeatureBits()[X86::FeatureAVX512])
876       return CI.loadAndFinalize(Reg, 256, X86::VMOVDQU32Z256rm);
877     if (STI.getFeatureBits()[X86::FeatureAVX])
878       return CI.loadAndFinalize(Reg, 256, X86::VMOVDQUYrm);
879   }
880   if (X86::VR512RegClass.contains(Reg))
881     if (STI.getFeatureBits()[X86::FeatureAVX512])
882       return CI.loadAndFinalize(Reg, 512, X86::VMOVDQU32Zrm);
883   if (X86::RSTRegClass.contains(Reg)) {
884     return CI.loadX87STAndFinalize(Reg);
885   }
886   if (X86::RFP32RegClass.contains(Reg) || X86::RFP64RegClass.contains(Reg) ||
887       X86::RFP80RegClass.contains(Reg)) {
888     return CI.loadX87FPAndFinalize(Reg);
889   }
890   if (Reg == X86::EFLAGS)
891     return CI.popFlagAndFinalize();
892   if (Reg == X86::MXCSR)
893     return CI.loadImplicitRegAndFinalize(
894         STI.getFeatureBits()[X86::FeatureAVX] ? X86::VLDMXCSR : X86::LDMXCSR,
895         0x1f80);
896   if (Reg == X86::FPCW)
897     return CI.loadImplicitRegAndFinalize(X86::FLDCW16m, 0x37f);
898   return {}; // Not yet implemented.
899 }
900 
901 // Instruction can have some variable operands, and we may want to see how
902 // different operands affect performance. So for each operand position,
903 // precompute all the possible choices we might care about,
904 // and greedily generate all the possible combinations of choices.
generateInstructionVariants(const Instruction & Instr,unsigned MaxConfigsPerOpcode) const905 std::vector<InstructionTemplate> ExegesisX86Target::generateInstructionVariants(
906     const Instruction &Instr, unsigned MaxConfigsPerOpcode) const {
907   bool Exploration = false;
908   SmallVector<SmallVector<MCOperand, 1>, 4> VariableChoices;
909   VariableChoices.resize(Instr.Variables.size());
910   for (auto I : llvm::zip(Instr.Variables, VariableChoices)) {
911     const Variable &Var = std::get<0>(I);
912     SmallVectorImpl<MCOperand> &Choices = std::get<1>(I);
913 
914     switch (Instr.getPrimaryOperand(Var).getExplicitOperandInfo().OperandType) {
915     default:
916       // We don't wish to explicitly explore this variable.
917       Choices.emplace_back(); // But add invalid MCOperand to simplify logic.
918       continue;
919     case X86::OperandType::OPERAND_COND_CODE: {
920       Exploration = true;
921       auto CondCodes =
922           seq_inclusive(X86::CondCode::COND_O, X86::CondCode::LAST_VALID_COND);
923       Choices.reserve(CondCodes.size());
924       for (int CondCode : CondCodes)
925         Choices.emplace_back(MCOperand::createImm(CondCode));
926       break;
927     }
928     }
929   }
930 
931   // If we don't wish to explore any variables, defer to the baseline method.
932   if (!Exploration)
933     return ExegesisTarget::generateInstructionVariants(Instr,
934                                                        MaxConfigsPerOpcode);
935 
936   std::vector<InstructionTemplate> Variants;
937   size_t NumVariants;
938   CombinationGenerator<MCOperand, decltype(VariableChoices)::value_type, 4> G(
939       VariableChoices);
940 
941   // How many operand combinations can we produce, within the limit?
942   NumVariants = std::min(G.numCombinations(), (size_t)MaxConfigsPerOpcode);
943   // And actually produce all the wanted operand combinations.
944   Variants.reserve(NumVariants);
945   G.generate([&](ArrayRef<MCOperand> State) -> bool {
946     Variants.emplace_back(&Instr);
947     Variants.back().setVariableValues(State);
948     // Did we run out of space for variants?
949     return Variants.size() >= NumVariants;
950   });
951 
952   assert(Variants.size() == NumVariants &&
953          Variants.size() <= MaxConfigsPerOpcode &&
954          "Should not produce too many variants");
955   return Variants;
956 }
957 
getTheExegesisX86Target()958 static ExegesisTarget *getTheExegesisX86Target() {
959   static ExegesisX86Target Target;
960   return &Target;
961 }
962 
InitializeX86ExegesisTarget()963 void InitializeX86ExegesisTarget() {
964   ExegesisTarget::registerTarget(getTheExegesisX86Target());
965 }
966 
967 } // namespace exegesis
968 } // namespace llvm
969