1 //===-- ParallelSnippetGenerator.cpp ----------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "ParallelSnippetGenerator.h" 10 11 #include "BenchmarkRunner.h" 12 #include "MCInstrDescView.h" 13 #include "Target.h" 14 15 // FIXME: Load constants into registers (e.g. with fld1) to not break 16 // instructions like x87. 17 18 // Ideally we would like the only limitation on executing instructions to be the 19 // availability of the CPU resources (e.g. execution ports) needed to execute 20 // them, instead of the availability of their data dependencies. 21 22 // To achieve that, one approach is to generate instructions that do not have 23 // data dependencies between them. 24 // 25 // For some instructions, this is trivial: 26 // mov rax, qword ptr [rsi] 27 // mov rax, qword ptr [rsi] 28 // mov rax, qword ptr [rsi] 29 // mov rax, qword ptr [rsi] 30 // For the above snippet, haswell just renames rax four times and executes the 31 // four instructions two at a time on P23 and P0126. 32 // 33 // For some instructions, we just need to make sure that the source is 34 // different from the destination. For example, IDIV8r reads from GPR and 35 // writes to AX. We just need to ensure that the Var is assigned a 36 // register which is different from AX: 37 // idiv bx 38 // idiv bx 39 // idiv bx 40 // idiv bx 41 // The above snippet will be able to fully saturate the ports, while the same 42 // with ax would issue one uop every `latency(IDIV8r)` cycles. 43 // 44 // Some instructions make this harder because they both read and write from 45 // the same register: 46 // inc rax 47 // inc rax 48 // inc rax 49 // inc rax 50 // This has a data dependency from each instruction to the next, limit the 51 // number of instructions that can be issued in parallel. 52 // It turns out that this is not a big issue on recent Intel CPUs because they 53 // have heuristics to balance port pressure. In the snippet above, subsequent 54 // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs 55 // might end up executing them all on P0 (just because they can), or try 56 // avoiding P5 because it's usually under high pressure from vector 57 // instructions. 58 // This issue is even more important for high-latency instructions because 59 // they increase the idle time of the CPU, e.g. : 60 // imul rax, rbx 61 // imul rax, rbx 62 // imul rax, rbx 63 // imul rax, rbx 64 // 65 // To avoid that, we do the renaming statically by generating as many 66 // independent exclusive assignments as possible (until all possible registers 67 // are exhausted) e.g.: 68 // imul rax, rbx 69 // imul rcx, rbx 70 // imul rdx, rbx 71 // imul r8, rbx 72 // 73 // Some instruction even make the above static renaming impossible because 74 // they implicitly read and write from the same operand, e.g. ADC16rr reads 75 // and writes from EFLAGS. 76 // In that case we just use a greedy register assignment and hope for the 77 // best. 78 79 namespace llvm { 80 namespace exegesis { 81 82 static SmallVector<const Variable *, 8> 83 getVariablesWithTiedOperands(const Instruction &Instr) { 84 SmallVector<const Variable *, 8> Result; 85 for (const auto &Var : Instr.Variables) 86 if (Var.hasTiedOperands()) 87 Result.push_back(&Var); 88 return Result; 89 } 90 91 ParallelSnippetGenerator::~ParallelSnippetGenerator() = default; 92 93 void ParallelSnippetGenerator::instantiateMemoryOperands( 94 const unsigned ScratchSpacePointerInReg, 95 std::vector<InstructionTemplate> &Instructions) const { 96 if (ScratchSpacePointerInReg == 0) 97 return; // no memory operands. 98 const auto &ET = State.getExegesisTarget(); 99 const unsigned MemStep = ET.getMaxMemoryAccessSize(); 100 const size_t OriginalInstructionsSize = Instructions.size(); 101 size_t I = 0; 102 for (InstructionTemplate &IT : Instructions) { 103 ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep); 104 ++I; 105 } 106 107 while (Instructions.size() < kMinNumDifferentAddresses) { 108 InstructionTemplate IT = Instructions[I % OriginalInstructionsSize]; 109 ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep); 110 ++I; 111 Instructions.push_back(std::move(IT)); 112 } 113 assert(I * MemStep < BenchmarkRunner::ScratchSpace::kSize && 114 "not enough scratch space"); 115 } 116 117 static std::vector<InstructionTemplate> generateSnippetUsingStaticRenaming( 118 const LLVMState &State, const InstructionTemplate &IT, 119 const ArrayRef<const Variable *> TiedVariables, 120 const BitVector &ForbiddenRegisters) { 121 std::vector<InstructionTemplate> Instructions; 122 // Assign registers to variables in a round-robin manner. This is simple but 123 // ensures that the most register-constrained variable does not get starved. 124 std::vector<BitVector> PossibleRegsForVar; 125 for (const Variable *Var : TiedVariables) { 126 assert(Var); 127 const Operand &Op = IT.getInstr().getPrimaryOperand(*Var); 128 assert(Op.isReg()); 129 BitVector PossibleRegs = Op.getRegisterAliasing().sourceBits(); 130 remove(PossibleRegs, ForbiddenRegisters); 131 PossibleRegsForVar.push_back(std::move(PossibleRegs)); 132 } 133 SmallVector<int, 2> Iterators(TiedVariables.size(), 0); 134 while (true) { 135 InstructionTemplate TmpIT = IT; 136 // Find a possible register for each variable in turn, marking the 137 // register as taken. 138 for (size_t VarId = 0; VarId < TiedVariables.size(); ++VarId) { 139 const int NextPossibleReg = 140 PossibleRegsForVar[VarId].find_next(Iterators[VarId]); 141 if (NextPossibleReg <= 0) { 142 return Instructions; 143 } 144 TmpIT.getValueFor(*TiedVariables[VarId]) = 145 MCOperand::createReg(NextPossibleReg); 146 // Bump iterator. 147 Iterators[VarId] = NextPossibleReg; 148 // Prevent other variables from using the register. 149 for (BitVector &OtherPossibleRegs : PossibleRegsForVar) { 150 OtherPossibleRegs.reset(NextPossibleReg); 151 } 152 } 153 Instructions.push_back(std::move(TmpIT)); 154 } 155 } 156 157 Expected<std::vector<CodeTemplate>> 158 ParallelSnippetGenerator::generateCodeTemplates( 159 InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const { 160 const Instruction &Instr = Variant.getInstr(); 161 CodeTemplate CT; 162 CT.ScratchSpacePointerInReg = 163 Instr.hasMemoryOperands() 164 ? State.getExegesisTarget().getScratchMemoryRegister( 165 State.getTargetMachine().getTargetTriple()) 166 : 0; 167 const AliasingConfigurations SelfAliasing(Instr, Instr); 168 if (SelfAliasing.empty()) { 169 CT.Info = "instruction is parallel, repeating a random one."; 170 CT.Instructions.push_back(std::move(Variant)); 171 instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions); 172 return getSingleton(std::move(CT)); 173 } 174 if (SelfAliasing.hasImplicitAliasing()) { 175 CT.Info = "instruction is serial, repeating a random one."; 176 CT.Instructions.push_back(std::move(Variant)); 177 instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions); 178 return getSingleton(std::move(CT)); 179 } 180 const auto TiedVariables = getVariablesWithTiedOperands(Instr); 181 if (!TiedVariables.empty()) { 182 CT.Info = "instruction has tied variables, using static renaming."; 183 CT.Instructions = generateSnippetUsingStaticRenaming( 184 State, Variant, TiedVariables, ForbiddenRegisters); 185 instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions); 186 return getSingleton(std::move(CT)); 187 } 188 // No tied variables, we pick random values for defs. 189 BitVector Defs(State.getRegInfo().getNumRegs()); 190 for (const auto &Op : Instr.Operands) { 191 if (Op.isReg() && Op.isExplicit() && Op.isDef() && !Op.isMemory()) { 192 auto PossibleRegisters = Op.getRegisterAliasing().sourceBits(); 193 // Do not use forbidden registers. 194 remove(PossibleRegisters, ForbiddenRegisters); 195 assert(PossibleRegisters.any() && "No register left to choose from"); 196 const auto RandomReg = randomBit(PossibleRegisters); 197 Defs.set(RandomReg); 198 Variant.getValueFor(Op) = MCOperand::createReg(RandomReg); 199 } 200 } 201 // And pick random use values that are not reserved and don't alias with defs. 202 const auto DefAliases = getAliasedBits(State.getRegInfo(), Defs); 203 for (const auto &Op : Instr.Operands) { 204 if (Op.isReg() && Op.isExplicit() && Op.isUse() && !Op.isMemory()) { 205 auto PossibleRegisters = Op.getRegisterAliasing().sourceBits(); 206 remove(PossibleRegisters, ForbiddenRegisters); 207 remove(PossibleRegisters, DefAliases); 208 assert(PossibleRegisters.any() && "No register left to choose from"); 209 const auto RandomReg = randomBit(PossibleRegisters); 210 Variant.getValueFor(Op) = MCOperand::createReg(RandomReg); 211 } 212 } 213 CT.Info = 214 "instruction has no tied variables picking Uses different from defs"; 215 CT.Instructions.push_back(std::move(Variant)); 216 instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions); 217 return getSingleton(std::move(CT)); 218 } 219 220 constexpr const size_t ParallelSnippetGenerator::kMinNumDifferentAddresses; 221 222 } // namespace exegesis 223 } // namespace llvm 224