1 //===-- ParallelSnippetGenerator.cpp ----------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "ParallelSnippetGenerator.h" 10 11 #include "BenchmarkRunner.h" 12 #include "MCInstrDescView.h" 13 #include "Target.h" 14 15 // FIXME: Load constants into registers (e.g. with fld1) to not break 16 // instructions like x87. 17 18 // Ideally we would like the only limitation on executing instructions to be the 19 // availability of the CPU resources (e.g. execution ports) needed to execute 20 // them, instead of the availability of their data dependencies. 21 22 // To achieve that, one approach is to generate instructions that do not have 23 // data dependencies between them. 24 // 25 // For some instructions, this is trivial: 26 // mov rax, qword ptr [rsi] 27 // mov rax, qword ptr [rsi] 28 // mov rax, qword ptr [rsi] 29 // mov rax, qword ptr [rsi] 30 // For the above snippet, haswell just renames rax four times and executes the 31 // four instructions two at a time on P23 and P0126. 32 // 33 // For some instructions, we just need to make sure that the source is 34 // different from the destination. For example, IDIV8r reads from GPR and 35 // writes to AX. We just need to ensure that the Var is assigned a 36 // register which is different from AX: 37 // idiv bx 38 // idiv bx 39 // idiv bx 40 // idiv bx 41 // The above snippet will be able to fully saturate the ports, while the same 42 // with ax would issue one uop every `latency(IDIV8r)` cycles. 43 // 44 // Some instructions make this harder because they both read and write from 45 // the same register: 46 // inc rax 47 // inc rax 48 // inc rax 49 // inc rax 50 // This has a data dependency from each instruction to the next, limit the 51 // number of instructions that can be issued in parallel. 52 // It turns out that this is not a big issue on recent Intel CPUs because they 53 // have heuristics to balance port pressure. In the snippet above, subsequent 54 // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs 55 // might end up executing them all on P0 (just because they can), or try 56 // avoiding P5 because it's usually under high pressure from vector 57 // instructions. 58 // This issue is even more important for high-latency instructions because 59 // they increase the idle time of the CPU, e.g. : 60 // imul rax, rbx 61 // imul rax, rbx 62 // imul rax, rbx 63 // imul rax, rbx 64 // 65 // To avoid that, we do the renaming statically by generating as many 66 // independent exclusive assignments as possible (until all possible registers 67 // are exhausted) e.g.: 68 // imul rax, rbx 69 // imul rcx, rbx 70 // imul rdx, rbx 71 // imul r8, rbx 72 // 73 // Some instruction even make the above static renaming impossible because 74 // they implicitly read and write from the same operand, e.g. ADC16rr reads 75 // and writes from EFLAGS. 76 // In that case we just use a greedy register assignment and hope for the 77 // best. 78 79 namespace llvm { 80 namespace exegesis { 81 82 static bool hasVariablesWithTiedOperands(const Instruction &Instr) { 83 SmallVector<const Variable *, 8> Result; 84 for (const auto &Var : Instr.Variables) 85 if (Var.hasTiedOperands()) 86 return true; 87 return false; 88 } 89 90 ParallelSnippetGenerator::~ParallelSnippetGenerator() = default; 91 92 void ParallelSnippetGenerator::instantiateMemoryOperands( 93 const unsigned ScratchSpacePointerInReg, 94 std::vector<InstructionTemplate> &Instructions) const { 95 if (ScratchSpacePointerInReg == 0) 96 return; // no memory operands. 97 const auto &ET = State.getExegesisTarget(); 98 const unsigned MemStep = ET.getMaxMemoryAccessSize(); 99 const size_t OriginalInstructionsSize = Instructions.size(); 100 size_t I = 0; 101 for (InstructionTemplate &IT : Instructions) { 102 ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep); 103 ++I; 104 } 105 106 while (Instructions.size() < kMinNumDifferentAddresses) { 107 InstructionTemplate IT = Instructions[I % OriginalInstructionsSize]; 108 ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep); 109 ++I; 110 Instructions.push_back(std::move(IT)); 111 } 112 assert(I * MemStep < BenchmarkRunner::ScratchSpace::kSize && 113 "not enough scratch space"); 114 } 115 116 enum class RegRandomizationStrategy : uint8_t { 117 PickRandomRegs, 118 SingleStaticRegPerOperand, 119 SingleStaticReg, 120 121 FIRST = PickRandomRegs, 122 LAST = SingleStaticReg, 123 }; 124 125 } // namespace exegesis 126 127 template <> struct enum_iteration_traits<exegesis::RegRandomizationStrategy> { 128 static constexpr bool is_iterable = true; 129 }; 130 131 namespace exegesis { 132 133 const char *getDescription(RegRandomizationStrategy S) { 134 switch (S) { 135 case RegRandomizationStrategy::PickRandomRegs: 136 return "randomizing registers"; 137 case RegRandomizationStrategy::SingleStaticRegPerOperand: 138 return "one unique register for each position"; 139 case RegRandomizationStrategy::SingleStaticReg: 140 return "reusing the same register for all positions"; 141 } 142 llvm_unreachable("Unknown UseRegRandomizationStrategy enum"); 143 } 144 145 static std::variant<std::nullopt_t, MCOperand, Register> 146 generateSingleRegisterForInstrAvoidingDefUseOverlap( 147 const LLVMState &State, const BitVector &ForbiddenRegisters, 148 const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases, 149 const BitVector &Uses, const BitVector &Defs, const InstructionTemplate &IT, 150 const Operand &Op, const ArrayRef<InstructionTemplate> Instructions, 151 RegRandomizationStrategy S) { 152 const Instruction &Instr = IT.getInstr(); 153 assert(Op.isReg() && Op.isExplicit() && !Op.isMemory() && 154 !IT.getValueFor(Op).isValid()); 155 assert((!Op.isUse() || !Op.isTied()) && 156 "Not expecting to see a tied use reg"); 157 158 if (Op.isUse()) { 159 switch (S) { 160 case RegRandomizationStrategy::PickRandomRegs: 161 break; 162 case RegRandomizationStrategy::SingleStaticReg: 163 case RegRandomizationStrategy::SingleStaticRegPerOperand: { 164 if (!Instructions.empty()) 165 return Instructions.front().getValueFor(Op); 166 if (S != RegRandomizationStrategy::SingleStaticReg) 167 break; 168 BitVector PossibleRegisters = Op.getRegisterAliasing().sourceBits(); 169 const BitVector UseAliases = getAliasedBits(State.getRegInfo(), Uses); 170 if (std::optional<int> CommonBit = 171 getFirstCommonBit(PossibleRegisters, UseAliases)) 172 return *CommonBit; 173 break; 174 } 175 } 176 } 177 178 BitVector PossibleRegisters = Op.getRegisterAliasing().sourceBits(); 179 remove(PossibleRegisters, ForbiddenRegisters); 180 181 if (Op.isDef()) { 182 remove(PossibleRegisters, ImplicitUseAliases); 183 const BitVector UseAliases = getAliasedBits(State.getRegInfo(), Uses); 184 remove(PossibleRegisters, UseAliases); 185 } 186 187 if (Op.isUse()) { 188 remove(PossibleRegisters, ImplicitDefAliases); 189 // NOTE: in general, using same reg for multiple Use's is fine. 190 if (S == RegRandomizationStrategy::SingleStaticRegPerOperand) { 191 const BitVector UseAliases = getAliasedBits(State.getRegInfo(), Uses); 192 remove(PossibleRegisters, UseAliases); 193 } 194 } 195 196 bool IsDefWithTiedUse = 197 Instr.Variables[Op.getVariableIndex()].hasTiedOperands(); 198 if (Op.isUse() || IsDefWithTiedUse) { 199 // Now, important bit: if we have used some register for def, 200 // then we can not use that same register for *any* use, 201 // be it either an untied use, or an use tied to a def. 202 // But def-ing same regs is fine, as long as there are no uses! 203 const BitVector DefsAliases = getAliasedBits(State.getRegInfo(), Defs); 204 remove(PossibleRegisters, DefsAliases); 205 } 206 207 if (!PossibleRegisters.any()) 208 return std::nullopt; 209 210 return randomBit(PossibleRegisters); 211 } 212 213 static std::optional<InstructionTemplate> 214 generateSingleSnippetForInstrAvoidingDefUseOverlap( 215 const LLVMState &State, const BitVector &ForbiddenRegisters, 216 const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases, 217 BitVector &Uses, BitVector &Defs, InstructionTemplate IT, 218 const ArrayRef<InstructionTemplate> Instructions, 219 RegRandomizationStrategy S) { 220 const Instruction &Instr = IT.getInstr(); 221 for (const Operand &Op : Instr.Operands) { 222 if (!Op.isReg() || !Op.isExplicit() || Op.isMemory() || 223 IT.getValueFor(Op).isValid()) 224 continue; 225 assert((!Op.isUse() || !Op.isTied()) && "Will not get tied uses."); 226 227 std::variant<std::nullopt_t, MCOperand, Register> R = 228 generateSingleRegisterForInstrAvoidingDefUseOverlap( 229 State, ForbiddenRegisters, ImplicitUseAliases, ImplicitDefAliases, 230 Uses, Defs, IT, Op, Instructions, S); 231 232 if (std::holds_alternative<std::nullopt_t>(R)) 233 return {}; 234 235 MCOperand MCOp; 236 if (std::holds_alternative<MCOperand>(R)) 237 MCOp = std::get<MCOperand>(R); 238 else { 239 Register RandomReg = std::get<Register>(R); 240 if (Op.isDef()) 241 Defs.set(RandomReg); 242 if (Op.isUse()) 243 Uses.set(RandomReg); 244 MCOp = MCOperand::createReg(RandomReg); 245 } 246 IT.getValueFor(Op) = MCOp; 247 } 248 return IT; 249 } 250 251 static std::vector<InstructionTemplate> 252 generateSnippetForInstrAvoidingDefUseOverlap( 253 const LLVMState &State, const InstructionTemplate &IT, 254 RegRandomizationStrategy S, const BitVector &ForbiddenRegisters) { 255 // We don't want to accidentally serialize the instruction, 256 // so we must be sure that we don't pick a def that is an implicit use, 257 // or a use that is an implicit def, so record implicit regs now. 258 BitVector ImplicitUses(State.getRegInfo().getNumRegs()); 259 BitVector ImplicitDefs(State.getRegInfo().getNumRegs()); 260 for (const auto &Op : IT.getInstr().Operands) { 261 if (Op.isReg() && Op.isImplicit() && !Op.isMemory()) { 262 assert(Op.isImplicitReg() && "Not an implicit register operand?"); 263 if (Op.isUse()) 264 ImplicitUses.set(Op.getImplicitReg()); 265 else { 266 assert(Op.isDef() && "Not a use and not a def?"); 267 ImplicitDefs.set(Op.getImplicitReg()); 268 } 269 } 270 } 271 const BitVector ImplicitUseAliases = 272 getAliasedBits(State.getRegInfo(), ImplicitUses); 273 const BitVector ImplicitDefAliases = 274 getAliasedBits(State.getRegInfo(), ImplicitDefs); 275 276 BitVector Defs(State.getRegInfo().getNumRegs()); 277 BitVector Uses(State.getRegInfo().getNumRegs()); 278 std::vector<InstructionTemplate> Instructions; 279 280 while (true) { 281 std::optional<InstructionTemplate> TmpIT = 282 generateSingleSnippetForInstrAvoidingDefUseOverlap( 283 State, ForbiddenRegisters, ImplicitUseAliases, ImplicitDefAliases, 284 Uses, Defs, IT, Instructions, S); 285 if (!TmpIT) 286 return Instructions; 287 Instructions.push_back(std::move(*TmpIT)); 288 if (!hasVariablesWithTiedOperands(IT.getInstr())) 289 return Instructions; 290 assert(Instructions.size() <= 128 && "Stuck in endless loop?"); 291 } 292 } 293 294 Expected<std::vector<CodeTemplate>> 295 ParallelSnippetGenerator::generateCodeTemplates( 296 InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const { 297 const Instruction &Instr = Variant.getInstr(); 298 CodeTemplate CT; 299 CT.ScratchSpacePointerInReg = 300 Instr.hasMemoryOperands() 301 ? State.getExegesisTarget().getScratchMemoryRegister( 302 State.getTargetMachine().getTargetTriple()) 303 : 0; 304 const AliasingConfigurations SelfAliasing(Instr, Instr, ForbiddenRegisters); 305 if (SelfAliasing.empty()) { 306 CT.Info = "instruction is parallel, repeating a random one."; 307 CT.Instructions.push_back(std::move(Variant)); 308 instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions); 309 return getSingleton(std::move(CT)); 310 } 311 if (SelfAliasing.hasImplicitAliasing()) { 312 CT.Info = "instruction is serial, repeating a random one."; 313 CT.Instructions.push_back(std::move(Variant)); 314 instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions); 315 return getSingleton(std::move(CT)); 316 } 317 std::vector<CodeTemplate> Result; 318 bool HasTiedOperands = hasVariablesWithTiedOperands(Instr); 319 // If there are no tied operands, then we don't want to "saturate backedge", 320 // and the template we will produce will have only a single instruction. 321 unsigned NumUntiedUseRegs = count_if(Instr.Operands, [](const Operand &Op) { 322 return Op.isReg() && Op.isExplicit() && !Op.isMemory() && Op.isUse() && 323 !Op.isTied(); 324 }); 325 SmallVector<RegRandomizationStrategy, 3> Strategies; 326 if (HasTiedOperands || NumUntiedUseRegs >= 3) 327 Strategies.push_back(RegRandomizationStrategy::PickRandomRegs); 328 if (NumUntiedUseRegs >= 2) 329 Strategies.push_back(RegRandomizationStrategy::SingleStaticRegPerOperand); 330 Strategies.push_back(RegRandomizationStrategy::SingleStaticReg); 331 for (RegRandomizationStrategy S : Strategies) { 332 CodeTemplate CurrCT = CT.clone(); 333 CurrCT.Info = 334 Twine("instruction has ") 335 .concat(HasTiedOperands ? "" : "no ") 336 .concat("tied variables, avoiding " 337 "Read-After-Write issue, picking random def and use " 338 "registers not aliasing each other, for uses, ") 339 .concat(getDescription(S)) 340 .str(); 341 CurrCT.Instructions = generateSnippetForInstrAvoidingDefUseOverlap( 342 State, Variant, S, ForbiddenRegisters); 343 if (CurrCT.Instructions.empty()) 344 return make_error<StringError>( 345 Twine("Failed to produce any snippet via: ").concat(CurrCT.Info), 346 inconvertibleErrorCode()); 347 instantiateMemoryOperands(CurrCT.ScratchSpacePointerInReg, 348 CurrCT.Instructions); 349 Result.push_back(std::move(CurrCT)); 350 } 351 return Result; 352 } 353 354 constexpr const size_t ParallelSnippetGenerator::kMinNumDifferentAddresses; 355 356 } // namespace exegesis 357 } // namespace llvm 358