1 //===--------------------- InstrBuilder.cpp ---------------------*- C++ -*-===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 /// \file
10 ///
11 /// This file implements the InstrBuilder interface.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #include "InstrBuilder.h"
16 #include "llvm/ADT/APInt.h"
17 #include "llvm/ADT/DenseMap.h"
18 #include "llvm/MC/MCInst.h"
19 #include "llvm/Support/Debug.h"
20 #include "llvm/Support/WithColor.h"
21 #include "llvm/Support/raw_ostream.h"
22 
23 #define DEBUG_TYPE "llvm-mca"
24 
25 namespace mca {
26 
27 using namespace llvm;
28 
initializeUsedResources(InstrDesc & ID,const MCSchedClassDesc & SCDesc,const MCSubtargetInfo & STI,ArrayRef<uint64_t> ProcResourceMasks)29 static void initializeUsedResources(InstrDesc &ID,
30                                     const MCSchedClassDesc &SCDesc,
31                                     const MCSubtargetInfo &STI,
32                                     ArrayRef<uint64_t> ProcResourceMasks) {
33   const MCSchedModel &SM = STI.getSchedModel();
34 
35   // Populate resources consumed.
36   using ResourcePlusCycles = std::pair<uint64_t, ResourceUsage>;
37   std::vector<ResourcePlusCycles> Worklist;
38 
39   // Track cycles contributed by resources that are in a "Super" relationship.
40   // This is required if we want to correctly match the behavior of method
41   // SubtargetEmitter::ExpandProcResource() in Tablegen. When computing the set
42   // of "consumed" processor resources and resource cycles, the logic in
43   // ExpandProcResource() doesn't update the number of resource cycles
44   // contributed by a "Super" resource to a group.
45   // We need to take this into account when we find that a processor resource is
46   // part of a group, and it is also used as the "Super" of other resources.
47   // This map stores the number of cycles contributed by sub-resources that are
48   // part of a "Super" resource. The key value is the "Super" resource mask ID.
49   DenseMap<uint64_t, unsigned> SuperResources;
50 
51   for (unsigned I = 0, E = SCDesc.NumWriteProcResEntries; I < E; ++I) {
52     const MCWriteProcResEntry *PRE = STI.getWriteProcResBegin(&SCDesc) + I;
53     const MCProcResourceDesc &PR = *SM.getProcResource(PRE->ProcResourceIdx);
54     uint64_t Mask = ProcResourceMasks[PRE->ProcResourceIdx];
55     if (PR.BufferSize != -1)
56       ID.Buffers.push_back(Mask);
57     CycleSegment RCy(0, PRE->Cycles, false);
58     Worklist.emplace_back(ResourcePlusCycles(Mask, ResourceUsage(RCy)));
59     if (PR.SuperIdx) {
60       uint64_t Super = ProcResourceMasks[PR.SuperIdx];
61       SuperResources[Super] += PRE->Cycles;
62     }
63   }
64 
65   // Sort elements by mask popcount, so that we prioritize resource units over
66   // resource groups, and smaller groups over larger groups.
67   llvm::sort(Worklist.begin(), Worklist.end(),
68              [](const ResourcePlusCycles &A, const ResourcePlusCycles &B) {
69                unsigned popcntA = countPopulation(A.first);
70                unsigned popcntB = countPopulation(B.first);
71                if (popcntA < popcntB)
72                  return true;
73                if (popcntA > popcntB)
74                  return false;
75                return A.first < B.first;
76              });
77 
78   uint64_t UsedResourceUnits = 0;
79 
80   // Remove cycles contributed by smaller resources.
81   for (unsigned I = 0, E = Worklist.size(); I < E; ++I) {
82     ResourcePlusCycles &A = Worklist[I];
83     if (!A.second.size()) {
84       A.second.NumUnits = 0;
85       A.second.setReserved();
86       ID.Resources.emplace_back(A);
87       continue;
88     }
89 
90     ID.Resources.emplace_back(A);
91     uint64_t NormalizedMask = A.first;
92     if (countPopulation(A.first) == 1) {
93       UsedResourceUnits |= A.first;
94     } else {
95       // Remove the leading 1 from the resource group mask.
96       NormalizedMask ^= PowerOf2Floor(NormalizedMask);
97     }
98 
99     for (unsigned J = I + 1; J < E; ++J) {
100       ResourcePlusCycles &B = Worklist[J];
101       if ((NormalizedMask & B.first) == NormalizedMask) {
102         B.second.CS.Subtract(A.second.size() - SuperResources[A.first]);
103         if (countPopulation(B.first) > 1)
104           B.second.NumUnits++;
105       }
106     }
107   }
108 
109   // A SchedWrite may specify a number of cycles in which a resource group
110   // is reserved. For example (on target x86; cpu Haswell):
111   //
112   //  SchedWriteRes<[HWPort0, HWPort1, HWPort01]> {
113   //    let ResourceCycles = [2, 2, 3];
114   //  }
115   //
116   // This means:
117   // Resource units HWPort0 and HWPort1 are both used for 2cy.
118   // Resource group HWPort01 is the union of HWPort0 and HWPort1.
119   // Since this write touches both HWPort0 and HWPort1 for 2cy, HWPort01
120   // will not be usable for 2 entire cycles from instruction issue.
121   //
122   // On top of those 2cy, SchedWriteRes explicitly specifies an extra latency
123   // of 3 cycles for HWPort01. This tool assumes that the 3cy latency is an
124   // extra delay on top of the 2 cycles latency.
125   // During those extra cycles, HWPort01 is not usable by other instructions.
126   for (ResourcePlusCycles &RPC : ID.Resources) {
127     if (countPopulation(RPC.first) > 1 && !RPC.second.isReserved()) {
128       // Remove the leading 1 from the resource group mask.
129       uint64_t Mask = RPC.first ^ PowerOf2Floor(RPC.first);
130       if ((Mask & UsedResourceUnits) == Mask)
131         RPC.second.setReserved();
132     }
133   }
134 
135   LLVM_DEBUG({
136     for (const std::pair<uint64_t, ResourceUsage> &R : ID.Resources)
137       dbgs() << "\t\tMask=" << R.first << ", cy=" << R.second.size() << '\n';
138     for (const uint64_t R : ID.Buffers)
139       dbgs() << "\t\tBuffer Mask=" << R << '\n';
140   });
141 }
142 
computeMaxLatency(InstrDesc & ID,const MCInstrDesc & MCDesc,const MCSchedClassDesc & SCDesc,const MCSubtargetInfo & STI)143 static void computeMaxLatency(InstrDesc &ID, const MCInstrDesc &MCDesc,
144                               const MCSchedClassDesc &SCDesc,
145                               const MCSubtargetInfo &STI) {
146   if (MCDesc.isCall()) {
147     // We cannot estimate how long this call will take.
148     // Artificially set an arbitrarily high latency (100cy).
149     ID.MaxLatency = 100U;
150     return;
151   }
152 
153   int Latency = MCSchedModel::computeInstrLatency(STI, SCDesc);
154   // If latency is unknown, then conservatively assume a MaxLatency of 100cy.
155   ID.MaxLatency = Latency < 0 ? 100U : static_cast<unsigned>(Latency);
156 }
157 
populateWrites(InstrDesc & ID,const MCInst & MCI,unsigned SchedClassID)158 void InstrBuilder::populateWrites(InstrDesc &ID, const MCInst &MCI,
159                                   unsigned SchedClassID) {
160   const MCInstrDesc &MCDesc = MCII.get(MCI.getOpcode());
161   const MCSchedModel &SM = STI.getSchedModel();
162   const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SchedClassID);
163 
164   // These are for now the (strong) assumptions made by this algorithm:
165   //  * The number of explicit and implicit register definitions in a MCInst
166   //    matches the number of explicit and implicit definitions according to
167   //    the opcode descriptor (MCInstrDesc).
168   //  * Register definitions take precedence over register uses in the operands
169   //    list.
170   //  * If an opcode specifies an optional definition, then the optional
171   //    definition is always the last operand in the sequence, and it can be
172   //    set to zero (i.e. "no register").
173   //
174   // These assumptions work quite well for most out-of-order in-tree targets
175   // like x86. This is mainly because the vast majority of instructions is
176   // expanded to MCInst using a straightforward lowering logic that preserves
177   // the ordering of the operands.
178   unsigned NumExplicitDefs = MCDesc.getNumDefs();
179   unsigned NumImplicitDefs = MCDesc.getNumImplicitDefs();
180   unsigned NumWriteLatencyEntries = SCDesc.NumWriteLatencyEntries;
181   unsigned TotalDefs = NumExplicitDefs + NumImplicitDefs;
182   if (MCDesc.hasOptionalDef())
183     TotalDefs++;
184   ID.Writes.resize(TotalDefs);
185   // Iterate over the operands list, and skip non-register operands.
186   // The first NumExplictDefs register operands are expected to be register
187   // definitions.
188   unsigned CurrentDef = 0;
189   unsigned i = 0;
190   for (; i < MCI.getNumOperands() && CurrentDef < NumExplicitDefs; ++i) {
191     const MCOperand &Op = MCI.getOperand(i);
192     if (!Op.isReg())
193       continue;
194 
195     WriteDescriptor &Write = ID.Writes[CurrentDef];
196     Write.OpIndex = i;
197     if (CurrentDef < NumWriteLatencyEntries) {
198       const MCWriteLatencyEntry &WLE =
199           *STI.getWriteLatencyEntry(&SCDesc, CurrentDef);
200       // Conservatively default to MaxLatency.
201       Write.Latency =
202           WLE.Cycles < 0 ? ID.MaxLatency : static_cast<unsigned>(WLE.Cycles);
203       Write.SClassOrWriteResourceID = WLE.WriteResourceID;
204     } else {
205       // Assign a default latency for this write.
206       Write.Latency = ID.MaxLatency;
207       Write.SClassOrWriteResourceID = 0;
208     }
209     Write.IsOptionalDef = false;
210     LLVM_DEBUG({
211       dbgs() << "\t\t[Def] OpIdx=" << Write.OpIndex
212              << ", Latency=" << Write.Latency
213              << ", WriteResourceID=" << Write.SClassOrWriteResourceID << '\n';
214     });
215     CurrentDef++;
216   }
217 
218   if (CurrentDef != NumExplicitDefs)
219     llvm::report_fatal_error(
220         "error: Expected more register operand definitions. ");
221 
222   CurrentDef = 0;
223   for (CurrentDef = 0; CurrentDef < NumImplicitDefs; ++CurrentDef) {
224     unsigned Index = NumExplicitDefs + CurrentDef;
225     WriteDescriptor &Write = ID.Writes[Index];
226     Write.OpIndex = ~CurrentDef;
227     Write.RegisterID = MCDesc.getImplicitDefs()[CurrentDef];
228     if (Index < NumWriteLatencyEntries) {
229       const MCWriteLatencyEntry &WLE =
230           *STI.getWriteLatencyEntry(&SCDesc, Index);
231       // Conservatively default to MaxLatency.
232       Write.Latency =
233           WLE.Cycles < 0 ? ID.MaxLatency : static_cast<unsigned>(WLE.Cycles);
234       Write.SClassOrWriteResourceID = WLE.WriteResourceID;
235     } else {
236       // Assign a default latency for this write.
237       Write.Latency = ID.MaxLatency;
238       Write.SClassOrWriteResourceID = 0;
239     }
240 
241     Write.IsOptionalDef = false;
242     assert(Write.RegisterID != 0 && "Expected a valid phys register!");
243     LLVM_DEBUG({
244       dbgs() << "\t\t[Def] OpIdx=" << Write.OpIndex
245              << ", PhysReg=" << MRI.getName(Write.RegisterID)
246              << ", Latency=" << Write.Latency
247              << ", WriteResourceID=" << Write.SClassOrWriteResourceID << '\n';
248     });
249   }
250 
251   if (MCDesc.hasOptionalDef()) {
252     // Always assume that the optional definition is the last operand of the
253     // MCInst sequence.
254     const MCOperand &Op = MCI.getOperand(MCI.getNumOperands() - 1);
255     if (i == MCI.getNumOperands() || !Op.isReg())
256       llvm::report_fatal_error(
257           "error: expected a register operand for an optional "
258           "definition. Instruction has not be correctly analyzed.\n",
259           false);
260 
261     WriteDescriptor &Write = ID.Writes[TotalDefs - 1];
262     Write.OpIndex = MCI.getNumOperands() - 1;
263     // Assign a default latency for this write.
264     Write.Latency = ID.MaxLatency;
265     Write.SClassOrWriteResourceID = 0;
266     Write.IsOptionalDef = true;
267   }
268 }
269 
populateReads(InstrDesc & ID,const MCInst & MCI,unsigned SchedClassID)270 void InstrBuilder::populateReads(InstrDesc &ID, const MCInst &MCI,
271                                  unsigned SchedClassID) {
272   const MCInstrDesc &MCDesc = MCII.get(MCI.getOpcode());
273   unsigned NumExplicitDefs = MCDesc.getNumDefs();
274 
275   // Skip explicit definitions.
276   unsigned i = 0;
277   for (; i < MCI.getNumOperands() && NumExplicitDefs; ++i) {
278     const MCOperand &Op = MCI.getOperand(i);
279     if (Op.isReg())
280       NumExplicitDefs--;
281   }
282 
283   if (NumExplicitDefs)
284     llvm::report_fatal_error(
285         "error: Expected more register operand definitions. ", false);
286 
287   unsigned NumExplicitUses = MCI.getNumOperands() - i;
288   unsigned NumImplicitUses = MCDesc.getNumImplicitUses();
289   if (MCDesc.hasOptionalDef()) {
290     assert(NumExplicitUses);
291     NumExplicitUses--;
292   }
293   unsigned TotalUses = NumExplicitUses + NumImplicitUses;
294   if (!TotalUses)
295     return;
296 
297   ID.Reads.resize(TotalUses);
298   for (unsigned CurrentUse = 0; CurrentUse < NumExplicitUses; ++CurrentUse) {
299     ReadDescriptor &Read = ID.Reads[CurrentUse];
300     Read.OpIndex = i + CurrentUse;
301     Read.UseIndex = CurrentUse;
302     Read.SchedClassID = SchedClassID;
303     LLVM_DEBUG(dbgs() << "\t\t[Use] OpIdx=" << Read.OpIndex
304                       << ", UseIndex=" << Read.UseIndex << '\n');
305   }
306 
307   for (unsigned CurrentUse = 0; CurrentUse < NumImplicitUses; ++CurrentUse) {
308     ReadDescriptor &Read = ID.Reads[NumExplicitUses + CurrentUse];
309     Read.OpIndex = ~CurrentUse;
310     Read.UseIndex = NumExplicitUses + CurrentUse;
311     Read.RegisterID = MCDesc.getImplicitUses()[CurrentUse];
312     Read.SchedClassID = SchedClassID;
313     LLVM_DEBUG(dbgs() << "\t\t[Use] OpIdx=" << Read.OpIndex << ", RegisterID="
314                       << MRI.getName(Read.RegisterID) << '\n');
315   }
316 }
317 
createInstrDescImpl(const MCInst & MCI)318 const InstrDesc &InstrBuilder::createInstrDescImpl(const MCInst &MCI) {
319   assert(STI.getSchedModel().hasInstrSchedModel() &&
320          "Itineraries are not yet supported!");
321 
322   // Obtain the instruction descriptor from the opcode.
323   unsigned short Opcode = MCI.getOpcode();
324   const MCInstrDesc &MCDesc = MCII.get(Opcode);
325   const MCSchedModel &SM = STI.getSchedModel();
326 
327   // Then obtain the scheduling class information from the instruction.
328   unsigned SchedClassID = MCDesc.getSchedClass();
329   unsigned CPUID = SM.getProcessorID();
330 
331   // Try to solve variant scheduling classes.
332   if (SchedClassID) {
333     while (SchedClassID && SM.getSchedClassDesc(SchedClassID)->isVariant())
334       SchedClassID = STI.resolveVariantSchedClass(SchedClassID, &MCI, CPUID);
335 
336     if (!SchedClassID)
337       llvm::report_fatal_error("unable to resolve this variant class.");
338   }
339 
340   // Check if this instruction is supported. Otherwise, report a fatal error.
341   const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SchedClassID);
342   if (SCDesc.NumMicroOps == MCSchedClassDesc::InvalidNumMicroOps) {
343     std::string ToString;
344     llvm::raw_string_ostream OS(ToString);
345     WithColor::error() << "found an unsupported instruction in the input"
346                        << " assembly sequence.\n";
347     MCIP.printInst(&MCI, OS, "", STI);
348     OS.flush();
349 
350     WithColor::note() << "instruction: " << ToString << '\n';
351     llvm::report_fatal_error(
352         "Don't know how to analyze unsupported instructions.");
353   }
354 
355   // Create a new empty descriptor.
356   std::unique_ptr<InstrDesc> ID = llvm::make_unique<InstrDesc>();
357   ID->NumMicroOps = SCDesc.NumMicroOps;
358 
359   if (MCDesc.isCall()) {
360     // We don't correctly model calls.
361     WithColor::warning() << "found a call in the input assembly sequence.\n";
362     WithColor::note() << "call instructions are not correctly modeled. "
363                       << "Assume a latency of 100cy.\n";
364   }
365 
366   if (MCDesc.isReturn()) {
367     WithColor::warning() << "found a return instruction in the input"
368                          << " assembly sequence.\n";
369     WithColor::note() << "program counter updates are ignored.\n";
370   }
371 
372   ID->MayLoad = MCDesc.mayLoad();
373   ID->MayStore = MCDesc.mayStore();
374   ID->HasSideEffects = MCDesc.hasUnmodeledSideEffects();
375 
376   initializeUsedResources(*ID, SCDesc, STI, ProcResourceMasks);
377   computeMaxLatency(*ID, MCDesc, SCDesc, STI);
378   populateWrites(*ID, MCI, SchedClassID);
379   populateReads(*ID, MCI, SchedClassID);
380 
381   LLVM_DEBUG(dbgs() << "\t\tMaxLatency=" << ID->MaxLatency << '\n');
382   LLVM_DEBUG(dbgs() << "\t\tNumMicroOps=" << ID->NumMicroOps << '\n');
383 
384   // Now add the new descriptor.
385   SchedClassID = MCDesc.getSchedClass();
386   if (!SM.getSchedClassDesc(SchedClassID)->isVariant()) {
387     Descriptors[MCI.getOpcode()] = std::move(ID);
388     return *Descriptors[MCI.getOpcode()];
389   }
390 
391   VariantDescriptors[&MCI] = std::move(ID);
392   return *VariantDescriptors[&MCI];
393 }
394 
getOrCreateInstrDesc(const MCInst & MCI)395 const InstrDesc &InstrBuilder::getOrCreateInstrDesc(const MCInst &MCI) {
396   if (Descriptors.find_as(MCI.getOpcode()) != Descriptors.end())
397     return *Descriptors[MCI.getOpcode()];
398 
399   if (VariantDescriptors.find(&MCI) != VariantDescriptors.end())
400     return *VariantDescriptors[&MCI];
401 
402   return createInstrDescImpl(MCI);
403 }
404 
405 std::unique_ptr<Instruction>
createInstruction(const MCInst & MCI)406 InstrBuilder::createInstruction(const MCInst &MCI) {
407   const InstrDesc &D = getOrCreateInstrDesc(MCI);
408   std::unique_ptr<Instruction> NewIS = llvm::make_unique<Instruction>(D);
409 
410   // Initialize Reads first.
411   for (const ReadDescriptor &RD : D.Reads) {
412     int RegID = -1;
413     if (!RD.isImplicitRead()) {
414       // explicit read.
415       const MCOperand &Op = MCI.getOperand(RD.OpIndex);
416       // Skip non-register operands.
417       if (!Op.isReg())
418         continue;
419       RegID = Op.getReg();
420     } else {
421       // Implicit read.
422       RegID = RD.RegisterID;
423     }
424 
425     // Skip invalid register operands.
426     if (!RegID)
427       continue;
428 
429     // Okay, this is a register operand. Create a ReadState for it.
430     assert(RegID > 0 && "Invalid register ID found!");
431     NewIS->getUses().emplace_back(llvm::make_unique<ReadState>(RD, RegID));
432   }
433 
434   // Early exit if there are no writes.
435   if (D.Writes.empty())
436     return NewIS;
437 
438   // Track register writes that implicitly clear the upper portion of the
439   // underlying super-registers using an APInt.
440   APInt WriteMask(D.Writes.size(), 0);
441 
442   // Now query the MCInstrAnalysis object to obtain information about which
443   // register writes implicitly clear the upper portion of a super-register.
444   MCIA.clearsSuperRegisters(MRI, MCI, WriteMask);
445 
446   // Check if this is a dependency breaking instruction.
447   if (MCIA.isDependencyBreaking(STI, MCI))
448     NewIS->setDependencyBreaking();
449 
450   // Initialize writes.
451   unsigned WriteIndex = 0;
452   for (const WriteDescriptor &WD : D.Writes) {
453     unsigned RegID = WD.isImplicitWrite() ? WD.RegisterID
454                                           : MCI.getOperand(WD.OpIndex).getReg();
455     // Check if this is a optional definition that references NoReg.
456     if (WD.IsOptionalDef && !RegID) {
457       ++WriteIndex;
458       continue;
459     }
460 
461     assert(RegID && "Expected a valid register ID!");
462     NewIS->getDefs().emplace_back(llvm::make_unique<WriteState>(
463         WD, RegID, /* ClearsSuperRegs */ WriteMask[WriteIndex]));
464     ++WriteIndex;
465   }
466 
467   return NewIS;
468 }
469 } // namespace mca
470