1 //===-- Analysis.cpp --------------------------------------------*- C++ -*-===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9
10 #include "Analysis.h"
11 #include "BenchmarkResult.h"
12 #include "llvm/ADT/STLExtras.h"
13 #include "llvm/MC/MCAsmInfo.h"
14 #include "llvm/Support/FormatVariadic.h"
15 #include <limits>
16 #include <unordered_set>
17 #include <vector>
18
19 namespace llvm {
20 namespace exegesis {
21
22 static const char kCsvSep = ',';
23
resolveSchedClassId(const llvm::MCSubtargetInfo & STI,unsigned SchedClassId,const llvm::MCInst & MCI)24 static unsigned resolveSchedClassId(const llvm::MCSubtargetInfo &STI,
25 unsigned SchedClassId,
26 const llvm::MCInst &MCI) {
27 const auto &SM = STI.getSchedModel();
28 while (SchedClassId && SM.getSchedClassDesc(SchedClassId)->isVariant())
29 SchedClassId =
30 STI.resolveVariantSchedClass(SchedClassId, &MCI, SM.getProcessorID());
31 return SchedClassId;
32 }
33
34 namespace {
35
36 enum EscapeTag { kEscapeCsv, kEscapeHtml, kEscapeHtmlString };
37
38 template <EscapeTag Tag>
39 void writeEscaped(llvm::raw_ostream &OS, const llvm::StringRef S);
40
41 template <>
writeEscaped(llvm::raw_ostream & OS,const llvm::StringRef S)42 void writeEscaped<kEscapeCsv>(llvm::raw_ostream &OS, const llvm::StringRef S) {
43 if (std::find(S.begin(), S.end(), kCsvSep) == S.end()) {
44 OS << S;
45 } else {
46 // Needs escaping.
47 OS << '"';
48 for (const char C : S) {
49 if (C == '"')
50 OS << "\"\"";
51 else
52 OS << C;
53 }
54 OS << '"';
55 }
56 }
57
58 template <>
writeEscaped(llvm::raw_ostream & OS,const llvm::StringRef S)59 void writeEscaped<kEscapeHtml>(llvm::raw_ostream &OS, const llvm::StringRef S) {
60 for (const char C : S) {
61 if (C == '<')
62 OS << "<";
63 else if (C == '>')
64 OS << ">";
65 else if (C == '&')
66 OS << "&";
67 else
68 OS << C;
69 }
70 }
71
72 template <>
writeEscaped(llvm::raw_ostream & OS,const llvm::StringRef S)73 void writeEscaped<kEscapeHtmlString>(llvm::raw_ostream &OS,
74 const llvm::StringRef S) {
75 for (const char C : S) {
76 if (C == '"')
77 OS << "\\\"";
78 else
79 OS << C;
80 }
81 }
82
83 } // namespace
84
85 template <EscapeTag Tag>
86 static void
writeClusterId(llvm::raw_ostream & OS,const InstructionBenchmarkClustering::ClusterId & CID)87 writeClusterId(llvm::raw_ostream &OS,
88 const InstructionBenchmarkClustering::ClusterId &CID) {
89 if (CID.isNoise())
90 writeEscaped<Tag>(OS, "[noise]");
91 else if (CID.isError())
92 writeEscaped<Tag>(OS, "[error]");
93 else
94 OS << CID.getId();
95 }
96
97 template <EscapeTag Tag>
writeMeasurementValue(llvm::raw_ostream & OS,const double Value)98 static void writeMeasurementValue(llvm::raw_ostream &OS, const double Value) {
99 // Given Value, if we wanted to serialize it to a string,
100 // how many base-10 digits will we need to store, max?
101 static constexpr auto MaxDigitCount =
102 std::numeric_limits<decltype(Value)>::max_digits10;
103 // Also, we will need a decimal separator.
104 static constexpr auto DecimalSeparatorLen = 1; // '.' e.g.
105 // So how long of a string will the serialization produce, max?
106 static constexpr auto SerializationLen = MaxDigitCount + DecimalSeparatorLen;
107
108 // WARNING: when changing the format, also adjust the small-size estimate ^.
109 static constexpr StringLiteral SimpleFloatFormat = StringLiteral("{0:F}");
110
111 writeEscaped<Tag>(
112 OS,
113 llvm::formatv(SimpleFloatFormat.data(), Value).sstr<SerializationLen>());
114 }
115
116 template <typename EscapeTag, EscapeTag Tag>
writeSnippet(llvm::raw_ostream & OS,llvm::ArrayRef<uint8_t> Bytes,const char * Separator) const117 void Analysis::writeSnippet(llvm::raw_ostream &OS,
118 llvm::ArrayRef<uint8_t> Bytes,
119 const char *Separator) const {
120 llvm::SmallVector<std::string, 3> Lines;
121 // Parse the asm snippet and print it.
122 while (!Bytes.empty()) {
123 llvm::MCInst MI;
124 uint64_t MISize = 0;
125 if (!Disasm_->getInstruction(MI, MISize, Bytes, 0, llvm::nulls(),
126 llvm::nulls())) {
127 writeEscaped<Tag>(OS, llvm::join(Lines, Separator));
128 writeEscaped<Tag>(OS, Separator);
129 writeEscaped<Tag>(OS, "[error decoding asm snippet]");
130 return;
131 }
132 llvm::SmallString<128> InstPrinterStr; // FIXME: magic number.
133 llvm::raw_svector_ostream OSS(InstPrinterStr);
134 InstPrinter_->printInst(&MI, OSS, "", *SubtargetInfo_);
135 Bytes = Bytes.drop_front(MISize);
136 Lines.emplace_back(llvm::StringRef(InstPrinterStr).trim());
137 }
138 writeEscaped<Tag>(OS, llvm::join(Lines, Separator));
139 }
140
141 // Prints a row representing an instruction, along with scheduling info and
142 // point coordinates (measurements).
printInstructionRowCsv(const size_t PointId,llvm::raw_ostream & OS) const143 void Analysis::printInstructionRowCsv(const size_t PointId,
144 llvm::raw_ostream &OS) const {
145 const InstructionBenchmark &Point = Clustering_.getPoints()[PointId];
146 writeClusterId<kEscapeCsv>(OS, Clustering_.getClusterIdForPoint(PointId));
147 OS << kCsvSep;
148 writeSnippet<EscapeTag, kEscapeCsv>(OS, Point.AssembledSnippet, "; ");
149 OS << kCsvSep;
150 writeEscaped<kEscapeCsv>(OS, Point.Key.Config);
151 OS << kCsvSep;
152 assert(!Point.Key.Instructions.empty());
153 const llvm::MCInst &MCI = Point.Key.Instructions[0];
154 const unsigned SchedClassId = resolveSchedClassId(
155 *SubtargetInfo_, InstrInfo_->get(MCI.getOpcode()).getSchedClass(), MCI);
156
157 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
158 const llvm::MCSchedClassDesc *const SCDesc =
159 SubtargetInfo_->getSchedModel().getSchedClassDesc(SchedClassId);
160 writeEscaped<kEscapeCsv>(OS, SCDesc->Name);
161 #else
162 OS << SchedClassId;
163 #endif
164 for (const auto &Measurement : Point.Measurements) {
165 OS << kCsvSep;
166 writeMeasurementValue<kEscapeCsv>(OS, Measurement.PerInstructionValue);
167 }
168 OS << "\n";
169 }
170
Analysis(const llvm::Target & Target,const InstructionBenchmarkClustering & Clustering)171 Analysis::Analysis(const llvm::Target &Target,
172 const InstructionBenchmarkClustering &Clustering)
173 : Clustering_(Clustering) {
174 if (Clustering.getPoints().empty())
175 return;
176
177 const InstructionBenchmark &FirstPoint = Clustering.getPoints().front();
178 InstrInfo_.reset(Target.createMCInstrInfo());
179 RegInfo_.reset(Target.createMCRegInfo(FirstPoint.LLVMTriple));
180 AsmInfo_.reset(Target.createMCAsmInfo(*RegInfo_, FirstPoint.LLVMTriple));
181 SubtargetInfo_.reset(Target.createMCSubtargetInfo(FirstPoint.LLVMTriple,
182 FirstPoint.CpuName, ""));
183 InstPrinter_.reset(Target.createMCInstPrinter(
184 llvm::Triple(FirstPoint.LLVMTriple), 0 /*default variant*/, *AsmInfo_,
185 *InstrInfo_, *RegInfo_));
186
187 Context_ = llvm::make_unique<llvm::MCContext>(AsmInfo_.get(), RegInfo_.get(),
188 &ObjectFileInfo_);
189 Disasm_.reset(Target.createMCDisassembler(*SubtargetInfo_, *Context_));
190 assert(Disasm_ && "cannot create MCDisassembler. missing call to "
191 "InitializeXXXTargetDisassembler ?");
192 }
193
194 template <>
195 llvm::Error
run(llvm::raw_ostream & OS) const196 Analysis::run<Analysis::PrintClusters>(llvm::raw_ostream &OS) const {
197 if (Clustering_.getPoints().empty())
198 return llvm::Error::success();
199
200 // Write the header.
201 OS << "cluster_id" << kCsvSep << "opcode_name" << kCsvSep << "config"
202 << kCsvSep << "sched_class";
203 for (const auto &Measurement : Clustering_.getPoints().front().Measurements) {
204 OS << kCsvSep;
205 writeEscaped<kEscapeCsv>(OS, Measurement.Key);
206 }
207 OS << "\n";
208
209 // Write the points.
210 const auto &Clusters = Clustering_.getValidClusters();
211 for (size_t I = 0, E = Clusters.size(); I < E; ++I) {
212 for (const size_t PointId : Clusters[I].PointIndices) {
213 printInstructionRowCsv(PointId, OS);
214 }
215 OS << "\n\n";
216 }
217 return llvm::Error::success();
218 }
219
ResolvedSchedClassAndPoints(ResolvedSchedClass && RSC)220 Analysis::ResolvedSchedClassAndPoints::ResolvedSchedClassAndPoints(
221 ResolvedSchedClass &&RSC)
222 : RSC(std::move(RSC)) {}
223
224 std::vector<Analysis::ResolvedSchedClassAndPoints>
makePointsPerSchedClass() const225 Analysis::makePointsPerSchedClass() const {
226 std::vector<ResolvedSchedClassAndPoints> Entries;
227 // Maps SchedClassIds to index in result.
228 std::unordered_map<unsigned, size_t> SchedClassIdToIndex;
229 const auto &Points = Clustering_.getPoints();
230 for (size_t PointId = 0, E = Points.size(); PointId < E; ++PointId) {
231 const InstructionBenchmark &Point = Points[PointId];
232 if (!Point.Error.empty())
233 continue;
234 assert(!Point.Key.Instructions.empty());
235 // FIXME: we should be using the tuple of classes for instructions in the
236 // snippet as key.
237 const llvm::MCInst &MCI = Point.Key.Instructions[0];
238 unsigned SchedClassId = InstrInfo_->get(MCI.getOpcode()).getSchedClass();
239 const bool WasVariant = SchedClassId && SubtargetInfo_->getSchedModel()
240 .getSchedClassDesc(SchedClassId)
241 ->isVariant();
242 SchedClassId = resolveSchedClassId(*SubtargetInfo_, SchedClassId, MCI);
243 const auto IndexIt = SchedClassIdToIndex.find(SchedClassId);
244 if (IndexIt == SchedClassIdToIndex.end()) {
245 // Create a new entry.
246 SchedClassIdToIndex.emplace(SchedClassId, Entries.size());
247 ResolvedSchedClassAndPoints Entry(
248 ResolvedSchedClass(*SubtargetInfo_, SchedClassId, WasVariant));
249 Entry.PointIds.push_back(PointId);
250 Entries.push_back(std::move(Entry));
251 } else {
252 // Append to the existing entry.
253 Entries[IndexIt->second].PointIds.push_back(PointId);
254 }
255 }
256 return Entries;
257 }
258
259 // Uops repeat the same opcode over again. Just show this opcode and show the
260 // whole snippet only on hover.
writeUopsSnippetHtml(llvm::raw_ostream & OS,const std::vector<llvm::MCInst> & Instructions,const llvm::MCInstrInfo & InstrInfo)261 static void writeUopsSnippetHtml(llvm::raw_ostream &OS,
262 const std::vector<llvm::MCInst> &Instructions,
263 const llvm::MCInstrInfo &InstrInfo) {
264 if (Instructions.empty())
265 return;
266 writeEscaped<kEscapeHtml>(OS, InstrInfo.getName(Instructions[0].getOpcode()));
267 if (Instructions.size() > 1)
268 OS << " (x" << Instructions.size() << ")";
269 }
270
271 // Latency tries to find a serial path. Just show the opcode path and show the
272 // whole snippet only on hover.
273 static void
writeLatencySnippetHtml(llvm::raw_ostream & OS,const std::vector<llvm::MCInst> & Instructions,const llvm::MCInstrInfo & InstrInfo)274 writeLatencySnippetHtml(llvm::raw_ostream &OS,
275 const std::vector<llvm::MCInst> &Instructions,
276 const llvm::MCInstrInfo &InstrInfo) {
277 bool First = true;
278 for (const llvm::MCInst &Instr : Instructions) {
279 if (First)
280 First = false;
281 else
282 OS << " → ";
283 writeEscaped<kEscapeHtml>(OS, InstrInfo.getName(Instr.getOpcode()));
284 }
285 }
286
printSchedClassClustersHtml(const std::vector<SchedClassCluster> & Clusters,const ResolvedSchedClass & RSC,llvm::raw_ostream & OS) const287 void Analysis::printSchedClassClustersHtml(
288 const std::vector<SchedClassCluster> &Clusters,
289 const ResolvedSchedClass &RSC, llvm::raw_ostream &OS) const {
290 const auto &Points = Clustering_.getPoints();
291 OS << "<table class=\"sched-class-clusters\">";
292 OS << "<tr><th>ClusterId</th><th>Opcode/Config</th>";
293 assert(!Clusters.empty());
294 for (const auto &Measurement :
295 Points[Clusters[0].getPointIds()[0]].Measurements) {
296 OS << "<th>";
297 writeEscaped<kEscapeHtml>(OS, Measurement.Key);
298 OS << "</th>";
299 }
300 OS << "</tr>";
301 for (const SchedClassCluster &Cluster : Clusters) {
302 OS << "<tr class=\""
303 << (Cluster.measurementsMatch(*SubtargetInfo_, RSC, Clustering_)
304 ? "good-cluster"
305 : "bad-cluster")
306 << "\"><td>";
307 writeClusterId<kEscapeHtml>(OS, Cluster.id());
308 OS << "</td><td><ul>";
309 for (const size_t PointId : Cluster.getPointIds()) {
310 const auto &Point = Points[PointId];
311 OS << "<li><span class=\"mono\" title=\"";
312 writeSnippet<EscapeTag, kEscapeHtmlString>(OS, Point.AssembledSnippet,
313 "\n");
314 OS << "\">";
315 switch (Point.Mode) {
316 case InstructionBenchmark::Latency:
317 writeLatencySnippetHtml(OS, Point.Key.Instructions, *InstrInfo_);
318 break;
319 case InstructionBenchmark::Uops:
320 writeUopsSnippetHtml(OS, Point.Key.Instructions, *InstrInfo_);
321 break;
322 default:
323 llvm_unreachable("invalid mode");
324 }
325 OS << "</span> <span class=\"mono\">";
326 writeEscaped<kEscapeHtml>(OS, Point.Key.Config);
327 OS << "</span></li>";
328 }
329 OS << "</ul></td>";
330 for (const auto &Stats : Cluster.getRepresentative()) {
331 OS << "<td class=\"measurement\">";
332 writeMeasurementValue<kEscapeHtml>(OS, Stats.avg());
333 OS << "<br><span class=\"minmax\">[";
334 writeMeasurementValue<kEscapeHtml>(OS, Stats.min());
335 OS << ";";
336 writeMeasurementValue<kEscapeHtml>(OS, Stats.max());
337 OS << "]</span></td>";
338 }
339 OS << "</tr>";
340 }
341 OS << "</table>";
342 }
343
344 // Return the non-redundant list of WriteProcRes used by the given sched class.
345 // The scheduling model for LLVM is such that each instruction has a certain
346 // number of uops which consume resources which are described by WriteProcRes
347 // entries. Each entry describe how many cycles are spent on a specific ProcRes
348 // kind.
349 // For example, an instruction might have 3 uOps, one dispatching on P0
350 // (ProcResIdx=1) and two on P06 (ProcResIdx = 7).
351 // Note that LLVM additionally denormalizes resource consumption to include
352 // usage of super resources by subresources. So in practice if there exists a
353 // P016 (ProcResIdx=10), then the cycles consumed by P0 are also consumed by
354 // P06 (ProcResIdx = 7) and P016 (ProcResIdx = 10), and the resources consumed
355 // by P06 are also consumed by P016. In the figure below, parenthesized cycles
356 // denote implied usage of superresources by subresources:
357 // P0 P06 P016
358 // uOp1 1 (1) (1)
359 // uOp2 1 (1)
360 // uOp3 1 (1)
361 // =============================
362 // 1 3 3
363 // Eventually we end up with three entries for the WriteProcRes of the
364 // instruction:
365 // {ProcResIdx=1, Cycles=1} // P0
366 // {ProcResIdx=7, Cycles=3} // P06
367 // {ProcResIdx=10, Cycles=3} // P016
368 //
369 // Note that in this case, P016 does not contribute any cycles, so it would
370 // be removed by this function.
371 // FIXME: Move this to MCSubtargetInfo and use it in llvm-mca.
372 static llvm::SmallVector<llvm::MCWriteProcResEntry, 8>
getNonRedundantWriteProcRes(const llvm::MCSchedClassDesc & SCDesc,const llvm::MCSubtargetInfo & STI)373 getNonRedundantWriteProcRes(const llvm::MCSchedClassDesc &SCDesc,
374 const llvm::MCSubtargetInfo &STI) {
375 llvm::SmallVector<llvm::MCWriteProcResEntry, 8> Result;
376 const auto &SM = STI.getSchedModel();
377 const unsigned NumProcRes = SM.getNumProcResourceKinds();
378
379 // This assumes that the ProcResDescs are sorted in topological order, which
380 // is guaranteed by the tablegen backend.
381 llvm::SmallVector<float, 32> ProcResUnitUsage(NumProcRes);
382 for (const auto *WPR = STI.getWriteProcResBegin(&SCDesc),
383 *const WPREnd = STI.getWriteProcResEnd(&SCDesc);
384 WPR != WPREnd; ++WPR) {
385 const llvm::MCProcResourceDesc *const ProcResDesc =
386 SM.getProcResource(WPR->ProcResourceIdx);
387 if (ProcResDesc->SubUnitsIdxBegin == nullptr) {
388 // This is a ProcResUnit.
389 Result.push_back({WPR->ProcResourceIdx, WPR->Cycles});
390 ProcResUnitUsage[WPR->ProcResourceIdx] += WPR->Cycles;
391 } else {
392 // This is a ProcResGroup. First see if it contributes any cycles or if
393 // it has cycles just from subunits.
394 float RemainingCycles = WPR->Cycles;
395 for (const auto *SubResIdx = ProcResDesc->SubUnitsIdxBegin;
396 SubResIdx != ProcResDesc->SubUnitsIdxBegin + ProcResDesc->NumUnits;
397 ++SubResIdx) {
398 RemainingCycles -= ProcResUnitUsage[*SubResIdx];
399 }
400 if (RemainingCycles < 0.01f) {
401 // The ProcResGroup contributes no cycles of its own.
402 continue;
403 }
404 // The ProcResGroup contributes `RemainingCycles` cycles of its own.
405 Result.push_back({WPR->ProcResourceIdx,
406 static_cast<uint16_t>(std::round(RemainingCycles))});
407 // Spread the remaining cycles over all subunits.
408 for (const auto *SubResIdx = ProcResDesc->SubUnitsIdxBegin;
409 SubResIdx != ProcResDesc->SubUnitsIdxBegin + ProcResDesc->NumUnits;
410 ++SubResIdx) {
411 ProcResUnitUsage[*SubResIdx] += RemainingCycles / ProcResDesc->NumUnits;
412 }
413 }
414 }
415 return Result;
416 }
417
ResolvedSchedClass(const llvm::MCSubtargetInfo & STI,unsigned ResolvedSchedClassId,bool WasVariant)418 Analysis::ResolvedSchedClass::ResolvedSchedClass(
419 const llvm::MCSubtargetInfo &STI, unsigned ResolvedSchedClassId,
420 bool WasVariant)
421 : SchedClassId(ResolvedSchedClassId), SCDesc(STI.getSchedModel().getSchedClassDesc(ResolvedSchedClassId)),
422 WasVariant(WasVariant),
423 NonRedundantWriteProcRes(getNonRedundantWriteProcRes(*SCDesc, STI)),
424 IdealizedProcResPressure(computeIdealizedProcResPressure(
425 STI.getSchedModel(), NonRedundantWriteProcRes)) {
426 assert((SCDesc == nullptr || !SCDesc->isVariant()) &&
427 "ResolvedSchedClass should never be variant");
428 }
429
addPoint(size_t PointId,const InstructionBenchmarkClustering & Clustering)430 void Analysis::SchedClassCluster::addPoint(
431 size_t PointId, const InstructionBenchmarkClustering &Clustering) {
432 PointIds.push_back(PointId);
433 const auto &Point = Clustering.getPoints()[PointId];
434 if (ClusterId.isUndef()) {
435 ClusterId = Clustering.getClusterIdForPoint(PointId);
436 Representative.resize(Point.Measurements.size());
437 }
438 for (size_t I = 0, E = Point.Measurements.size(); I < E; ++I) {
439 Representative[I].push(Point.Measurements[I]);
440 }
441 assert(ClusterId == Clustering.getClusterIdForPoint(PointId));
442 }
443
444 // Returns a ProxResIdx by id or name.
findProcResIdx(const llvm::MCSubtargetInfo & STI,const llvm::StringRef NameOrId)445 static unsigned findProcResIdx(const llvm::MCSubtargetInfo &STI,
446 const llvm::StringRef NameOrId) {
447 // Interpret the key as an ProcResIdx.
448 unsigned ProcResIdx = 0;
449 if (llvm::to_integer(NameOrId, ProcResIdx, 10))
450 return ProcResIdx;
451 // Interpret the key as a ProcRes name.
452 const auto &SchedModel = STI.getSchedModel();
453 for (int I = 0, E = SchedModel.getNumProcResourceKinds(); I < E; ++I) {
454 if (NameOrId == SchedModel.getProcResource(I)->Name)
455 return I;
456 }
457 return 0;
458 }
459
measurementsMatch(const llvm::MCSubtargetInfo & STI,const ResolvedSchedClass & RSC,const InstructionBenchmarkClustering & Clustering) const460 bool Analysis::SchedClassCluster::measurementsMatch(
461 const llvm::MCSubtargetInfo &STI, const ResolvedSchedClass &RSC,
462 const InstructionBenchmarkClustering &Clustering) const {
463 const size_t NumMeasurements = Representative.size();
464 std::vector<BenchmarkMeasure> ClusterCenterPoint(NumMeasurements);
465 std::vector<BenchmarkMeasure> SchedClassPoint(NumMeasurements);
466 // Latency case.
467 assert(!Clustering.getPoints().empty());
468 const InstructionBenchmark::ModeE Mode = Clustering.getPoints()[0].Mode;
469 if (Mode == InstructionBenchmark::Latency) {
470 if (NumMeasurements != 1) {
471 llvm::errs()
472 << "invalid number of measurements in latency mode: expected 1, got "
473 << NumMeasurements << "\n";
474 return false;
475 }
476 // Find the latency.
477 SchedClassPoint[0].PerInstructionValue = 0.0;
478 for (unsigned I = 0; I < RSC.SCDesc->NumWriteLatencyEntries; ++I) {
479 const llvm::MCWriteLatencyEntry *const WLE =
480 STI.getWriteLatencyEntry(RSC.SCDesc, I);
481 SchedClassPoint[0].PerInstructionValue =
482 std::max<double>(SchedClassPoint[0].PerInstructionValue, WLE->Cycles);
483 }
484 ClusterCenterPoint[0].PerInstructionValue = Representative[0].avg();
485 } else if (Mode == InstructionBenchmark::Uops) {
486 for (int I = 0, E = Representative.size(); I < E; ++I) {
487 const auto Key = Representative[I].key();
488 uint16_t ProcResIdx = findProcResIdx(STI, Key);
489 if (ProcResIdx > 0) {
490 // Find the pressure on ProcResIdx `Key`.
491 const auto ProcResPressureIt =
492 std::find_if(RSC.IdealizedProcResPressure.begin(),
493 RSC.IdealizedProcResPressure.end(),
494 [ProcResIdx](const std::pair<uint16_t, float> &WPR) {
495 return WPR.first == ProcResIdx;
496 });
497 SchedClassPoint[I].PerInstructionValue =
498 ProcResPressureIt == RSC.IdealizedProcResPressure.end()
499 ? 0.0
500 : ProcResPressureIt->second;
501 } else if (Key == "NumMicroOps") {
502 SchedClassPoint[I].PerInstructionValue = RSC.SCDesc->NumMicroOps;
503 } else {
504 llvm::errs() << "expected `key` to be either a ProcResIdx or a ProcRes "
505 "name, got "
506 << Key << "\n";
507 return false;
508 }
509 ClusterCenterPoint[I].PerInstructionValue = Representative[I].avg();
510 }
511 } else {
512 llvm::errs() << "unimplemented measurement matching for mode " << Mode
513 << "\n";
514 return false;
515 }
516 return Clustering.isNeighbour(ClusterCenterPoint, SchedClassPoint);
517 }
518
printSchedClassDescHtml(const ResolvedSchedClass & RSC,llvm::raw_ostream & OS) const519 void Analysis::printSchedClassDescHtml(const ResolvedSchedClass &RSC,
520 llvm::raw_ostream &OS) const {
521 OS << "<table class=\"sched-class-desc\">";
522 OS << "<tr><th>Valid</th><th>Variant</th><th>NumMicroOps</th><th>Latency</"
523 "th><th>WriteProcRes</th><th title=\"This is the idealized unit "
524 "resource (port) pressure assuming ideal distribution\">Idealized "
525 "Resource Pressure</th></tr>";
526 if (RSC.SCDesc->isValid()) {
527 const auto &SM = SubtargetInfo_->getSchedModel();
528 OS << "<tr><td>✔</td>";
529 OS << "<td>" << (RSC.WasVariant ? "✔" : "✕") << "</td>";
530 OS << "<td>" << RSC.SCDesc->NumMicroOps << "</td>";
531 // Latencies.
532 OS << "<td><ul>";
533 for (int I = 0, E = RSC.SCDesc->NumWriteLatencyEntries; I < E; ++I) {
534 const auto *const Entry =
535 SubtargetInfo_->getWriteLatencyEntry(RSC.SCDesc, I);
536 OS << "<li>" << Entry->Cycles;
537 if (RSC.SCDesc->NumWriteLatencyEntries > 1) {
538 // Dismabiguate if more than 1 latency.
539 OS << " (WriteResourceID " << Entry->WriteResourceID << ")";
540 }
541 OS << "</li>";
542 }
543 OS << "</ul></td>";
544 // WriteProcRes.
545 OS << "<td><ul>";
546 for (const auto &WPR : RSC.NonRedundantWriteProcRes) {
547 OS << "<li><span class=\"mono\">";
548 writeEscaped<kEscapeHtml>(OS,
549 SM.getProcResource(WPR.ProcResourceIdx)->Name);
550 OS << "</span>: " << WPR.Cycles << "</li>";
551 }
552 OS << "</ul></td>";
553 // Idealized port pressure.
554 OS << "<td><ul>";
555 for (const auto &Pressure : RSC.IdealizedProcResPressure) {
556 OS << "<li><span class=\"mono\">";
557 writeEscaped<kEscapeHtml>(OS, SubtargetInfo_->getSchedModel()
558 .getProcResource(Pressure.first)
559 ->Name);
560 OS << "</span>: ";
561 writeMeasurementValue<kEscapeHtml>(OS, Pressure.second);
562 OS << "</li>";
563 }
564 OS << "</ul></td>";
565 OS << "</tr>";
566 } else {
567 OS << "<tr><td>✕</td><td></td><td></td></tr>";
568 }
569 OS << "</table>";
570 }
571
572 static constexpr const char kHtmlHead[] = R"(
573 <head>
574 <title>llvm-exegesis Analysis Results</title>
575 <style>
576 body {
577 font-family: sans-serif
578 }
579 span.sched-class-name {
580 font-weight: bold;
581 font-family: monospace;
582 }
583 span.opcode {
584 font-family: monospace;
585 }
586 span.config {
587 font-family: monospace;
588 }
589 div.inconsistency {
590 margin-top: 50px;
591 }
592 table {
593 margin-left: 50px;
594 border-collapse: collapse;
595 }
596 table, table tr,td,th {
597 border: 1px solid #444;
598 }
599 table ul {
600 padding-left: 0px;
601 margin: 0px;
602 list-style-type: none;
603 }
604 table.sched-class-clusters td {
605 padding-left: 10px;
606 padding-right: 10px;
607 padding-top: 10px;
608 padding-bottom: 10px;
609 }
610 table.sched-class-desc td {
611 padding-left: 10px;
612 padding-right: 10px;
613 padding-top: 2px;
614 padding-bottom: 2px;
615 }
616 span.mono {
617 font-family: monospace;
618 }
619 td.measurement {
620 text-align: center;
621 }
622 tr.good-cluster td.measurement {
623 color: #292
624 }
625 tr.bad-cluster td.measurement {
626 color: #922
627 }
628 tr.good-cluster td.measurement span.minmax {
629 color: #888;
630 }
631 tr.bad-cluster td.measurement span.minmax {
632 color: #888;
633 }
634 </style>
635 </head>
636 )";
637
638 template <>
run(llvm::raw_ostream & OS) const639 llvm::Error Analysis::run<Analysis::PrintSchedClassInconsistencies>(
640 llvm::raw_ostream &OS) const {
641 const auto &FirstPoint = Clustering_.getPoints()[0];
642 // Print the header.
643 OS << "<!DOCTYPE html><html>" << kHtmlHead << "<body>";
644 OS << "<h1><span class=\"mono\">llvm-exegesis</span> Analysis Results</h1>";
645 OS << "<h3>Triple: <span class=\"mono\">";
646 writeEscaped<kEscapeHtml>(OS, FirstPoint.LLVMTriple);
647 OS << "</span></h3><h3>Cpu: <span class=\"mono\">";
648 writeEscaped<kEscapeHtml>(OS, FirstPoint.CpuName);
649 OS << "</span></h3>";
650
651 for (const auto &RSCAndPoints : makePointsPerSchedClass()) {
652 if (!RSCAndPoints.RSC.SCDesc)
653 continue;
654 // Bucket sched class points into sched class clusters.
655 std::vector<SchedClassCluster> SchedClassClusters;
656 for (const size_t PointId : RSCAndPoints.PointIds) {
657 const auto &ClusterId = Clustering_.getClusterIdForPoint(PointId);
658 if (!ClusterId.isValid())
659 continue; // Ignore noise and errors. FIXME: take noise into account ?
660 auto SchedClassClusterIt =
661 std::find_if(SchedClassClusters.begin(), SchedClassClusters.end(),
662 [ClusterId](const SchedClassCluster &C) {
663 return C.id() == ClusterId;
664 });
665 if (SchedClassClusterIt == SchedClassClusters.end()) {
666 SchedClassClusters.emplace_back();
667 SchedClassClusterIt = std::prev(SchedClassClusters.end());
668 }
669 SchedClassClusterIt->addPoint(PointId, Clustering_);
670 }
671
672 // Print any scheduling class that has at least one cluster that does not
673 // match the checked-in data.
674 if (llvm::all_of(SchedClassClusters,
675 [this, &RSCAndPoints](const SchedClassCluster &C) {
676 return C.measurementsMatch(
677 *SubtargetInfo_, RSCAndPoints.RSC, Clustering_);
678 }))
679 continue; // Nothing weird.
680
681 OS << "<div class=\"inconsistency\"><p>Sched Class <span "
682 "class=\"sched-class-name\">";
683 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
684 writeEscaped<kEscapeHtml>(OS, RSCAndPoints.RSC.SCDesc->Name);
685 #else
686 OS << RSCAndPoints.RSC.SchedClassId;
687 #endif
688 OS << "</span> contains instructions whose performance characteristics do"
689 " not match that of LLVM:</p>";
690 printSchedClassClustersHtml(SchedClassClusters, RSCAndPoints.RSC, OS);
691 OS << "<p>llvm SchedModel data:</p>";
692 printSchedClassDescHtml(RSCAndPoints.RSC, OS);
693 OS << "</div>";
694 }
695
696 OS << "</body></html>";
697 return llvm::Error::success();
698 }
699
700 // Distributes a pressure budget as evenly as possible on the provided subunits
701 // given the already existing port pressure distribution.
702 //
703 // The algorithm is as follows: while there is remaining pressure to
704 // distribute, find the subunits with minimal pressure, and distribute
705 // remaining pressure equally up to the pressure of the unit with
706 // second-to-minimal pressure.
707 // For example, let's assume we want to distribute 2*P1256
708 // (Subunits = [P1,P2,P5,P6]), and the starting DensePressure is:
709 // DensePressure = P0 P1 P2 P3 P4 P5 P6 P7
710 // 0.1 0.3 0.2 0.0 0.0 0.5 0.5 0.5
711 // RemainingPressure = 2.0
712 // We sort the subunits by pressure:
713 // Subunits = [(P2,p=0.2), (P1,p=0.3), (P5,p=0.5), (P6, p=0.5)]
714 // We'll first start by the subunits with minimal pressure, which are at
715 // the beginning of the sorted array. In this example there is one (P2).
716 // The subunit with second-to-minimal pressure is the next one in the
717 // array (P1). So we distribute 0.1 pressure to P2, and remove 0.1 cycles
718 // from the budget.
719 // Subunits = [(P2,p=0.3), (P1,p=0.3), (P5,p=0.5), (P5,p=0.5)]
720 // RemainingPressure = 1.9
721 // We repeat this process: distribute 0.2 pressure on each of the minimal
722 // P2 and P1, decrease budget by 2*0.2:
723 // Subunits = [(P2,p=0.5), (P1,p=0.5), (P5,p=0.5), (P5,p=0.5)]
724 // RemainingPressure = 1.5
725 // There are no second-to-minimal subunits so we just share the remaining
726 // budget (1.5 cycles) equally:
727 // Subunits = [(P2,p=0.875), (P1,p=0.875), (P5,p=0.875), (P5,p=0.875)]
728 // RemainingPressure = 0.0
729 // We stop as there is no remaining budget to distribute.
distributePressure(float RemainingPressure,llvm::SmallVector<uint16_t,32> Subunits,llvm::SmallVector<float,32> & DensePressure)730 void distributePressure(float RemainingPressure,
731 llvm::SmallVector<uint16_t, 32> Subunits,
732 llvm::SmallVector<float, 32> &DensePressure) {
733 // Find the number of subunits with minimal pressure (they are at the
734 // front).
735 llvm::sort(Subunits, [&DensePressure](const uint16_t A, const uint16_t B) {
736 return DensePressure[A] < DensePressure[B];
737 });
738 const auto getPressureForSubunit = [&DensePressure,
739 &Subunits](size_t I) -> float & {
740 return DensePressure[Subunits[I]];
741 };
742 size_t NumMinimalSU = 1;
743 while (NumMinimalSU < Subunits.size() &&
744 getPressureForSubunit(NumMinimalSU) == getPressureForSubunit(0)) {
745 ++NumMinimalSU;
746 }
747 while (RemainingPressure > 0.0f) {
748 if (NumMinimalSU == Subunits.size()) {
749 // All units are minimal, just distribute evenly and be done.
750 for (size_t I = 0; I < NumMinimalSU; ++I) {
751 getPressureForSubunit(I) += RemainingPressure / NumMinimalSU;
752 }
753 return;
754 }
755 // Distribute the remaining pressure equally.
756 const float MinimalPressure = getPressureForSubunit(NumMinimalSU - 1);
757 const float SecondToMinimalPressure = getPressureForSubunit(NumMinimalSU);
758 assert(MinimalPressure < SecondToMinimalPressure);
759 const float Increment = SecondToMinimalPressure - MinimalPressure;
760 if (RemainingPressure <= NumMinimalSU * Increment) {
761 // There is not enough remaining pressure.
762 for (size_t I = 0; I < NumMinimalSU; ++I) {
763 getPressureForSubunit(I) += RemainingPressure / NumMinimalSU;
764 }
765 return;
766 }
767 // Bump all minimal pressure subunits to `SecondToMinimalPressure`.
768 for (size_t I = 0; I < NumMinimalSU; ++I) {
769 getPressureForSubunit(I) = SecondToMinimalPressure;
770 RemainingPressure -= SecondToMinimalPressure;
771 }
772 while (NumMinimalSU < Subunits.size() &&
773 getPressureForSubunit(NumMinimalSU) == SecondToMinimalPressure) {
774 ++NumMinimalSU;
775 }
776 }
777 }
778
computeIdealizedProcResPressure(const llvm::MCSchedModel & SM,llvm::SmallVector<llvm::MCWriteProcResEntry,8> WPRS)779 std::vector<std::pair<uint16_t, float>> computeIdealizedProcResPressure(
780 const llvm::MCSchedModel &SM,
781 llvm::SmallVector<llvm::MCWriteProcResEntry, 8> WPRS) {
782 // DensePressure[I] is the port pressure for Proc Resource I.
783 llvm::SmallVector<float, 32> DensePressure(SM.getNumProcResourceKinds());
784 llvm::sort(WPRS, [](const llvm::MCWriteProcResEntry &A,
785 const llvm::MCWriteProcResEntry &B) {
786 return A.ProcResourceIdx < B.ProcResourceIdx;
787 });
788 for (const llvm::MCWriteProcResEntry &WPR : WPRS) {
789 // Get units for the entry.
790 const llvm::MCProcResourceDesc *const ProcResDesc =
791 SM.getProcResource(WPR.ProcResourceIdx);
792 if (ProcResDesc->SubUnitsIdxBegin == nullptr) {
793 // This is a ProcResUnit.
794 DensePressure[WPR.ProcResourceIdx] += WPR.Cycles;
795 } else {
796 // This is a ProcResGroup.
797 llvm::SmallVector<uint16_t, 32> Subunits(ProcResDesc->SubUnitsIdxBegin,
798 ProcResDesc->SubUnitsIdxBegin +
799 ProcResDesc->NumUnits);
800 distributePressure(WPR.Cycles, Subunits, DensePressure);
801 }
802 }
803 // Turn dense pressure into sparse pressure by removing zero entries.
804 std::vector<std::pair<uint16_t, float>> Pressure;
805 for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) {
806 if (DensePressure[I] > 0.0f)
807 Pressure.emplace_back(I, DensePressure[I]);
808 }
809 return Pressure;
810 }
811
812 } // namespace exegesis
813 } // namespace llvm
814