//===-- Analysis.cpp --------------------------------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// #include "Analysis.h" #include "BenchmarkResult.h" #include "llvm/ADT/STLExtras.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/Support/FormatVariadic.h" #include #include namespace exegesis { static const char kCsvSep = ','; namespace { enum EscapeTag { kEscapeCsv, kEscapeHtml, kEscapeHtmlString }; template void writeEscaped(llvm::raw_ostream &OS, const llvm::StringRef S); template <> void writeEscaped(llvm::raw_ostream &OS, const llvm::StringRef S) { if (std::find(S.begin(), S.end(), kCsvSep) == S.end()) { OS << S; } else { // Needs escaping. OS << '"'; for (const char C : S) { if (C == '"') OS << "\"\""; else OS << C; } OS << '"'; } } template <> void writeEscaped(llvm::raw_ostream &OS, const llvm::StringRef S) { for (const char C : S) { if (C == '<') OS << "<"; else if (C == '>') OS << ">"; else if (C == '&') OS << "&"; else OS << C; } } template <> void writeEscaped(llvm::raw_ostream &OS, const llvm::StringRef S) { for (const char C : S) { if (C == '"') OS << "\\\""; else OS << C; } } } // namespace template static void writeClusterId(llvm::raw_ostream &OS, const InstructionBenchmarkClustering::ClusterId &CID) { if (CID.isNoise()) writeEscaped(OS, "[noise]"); else if (CID.isError()) writeEscaped(OS, "[error]"); else OS << CID.getId(); } template static void writeMeasurementValue(llvm::raw_ostream &OS, const double Value) { writeEscaped(OS, llvm::formatv("{0:F}", Value).str()); } template void Analysis::writeSnippet(llvm::raw_ostream &OS, llvm::ArrayRef Bytes, const char *Separator) const { llvm::SmallVector Lines; // Parse the asm snippet and print it. while (!Bytes.empty()) { llvm::MCInst MI; uint64_t MISize = 0; if (!Disasm_->getInstruction(MI, MISize, Bytes, 0, llvm::nulls(), llvm::nulls())) { writeEscaped(OS, llvm::join(Lines, Separator)); writeEscaped(OS, Separator); writeEscaped(OS, "[error decoding asm snippet]"); return; } Lines.emplace_back(); std::string &Line = Lines.back(); llvm::raw_string_ostream OSS(Line); InstPrinter_->printInst(&MI, OSS, "", *SubtargetInfo_); Bytes = Bytes.drop_front(MISize); OSS.flush(); Line = llvm::StringRef(Line).trim().str(); } writeEscaped(OS, llvm::join(Lines, Separator)); } // Prints a row representing an instruction, along with scheduling info and // point coordinates (measurements). void Analysis::printInstructionRowCsv(const size_t PointId, llvm::raw_ostream &OS) const { const InstructionBenchmark &Point = Clustering_.getPoints()[PointId]; writeClusterId(OS, Clustering_.getClusterIdForPoint(PointId)); OS << kCsvSep; writeSnippet(OS, Point.AssembledSnippet, "; "); OS << kCsvSep; writeEscaped(OS, Point.Key.Config); OS << kCsvSep; assert(!Point.Key.Instructions.empty()); // FIXME: Resolve variant classes. const unsigned SchedClassId = InstrInfo_->get(Point.Key.Instructions[0].getOpcode()).getSchedClass(); #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) const auto &SchedModel = SubtargetInfo_->getSchedModel(); const llvm::MCSchedClassDesc *const SCDesc = SchedModel.getSchedClassDesc(SchedClassId); writeEscaped(OS, SCDesc->Name); #else OS << SchedClassId; #endif for (const auto &Measurement : Point.Measurements) { OS << kCsvSep; writeMeasurementValue(OS, Measurement.Value); } OS << "\n"; } Analysis::Analysis(const llvm::Target &Target, const InstructionBenchmarkClustering &Clustering) : Clustering_(Clustering) { if (Clustering.getPoints().empty()) return; const InstructionBenchmark &FirstPoint = Clustering.getPoints().front(); InstrInfo_.reset(Target.createMCInstrInfo()); RegInfo_.reset(Target.createMCRegInfo(FirstPoint.LLVMTriple)); AsmInfo_.reset(Target.createMCAsmInfo(*RegInfo_, FirstPoint.LLVMTriple)); SubtargetInfo_.reset(Target.createMCSubtargetInfo(FirstPoint.LLVMTriple, FirstPoint.CpuName, "")); InstPrinter_.reset(Target.createMCInstPrinter( llvm::Triple(FirstPoint.LLVMTriple), 0 /*default variant*/, *AsmInfo_, *InstrInfo_, *RegInfo_)); Context_ = llvm::make_unique(AsmInfo_.get(), RegInfo_.get(), &ObjectFileInfo_); Disasm_.reset(Target.createMCDisassembler(*SubtargetInfo_, *Context_)); assert(Disasm_ && "cannot create MCDisassembler. missing call to " "InitializeXXXTargetDisassembler ?"); } template <> llvm::Error Analysis::run(llvm::raw_ostream &OS) const { if (Clustering_.getPoints().empty()) return llvm::Error::success(); // Write the header. OS << "cluster_id" << kCsvSep << "opcode_name" << kCsvSep << "config" << kCsvSep << "sched_class"; for (const auto &Measurement : Clustering_.getPoints().front().Measurements) { OS << kCsvSep; writeEscaped(OS, Measurement.Key); } OS << "\n"; // Write the points. const auto &Clusters = Clustering_.getValidClusters(); for (size_t I = 0, E = Clusters.size(); I < E; ++I) { for (const size_t PointId : Clusters[I].PointIndices) { printInstructionRowCsv(PointId, OS); } OS << "\n\n"; } return llvm::Error::success(); } std::unordered_map> Analysis::makePointsPerSchedClass() const { std::unordered_map> PointsPerSchedClass; const auto &Points = Clustering_.getPoints(); for (size_t PointId = 0, E = Points.size(); PointId < E; ++PointId) { const InstructionBenchmark &Point = Points[PointId]; if (!Point.Error.empty()) continue; assert(!Point.Key.Instructions.empty()); const auto Opcode = Point.Key.Instructions[0].getOpcode(); // FIXME: Resolve variant classes. PointsPerSchedClass[InstrInfo_->get(Opcode).getSchedClass()].push_back( PointId); } return PointsPerSchedClass; } // Uops repeat the same opcode over again. Just show this opcode and show the // whole snippet only on hover. static void writeUopsSnippetHtml(llvm::raw_ostream &OS, const std::vector &Instructions, const llvm::MCInstrInfo &InstrInfo) { if (Instructions.empty()) return; writeEscaped(OS, InstrInfo.getName(Instructions[0].getOpcode())); if (Instructions.size() > 1) OS << " (x" << Instructions.size() << ")"; } // Latency tries to find a serial path. Just show the opcode path and show the // whole snippet only on hover. static void writeLatencySnippetHtml(llvm::raw_ostream &OS, const std::vector &Instructions, const llvm::MCInstrInfo &InstrInfo) { bool First = true; for (const llvm::MCInst &Instr : Instructions) { if (First) First = false; else OS << " → "; writeEscaped(OS, InstrInfo.getName(Instr.getOpcode())); } } void Analysis::printSchedClassClustersHtml( const std::vector &Clusters, const SchedClass &SC, llvm::raw_ostream &OS) const { const auto &Points = Clustering_.getPoints(); OS << ""; OS << ""; assert(!Clusters.empty()); for (const auto &Measurement : Points[Clusters[0].getPointIds()[0]].Measurements) { OS << ""; } OS << ""; for (const SchedClassCluster &Cluster : Clusters) { OS << ""; for (const auto &Stats : Cluster.getRepresentative()) { OS << ""; } OS << ""; } OS << "

ClusterId	Opcode/Config	"; if (Measurement.DebugString.empty()) writeEscaped(OS, Measurement.Key); else writeEscaped(OS, Measurement.DebugString); OS << "
"; writeClusterId(OS, Cluster.id()); OS << "	"; for (const size_t PointId : Cluster.getPointIds()) { const auto &Point = Points[PointId]; OS << " (OS, Point.AssembledSnippet, "\n"); OS << "\">"; switch (Point.Mode) { case InstructionBenchmark::Latency: writeLatencySnippetHtml(OS, Point.Key.Instructions, InstrInfo_); break; case InstructionBenchmark::Uops: writeUopsSnippetHtml(OS, Point.Key.Instructions, InstrInfo_); break; default: llvm_unreachable("invalid mode"); } OS << " "; writeEscaped(OS, Point.Key.Config); OS << " "; } OS << "	"; writeMeasurementValue(OS, Stats.avg()); OS << " ["; writeMeasurementValue(OS, Stats.min()); OS << ";"; writeMeasurementValue(OS, Stats.max()); OS << "]

"; } // Return the non-redundant list of WriteProcRes used by the given sched class. // The scheduling model for LLVM is such that each instruction has a certain // number of uops which consume resources which are described by WriteProcRes // entries. Each entry describe how many cycles are spent on a specific ProcRes // kind. // For example, an instruction might have 3 uOps, one dispatching on P0 // (ProcResIdx=1) and two on P06 (ProcResIdx = 7). // Note that LLVM additionally denormalizes resource consumption to include // usage of super resources by subresources. So in practice if there exists a // P016 (ProcResIdx=10), then the cycles consumed by P0 are also consumed by // P06 (ProcResIdx = 7) and P016 (ProcResIdx = 10), and the resources consumed // by P06 are also consumed by P016. In the figure below, parenthesized cycles // denote implied usage of superresources by subresources: // P0 P06 P016 // uOp1 1 (1) (1) // uOp2 1 (1) // uOp3 1 (1) // ============================= // 1 3 3 // Eventually we end up with three entries for the WriteProcRes of the // instruction: // {ProcResIdx=1, Cycles=1} // P0 // {ProcResIdx=7, Cycles=3} // P06 // {ProcResIdx=10, Cycles=3} // P016 // // Note that in this case, P016 does not contribute any cycles, so it would // be removed by this function. // FIXME: Move this to MCSubtargetInfo and use it in llvm-mca. static llvm::SmallVector getNonRedundantWriteProcRes(const llvm::MCSchedClassDesc &SCDesc, const llvm::MCSubtargetInfo &STI) { llvm::SmallVector Result; const auto &SM = STI.getSchedModel(); const unsigned NumProcRes = SM.getNumProcResourceKinds(); // This assumes that the ProcResDescs are sorted in topological order, which // is guaranteed by the tablegen backend. llvm::SmallVector ProcResUnitUsage(NumProcRes); for (const auto *WPR = STI.getWriteProcResBegin(&SCDesc), *const WPREnd = STI.getWriteProcResEnd(&SCDesc); WPR != WPREnd; ++WPR) { const llvm::MCProcResourceDesc *const ProcResDesc = SM.getProcResource(WPR->ProcResourceIdx); if (ProcResDesc->SubUnitsIdxBegin == nullptr) { // This is a ProcResUnit. Result.push_back({WPR->ProcResourceIdx, WPR->Cycles}); ProcResUnitUsage[WPR->ProcResourceIdx] += WPR->Cycles; } else { // This is a ProcResGroup. First see if it contributes any cycles or if // it has cycles just from subunits. float RemainingCycles = WPR->Cycles; for (const auto *SubResIdx = ProcResDesc->SubUnitsIdxBegin; SubResIdx != ProcResDesc->SubUnitsIdxBegin + ProcResDesc->NumUnits; ++SubResIdx) { RemainingCycles -= ProcResUnitUsage[*SubResIdx]; } if (RemainingCycles < 0.01f) { // The ProcResGroup contributes no cycles of its own. continue; } // The ProcResGroup contributes `RemainingCycles` cycles of its own. Result.push_back({WPR->ProcResourceIdx, static_cast(std::round(RemainingCycles))}); // Spread the remaining cycles over all subunits. for (const auto *SubResIdx = ProcResDesc->SubUnitsIdxBegin; SubResIdx != ProcResDesc->SubUnitsIdxBegin + ProcResDesc->NumUnits; ++SubResIdx) { ProcResUnitUsage[*SubResIdx] += RemainingCycles / ProcResDesc->NumUnits; } } } return Result; } Analysis::SchedClass::SchedClass(const llvm::MCSchedClassDesc &SD, const llvm::MCSubtargetInfo &STI) : SCDesc(&SD), NonRedundantWriteProcRes(getNonRedundantWriteProcRes(SD, STI)), IdealizedProcResPressure(computeIdealizedProcResPressure( STI.getSchedModel(), NonRedundantWriteProcRes)) {} void Analysis::SchedClassCluster::addPoint( size_t PointId, const InstructionBenchmarkClustering &Clustering) { PointIds.push_back(PointId); const auto &Point = Clustering.getPoints()[PointId]; if (ClusterId.isUndef()) { ClusterId = Clustering.getClusterIdForPoint(PointId); Representative.resize(Point.Measurements.size()); } for (size_t I = 0, E = Point.Measurements.size(); I < E; ++I) { Representative[I].push(Point.Measurements[I]); } assert(ClusterId == Clustering.getClusterIdForPoint(PointId)); } bool Analysis::SchedClassCluster::measurementsMatch( const llvm::MCSubtargetInfo &STI, const SchedClass &SC, const InstructionBenchmarkClustering &Clustering) const { const size_t NumMeasurements = Representative.size(); std::vector ClusterCenterPoint(NumMeasurements); std::vector SchedClassPoint(NumMeasurements); // Latency case. assert(!Clustering.getPoints().empty()); const InstructionBenchmark::ModeE Mode = Clustering.getPoints()[0].Mode; if (Mode == InstructionBenchmark::Latency) { if (NumMeasurements != 1) { llvm::errs() << "invalid number of measurements in latency mode: expected 1, got " << NumMeasurements << "\n"; return false; } // Find the latency. SchedClassPoint[0].Value = 0.0; for (unsigned I = 0; I < SC.SCDesc->NumWriteLatencyEntries; ++I) { const llvm::MCWriteLatencyEntry *const WLE = STI.getWriteLatencyEntry(SC.SCDesc, I); SchedClassPoint[0].Value = std::max(SchedClassPoint[0].Value, WLE->Cycles); } ClusterCenterPoint[0].Value = Representative[0].avg(); } else if (Mode == InstructionBenchmark::Uops) { for (int I = 0, E = Representative.size(); I < E; ++I) { // Find the pressure on ProcResIdx `Key`. uint16_t ProcResIdx = 0; if (!llvm::to_integer(Representative[I].key(), ProcResIdx, 10)) { llvm::errs() << "expected ProcResIdx key, got " << Representative[I].key() << "\n"; return false; } const auto ProcResPressureIt = std::find_if(SC.IdealizedProcResPressure.begin(), SC.IdealizedProcResPressure.end(), [ProcResIdx](const std::pair &WPR) { return WPR.first == ProcResIdx; }); SchedClassPoint[I].Value = ProcResPressureIt == SC.IdealizedProcResPressure.end() ? 0.0 : ProcResPressureIt->second; ClusterCenterPoint[I].Value = Representative[I].avg(); } } else { llvm::errs() << "unimplemented measurement matching for mode " << Mode << "\n"; return false; } return Clustering.isNeighbour(ClusterCenterPoint, SchedClassPoint); } void Analysis::printSchedClassDescHtml(const SchedClass &SC, llvm::raw_ostream &OS) const { OS << ""; OS << ""; if (SC.SCDesc->isValid()) { const auto &SM = SubtargetInfo_->getSchedModel(); OS << ""; OS << ""; OS << ""; // Latencies. OS << ""; // WriteProcRes. OS << ""; // Idealized port pressure. OS << ""; OS << ""; } else { OS << ""; } OS << "

Valid	Variant	uOps	Latency	WriteProcRes	Idealized " "Resource Pressure
✔	" << (SC.SCDesc->isVariant() ? "✔" : "✕") << "	" << SC.SCDesc->NumMicroOps << "	"; for (int I = 0, E = SC.SCDesc->NumWriteLatencyEntries; I < E; ++I) { const auto *const Entry = SubtargetInfo_->getWriteLatencyEntry(SC.SCDesc, I); OS << " " << Entry->Cycles; if (SC.SCDesc->NumWriteLatencyEntries > 1) { // Dismabiguate if more than 1 latency. OS << " (WriteResourceID " << Entry->WriteResourceID << ")"; } OS << " "; } OS << "	"; for (const auto &WPR : SC.NonRedundantWriteProcRes) { OS << " "; writeEscaped(OS, SM.getProcResource(WPR.ProcResourceIdx)->Name); OS << ": " << WPR.Cycles << " "; } OS << "	"; for (const auto &Pressure : SC.IdealizedProcResPressure) { OS << " "; writeEscaped(OS, SubtargetInfo_->getSchedModel() .getProcResource(Pressure.first) ->Name); OS << ": "; writeMeasurementValue(OS, Pressure.second); OS << " "; } OS << "
✕

"; } static constexpr const char kHtmlHead[] = R"( llvm-exegesis Analysis Results )"; template <> llvm::Error Analysis::run( llvm::raw_ostream &OS) const { const auto &FirstPoint = Clustering_.getPoints()[0]; // Print the header. OS << "" << kHtmlHead << ""; OS << "

llvm-exegesis Analysis Results

"; OS << "

Triple: "; writeEscaped(OS, FirstPoint.LLVMTriple); OS << "

Cpu: "; writeEscaped(OS, FirstPoint.CpuName); OS << "

"; for (const auto &SchedClassAndPoints : makePointsPerSchedClass()) { const auto SchedClassId = SchedClassAndPoints.first; const std::vector &SchedClassPoints = SchedClassAndPoints.second; const auto &SchedModel = SubtargetInfo_->getSchedModel(); const llvm::MCSchedClassDesc *const SCDesc = SchedModel.getSchedClassDesc(SchedClassId); if (!SCDesc) continue; const SchedClass SC(*SCDesc, *SubtargetInfo_); // Bucket sched class points into sched class clusters. std::vector SchedClassClusters; for (const size_t PointId : SchedClassPoints) { const auto &ClusterId = Clustering_.getClusterIdForPoint(PointId); if (!ClusterId.isValid()) continue; // Ignore noise and errors. FIXME: take noise into account ? auto SchedClassClusterIt = std::find_if(SchedClassClusters.begin(), SchedClassClusters.end(), [ClusterId](const SchedClassCluster &C) { return C.id() == ClusterId; }); if (SchedClassClusterIt == SchedClassClusters.end()) { SchedClassClusters.emplace_back(); SchedClassClusterIt = std::prev(SchedClassClusters.end()); } SchedClassClusterIt->addPoint(PointId, Clustering_); } // Print any scheduling class that has at least one cluster that does not // match the checked-in data. if (std::all_of(SchedClassClusters.begin(), SchedClassClusters.end(), [this, &SC](const SchedClassCluster &C) { return C.measurementsMatch(*SubtargetInfo_, SC, Clustering_); })) continue; // Nothing weird. OS << "

Sched Class "; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) writeEscaped(OS, SCDesc->Name); #else OS << SchedClassId; #endif OS << " contains instructions whose performance characteristics do" " not match that of LLVM:

"; printSchedClassClustersHtml(SchedClassClusters, SC, OS); OS << "

llvm SchedModel data:

"; printSchedClassDescHtml(SC, OS); OS << "

"; } OS << ""; return llvm::Error::success(); } // Distributes a pressure budget as evenly as possible on the provided subunits // given the already existing port pressure distribution. // // The algorithm is as follows: while there is remaining pressure to // distribute, find the subunits with minimal pressure, and distribute // remaining pressure equally up to the pressure of the unit with // second-to-minimal pressure. // For example, let's assume we want to distribute 2*P1256 // (Subunits = [P1,P2,P5,P6]), and the starting DensePressure is: // DensePressure = P0 P1 P2 P3 P4 P5 P6 P7 // 0.1 0.3 0.2 0.0 0.0 0.5 0.5 0.5 // RemainingPressure = 2.0 // We sort the subunits by pressure: // Subunits = [(P2,p=0.2), (P1,p=0.3), (P5,p=0.5), (P6, p=0.5)] // We'll first start by the subunits with minimal pressure, which are at // the beginning of the sorted array. In this example there is one (P2). // The subunit with second-to-minimal pressure is the next one in the // array (P1). So we distribute 0.1 pressure to P2, and remove 0.1 cycles // from the budget. // Subunits = [(P2,p=0.3), (P1,p=0.3), (P5,p=0.5), (P5,p=0.5)] // RemainingPressure = 1.9 // We repeat this process: distribute 0.2 pressure on each of the minimal // P2 and P1, decrease budget by 2*0.2: // Subunits = [(P2,p=0.5), (P1,p=0.5), (P5,p=0.5), (P5,p=0.5)] // RemainingPressure = 1.5 // There are no second-to-minimal subunits so we just share the remaining // budget (1.5 cycles) equally: // Subunits = [(P2,p=0.875), (P1,p=0.875), (P5,p=0.875), (P5,p=0.875)] // RemainingPressure = 0.0 // We stop as there is no remaining budget to distribute. void distributePressure(float RemainingPressure, llvm::SmallVector Subunits, llvm::SmallVector &DensePressure) { // Find the number of subunits with minimal pressure (they are at the // front). llvm::sort(Subunits.begin(), Subunits.end(), [&DensePressure](const uint16_t A, const uint16_t B) { return DensePressure[A] < DensePressure[B]; }); const auto getPressureForSubunit = [&DensePressure, &Subunits](size_t I) -> float & { return DensePressure[Subunits[I]]; }; size_t NumMinimalSU = 1; while (NumMinimalSU < Subunits.size() && getPressureForSubunit(NumMinimalSU) == getPressureForSubunit(0)) { ++NumMinimalSU; } while (RemainingPressure > 0.0f) { if (NumMinimalSU == Subunits.size()) { // All units are minimal, just distribute evenly and be done. for (size_t I = 0; I < NumMinimalSU; ++I) { getPressureForSubunit(I) += RemainingPressure / NumMinimalSU; } return; } // Distribute the remaining pressure equally. const float MinimalPressure = getPressureForSubunit(NumMinimalSU - 1); const float SecondToMinimalPressure = getPressureForSubunit(NumMinimalSU); assert(MinimalPressure < SecondToMinimalPressure); const float Increment = SecondToMinimalPressure - MinimalPressure; if (RemainingPressure <= NumMinimalSU * Increment) { // There is not enough remaining pressure. for (size_t I = 0; I < NumMinimalSU; ++I) { getPressureForSubunit(I) += RemainingPressure / NumMinimalSU; } return; } // Bump all minimal pressure subunits to `SecondToMinimalPressure`. for (size_t I = 0; I < NumMinimalSU; ++I) { getPressureForSubunit(I) = SecondToMinimalPressure; RemainingPressure -= SecondToMinimalPressure; } while (NumMinimalSU < Subunits.size() && getPressureForSubunit(NumMinimalSU) == SecondToMinimalPressure) { ++NumMinimalSU; } } } std::vector> computeIdealizedProcResPressure( const llvm::MCSchedModel &SM, llvm::SmallVector WPRS) { // DensePressure[I] is the port pressure for Proc Resource I. llvm::SmallVector DensePressure(SM.getNumProcResourceKinds()); llvm::sort(WPRS.begin(), WPRS.end(), [](const llvm::MCWriteProcResEntry &A, const llvm::MCWriteProcResEntry &B) { return A.ProcResourceIdx < B.ProcResourceIdx; }); for (const llvm::MCWriteProcResEntry &WPR : WPRS) { // Get units for the entry. const llvm::MCProcResourceDesc *const ProcResDesc = SM.getProcResource(WPR.ProcResourceIdx); if (ProcResDesc->SubUnitsIdxBegin == nullptr) { // This is a ProcResUnit. DensePressure[WPR.ProcResourceIdx] += WPR.Cycles; } else { // This is a ProcResGroup. llvm::SmallVector Subunits(ProcResDesc->SubUnitsIdxBegin, ProcResDesc->SubUnitsIdxBegin + ProcResDesc->NumUnits); distributePressure(WPR.Cycles, Subunits, DensePressure); } } // Turn dense pressure into sparse pressure by removing zero entries. std::vector> Pressure; for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) { if (DensePressure[I] > 0.0f) Pressure.emplace_back(I, DensePressure[I]); } return Pressure; } } // namespace exegesis