10b57cec5SDimitry Andric //===- CallGraphSort.cpp --------------------------------------------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric ///
95f757f3fSDimitry Andric /// The file is responsible for sorting sections using LLVM call graph profile
105f757f3fSDimitry Andric /// data by placing frequently executed code sections together. The goal of the
115f757f3fSDimitry Andric /// placement is to improve the runtime performance of the final executable by
125f757f3fSDimitry Andric /// arranging code sections so that i-TLB misses and i-cache misses are reduced.
135f757f3fSDimitry Andric ///
145f757f3fSDimitry Andric /// The algorithm first builds a call graph based on the profile data and then
155f757f3fSDimitry Andric /// iteratively merges "chains" (ordered lists) of input sections which will be
165f757f3fSDimitry Andric /// laid out as a unit. There are two implementations for deciding how to
175f757f3fSDimitry Andric /// merge a pair of chains:
185f757f3fSDimitry Andric ///  - a simpler one, referred to as Call-Chain Clustering (C^3), that follows
195f757f3fSDimitry Andric ///    "Optimizing Function Placement for Large-Scale Data-Center Applications"
200b57cec5SDimitry Andric /// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf
215f757f3fSDimitry Andric /// - a more advanced one, referred to as Cache-Directed-Sort (CDSort), which
225f757f3fSDimitry Andric ///   typically produces layouts with higher locality, and hence, yields fewer
235f757f3fSDimitry Andric ///   instruction cache misses on large binaries.
240b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
250b57cec5SDimitry Andric 
260b57cec5SDimitry Andric #include "CallGraphSort.h"
2781ad6265SDimitry Andric #include "InputFiles.h"
2881ad6265SDimitry Andric #include "InputSection.h"
290b57cec5SDimitry Andric #include "Symbols.h"
3081ad6265SDimitry Andric #include "llvm/Support/FileSystem.h"
315f757f3fSDimitry Andric #include "llvm/Transforms/Utils/CodeLayout.h"
320b57cec5SDimitry Andric 
3385868e8aSDimitry Andric #include <numeric>
3485868e8aSDimitry Andric 
350b57cec5SDimitry Andric using namespace llvm;
365ffd83dbSDimitry Andric using namespace lld;
375ffd83dbSDimitry Andric using namespace lld::elf;
380b57cec5SDimitry Andric 
390b57cec5SDimitry Andric namespace {
400b57cec5SDimitry Andric struct Edge {
410b57cec5SDimitry Andric   int from;
420b57cec5SDimitry Andric   uint64_t weight;
430b57cec5SDimitry Andric };
440b57cec5SDimitry Andric 
450b57cec5SDimitry Andric struct Cluster {
Cluster__anon464a850f0111::Cluster4685868e8aSDimitry Andric   Cluster(int sec, size_t s) : next(sec), prev(sec), size(s) {}
470b57cec5SDimitry Andric 
getDensity__anon464a850f0111::Cluster480b57cec5SDimitry Andric   double getDensity() const {
490b57cec5SDimitry Andric     if (size == 0)
500b57cec5SDimitry Andric       return 0;
510b57cec5SDimitry Andric     return double(weight) / double(size);
520b57cec5SDimitry Andric   }
530b57cec5SDimitry Andric 
5485868e8aSDimitry Andric   int next;
5585868e8aSDimitry Andric   int prev;
56e8d8bef9SDimitry Andric   uint64_t size;
570b57cec5SDimitry Andric   uint64_t weight = 0;
580b57cec5SDimitry Andric   uint64_t initialWeight = 0;
590b57cec5SDimitry Andric   Edge bestPred = {-1, 0};
600b57cec5SDimitry Andric };
610b57cec5SDimitry Andric 
625f757f3fSDimitry Andric /// Implementation of the Call-Chain Clustering (C^3). The goal of this
635f757f3fSDimitry Andric /// algorithm is to improve runtime performance of the executable by arranging
645f757f3fSDimitry Andric /// code sections such that page table and i-cache misses are minimized.
655f757f3fSDimitry Andric ///
665f757f3fSDimitry Andric /// Definitions:
675f757f3fSDimitry Andric /// * Cluster
685f757f3fSDimitry Andric ///   * An ordered list of input sections which are laid out as a unit. At the
695f757f3fSDimitry Andric ///     beginning of the algorithm each input section has its own cluster and
705f757f3fSDimitry Andric ///     the weight of the cluster is the sum of the weight of all incoming
715f757f3fSDimitry Andric ///     edges.
725f757f3fSDimitry Andric /// * Call-Chain Clustering (C³) Heuristic
735f757f3fSDimitry Andric ///   * Defines when and how clusters are combined. Pick the highest weighted
745f757f3fSDimitry Andric ///     input section then add it to its most likely predecessor if it wouldn't
755f757f3fSDimitry Andric ///     penalize it too much.
765f757f3fSDimitry Andric /// * Density
775f757f3fSDimitry Andric ///   * The weight of the cluster divided by the size of the cluster. This is a
785f757f3fSDimitry Andric ///     proxy for the amount of execution time spent per byte of the cluster.
795f757f3fSDimitry Andric ///
805f757f3fSDimitry Andric /// It does so given a call graph profile by the following:
815f757f3fSDimitry Andric /// * Build a weighted call graph from the call graph profile
825f757f3fSDimitry Andric /// * Sort input sections by weight
835f757f3fSDimitry Andric /// * For each input section starting with the highest weight
845f757f3fSDimitry Andric ///   * Find its most likely predecessor cluster
855f757f3fSDimitry Andric ///   * Check if the combined cluster would be too large, or would have too low
865f757f3fSDimitry Andric ///     a density.
875f757f3fSDimitry Andric ///   * If not, then combine the clusters.
885f757f3fSDimitry Andric /// * Sort non-empty clusters by density
890b57cec5SDimitry Andric class CallGraphSort {
900b57cec5SDimitry Andric public:
910b57cec5SDimitry Andric   CallGraphSort();
920b57cec5SDimitry Andric 
930b57cec5SDimitry Andric   DenseMap<const InputSectionBase *, int> run();
940b57cec5SDimitry Andric 
950b57cec5SDimitry Andric private:
960b57cec5SDimitry Andric   std::vector<Cluster> clusters;
970b57cec5SDimitry Andric   std::vector<const InputSectionBase *> sections;
980b57cec5SDimitry Andric };
990b57cec5SDimitry Andric 
100480093f4SDimitry Andric // Maximum amount the combined cluster density can be worse than the original
1010b57cec5SDimitry Andric // cluster to consider merging.
1020b57cec5SDimitry Andric constexpr int MAX_DENSITY_DEGRADATION = 8;
1030b57cec5SDimitry Andric 
1040b57cec5SDimitry Andric // Maximum cluster size in bytes.
1050b57cec5SDimitry Andric constexpr uint64_t MAX_CLUSTER_SIZE = 1024 * 1024;
1060b57cec5SDimitry Andric } // end anonymous namespace
1070b57cec5SDimitry Andric 
1080b57cec5SDimitry Andric using SectionPair =
1090b57cec5SDimitry Andric     std::pair<const InputSectionBase *, const InputSectionBase *>;
1100b57cec5SDimitry Andric 
1110b57cec5SDimitry Andric // Take the edge list in Config->CallGraphProfile, resolve symbol names to
1120b57cec5SDimitry Andric // Symbols, and generate a graph between InputSections with the provided
1130b57cec5SDimitry Andric // weights.
CallGraphSort()1140b57cec5SDimitry Andric CallGraphSort::CallGraphSort() {
1150b57cec5SDimitry Andric   MapVector<SectionPair, uint64_t> &profile = config->callGraphProfile;
1160b57cec5SDimitry Andric   DenseMap<const InputSectionBase *, int> secToCluster;
1170b57cec5SDimitry Andric 
1180b57cec5SDimitry Andric   auto getOrCreateNode = [&](const InputSectionBase *isec) -> int {
11985868e8aSDimitry Andric     auto res = secToCluster.try_emplace(isec, clusters.size());
1200b57cec5SDimitry Andric     if (res.second) {
1210b57cec5SDimitry Andric       sections.push_back(isec);
1220b57cec5SDimitry Andric       clusters.emplace_back(clusters.size(), isec->getSize());
1230b57cec5SDimitry Andric     }
1240b57cec5SDimitry Andric     return res.first->second;
1250b57cec5SDimitry Andric   };
1260b57cec5SDimitry Andric 
1270b57cec5SDimitry Andric   // Create the graph.
1280b57cec5SDimitry Andric   for (std::pair<SectionPair, uint64_t> &c : profile) {
1290eae32dcSDimitry Andric     const auto *fromSB = cast<InputSectionBase>(c.first.first);
1300eae32dcSDimitry Andric     const auto *toSB = cast<InputSectionBase>(c.first.second);
1310b57cec5SDimitry Andric     uint64_t weight = c.second;
1320b57cec5SDimitry Andric 
1330b57cec5SDimitry Andric     // Ignore edges between input sections belonging to different output
1340b57cec5SDimitry Andric     // sections.  This is done because otherwise we would end up with clusters
1350b57cec5SDimitry Andric     // containing input sections that can't actually be placed adjacently in the
1360b57cec5SDimitry Andric     // output.  This messes with the cluster size and density calculations.  We
1370b57cec5SDimitry Andric     // would also end up moving input sections in other output sections without
1380b57cec5SDimitry Andric     // moving them closer to what calls them.
1390b57cec5SDimitry Andric     if (fromSB->getOutputSection() != toSB->getOutputSection())
1400b57cec5SDimitry Andric       continue;
1410b57cec5SDimitry Andric 
1420b57cec5SDimitry Andric     int from = getOrCreateNode(fromSB);
1430b57cec5SDimitry Andric     int to = getOrCreateNode(toSB);
1440b57cec5SDimitry Andric 
1450b57cec5SDimitry Andric     clusters[to].weight += weight;
1460b57cec5SDimitry Andric 
1470b57cec5SDimitry Andric     if (from == to)
1480b57cec5SDimitry Andric       continue;
1490b57cec5SDimitry Andric 
1500b57cec5SDimitry Andric     // Remember the best edge.
1510b57cec5SDimitry Andric     Cluster &toC = clusters[to];
1520b57cec5SDimitry Andric     if (toC.bestPred.from == -1 || toC.bestPred.weight < weight) {
1530b57cec5SDimitry Andric       toC.bestPred.from = from;
1540b57cec5SDimitry Andric       toC.bestPred.weight = weight;
1550b57cec5SDimitry Andric     }
1560b57cec5SDimitry Andric   }
1570b57cec5SDimitry Andric   for (Cluster &c : clusters)
1580b57cec5SDimitry Andric     c.initialWeight = c.weight;
1590b57cec5SDimitry Andric }
1600b57cec5SDimitry Andric 
1610b57cec5SDimitry Andric // It's bad to merge clusters which would degrade the density too much.
isNewDensityBad(Cluster & a,Cluster & b)1620b57cec5SDimitry Andric static bool isNewDensityBad(Cluster &a, Cluster &b) {
1630b57cec5SDimitry Andric   double newDensity = double(a.weight + b.weight) / double(a.size + b.size);
1640b57cec5SDimitry Andric   return newDensity < a.getDensity() / MAX_DENSITY_DEGRADATION;
1650b57cec5SDimitry Andric }
1660b57cec5SDimitry Andric 
16785868e8aSDimitry Andric // Find the leader of V's belonged cluster (represented as an equivalence
16885868e8aSDimitry Andric // class). We apply union-find path-halving technique (simple to implement) in
16985868e8aSDimitry Andric // the meantime as it decreases depths and the time complexity.
getLeader(int * leaders,int v)170bdd1243dSDimitry Andric static int getLeader(int *leaders, int v) {
17185868e8aSDimitry Andric   while (leaders[v] != v) {
17285868e8aSDimitry Andric     leaders[v] = leaders[leaders[v]];
17385868e8aSDimitry Andric     v = leaders[v];
17485868e8aSDimitry Andric   }
17585868e8aSDimitry Andric   return v;
17685868e8aSDimitry Andric }
17785868e8aSDimitry Andric 
mergeClusters(std::vector<Cluster> & cs,Cluster & into,int intoIdx,Cluster & from,int fromIdx)17885868e8aSDimitry Andric static void mergeClusters(std::vector<Cluster> &cs, Cluster &into, int intoIdx,
17985868e8aSDimitry Andric                           Cluster &from, int fromIdx) {
18085868e8aSDimitry Andric   int tail1 = into.prev, tail2 = from.prev;
18185868e8aSDimitry Andric   into.prev = tail2;
18285868e8aSDimitry Andric   cs[tail2].next = intoIdx;
18385868e8aSDimitry Andric   from.prev = tail1;
18485868e8aSDimitry Andric   cs[tail1].next = fromIdx;
1850b57cec5SDimitry Andric   into.size += from.size;
1860b57cec5SDimitry Andric   into.weight += from.weight;
1870b57cec5SDimitry Andric   from.size = 0;
1880b57cec5SDimitry Andric   from.weight = 0;
1890b57cec5SDimitry Andric }
1900b57cec5SDimitry Andric 
1910b57cec5SDimitry Andric // Group InputSections into clusters using the Call-Chain Clustering heuristic
1920b57cec5SDimitry Andric // then sort the clusters by density.
run()19385868e8aSDimitry Andric DenseMap<const InputSectionBase *, int> CallGraphSort::run() {
19485868e8aSDimitry Andric   std::vector<int> sorted(clusters.size());
195bdd1243dSDimitry Andric   std::unique_ptr<int[]> leaders(new int[clusters.size()]);
1960b57cec5SDimitry Andric 
197bdd1243dSDimitry Andric   std::iota(leaders.get(), leaders.get() + clusters.size(), 0);
19885868e8aSDimitry Andric   std::iota(sorted.begin(), sorted.end(), 0);
19985868e8aSDimitry Andric   llvm::stable_sort(sorted, [&](int a, int b) {
2000b57cec5SDimitry Andric     return clusters[a].getDensity() > clusters[b].getDensity();
2010b57cec5SDimitry Andric   });
2020b57cec5SDimitry Andric 
20385868e8aSDimitry Andric   for (int l : sorted) {
20485868e8aSDimitry Andric     // The cluster index is the same as the index of its leader here because
20585868e8aSDimitry Andric     // clusters[L] has not been merged into another cluster yet.
20685868e8aSDimitry Andric     Cluster &c = clusters[l];
2070b57cec5SDimitry Andric 
2080b57cec5SDimitry Andric     // Don't consider merging if the edge is unlikely.
2090b57cec5SDimitry Andric     if (c.bestPred.from == -1 || c.bestPred.weight * 10 <= c.initialWeight)
2100b57cec5SDimitry Andric       continue;
2110b57cec5SDimitry Andric 
212bdd1243dSDimitry Andric     int predL = getLeader(leaders.get(), c.bestPred.from);
21385868e8aSDimitry Andric     if (l == predL)
2140b57cec5SDimitry Andric       continue;
2150b57cec5SDimitry Andric 
21685868e8aSDimitry Andric     Cluster *predC = &clusters[predL];
2170b57cec5SDimitry Andric     if (c.size + predC->size > MAX_CLUSTER_SIZE)
2180b57cec5SDimitry Andric       continue;
2190b57cec5SDimitry Andric 
2200b57cec5SDimitry Andric     if (isNewDensityBad(*predC, c))
2210b57cec5SDimitry Andric       continue;
2220b57cec5SDimitry Andric 
22385868e8aSDimitry Andric     leaders[l] = predL;
22485868e8aSDimitry Andric     mergeClusters(clusters, *predC, predL, c, l);
2250b57cec5SDimitry Andric   }
2260b57cec5SDimitry Andric 
22785868e8aSDimitry Andric   // Sort remaining non-empty clusters by density.
22885868e8aSDimitry Andric   sorted.clear();
22985868e8aSDimitry Andric   for (int i = 0, e = (int)clusters.size(); i != e; ++i)
23085868e8aSDimitry Andric     if (clusters[i].size > 0)
23185868e8aSDimitry Andric       sorted.push_back(i);
23285868e8aSDimitry Andric   llvm::stable_sort(sorted, [&](int a, int b) {
23385868e8aSDimitry Andric     return clusters[a].getDensity() > clusters[b].getDensity();
2340b57cec5SDimitry Andric   });
2350b57cec5SDimitry Andric 
2360b57cec5SDimitry Andric   DenseMap<const InputSectionBase *, int> orderMap;
23785868e8aSDimitry Andric   int curOrder = 1;
238e8d8bef9SDimitry Andric   for (int leader : sorted) {
23985868e8aSDimitry Andric     for (int i = leader;;) {
24085868e8aSDimitry Andric       orderMap[sections[i]] = curOrder++;
24185868e8aSDimitry Andric       i = clusters[i].next;
24285868e8aSDimitry Andric       if (i == leader)
24385868e8aSDimitry Andric         break;
24485868e8aSDimitry Andric     }
245e8d8bef9SDimitry Andric   }
2460b57cec5SDimitry Andric   if (!config->printSymbolOrder.empty()) {
2470b57cec5SDimitry Andric     std::error_code ec;
24885868e8aSDimitry Andric     raw_fd_ostream os(config->printSymbolOrder, ec, sys::fs::OF_None);
2490b57cec5SDimitry Andric     if (ec) {
2500b57cec5SDimitry Andric       error("cannot open " + config->printSymbolOrder + ": " + ec.message());
2510b57cec5SDimitry Andric       return orderMap;
2520b57cec5SDimitry Andric     }
2530b57cec5SDimitry Andric 
2540b57cec5SDimitry Andric     // Print the symbols ordered by C3, in the order of increasing curOrder
2550b57cec5SDimitry Andric     // Instead of sorting all the orderMap, just repeat the loops above.
25685868e8aSDimitry Andric     for (int leader : sorted)
25785868e8aSDimitry Andric       for (int i = leader;;) {
2580b57cec5SDimitry Andric         // Search all the symbols in the file of the section
2590b57cec5SDimitry Andric         // and find out a Defined symbol with name that is within the section.
26085868e8aSDimitry Andric         for (Symbol *sym : sections[i]->file->getSymbols())
2610b57cec5SDimitry Andric           if (!sym->isSection()) // Filter out section-type symbols here.
2620b57cec5SDimitry Andric             if (auto *d = dyn_cast<Defined>(sym))
26385868e8aSDimitry Andric               if (sections[i] == d->section)
2640b57cec5SDimitry Andric                 os << sym->getName() << "\n";
26585868e8aSDimitry Andric         i = clusters[i].next;
26685868e8aSDimitry Andric         if (i == leader)
26785868e8aSDimitry Andric           break;
26885868e8aSDimitry Andric       }
2690b57cec5SDimitry Andric   }
2700b57cec5SDimitry Andric 
2710b57cec5SDimitry Andric   return orderMap;
2720b57cec5SDimitry Andric }
2730b57cec5SDimitry Andric 
2745f757f3fSDimitry Andric // Sort sections by the profile data using the Cache-Directed Sort algorithm.
2755f757f3fSDimitry Andric // The placement is done by optimizing the locality by co-locating frequently
2765f757f3fSDimitry Andric // executed code sections together.
computeCacheDirectedSortOrder()2775f757f3fSDimitry Andric DenseMap<const InputSectionBase *, int> elf::computeCacheDirectedSortOrder() {
2785f757f3fSDimitry Andric   SmallVector<uint64_t, 0> funcSizes;
2795f757f3fSDimitry Andric   SmallVector<uint64_t, 0> funcCounts;
2805f757f3fSDimitry Andric   SmallVector<codelayout::EdgeCount, 0> callCounts;
2815f757f3fSDimitry Andric   SmallVector<uint64_t, 0> callOffsets;
2825f757f3fSDimitry Andric   SmallVector<const InputSectionBase *, 0> sections;
2835f757f3fSDimitry Andric   DenseMap<const InputSectionBase *, size_t> secToTargetId;
2845f757f3fSDimitry Andric 
2855f757f3fSDimitry Andric   auto getOrCreateNode = [&](const InputSectionBase *inSec) -> size_t {
2865f757f3fSDimitry Andric     auto res = secToTargetId.try_emplace(inSec, sections.size());
2875f757f3fSDimitry Andric     if (res.second) {
2885f757f3fSDimitry Andric       // inSec does not appear before in the graph.
2895f757f3fSDimitry Andric       sections.push_back(inSec);
2905f757f3fSDimitry Andric       funcSizes.push_back(inSec->getSize());
2915f757f3fSDimitry Andric       funcCounts.push_back(0);
2925f757f3fSDimitry Andric     }
2935f757f3fSDimitry Andric     return res.first->second;
2945f757f3fSDimitry Andric   };
2955f757f3fSDimitry Andric 
2965f757f3fSDimitry Andric   // Create the graph.
2975f757f3fSDimitry Andric   for (std::pair<SectionPair, uint64_t> &c : config->callGraphProfile) {
2985f757f3fSDimitry Andric     const InputSectionBase *fromSB = cast<InputSectionBase>(c.first.first);
2995f757f3fSDimitry Andric     const InputSectionBase *toSB = cast<InputSectionBase>(c.first.second);
3005f757f3fSDimitry Andric     // Ignore edges between input sections belonging to different sections.
3015f757f3fSDimitry Andric     if (fromSB->getOutputSection() != toSB->getOutputSection())
3025f757f3fSDimitry Andric       continue;
3035f757f3fSDimitry Andric 
3045f757f3fSDimitry Andric     uint64_t weight = c.second;
3055f757f3fSDimitry Andric     // Ignore edges with zero weight.
3065f757f3fSDimitry Andric     if (weight == 0)
3075f757f3fSDimitry Andric       continue;
3085f757f3fSDimitry Andric 
3095f757f3fSDimitry Andric     size_t from = getOrCreateNode(fromSB);
3105f757f3fSDimitry Andric     size_t to = getOrCreateNode(toSB);
3115f757f3fSDimitry Andric     // Ignore self-edges (recursive calls).
3125f757f3fSDimitry Andric     if (from == to)
3135f757f3fSDimitry Andric       continue;
3145f757f3fSDimitry Andric 
3155f757f3fSDimitry Andric     callCounts.push_back({from, to, weight});
3165f757f3fSDimitry Andric     // Assume that the jump is at the middle of the input section. The profile
3175f757f3fSDimitry Andric     // data does not contain jump offsets.
3185f757f3fSDimitry Andric     callOffsets.push_back((funcSizes[from] + 1) / 2);
3195f757f3fSDimitry Andric     funcCounts[to] += weight;
3205f757f3fSDimitry Andric   }
3215f757f3fSDimitry Andric 
3225f757f3fSDimitry Andric   // Run the layout algorithm.
3235f757f3fSDimitry Andric   std::vector<uint64_t> sortedSections = codelayout::computeCacheDirectedLayout(
3245f757f3fSDimitry Andric       funcSizes, funcCounts, callCounts, callOffsets);
3255f757f3fSDimitry Andric 
3265f757f3fSDimitry Andric   // Create the final order.
3275f757f3fSDimitry Andric   DenseMap<const InputSectionBase *, int> orderMap;
3285f757f3fSDimitry Andric   int curOrder = 1;
3295f757f3fSDimitry Andric   for (uint64_t secIdx : sortedSections)
3305f757f3fSDimitry Andric     orderMap[sections[secIdx]] = curOrder++;
3315f757f3fSDimitry Andric 
3325f757f3fSDimitry Andric   return orderMap;
3335f757f3fSDimitry Andric }
3345f757f3fSDimitry Andric 
335349cc55cSDimitry Andric // Sort sections by the profile data provided by --callgraph-profile-file.
3360b57cec5SDimitry Andric //
3370b57cec5SDimitry Andric // This first builds a call graph based on the profile data then merges sections
3385f757f3fSDimitry Andric // according either to the C³ or Cache-Directed-Sort ordering algorithm.
computeCallGraphProfileOrder()3395ffd83dbSDimitry Andric DenseMap<const InputSectionBase *, int> elf::computeCallGraphProfileOrder() {
3405f757f3fSDimitry Andric   if (config->callGraphProfileSort == CGProfileSortKind::Cdsort)
3415f757f3fSDimitry Andric     return computeCacheDirectedSortOrder();
3420b57cec5SDimitry Andric   return CallGraphSort().run();
3430b57cec5SDimitry Andric }
344