1 /*
2 Copyright (c) 2009-2020, Intel Corporation
3 All rights reserved.
4 
5 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
6 
7     * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
8     * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
9     * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
10 
11 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
12 */
13 // written by Roman Dementiev
14 //            Thomas Willhalm
15 
16 #ifndef CPUCOUNTERS_HEADER
17 #define CPUCOUNTERS_HEADER
18 
19 /*!     \file cpucounters.h
20         \brief Main CPU counters header
21 
22         Include this header file if you want to access CPU counters (core and uncore - including memory controller chips and QPI)
23 */
24 
25 #include "version.h"
26 
27 #ifndef PCM_API
28 #define PCM_API
29 #endif
30 
31 #undef PCM_HA_REQUESTS_READS_ONLY
32 
33 #include "types.h"
34 #include "msr.h"
35 #include "pci.h"
36 #include "bw.h"
37 #include "width_extender.h"
38 #include "exceptions/unsupported_processor_exception.hpp"
39 
40 #include <vector>
41 #include <array>
42 #include <limits>
43 #include <string>
44 #include <memory>
45 #include <map>
46 #include <unordered_map>
47 #include <string.h>
48 #include <assert.h>
49 
50 #ifdef PCM_USE_PERF
51 #include <linux/perf_event.h>
52 #include <errno.h>
53 #define PCM_PERF_COUNT_HW_REF_CPU_CYCLES (9)
54 #endif
55 
56 #ifndef _MSC_VER
57 #define NOMINMAX
58 #include <semaphore.h>
59 #include <sys/types.h>
60 #include <sys/stat.h>
61 #include <fcntl.h>
62 #include <sys/syscall.h>
63 #include <unistd.h>
64 #endif
65 
66 #ifdef _MSC_VER
67 #if _MSC_VER>= 1600
68 #include <intrin.h>
69 #endif
70 #endif
71 
72 #include "resctrl.h"
73 
74 namespace pcm {
75 
76 #ifdef _MSC_VER
77 void PCM_API restrictDriverAccess(LPCWSTR path);
78 #endif
79 
80 class SystemCounterState;
81 class SocketCounterState;
82 class CoreCounterState;
83 class BasicCounterState;
84 class ServerUncoreCounterState;
85 class PCM;
86 class CoreTaskQueue;
87 class SystemRoot;
88 
89 /*
90         CPU performance monitoring routines
91 
92         A set of performance monitoring routines for recent Intel CPUs
93 */
94 
95 struct PCM_API TopologyEntry // decribes a core
96 {
97     int32 os_id;
98     int32 thread_id;
99     int32 core_id;
100     int32 tile_id; // tile is a constalation of 1 or more cores sharing salem L2 cache. Unique for entire system
101     int32 socket;
102 
TopologyEntryTopologyEntry103     TopologyEntry() : os_id(-1), thread_id (-1), core_id(-1), tile_id(-1), socket(-1) { }
104 };
105 
106 class HWRegister
107 {
108 public:
109     virtual void operator = (uint64 val) = 0; // write operation
110     virtual operator uint64 () = 0; //read operation
~HWRegister()111     virtual ~HWRegister() {}
112 };
113 
114 class PCICFGRegister64 : public HWRegister
115 {
116     std::shared_ptr<PciHandleType> handle;
117     size_t offset;
118 public:
PCICFGRegister64(const std::shared_ptr<PciHandleType> & handle_,size_t offset_)119     PCICFGRegister64(const std::shared_ptr<PciHandleType> & handle_, size_t offset_) :
120         handle(handle_),
121         offset(offset_)
122     {
123     }
124     void operator = (uint64 val) override
125     {
126         cvt_ds cvt;
127         cvt.ui64 = val;
128         handle->write32(offset, cvt.ui32.low);
129         handle->write32(offset + sizeof(uint32), cvt.ui32.high);
130     }
uint64()131     operator uint64 ()  override
132     {
133         uint64 result = 0;
134         handle->read64(offset, &result);
135         return result;
136     }
137 };
138 
139 class PCICFGRegister32 : public HWRegister
140 {
141     std::shared_ptr<PciHandleType> handle;
142     size_t offset;
143 public:
PCICFGRegister32(const std::shared_ptr<PciHandleType> & handle_,size_t offset_)144     PCICFGRegister32(const std::shared_ptr<PciHandleType> & handle_, size_t offset_) :
145         handle(handle_),
146         offset(offset_)
147     {
148     }
149     void operator = (uint64 val) override
150     {
151         handle->write32(offset, (uint32)val);
152     }
uint64()153     operator uint64 () override
154     {
155         uint32 result = 0;
156         handle->read32(offset, &result);
157         return result;
158     }
159 };
160 
161 class MMIORegister64 : public HWRegister
162 {
163     std::shared_ptr<MMIORange> handle;
164     size_t offset;
165 public:
MMIORegister64(const std::shared_ptr<MMIORange> & handle_,size_t offset_)166     MMIORegister64(const std::shared_ptr<MMIORange> & handle_, size_t offset_) :
167         handle(handle_),
168         offset(offset_)
169     {
170     }
171     void operator = (uint64 val) override
172     {
173         handle->write64(offset, val);
174     }
uint64()175     operator uint64 () override
176     {
177         return handle->read64(offset);
178     }
179 };
180 
181 class MMIORegister32 : public HWRegister
182 {
183     std::shared_ptr<MMIORange> handle;
184     size_t offset;
185 public:
MMIORegister32(const std::shared_ptr<MMIORange> & handle_,size_t offset_)186     MMIORegister32(const std::shared_ptr<MMIORange> & handle_, size_t offset_) :
187         handle(handle_),
188         offset(offset_)
189     {
190     }
191     void operator = (uint64 val) override
192     {
193         handle->write32(offset, (uint32)val);
194     }
uint64()195     operator uint64 () override
196     {
197         return (uint64)handle->read32(offset);
198     }
199 };
200 
201 class MSRRegister : public HWRegister
202 {
203     std::shared_ptr<SafeMsrHandle> handle;
204     size_t offset;
205 public:
MSRRegister(const std::shared_ptr<SafeMsrHandle> & handle_,size_t offset_)206     MSRRegister(const std::shared_ptr<SafeMsrHandle> & handle_, size_t offset_) :
207         handle(handle_),
208         offset(offset_)
209     {
210     }
211     void operator = (uint64 val) override
212     {
213         handle->write(offset, val);
214     }
uint64()215     operator uint64 () override
216     {
217         uint64 value = 0;
218         handle->read(offset, &value);
219         return value;
220     }
221 };
222 
223 class CounterWidthExtenderRegister : public HWRegister
224 {
225     std::shared_ptr<CounterWidthExtender> handle;
226 public:
CounterWidthExtenderRegister(const std::shared_ptr<CounterWidthExtender> & handle_)227     CounterWidthExtenderRegister(const std::shared_ptr<CounterWidthExtender> & handle_) :
228         handle(handle_)
229     {
230     }
231     void operator = (uint64 val) override
232     {
233         if (val == 0)
234         {
235             handle->reset();
236         }
237         else
238         {
239             std::cerr << "ERROR: writing non-zero values to CounterWidthExtenderRegister is not supported\n";
240             throw std::exception();
241         }
242     }
uint64()243     operator uint64 () override
244     {
245         return handle->read();;
246     }
247 };
248 
249 #undef PCM_UNCORE_PMON_BOX_CHECK_STATUS // debug only
250 
251 class UncorePMU
252 {
253     typedef std::shared_ptr<HWRegister> HWRegisterPtr;
254     HWRegisterPtr unitControl;
255 public:
256     HWRegisterPtr counterControl[4];
257     HWRegisterPtr counterValue[4];
258     HWRegisterPtr fixedCounterControl;
259     HWRegisterPtr fixedCounterValue;
260     HWRegisterPtr filter[2];
261 
262     UncorePMU(const HWRegisterPtr & unitControl_,
263         const HWRegisterPtr & counterControl0,
264         const HWRegisterPtr & counterControl1,
265         const HWRegisterPtr & counterControl2,
266         const HWRegisterPtr & counterControl3,
267         const HWRegisterPtr & counterValue0,
268         const HWRegisterPtr & counterValue1,
269         const HWRegisterPtr & counterValue2,
270         const HWRegisterPtr & counterValue3,
271         const HWRegisterPtr & fixedCounterControl_ = HWRegisterPtr(),
272         const HWRegisterPtr & fixedCounterValue_ = HWRegisterPtr(),
273         const HWRegisterPtr & filter0 = HWRegisterPtr(),
274         const HWRegisterPtr & filter1 = HWRegisterPtr()
275     ) :
unitControl(unitControl_)276         unitControl(unitControl_),
277         counterControl{ counterControl0, counterControl1, counterControl2, counterControl3 },
278         counterValue{ counterValue0, counterValue1, counterValue2, counterValue3 },
279         fixedCounterControl(fixedCounterControl_),
280         fixedCounterValue(fixedCounterValue_),
281         filter{ filter0 , filter1 }
282     {
283     }
UncorePMU()284     UncorePMU() {}
~UncorePMU()285     virtual ~UncorePMU() {}
valid()286     bool valid() const
287     {
288         return unitControl.get() != nullptr;
289     }
writeUnitControl(const uint32 value)290     void writeUnitControl(const uint32 value)
291     {
292         *unitControl = value;
293     }
294     void cleanup();
295     void freeze(const uint32 extra);
296     bool initFreeze(const uint32 extra, const char* xPICheckMsg = nullptr);
297     void unfreeze(const uint32 extra);
298     void resetUnfreeze(const uint32 extra);
299 };
300 
301 enum ServerUncoreMemoryMetrics
302 {
303     PartialWrites,
304     Pmem,
305     PmemMemoryMode,
306     PmemMixedMode
307 };
308 
309 //! Object to access uncore counters in a socket/processor with microarchitecture codename SandyBridge-EP (Jaketown) or Ivytown-EP or Ivytown-EX
310 class ServerPCICFGUncore
311 {
312     friend class PCM;
313     int32 iMCbus,UPIbus,M2Mbus;
314     uint32 groupnr;
315     int32 cpu_model;
316     typedef std::vector<UncorePMU> UncorePMUVector;
317     UncorePMUVector imcPMUs;
318     UncorePMUVector edcPMUs;
319     UncorePMUVector xpiPMUs;
320     UncorePMUVector m3upiPMUs;
321     UncorePMUVector m2mPMUs;
322     UncorePMUVector haPMUs;
323     std::vector<UncorePMUVector*> allPMUs{ &imcPMUs, &edcPMUs, &xpiPMUs, &m3upiPMUs , &m2mPMUs, &haPMUs };
324     std::vector<uint64> qpi_speed;
325     std::vector<uint32> num_imc_channels; // number of memory channels in each memory controller
326     std::vector<std::pair<uint32, uint32> > XPIRegisterLocation; // (device, function)
327     std::vector<std::pair<uint32, uint32> > M3UPIRegisterLocation; // (device, function)
328     std::vector<std::vector< std::pair<uint32, uint32> > > MCRegisterLocation; // MCRegisterLocation[controller]: (device, function)
329     std::vector<std::pair<uint32, uint32> > EDCRegisterLocation; // EDCRegisterLocation: (device, function)
330     std::vector<std::pair<uint32, uint32> > M2MRegisterLocation; // M2MRegisterLocation: (device, function)
331     std::vector<std::pair<uint32, uint32> > HARegisterLocation;  // HARegisterLocation: (device, function)
332 
333     static std::vector<std::pair<uint32, uint32> > socket2iMCbus;
334     static std::vector<std::pair<uint32, uint32> > socket2UPIbus;
335     static std::vector<std::pair<uint32, uint32> > socket2M2Mbus;
336 
337     ServerPCICFGUncore();                                         // forbidden
338     ServerPCICFGUncore(ServerPCICFGUncore &);                     // forbidden
339     ServerPCICFGUncore & operator = (const ServerPCICFGUncore &); // forbidden
340     PciHandleType * createIntelPerfMonDevice(uint32 groupnr, int32 bus, uint32 dev, uint32 func, bool checkVendor = false);
341     void programIMC(const uint32 * MCCntConfig);
342     void programEDC(const uint32 * EDCCntConfig);
343     void programM2M(const uint64 * M2MCntConfig);
344     void programM2M();
345     void programHA(const uint32 * config);
346     void programHA();
347     void programXPI(const uint32 * XPICntConfig);
348     void programM3UPI(const uint32* M3UPICntConfig);
349     typedef std::pair<size_t, std::vector<uint64 *> > MemTestParam;
350     void initMemTest(MemTestParam & param);
351     void doMemTest(const MemTestParam & param);
352     void cleanupMemTest(const MemTestParam & param);
353     void cleanupQPIHandles();
354     void cleanupPMUs();
355     void writeAllUnitControl(const uint32 value);
356     void initDirect(uint32 socket_, const PCM * pcm);
357     void initPerf(uint32 socket_, const PCM * pcm);
358     void initBuses(uint32 socket_, const PCM * pcm);
359     void initRegisterLocations(const PCM * pcm);
360     uint64 getPMUCounter(std::vector<UncorePMU> & pmu, const uint32 id, const uint32 counter);
361 
362 public:
363     enum EventPosition {
364         READ=0,
365         WRITE=1,
366         READ_RANK_A=0,
367         WRITE_RANK_A=1,
368         READ_RANK_B=2,
369         WRITE_RANK_B=3,
370         PARTIAL=2,
371         PMM_READ=2,
372         PMM_WRITE=3,
373         PMM_MM_MISS_CLEAN=2,
374         PMM_MM_MISS_DIRTY=3,
375         NM_HIT=0,  // NM :  Near Memory (DRAM cache) in Memory Mode
376         M2M_CLOCKTICKS=1
377     };
378     //! \brief Initialize access data structures
379     //! \param socket_ socket id
380     //! \param pcm pointer to PCM instance
381     ServerPCICFGUncore(uint32 socket_, const PCM * pcm);
382     //! \brief Program performance counters (disables programming power counters)
383     void program();
384     //! \brief Get the number of integrated controller reads (in cache lines)
385     uint64 getImcReads();
386     //! \brief Get the number of integrated controller reads for given controller (in cache lines)
387     //! \param controller controller ID/number
388     uint64 getImcReadsForController(uint32 controller);
389     //! \brief Get the number of integrated controller reads for given channels (in cache lines)
390     //! \param beginChannel first channel in the range
391     //! \param endChannel last channel + 1: the range is [beginChannel, endChannel). endChannel is not included.
392     uint64 getImcReadsForChannels(uint32 beginChannel, uint32 endChannel);
393     //! \brief Get the number of integrated controller writes (in cache lines)
394     uint64 getImcWrites();
395     //! \brief Get the number of requests to home agent (BDX/HSX only)
396     uint64 getHALocalRequests();
397     //! \brief Get the number of local requests to home agent (BDX/HSX only)
398     uint64 getHARequests();
399 
400     //! \brief Get the number of PMM memory reads (in cache lines)
401     uint64 getPMMReads();
402     //! \brief Get the number of PMM memory writes (in cache lines)
403     uint64 getPMMWrites();
404 
405     //! \brief Get the number of cache lines read by EDC (embedded DRAM controller)
406     uint64 getEdcReads();
407     //! \brief Get the number of cache lines written by EDC (embedded DRAM controller)
408     uint64 getEdcWrites();
409 
410     //! \brief Get the number of incoming data flits to the socket through a port
411     //! \param port QPI port id
412     uint64 getIncomingDataFlits(uint32 port);
413 
414     //! \brief Get the number of outgoing data and non-data or idle flits (depending on the architecture) from the socket through a port
415     //! \param port QPI port id
416     uint64 getOutgoingFlits(uint32 port);
417 
418     ~ServerPCICFGUncore();
419 
420     //! \brief Program power counters (disables programming performance counters)
421     //! \param mc_profile memory controller measurement profile. See description of profiles in pcm-power.cpp
422     void program_power_metrics(int mc_profile);
423 
424     //! \brief Program memory counters (disables programming performance counters)
425     //! \param rankA count DIMM rank1 statistics (disables memory channel monitoring)
426     //! \param rankB count DIMM rank2 statistics (disables memory channel monitoring)
427     //! \brief metrics metric set (see the ServerUncoreMemoryMetrics enum)
428     void programServerUncoreMemoryMetrics(const ServerUncoreMemoryMetrics & metrics, const int rankA = -1, const int rankB = -1);
429 
430     //! \brief Get number of QPI LL clocks on a QPI port
431     //! \param port QPI port number
432     uint64 getQPIClocks(uint32 port);
433 
434     //! \brief Get number cycles on a QPI port when the link was in a power saving half-lane mode
435     //! \param port QPI port number
436     uint64 getQPIL0pTxCycles(uint32 port);
437     //! \brief Get number cycles on a UPI port when the link was in a L0 mode (fully active)
438     //! \param port UPI port number
439     uint64 getUPIL0TxCycles(uint32 port);
440     //! \brief Get number cycles on a QPI port when the link was in a power saving shutdown mode
441     //! \param port QPI port number
442     uint64 getQPIL1Cycles(uint32 port);
443     //! \brief Get number DRAM channel cycles
444     //! \param channel channel number
445     uint64 getDRAMClocks(uint32 channel);
446     //! \brief Get number MCDRAM channel cycles
447     //! \param channel channel number
448     uint64 getMCDRAMClocks(uint32 channel);
449     //! \brief Direct read of memory controller PMU counter (counter meaning depends on the programming: power/performance/etc)
450     //! \param channel channel number
451     //! \param counter counter number
452     uint64 getMCCounter(uint32 channel, uint32 counter);
453     //! \brief Direct read of embedded DRAM memory controller PMU counter (counter meaning depends on the programming: power/performance/etc)
454     //! \param channel channel number
455     //! \param counter counter number
456     uint64 getEDCCounter(uint32 channel, uint32 counter);
457     //! \brief Direct read of QPI LL PMU counter (counter meaning depends on the programming: power/performance/etc)
458     //! \param port port number
459     //! \param counter counter number
460     uint64 getQPILLCounter(uint32 port, uint32 counter);
461     //! \brief Direct read of M3UPI PMU counter (counter meaning depends on the programming: power/performance/etc)
462     //! \param port port number
463     //! \param counter counter number
464     uint64 getM3UPICounter(uint32 port, uint32 counter);
465     //! \brief Direct read of M2M counter
466     //! \param box box ID/number
467     //! \param counter counter number
468     uint64 getM2MCounter(uint32 box, uint32 counter);
469 
470     //! \brief Freezes event counting
471     void freezeCounters();
472     //! \brief Unfreezes event counting
473     void unfreezeCounters();
474 
475     //! \brief Measures/computes the maximum theoretical QPI link bandwidth speed in GByte/seconds
476     uint64 computeQPISpeed(const uint32 ref_core, const int cpumodel);
477 
478     //! \brief Enable correct counting of various LLC events (with memory access perf penalty)
479     void enableJKTWorkaround(bool enable);
480 
481     //! \brief Returns the number of detected QPI ports
getNumQPIPorts()482     size_t getNumQPIPorts() const { return xpiPMUs.size(); }
483 
484     //! \brief Returns the speed of the QPI link
getQPILinkSpeed(const uint32 linkNr)485     uint64 getQPILinkSpeed(const uint32 linkNr) const
486     {
487         return qpi_speed.empty() ? 0 : qpi_speed[linkNr];
488     }
489 
490     //! \brief Print QPI Speeds
491     void reportQPISpeed() const;
492 
493     //! \brief Returns the number of detected integrated memory controllers
getNumMC()494     uint32 getNumMC() const { return (uint32)num_imc_channels.size(); }
495 
496     //! \brief Returns the total number of detected memory channels on all integrated memory controllers
getNumMCChannels()497     size_t getNumMCChannels() const { return (size_t)imcPMUs.size(); }
498 
499     //! \brief Returns the total number of detected memory channels on given integrated memory controller
500     //! \param controller controller number
501     size_t getNumMCChannels(const uint32 controller) const;
502 
503     //! \brief Returns the total number of detected memory channels on all embedded DRAM controllers (EDC)
getNumEDCChannels()504     size_t getNumEDCChannels() const { return edcPMUs.size(); }
505 };
506 
507 class SimpleCounterState
508 {
509     template <class T>
510     friend uint64 getNumberOfEvents(const T & before, const T & after);
511     friend class PCM;
512     uint64 data;
513 
514 public:
SimpleCounterState()515     SimpleCounterState() : data(0)
516     { }
~SimpleCounterState()517     virtual ~SimpleCounterState() { }
518 };
519 
520 typedef SimpleCounterState PCIeCounterState;
521 typedef SimpleCounterState IIOCounterState;
522 typedef std::vector<uint64> eventGroup_t;
523 
524 class PerfVirtualControlRegister;
525 
526 /*!
527         \brief CPU Performance Monitor
528 
529         This singleton object needs to be instantiated for each process
530         before accessing counting and measuring routines
531 */
532 class PCM_API PCM
533 {
534     friend class BasicCounterState;
535     friend class UncoreCounterState;
536     friend class Socket;
537     friend class ServerUncore;
538     friend class PerfVirtualControlRegister;
539     friend class Aggregator;
540     friend class ServerPCICFGUncore;
541     PCM();     // forbidden to call directly because it is a singleton
542     PCM(const PCM &) = delete;
543     PCM & operator = (const PCM &) = delete;
544 
545     int32 cpu_family;
546     int32 cpu_model;
547     int32 cpu_stepping;
548     int64 cpu_microcode_level;
549     int32 max_cpuid;
550     int32 threads_per_core;
551     int32 num_cores;
552     int32 num_sockets;
553     int32 num_phys_cores_per_socket;
554     int32 num_online_cores;
555     int32 num_online_sockets;
556     uint32 core_gen_counter_num_max;
557     uint32 core_gen_counter_num_used;
558     uint32 core_gen_counter_width;
559     uint32 core_fixed_counter_num_max;
560     uint32 core_fixed_counter_num_used;
561     uint32 core_fixed_counter_width;
562     uint32 uncore_gen_counter_num_max;
563     uint32 uncore_gen_counter_num_used;
564     uint32 uncore_gen_counter_width;
565     uint32 uncore_fixed_counter_num_max;
566     uint32 uncore_fixed_counter_num_used;
567     uint32 uncore_fixed_counter_width;
568     uint32 perfmon_version;
569     int32 perfmon_config_anythread;
570     uint64 nominal_frequency;
571     uint64 max_qpi_speed; // in GBytes/second
572     uint32 L3ScalingFactor;
573     int32 pkgThermalSpecPower, pkgMinimumPower, pkgMaximumPower;
574 
575     std::vector<TopologyEntry> topology;
576     SystemRoot* systemTopology;
577     std::string errorMessage;
578 
579     static PCM * instance;
580     bool allow_multiple_instances;
581     bool programmed_pmu;
582     std::vector<std::shared_ptr<SafeMsrHandle> > MSR;
583     std::vector<std::shared_ptr<ServerPCICFGUncore> > server_pcicfg_uncore;
584     std::vector<UncorePMU> pcuPMUs;
585     std::vector<std::map<int32, UncorePMU> > iioPMUs;
586     std::vector<UncorePMU> uboxPMUs;
587     double joulesPerEnergyUnit;
588     std::vector<std::shared_ptr<CounterWidthExtender> > energy_status;
589     std::vector<std::shared_ptr<CounterWidthExtender> > dram_energy_status;
590     std::vector<std::vector<UncorePMU> > cboPMUs;
591 
592     std::vector<std::shared_ptr<CounterWidthExtender> > memory_bw_local;
593     std::vector<std::shared_ptr<CounterWidthExtender> > memory_bw_total;
594 #ifdef __linux__
595     Resctrl resctrl;
596 #endif
597     bool useResctrl;
598 
599     std::shared_ptr<FreeRunningBWCounters> clientBW;
600     std::shared_ptr<CounterWidthExtender> clientImcReads;
601     std::shared_ptr<CounterWidthExtender> clientImcWrites;
602     std::shared_ptr<CounterWidthExtender> clientIoRequests;
603 
604     std::vector<std::shared_ptr<ServerBW> > serverBW;
605 
606     bool disable_JKT_workaround;
607     bool blocked;              // track if time-driven counter update is running or not: PCM is blocked
608 
609     uint64 * coreCStateMsr;    // MSR addresses of core C-state free-running counters
610     uint64 * pkgCStateMsr;     // MSR addresses of package C-state free-running counters
611 
612     std::vector<std::shared_ptr<CoreTaskQueue> > coreTaskQueues;
613 
614     bool L2CacheHitRatioAvailable;
615     bool L3CacheHitRatioAvailable;
616     bool L3CacheMissesAvailable;
617     bool L2CacheMissesAvailable;
618     bool L2CacheHitsAvailable;
619     bool L3CacheHitsNoSnoopAvailable;
620     bool L3CacheHitsSnoopAvailable;
621     bool L3CacheHitsAvailable;
622 
623     bool forceRTMAbortMode;
624 
625     std::vector<uint64> FrontendBoundSlots, BadSpeculationSlots, BackendBoundSlots, RetiringSlots, AllSlotsRaw;
626     bool isFixedCounterSupported(unsigned c);
627     bool vm = false;
628     bool linux_arch_perfmon = false;
629 
630 public:
631     enum { MAX_C_STATE = 10 }; // max C-state on Intel architecture
632 
633     //! \brief Returns true if the specified core C-state residency metric is supported
isCoreCStateResidencySupported(int state)634     bool isCoreCStateResidencySupported(int state)
635     {
636         if (state == 0 || state == 1)
637             return true;
638 
639         return (coreCStateMsr != NULL && state <= ((int)MAX_C_STATE) && coreCStateMsr[state] != 0);
640     }
641 
642     //! \brief Returns true if the specified package C-state residency metric is supported
isPackageCStateResidencySupported(int state)643     bool isPackageCStateResidencySupported(int state)
644     {
645         if (state == 0)
646         {
647             return true;
648         }
649         return (pkgCStateMsr != NULL && state <= ((int)MAX_C_STATE) && pkgCStateMsr[state] != 0);
650     }
651 
652     //! \brief Redirects output destination to provided file, instead of std::cout
653     void setOutput(const std::string filename);
654 
655     //! \brief Restores output, closes output file if opened
656     void restoreOutput();
657 
658     //! \brief Set Run State.
659     // Arguments:
660     //  -- 1 - program is running
661     //  -- 0 -pgram is sleeping
setRunState(int new_state)662     void setRunState(int new_state) { run_state = new_state; }
663 
664     //! \brief Returns program's Run State.
665     // Results:
666     //  -- 1 - program is running
667     //  -- 0 -pgram is sleeping
getRunState(void)668     int getRunState(void) { return run_state; }
669 
isBlocked(void)670     bool isBlocked(void) { return blocked; }
setBlocked(const bool new_blocked)671     void setBlocked(const bool new_blocked) { blocked = new_blocked; }
672 
673     //! \brief Call it before program() to allow multiple running instances of PCM on the same system
allowMultipleInstances()674     void allowMultipleInstances()
675     {
676         allow_multiple_instances = true;
677     }
678 
679     //! Mode of programming (parameter in the program() method)
680     enum ProgramMode {
681         DEFAULT_EVENTS = 0,         /*!< Default choice of events, the additional parameter is not needed and ignored */
682         CUSTOM_CORE_EVENTS = 1,     /*!< Custom set of core events specified in the parameter to the program method. The parameter must be a pointer to array of four \c CustomCoreEventDescription values */
683         EXT_CUSTOM_CORE_EVENTS = 2, /*!< Custom set of core events specified in the parameter to the program method. The parameter must be a pointer to a \c ExtendedCustomCoreEventDescription  data structure */
684         INVALID_MODE                /*!< Non-programmed mode */
685     };
686 
687     //! Return codes (e.g. for program(..) method)
688     enum ErrorCode {
689         Success = 0,
690         MSRAccessDenied = 1,
691         PMUBusy = 2,
692         UnknownError
693     };
694 
695     enum PerfmonField {
696         INVALID, /* Use to parse invalid field */
697         OPCODE,
698         EVENT_SELECT,
699         UMASK,
700         RESET,
701         EDGE_DET,
702         IGNORED,
703         OVERFLOW_ENABLE,
704         ENABLE,
705         INVERT,
706         THRESH,
707         CH_MASK,
708         FC_MASK,
709         /* Below are not part of perfmon definition */
710         H_EVENT_NAME,
711         V_EVENT_NAME,
712         MULTIPLIER,
713         DIVIDER,
714         COUNTER_INDEX
715     };
716 
717     enum PCIeWidthMode {
718         X1,
719         X4,
720         X8,
721         X16,
722         XFF
723     };
724 
725     enum { // offsets/enumeration of IIO stacks
726         IIO_CBDMA = 0, // shared with DMI
727         IIO_PCIe0 = 1,
728         IIO_PCIe1 = 2,
729         IIO_PCIe2 = 3,
730         IIO_MCP0 = 4,
731         IIO_MCP1 = 5,
732         IIO_STACK_COUNT = 6
733     };
734 
735     // Offsets/enumeration of IIO stacks Skylake server.
736     enum SkylakeIIOStacks {
737         SKX_IIO_CBDMA_DMI   = 0,
738         SKX_IIO_PCIe0       = 1,
739         SKX_IIO_PCIe1       = 2,
740         SKX_IIO_PCIe2       = 3,
741         SKX_IIO_MCP0        = 4,
742         SKX_IIO_MCP1        = 5,
743         SKX_IIO_STACK_COUNT = 6
744     };
745 
746      // Offsets/enumeration of IIO stacks for IceLake server.
747     enum IcelakeIIOStacks {
748         ICX_IIO_PCIe0       = 0,
749         ICX_IIO_PCIe1       = 1,
750         ICX_IIO_MCP0        = 2,
751         ICX_IIO_PCIe2       = 3,
752         ICX_IIO_PCIe3       = 4,
753         ICX_IIO_CBDMA_DMI   = 5,
754         ICX_IIO_STACK_COUNT = 6
755     };
756 
757     // Offsets/enumeration of IIO stacks for IceLake server.
758     enum SnowridgeIIOStacks {
759         SNR_IIO_QAT         = 0,
760         SNR_IIO_CBDMA_DMI   = 1,
761         SNR_IIO_NIS         = 2,
762         SNR_IIO_HQM         = 3,
763         SNR_IIO_PCIe0       = 4,
764         SNR_IIO_STACK_COUNT = 5
765     };
766 
767     struct SimplePCIeDevInfo
768     {
769         enum PCIeWidthMode width;
770         std::string pciDevName;
771         std::string busNumber;
772 
SimplePCIeDevInfoSimplePCIeDevInfo773         SimplePCIeDevInfo() : width(XFF) { }
774     };
775 
776     /*! \brief Custom Core event description
777 
778         See "Intel 64 and IA-32 Architectures Software Developers Manual Volume 3B:
779         System Programming Guide, Part 2" for the concrete values of the data structure fields,
780         e.g. Appendix A.2 "Performance Monitoring Events for Intel(r) Core(tm) Processor Family
781         and Xeon Processor Family"
782     */
783     struct CustomCoreEventDescription
784     {
785         int32 event_number, umask_value;
786     };
787 
788     /*! \brief Extended custom core event description
789 
790         In contrast to CustomCoreEventDescription supports configuration of all fields.
791 
792         See "Intel 64 and IA-32 Architectures Software Developers Manual Volume 3B:
793         System Programming Guide, Part 2" for the concrete values of the data structure fields,
794         e.g. Appendix A.2 "Performance Monitoring Events for Intel(r) Core(tm) Processor Family
795         and Xeon Processor Family"
796     */
797     struct ExtendedCustomCoreEventDescription
798     {
799         FixedEventControlRegister * fixedCfg; // if NULL, then default configuration performed for fixed counters
800         uint32 nGPCounters;                   // number of general purpose counters
801         EventSelectRegister * gpCounterCfg;   // general purpose counters, if NULL, then default configuration performed for GP counters
802         uint64 OffcoreResponseMsrValue[2];
ExtendedCustomCoreEventDescriptionExtendedCustomCoreEventDescription803         ExtendedCustomCoreEventDescription() : fixedCfg(NULL), nGPCounters(0), gpCounterCfg(NULL)
804         {
805             OffcoreResponseMsrValue[0] = 0;
806             OffcoreResponseMsrValue[1] = 0;
807         }
808     };
809 
810     struct CustomIIOEventDescription
811     {
812         /* We program the same counters to every IIO Stacks */
813         std::string eventNames[4];
814         IIOPMUCNTCTLRegister eventOpcodes[4];
815         int multiplier[4]; //Some IIO event requires transformation to get meaningful output (i.e. DWord to bytes)
816         int divider[4]; //We usually like to have some kind of divider (i.e. /10e6 )
817     };
818 
819 private:
820     ProgramMode mode;
821     CustomCoreEventDescription coreEventDesc[PERF_MAX_CUSTOM_COUNTERS];
822 
823 #ifdef _MSC_VER
824     HANDLE numInstancesSemaphore;     // global semaphore that counts the number of PCM instances on the system
825 #else
826     // global semaphore that counts the number of PCM instances on the system
827     sem_t * numInstancesSemaphore;
828 #endif
829 
830     std::vector<int32> socketRefCore;
831 
832     bool canUsePerf;
833 #ifdef PCM_USE_PERF
834     std::vector<std::vector<int> > perfEventHandle;
835     void readPerfData(uint32 core, std::vector<uint64> & data);
836 
837     enum {
838         PERF_INST_RETIRED_POS = 0,
839         PERF_CPU_CLK_UNHALTED_THREAD_POS = 1,
840         PERF_CPU_CLK_UNHALTED_REF_POS = 2,
841         PERF_GEN_EVENT_0_POS = 3,
842         PERF_GEN_EVENT_1_POS = 4,
843         PERF_GEN_EVENT_2_POS = 5,
844         PERF_GEN_EVENT_3_POS = 6,
845         PERF_TOPDOWN_SLOTS_POS = PERF_GEN_EVENT_0_POS + PERF_MAX_CUSTOM_COUNTERS,
846         PERF_TOPDOWN_FRONTEND_POS = PERF_TOPDOWN_SLOTS_POS + 1,
847         PERF_TOPDOWN_BADSPEC_POS = PERF_TOPDOWN_SLOTS_POS + 2,
848         PERF_TOPDOWN_BACKEND_POS = PERF_TOPDOWN_SLOTS_POS + 3,
849         PERF_TOPDOWN_RETIRING_POS = PERF_TOPDOWN_SLOTS_POS + 4
850     };
851 
852     std::unordered_map<int, int> perfTopDownPos;
853 
854     enum {
855         PERF_GROUP_LEADER_COUNTER = PERF_INST_RETIRED_POS,
856         PERF_TOPDOWN_GROUP_LEADER_COUNTER = PERF_TOPDOWN_SLOTS_POS
857     };
858 #endif
859     std::ofstream * outfile;       // output file stream
860     std::streambuf * backup_ofile; // backup of original output = cout
861     int run_state;                 // either running (1) or sleeping (0)
862 
863     bool needToRestoreNMIWatchdog;
864 
865     std::vector<std::vector<EventSelectRegister> > lastProgrammedCustomCounters;
866     uint32 checkCustomCoreProgramming(std::shared_ptr<SafeMsrHandle> msr);
867     ErrorCode programCoreCounters(int core, const PCM::ProgramMode mode, const ExtendedCustomCoreEventDescription * pExtDesc,
868         std::vector<EventSelectRegister> & programmedCustomCounters);
869 
870     bool PMUinUse();
871     void cleanupPMU(const bool silent = false);
872     void cleanupRDT(const bool silent = false);
873     bool decrementInstanceSemaphore(); // returns true if it was the last instance
874 
875 #ifdef __APPLE__
876     // OSX does not have sem_getvalue, so we must get the number of instances by a different method
877     uint32 getNumInstances();
878     uint32 decrementNumInstances();
879     uint32 incrementNumInstances();
880 #endif
881 
882 
883     void computeQPISpeedBeckton(int core_nr);
884     void destroyMSR();
885     void computeNominalFrequency();
886     static bool isCPUModelSupported(const int model_);
887     std::string getSupportedUarchCodenames() const;
888     std::string getUnsupportedMessage() const;
889     bool detectModel();
890     bool checkModel();
891 
892     void initCStateSupportTables();
893     bool discoverSystemTopology();
894     void printSystemTopology() const;
895     bool initMSR();
896     bool detectNominalFrequency();
897     void showSpecControlMSRs();
898     void initEnergyMonitoring();
899     void initUncoreObjects();
900     /*!
901     *       \brief initializes each core with an RMID
902     *
903     *       \returns nothing
904     */
905     void initRDT();
906     /*!
907      *      \brief Initializes RDT
908      *
909      *      Initializes RDT infrastructure through resctrl Linux driver or direct MSR programming.
910      *      For the latter: initializes each core event MSR with an RMID for QOS event (L3 cache monitoring or memory bandwidth monitoring)
911      *      \returns nothing
912     */
913     void initQOSevent(const uint64 event, const int32 core);
914     void programBecktonUncore(int core);
915     void programNehalemEPUncore(int core);
916     void enableJKTWorkaround(bool enable);
917     template <class CounterStateType>
918     void readAndAggregateMemoryBWCounters(const uint32 core, CounterStateType & counterState);
919     template <class CounterStateType>
920     void readAndAggregateUncoreMCCounters(const uint32 socket, CounterStateType & counterState);
921     template <class CounterStateType>
922     void readAndAggregateEnergyCounters(const uint32 socket, CounterStateType & counterState);
923     template <class CounterStateType>
924     void readPackageThermalHeadroom(const uint32 socket, CounterStateType & counterState);
925     template <class CounterStateType>
926     void readAndAggregatePackageCStateResidencies(std::shared_ptr<SafeMsrHandle> msr, CounterStateType & result);
927     void readQPICounters(SystemCounterState & counterState);
928     void reportQPISpeed() const;
929     void readCoreCounterConfig(const bool complainAboutMSR = false);
930     void readCPUMicrocodeLevel();
931 
932     uint64 CX_MSR_PMON_CTRY(uint32 Cbo, uint32 Ctr) const;
933     uint64 CX_MSR_PMON_BOX_FILTER(uint32 Cbo) const;
934     uint64 CX_MSR_PMON_BOX_FILTER1(uint32 Cbo) const;
935     uint64 CX_MSR_PMON_CTLY(uint32 Cbo, uint32 Ctl) const;
936     uint64 CX_MSR_PMON_BOX_CTL(uint32 Cbo) const;
937     void programCboOpcodeFilter(const uint32 opc0, UncorePMU & pmu, const uint32 nc_, const uint32 opc1, const uint32 loc, const uint32 rem);
938     void initLLCReadMissLatencyEvents(uint64 * events, uint32 & opCode);
939     void initCHARequestEvents(uint64 * events);
940     void programCbo();
941     uint64 getCBOCounterState(const uint32 socket, const uint32 ctr_);
942     template <class Iterator>
program(UncorePMU & pmu,const Iterator & eventsBegin,const Iterator & eventsEnd,const uint32 extra)943     static void program(UncorePMU& pmu, const Iterator& eventsBegin, const Iterator& eventsEnd, const uint32 extra)
944     {
945         if (!eventsBegin) return;
946         Iterator curEvent = eventsBegin;
947         for (int c = 0; curEvent != eventsEnd; ++c, ++curEvent)
948         {
949             auto ctrl = pmu.counterControl[c];
950             if (ctrl.get() != nullptr)
951             {
952                 *ctrl = MC_CH_PCI_PMON_CTL_EN;
953                 *ctrl = MC_CH_PCI_PMON_CTL_EN | *curEvent;
954             }
955         }
956         if (extra)
957         {
958             pmu.resetUnfreeze(extra);
959         }
960     }
961     void programPCU(uint32 * events, const uint64 filter);
962     void programUBOX(const uint64* events);
963 
964     void cleanupUncorePMUs(const bool silent = false);
965 
isCLX()966     bool isCLX() const // Cascade Lake-SP
967     {
968         return (PCM::SKX == cpu_model) && (cpu_stepping > 4 && cpu_stepping < 8);
969     }
970 
isCPX(int cpu_model_,int cpu_stepping_)971     static bool isCPX(int cpu_model_, int cpu_stepping_) // Cooper Lake
972     {
973         return (PCM::SKX == cpu_model_) && (cpu_stepping_ >= 10);
974     }
975 
isCPX()976     bool isCPX() const
977     {
978         return isCPX(cpu_model, cpu_stepping);
979     }
980 
981     void initUncorePMUsDirect();
982     void initUncorePMUsPerf();
983     bool isRDTDisabled() const;
984 
985 public:
986     //! check if TMA level 1 metrics are supported
987     bool isHWTMAL1Supported() const;
988 
989     enum EventPosition
990     {
991         TOR_OCCUPANCY = 0,
992         TOR_INSERTS = 1,
993         REQUESTS_ALL = 2,
994         REQUESTS_LOCAL = 3
995     };
996     //! check if in secure boot mode
997     bool isSecureBoot() const;
998 
999     //! true if Linux perf for uncore PMU programming should AND can be used internally
1000     bool useLinuxPerfForUncore() const;
1001 
1002     /*!
1003              \brief The system, sockets, uncores, cores and threads are structured like a tree
1004 
1005              \returns a reference to a const System object representing the root of the tree
1006      */
getSystemTopology()1007     SystemRoot const & getSystemTopology() const {
1008         return *systemTopology;
1009     }
1010 
1011     /*!
1012              \brief checks if QOS monitoring support present
1013 
1014              \returns true or false
1015      */
1016     bool QOSMetricAvailable() const;
1017     /*!
1018              \brief checks L3 cache support for QOS present
1019 
1020              \returns true or false
1021      */
1022     bool L3QOSMetricAvailable() const;
1023     /*!
1024              \brief checks if L3 cache monitoring present
1025 
1026              \returns true or false
1027      */
1028     bool L3CacheOccupancyMetricAvailable() const;
1029     /*!
1030             \brief checks if local memory bandwidth monitoring present
1031 
1032             \returns true or false
1033     */
1034     bool CoreLocalMemoryBWMetricAvailable() const;
1035     /*!
1036     \brief checks if total memory bandwidth monitoring present
1037 
1038     \returns true or false
1039     */
1040     bool CoreRemoteMemoryBWMetricAvailable() const;
1041     /*!
1042      *      \brief returns the max number of RMID supported by socket
1043      *
1044      *      \returns maximum number of RMID supported by socket
1045      */
1046     unsigned getMaxRMID() const;
1047 
1048     //! \brief Returns the number of CBO or CHA units per socket
1049     uint32 getMaxNumOfCBoxes() const;
1050 
1051     //! \brief Returns the number of IIO stacks per socket
1052     uint32 getMaxNumOfIIOStacks() const;
1053 
1054     /*!
1055             \brief Returns PCM object
1056 
1057             Returns PCM object. If the PCM has not been created before than
1058             an instance is created. PCM is a singleton.
1059 
1060             \return Pointer to PCM object
1061     */
1062     static PCM * getInstance();        // the only way to get access
1063 
1064     /*!
1065             \brief Checks the status of PCM object
1066 
1067             Call this method to check if PCM gained access to model specific registers. The method is deprecated, see program error code instead.
1068 
1069             \return true iff access to model specific registers works without problems
1070     */
1071     bool good();                       // true if access to CPU counters works
1072 
1073     /*! \brief Returns the error message
1074 
1075                 Call this when good() returns false, otherwise return an empty string
1076     */
getErrorMessage()1077     const std::string & getErrorMessage() const
1078     {
1079         return errorMessage;
1080     }
1081 
1082     /*! \brief Programs performance counters
1083         \param mode_ mode of programming, see ProgramMode definition
1084         \param parameter_ optional parameter for some of programming modes
1085 
1086                 Call this method before you start using the performance counting routines.
1087 
1088         \warning Using this routines with other tools that *program* Performance Monitoring
1089         Units (PMUs) on CPUs is not recommended because PMU can not be shared. Tools that are known to
1090         program PMUs: Intel(r) VTune(tm), Intel(r) Performance Tuning Utility (PTU). This code may make
1091         VTune or PTU measurements invalid. VTune or PTU measurement may make measurement with this code invalid. Please enable either usage of these routines or VTune/PTU/etc.
1092     */
1093     ErrorCode program(const ProgramMode mode_ = DEFAULT_EVENTS, const void * parameter_ = NULL, const bool silent = false); // program counters and start counting
1094 
1095     /*! \brief Programs uncore latency counters on microarchitectures codename SandyBridge-EP and later Xeon uarch
1096         \param enable_pmm enables DDR/PMM. See possible profile values in pcm-latency.cpp example
1097 
1098         Call this method before you start using the latency counter routines on microarchitecture codename SandyBridge-EP and later Xeon uarch
1099 
1100         \warning After this call the memory and QPI bandwidth counters on microarchitecture codename SandyBridge-EP and later Xeon uarch will not work.
1101         \warning Using this routines with other tools that *program* Performance Monitoring
1102         Units (PMUs) on CPUs is not recommended because PMU can not be shared. Tools that are known to
1103         program PMUs: Intel(r) VTune(tm), Intel(r) Performance Tuning Utility (PTU). This code may make
1104         VTune or PTU measurements invalid. VTune or PTU measurement may make measurement with this code invalid. Please enable either usage of these routines or VTune/PTU/etc.
1105     */
1106     ErrorCode programServerUncoreLatencyMetrics(bool enable_pmm);
1107 
1108     /*! \brief Programs uncore power/energy counters on microarchitectures codename SandyBridge-EP and later Xeon uarch
1109         \param mc_profile profile for integrated memory controller PMU. See possible profile values in pcm-power.cpp example
1110         \param pcu_profile profile for power control unit PMU. See possible profile values in pcm-power.cpp example
1111         \param freq_bands array of three integer values for core frequency band monitoring. See usage in pcm-power.cpp example
1112 
1113         Call this method before you start using the power counter routines on microarchitecture codename SandyBridge-EP and later Xeon uarch
1114 
1115         \warning After this call the memory and QPI bandwidth counters on microarchitecture codename SandyBridge-EP and later Xeon uarch will not work.
1116         \warning Using this routines with other tools that *program* Performance Monitoring
1117         Units (PMUs) on CPUs is not recommended because PMU can not be shared. Tools that are known to
1118         program PMUs: Intel(r) VTune(tm), Intel(r) Performance Tuning Utility (PTU). This code may make
1119         VTune or PTU measurements invalid. VTune or PTU measurement may make measurement with this code invalid. Please enable either usage of these routines or VTune/PTU/etc.
1120     */
1121     ErrorCode programServerUncorePowerMetrics(int mc_profile, int pcu_profile, int * freq_bands = NULL);
1122 
1123     /*  \brief Program memory counters (disables programming performance counters)
1124         \param rankA count DIMM rank1 statistics (disables memory channel monitoring)
1125         \param rankB count DIMM rank2 statistics (disables memory channel monitoring)
1126         \brief metrics metric set (see the ServerUncoreMemoryMetrics enum)
1127 
1128         Call this method before you start using the memory counter routines on microarchitecture codename SandyBridge-EP and later Xeon uarch
1129 
1130         \warning Using this routines with other tools that *program* Performance Monitoring
1131         Units (PMUs) on CPUs is not recommended because PMU can not be shared. Tools that are known to
1132         program PMUs: Intel(r) VTune(tm), Intel(r) Performance Tuning Utility (PTU). This code may make
1133         VTune or PTU measurements invalid. VTune or PTU measurement may make measurement with this code invalid. Please enable either usage of these routines or VTune/PTU/etc.
1134     */
1135     ErrorCode programServerUncoreMemoryMetrics(const ServerUncoreMemoryMetrics & metrics, int rankA = -1, int rankB = -1);
1136 
1137     // vector of IDs. E.g. for core {raw event} or {raw event, offcore response1 msr value, } or {raw event, offcore response1 msr value, offcore response2}
1138     // or for cha/cbo {raw event, filter value}, etc
1139     // + user-supplied name
1140     typedef std::pair<std::array<uint64, 3>, std::string> RawEventConfig;
1141     struct RawPMUConfig
1142     {
1143         std::vector<RawEventConfig> programmable;
1144         std::vector<RawEventConfig> fixed;
1145     };
1146     typedef std::map<std::string, RawPMUConfig> RawPMUConfigs;
1147     ErrorCode program(const RawPMUConfigs& curPMUConfigs, const bool silent = false);
1148 
1149     //! \brief Freezes uncore event counting (works only on microarchitecture codename SandyBridge-EP and IvyTown)
1150     void freezeServerUncoreCounters();
1151 
1152     //! \brief Unfreezes uncore event counting (works only on microarchitecture codename SandyBridge-EP and IvyTown)
1153     void unfreezeServerUncoreCounters();
1154 
1155     /*! \brief Reads the power/energy counter state of a socket (works only on microarchitecture codename SandyBridge-EP)
1156         \param socket socket id
1157         \return State of power counters in the socket
1158     */
1159     ServerUncoreCounterState getServerUncoreCounterState(uint32 socket);
1160 
1161     /*! \brief Cleanups resources and stops performance counting
1162 
1163             One needs to call this method when your program finishes or/and you are not going to use the
1164             performance counting routines anymore.
1165 */
1166     void cleanup(const bool silent = false);
1167 
1168     /*! \brief Forces PMU reset
1169 
1170                 If there is no chance to free up PMU from other applications you might try to call this method at your own risk.
1171     */
1172     void resetPMU();
1173 
1174     /*! \brief Reads all counter states (including system, sockets and cores)
1175 
1176         \param systemState system counter state (return parameter)
1177         \param socketStates socket counter states (return parameter)
1178         \param coreStates core counter states (return parameter)
1179 
1180     */
1181     void getAllCounterStates(SystemCounterState & systemState, std::vector<SocketCounterState> & socketStates, std::vector<CoreCounterState> & coreStates);
1182 
1183     /*! \brief Reads uncore counter states (including system and sockets) but no core counters
1184 
1185     \param systemState system counter state (return parameter)
1186     \param socketStates socket counter states (return parameter)
1187 
1188     */
1189     void getUncoreCounterStates(SystemCounterState & systemState, std::vector<SocketCounterState> & socketStates);
1190 
1191     /*! \brief Return true if the core in online
1192 
1193         \param os_core_id OS core id
1194     */
1195     bool isCoreOnline(int32 os_core_id) const;
1196 
1197     /*! \brief Return true if the socket in online
1198 
1199         \param socket_id OS socket id
1200     */
1201     bool isSocketOnline(int32 socket_id) const;
1202 
1203     /*! \brief Reads the counter state of the system
1204 
1205             System consists of several sockets (CPUs).
1206             Socket has a CPU in it. Socket (CPU) consists of several (logical) cores.
1207 
1208             \return State of counters in the entire system
1209     */
1210     SystemCounterState getSystemCounterState();
1211 
1212     /*! \brief Reads the counter state of a socket
1213             \param socket socket id
1214             \return State of counters in the socket
1215     */
1216     SocketCounterState getSocketCounterState(uint32 socket);
1217 
1218     /*! \brief Reads the counter state of a (logical) core
1219 
1220         Be aware that during the measurement other threads may be scheduled on the same core by the operating system (this is called context-switching). The performance events caused by these threads will be counted as well.
1221 
1222 
1223             \param core core id
1224             \return State of counters in the core
1225     */
1226     CoreCounterState getCoreCounterState(uint32 core);
1227 
1228     /*! \brief Reads number of logical cores in the system
1229             \return Number of logical cores in the system
1230     */
1231     uint32 getNumCores() const;
1232 
1233     /*! \brief Reads number of online logical cores in the system
1234             \return Number of online logical cores in the system
1235     */
1236     uint32 getNumOnlineCores() const;
1237 
1238     /*! \brief Reads number of sockets (CPUs) in the system
1239             \return Number of sockets in the system
1240     */
1241     uint32 getNumSockets() const;
1242 
1243     /*! \brief Reads number of online sockets (CPUs) in the system
1244             \return Number of online sockets in the system
1245     */
1246     uint32 getNumOnlineSockets() const;
1247 
1248     /*! \brief Reads how many hardware threads has a physical core
1249             "Hardware thread" is a logical core in a different terminology.
1250             If Intel(r) Hyperthreading(tm) is enabled then this function returns 2.
1251             \return Number of hardware threads per physical core
1252     */
1253     uint32 getThreadsPerCore() const;
1254 
1255     /*! \brief Checks if SMT (HyperThreading) is enabled.
1256             \return true iff SMT (HyperThreading) is enabled.
1257     */
1258     bool getSMT() const; // returns true iff SMT ("Hyperthreading") is on
1259 
1260     /*! \brief Reads the nominal core frequency
1261             \return Nominal frequency in Hz
1262     */
1263     uint64 getNominalFrequency() const; // in Hz
1264 
1265     /*! \brief runs CPUID.0xF.0x01 to get the L3 up scaling factor to calculate L3 Occupancy
1266      *  Scaling factor is returned in EBX register after running the CPU instruction
1267      * \return L3 up scaling factor
1268      */
1269     uint32 getL3ScalingFactor() const;
1270 
1271     /*! \brief runs CPUID.0xB.0x01 to get maximum logical cores (including SMT) per socket.
1272      *  max_lcores_per_socket is returned in EBX[15:0]. Compare this value with number of cores per socket
1273      *  detected in the system to see if some cores are offlined
1274      * \return true iff max_lcores_per_socket == number of cores per socket detected
1275      */
1276     bool isSomeCoreOfflined();
1277 
1278     /*! \brief Returns the maximum number of custom (general-purpose) core events supported by CPU
1279     */
1280     int32 getMaxCustomCoreEvents();
1281 
1282     //! \brief Identifiers of supported CPU models
1283     enum SupportedCPUModels
1284     {
1285         NEHALEM_EP = 26,
1286         NEHALEM = 30,
1287         ATOM = 28,
1288         ATOM_2 = 53,
1289         CENTERTON = 54,
1290         BAYTRAIL = 55,
1291         AVOTON = 77,
1292         CHERRYTRAIL = 76,
1293         APOLLO_LAKE = 92,
1294         DENVERTON = 95,
1295         SNOWRIDGE = 134,
1296         CLARKDALE = 37,
1297         WESTMERE_EP = 44,
1298         NEHALEM_EX = 46,
1299         WESTMERE_EX = 47,
1300         SANDY_BRIDGE = 42,
1301         JAKETOWN = 45,
1302         IVY_BRIDGE = 58,
1303         HASWELL = 60,
1304         HASWELL_ULT = 69,
1305         HASWELL_2 = 70,
1306         IVYTOWN = 62,
1307         HASWELLX = 63,
1308         BROADWELL = 61,
1309         BROADWELL_XEON_E3 = 71,
1310         BDX_DE = 86,
1311         SKL_UY = 78,
1312         KBL = 158,
1313         KBL_1 = 142,
1314         CML = 166,
1315         CML_1 = 165,
1316         ICL = 126,
1317         ICL_1 = 125,
1318         RKL = 167,
1319         TGL = 140,
1320         TGL_1 = 141,
1321         BDX = 79,
1322         KNL = 87,
1323         SKL = 94,
1324         SKX = 85,
1325         ICX_D = 108,
1326         ICX = 106,
1327         END_OF_MODEL_LIST = 0x0ffff
1328     };
1329 
1330 #define PCM_SKL_PATH_CASES \
1331         case PCM::SKL_UY:  \
1332         case PCM::KBL:     \
1333         case PCM::KBL_1:   \
1334         case PCM::CML:     \
1335         case PCM::ICL:     \
1336         case PCM::RKL:     \
1337         case PCM::TGL:     \
1338         case PCM::SKL:
1339 
1340 private:
useSKLPath()1341     bool useSKLPath() const
1342     {
1343         switch (cpu_model)
1344         {
1345             PCM_SKL_PATH_CASES
1346                 return true;
1347         }
1348         return false;
1349     }
1350 public:
1351 
1352     //! \brief Reads CPU model id
1353     //! \return CPU model ID
getCPUModel()1354     uint32 getCPUModel() const { return (uint32)cpu_model; }
1355 
1356     //! \brief Reads CPU stepping id
1357     //! \return CPU stepping ID
getCPUStepping()1358     uint32 getCPUStepping() const { return (uint32)cpu_stepping; }
1359 
1360     //! \brief Determines physical thread of given processor ID within a core
1361     //! \param os_id processor identifier
1362     //! \return physical thread identifier
getThreadId(uint32 os_id)1363     int32 getThreadId(uint32 os_id) const { return (int32)topology[os_id].thread_id; }
1364 
1365     //! \brief Determines physical core of given processor ID within a socket
1366     //! \param os_id processor identifier
1367     //! \return physical core identifier
getCoreId(uint32 os_id)1368     int32 getCoreId(uint32 os_id) const { return (int32)topology[os_id].core_id; }
1369 
1370     //! \brief Determines physical tile (cores sharing L2 cache) of given processor ID
1371     //! \param os_id processor identifier
1372     //! \return physical tile identifier
getTileId(uint32 os_id)1373     int32 getTileId(uint32 os_id) const { return (int32)topology[os_id].tile_id; }
1374 
1375     //! \brief Determines socket of given core
1376     //! \param core_id core identifier
1377     //! \return socket identifier
getSocketId(uint32 core_id)1378     int32 getSocketId(uint32 core_id) const { return (int32)topology[core_id].socket; }
1379 
1380     //! \brief Returns the number of Intel(r) Quick Path Interconnect(tm) links per socket
1381     //! \return number of QPI links per socket
getQPILinksPerSocket()1382     uint64 getQPILinksPerSocket() const
1383     {
1384         switch (cpu_model)
1385         {
1386         case NEHALEM_EP:
1387         case WESTMERE_EP:
1388         case CLARKDALE:
1389             if (num_sockets == 2)
1390                 return 2;
1391             else
1392                 return 1;
1393         case NEHALEM_EX:
1394         case WESTMERE_EX:
1395             return 4;
1396         case JAKETOWN:
1397         case IVYTOWN:
1398         case HASWELLX:
1399         case BDX_DE:
1400         case BDX:
1401         case SKX:
1402         case ICX:
1403             return (server_pcicfg_uncore.size() && server_pcicfg_uncore[0].get()) ? (server_pcicfg_uncore[0]->getNumQPIPorts()) : 0;
1404         }
1405         return 0;
1406     }
1407 
1408     //! \brief Returns the number of detected integrated memory controllers per socket
getMCPerSocket()1409     uint32 getMCPerSocket() const
1410     {
1411         switch (cpu_model)
1412         {
1413         case NEHALEM_EP:
1414         case WESTMERE_EP:
1415         case CLARKDALE:
1416             return 1;
1417         case NEHALEM_EX:
1418         case WESTMERE_EX:
1419             return 2;
1420         case JAKETOWN:
1421         case IVYTOWN:
1422         case HASWELLX:
1423         case BDX_DE:
1424         case SKX:
1425         case ICX:
1426         case BDX:
1427         case KNL:
1428             return (server_pcicfg_uncore.size() && server_pcicfg_uncore[0].get()) ? (server_pcicfg_uncore[0]->getNumMC()) : 0;
1429         }
1430         return 0;
1431     }
1432 
1433     //! \brief Returns the total number of detected memory channels on all integrated memory controllers per socket
getMCChannelsPerSocket()1434     size_t getMCChannelsPerSocket() const
1435     {
1436         switch (cpu_model)
1437         {
1438         case NEHALEM_EP:
1439         case WESTMERE_EP:
1440         case CLARKDALE:
1441             return 3;
1442         case NEHALEM_EX:
1443         case WESTMERE_EX:
1444             return 4;
1445         case JAKETOWN:
1446         case IVYTOWN:
1447         case HASWELLX:
1448         case BDX_DE:
1449         case SKX:
1450         case ICX:
1451         case BDX:
1452         case KNL:
1453         case SNOWRIDGE:
1454             return (server_pcicfg_uncore.size() && server_pcicfg_uncore[0].get()) ? (server_pcicfg_uncore[0]->getNumMCChannels()) : 0;
1455         }
1456         return 0;
1457     }
1458 
1459     //! \brief Returns the number of detected memory channels on given integrated memory controllers
1460     //! \param socket socket
1461     //! \param controller controller
getMCChannels(uint32 socket,uint32 controller)1462     size_t getMCChannels(uint32 socket, uint32 controller) const
1463     {
1464         switch (cpu_model)
1465         {
1466         case NEHALEM_EP:
1467         case WESTMERE_EP:
1468         case CLARKDALE:
1469             return 3;
1470         case NEHALEM_EX:
1471         case WESTMERE_EX:
1472             return 4;
1473         case JAKETOWN:
1474         case IVYTOWN:
1475         case HASWELLX:
1476         case BDX_DE:
1477         case SKX:
1478         case ICX:
1479         case BDX:
1480         case KNL:
1481         case SNOWRIDGE:
1482             return (socket < server_pcicfg_uncore.size() && server_pcicfg_uncore[socket].get()) ? (server_pcicfg_uncore[socket]->getNumMCChannels(controller)) : 0;
1483         }
1484         return 0;
1485     }
1486 
1487 
1488     //! \brief Returns the total number of detected memory channels on all integrated memory controllers per socket
getEDCChannelsPerSocket()1489     size_t getEDCChannelsPerSocket() const
1490     {
1491         switch (cpu_model)
1492         {
1493         case KNL:
1494             return (server_pcicfg_uncore.size() && server_pcicfg_uncore[0].get()) ? (server_pcicfg_uncore[0]->getNumEDCChannels()) : 0;
1495         }
1496         return 0;
1497     }
1498 
1499 
1500     //! \brief Returns the max number of instructions per cycle
1501     //! \return max number of instructions per cycle
getMaxIPC()1502     uint32 getMaxIPC() const
1503     {
1504         if (ICL == cpu_model || TGL == cpu_model || RKL == cpu_model) return 5;
1505         switch (cpu_model)
1506         {
1507         case SNOWRIDGE:
1508             return 4;
1509         case DENVERTON:
1510             return 3;
1511         case NEHALEM_EP:
1512         case WESTMERE_EP:
1513         case NEHALEM_EX:
1514         case WESTMERE_EX:
1515         case CLARKDALE:
1516         case SANDY_BRIDGE:
1517         case JAKETOWN:
1518         case IVYTOWN:
1519         case IVY_BRIDGE:
1520         case HASWELL:
1521         case HASWELLX:
1522         case BROADWELL:
1523         case BDX_DE:
1524         case BDX:
1525         PCM_SKL_PATH_CASES
1526         case SKX:
1527             return 4;
1528         case KNL:
1529             return 2;
1530         case ICX:
1531             return 5;
1532         }
1533         if (isAtom())
1534         {
1535             return 2;
1536         }
1537         return 0;
1538     }
1539 
1540     //! \brief Returns the frequency of Power Control Unit
getPCUFrequency()1541     uint64 getPCUFrequency() const
1542     {
1543         switch (cpu_model)
1544         {
1545         case JAKETOWN:
1546         case IVYTOWN:
1547             return 800000000ULL;  // 800 MHz
1548         case HASWELLX:
1549         case BDX_DE:
1550         case BDX:
1551         case KNL:
1552             return 1000000000ULL; // 1 GHz
1553         case SKX:
1554         case ICX:
1555         case SNOWRIDGE:
1556             return 1100000000ULL; // 1.1 GHz
1557         }
1558         return 0;
1559     }
1560 
1561     //! \brief Returns whether it is a server part
isServerCPU()1562     bool isServerCPU() const
1563     {
1564         switch (cpu_model)
1565         {
1566         case NEHALEM_EP:
1567         case NEHALEM_EX:
1568         case WESTMERE_EP:
1569         case WESTMERE_EX:
1570         case JAKETOWN:
1571         case IVYTOWN:
1572         case HASWELLX:
1573         case BDX:
1574         case BDX_DE:
1575         case SKX:
1576         case ICX:
1577         case SNOWRIDGE:
1578         case KNL:
1579             return true;
1580         default:
1581             return false;
1582         };
1583     }
1584 
1585     //! \brief Returns whether it is a client part
isClientCPU()1586     bool isClientCPU() const
1587     {
1588         return !isServerCPU();
1589     }
1590     //! \brief Return TSC timer value in time units
1591     //! \param multiplier use 1 for seconds, 1000 for ms, 1000000 for mks, etc (default is 1000: ms)
1592     //! \param core core to read on-chip TSC value (default is 0)
1593     //! \return time counter value
1594     uint64 getTickCount(uint64 multiplier = 1000 /* ms */, uint32 core = 0);
1595 
1596     //! \brief Return TSC timer value in time units using rdtscp instruction from current core
1597     //! \param multiplier use 1 for seconds, 1000 for ms, 1000000 for mks, etc (default is 1000: ms)
1598     //! \warning Processor support is required  bit 27 of cpuid EDX must be set, for Windows, Visual Studio 2010 is required
1599     //! \return time counter value
1600     uint64 getTickCountRDTSCP(uint64 multiplier = 1000 /* ms */);
1601 
1602     //! \brief Returns uncore clock ticks on specified socket
1603     uint64 getUncoreClocks(const uint32 socket_);
1604 
1605     //! \brief Return QPI Link Speed in GBytes/second
1606     //! \warning Works only for Nehalem-EX (Xeon 7500) and Xeon E7 and E5 processors
1607     //! \return QPI Link Speed in GBytes/second
getQPILinkSpeed(uint32 socketNr,uint32 linkNr)1608     uint64 getQPILinkSpeed(uint32 socketNr, uint32 linkNr) const
1609     {
1610         return hasPCICFGUncore() ? server_pcicfg_uncore[socketNr]->getQPILinkSpeed(linkNr) : max_qpi_speed;
1611     }
1612 
1613     //! \brief Returns how many joules are in an internal processor energy unit
getJoulesPerEnergyUnit()1614     double getJoulesPerEnergyUnit() const { return joulesPerEnergyUnit; }
1615 
1616     //! \brief Returns thermal specification power of the package domain in Watt
getPackageThermalSpecPower()1617     int32 getPackageThermalSpecPower() const { return pkgThermalSpecPower; }
1618 
1619     //! \brief Returns minimum power derived from electrical spec of the package domain in Watt
getPackageMinimumPower()1620     int32 getPackageMinimumPower() const { return pkgMinimumPower; }
1621 
1622     //! \brief Returns maximum power derived from electrical spec of the package domain in Watt
getPackageMaximumPower()1623     int32 getPackageMaximumPower() const { return pkgMaximumPower; }
1624 
1625     #ifndef NO_WINRING // In cases where loading the WinRing0 driver is not desirable as a fallback to MSR.sys, add -DNO_WINRING to compile command to remove ability to load driver
1626     //! \brief Loads and initializes Winring0 third party library for access to processor model specific and PCI configuration registers
1627     //! \return returns true in case of success
1628     static bool initWinRing0Lib();
1629     #endif // NO_WINRING
1630 
disableJKTWorkaround()1631     inline void disableJKTWorkaround() { disable_JKT_workaround = true; }
1632 
1633     enum PCIeEventCode
1634     {
1635         // PCIe read events (PCI devices reading from memory - application writes to disk/network/PCIe device)
1636         PCIeRdCur = 0x19E, // PCIe read current (full cache line)
1637         PCIeNSRd = 0x1E4,  // PCIe non-snoop read (full cache line)
1638         // PCIe write events (PCI devices writing to memory - application reads from disk/network/PCIe device)
1639         PCIeWiLF = 0x194,  // PCIe Write (non-allocating) (full cache line)
1640         PCIeItoM = 0x19C,  // PCIe Write (allocating) (full cache line)
1641         PCIeNSWr = 0x1E5,  // PCIe Non-snoop write (partial cache line)
1642         PCIeNSWrF = 0x1E6, // PCIe Non-snoop write (full cache line)
1643         // events shared by CPU and IO
1644         RFO = 0x180,       // Demand Data RFO; share the same code for CPU, use tid to filter PCIe only traffic
1645         CRd = 0x181,       // Demand Code Read
1646         DRd = 0x182,       // Demand Data Read
1647         PRd = 0x187,       // Partial Reads (UC) (MMIO Read)
1648         WiL = 0x18F,       // Write Invalidate Line - partial (MMIO write), PL: Not documented in HSX/IVT
1649         ItoM = 0x1C8,      // Request Invalidate Line; share the same code for CPU, use tid to filter PCIe only traffic
1650 
1651         SKX_RFO = 0x200,
1652         SKX_CRd = 0x201,
1653         SKX_DRd = 0x202,
1654         SKX_PRd = 0x207,
1655         SKX_WiL = 0x20F,
1656         SKX_RdCur = 0x21E,
1657         SKX_ItoM = 0x248,
1658     };
1659 
1660     enum ChaPipelineQueue
1661     {
1662         None,
1663         IRQ,
1664         PRQ,
1665     };
1666 
1667     enum CBoEventTid
1668     {
1669         RFOtid = 0x3E,
1670         ItoMtid = 0x3E,
1671     };
1672 
1673     //! \brief Program uncore PCIe monitoring event(s)
1674     //! \param eventGroup - events to programm for the same run
1675     void programPCIeEventGroup(eventGroup_t &eventGroup);
1676     uint64 getPCIeCounterData(const uint32 socket_, const uint32 ctr_);
1677 
1678     //! \brief Program CBO (or CHA on SKX+) counters
1679     //! \param events array with four raw event values
1680     //! \param opCode opcode match filter
1681     //! \param nc_ match non-coherent requests
1682     //! \param llc_lookup_tid_filter filter for LLC lookup event filter and TID filter (core and thread ID)
1683     //! \param loc match on local node target
1684     //! \param rem match on remote node target
1685     void programCbo(const uint64 * events, const uint32 opCode = 0, const uint32 nc_ = 0, const uint32 llc_lookup_tid_filter = 0, const uint32 loc = 1, const uint32 rem = 1);
1686 
1687     //! \brief Program CBO (or CHA on SKX+) counters
1688     //! \param events array with four raw event values
1689     //! \param filter0 raw filter value
1690     //! \param filter1 raw filter1 value
1691     void programCboRaw(const uint64* events, const uint64 filter0, const uint64 filter1);
1692 
1693     //! \brief Get the state of PCIe counter(s)
1694     //! \param socket_ socket of the PCIe controller
1695     //! \return State of PCIe counter(s)
1696     PCIeCounterState getPCIeCounterState(const uint32 socket_, const uint32 ctr_ = 0);
1697 
1698     //! \brief Program uncore IIO events
1699     //! \param rawEvents events to program (raw format)
1700     //! \param IIOStack id of the IIO stack to program (-1 for all, if parameter omitted)
1701     void programIIOCounters(uint64 rawEvents[4], int IIOStack = -1);
1702 
1703     //! \brief Get the state of IIO counter
1704     //! \param socket socket of the IIO stack
1705     //! \param IIOStack id of the IIO stack
1706     //! \return State of IIO counter
1707     IIOCounterState getIIOCounterState(int socket, int IIOStack, int counter);
1708 
1709     //! \brief Get the states of the four IIO counters in bulk (faster than four single reads)
1710     //! \param socket socket of the IIO stack
1711     //! \param IIOStack id of the IIO stack
1712     //! \param result states of IIO counters (array of four IIOCounterState elements)
1713     void getIIOCounterStates(int socket, int IIOStack, IIOCounterState * result);
1714 
1715     uint64 extractCoreGenCounterValue(uint64 val);
1716     uint64 extractCoreFixedCounterValue(uint64 val);
1717     uint64 extractUncoreGenCounterValue(uint64 val);
1718     uint64 extractUncoreFixedCounterValue(uint64 val);
1719     uint64 extractQOSMonitoring(uint64 val);
1720 
1721     //! \brief Get a string describing the codename of the processor microarchitecture
1722     //! \param cpu_model_ cpu model (if no parameter provided the codename of the detected CPU is returned)
1723     const char * getUArchCodename(const int32 cpu_model_ = -1) const;
1724 
1725     //! \brief Get Brand string of processor
1726     static std::string getCPUBrandString();
1727     std::string getCPUFamilyModelString();
1728 
1729 
1730     //! \brief Enables "force all RTM transaction abort" mode also enabling 4+ programmable counters on Skylake generation processors
1731     void enableForceRTMAbortMode(const bool silent = false);
1732 
1733     //! \brief queries status of "force all RTM transaction abort" mode
1734     bool isForceRTMAbortModeEnabled() const;
1735 
1736     //! \brief Disables "force all RTM transaction abort" mode restricting the number of programmable counters on Skylake generation processors to 3
1737     void disableForceRTMAbortMode(const bool silent = false);
1738 
1739     //! \brief queries availability of "force all RTM transaction abort" mode
1740     bool isForceRTMAbortModeAvailable() const;
1741 
1742     //! \brief Get microcode level (returns -1 if retrieval not supported due to some restrictions)
getCPUMicrocodeLevel()1743     int64 getCPUMicrocodeLevel() const { return cpu_microcode_level; }
1744 
1745     //! \brief returns true if CPU model is Atom-based
isAtom(const int32 cpu_model_)1746     static bool isAtom(const int32 cpu_model_)
1747     {
1748         return cpu_model_ == ATOM
1749             || cpu_model_ == ATOM_2
1750             || cpu_model_ == CENTERTON
1751             || cpu_model_ == BAYTRAIL
1752             || cpu_model_ == AVOTON
1753             || cpu_model_ == CHERRYTRAIL
1754             || cpu_model_ == APOLLO_LAKE
1755             || cpu_model_ == DENVERTON
1756             // || cpu_model_ == SNOWRIDGE do not use Atom code for SNOWRIDGE
1757             ;
1758     }
1759 
1760     //! \brief returns true if CPU is Atom-based
isAtom()1761     bool isAtom() const
1762     {
1763         return isAtom(cpu_model);
1764     }
1765 
packageEnergyMetricsAvailable()1766     bool packageEnergyMetricsAvailable() const
1767     {
1768         return (
1769                     cpu_model == PCM::JAKETOWN
1770                  || cpu_model == PCM::IVYTOWN
1771                  || cpu_model == PCM::SANDY_BRIDGE
1772                  || cpu_model == PCM::IVY_BRIDGE
1773                  || cpu_model == PCM::HASWELL
1774                  || cpu_model == PCM::AVOTON
1775                  || cpu_model == PCM::CHERRYTRAIL
1776                  || cpu_model == PCM::BAYTRAIL
1777                  || cpu_model == PCM::APOLLO_LAKE
1778                  || cpu_model == PCM::DENVERTON
1779                  || cpu_model == PCM::SNOWRIDGE
1780                  || cpu_model == PCM::HASWELLX
1781                  || cpu_model == PCM::BROADWELL
1782                  || cpu_model == PCM::BDX_DE
1783                  || cpu_model == PCM::BDX
1784                  || cpu_model == PCM::KNL
1785                  || useSKLPath()
1786                  || cpu_model == PCM::SKX
1787                  || cpu_model == PCM::ICX
1788                );
1789     }
1790 
dramEnergyMetricsAvailable()1791     bool dramEnergyMetricsAvailable() const
1792     {
1793         return (
1794              cpu_model == PCM::JAKETOWN
1795           || cpu_model == PCM::IVYTOWN
1796           || cpu_model == PCM::HASWELLX
1797           || cpu_model == PCM::BDX_DE
1798           || cpu_model == PCM::BDX
1799           || cpu_model == PCM::KNL
1800           || cpu_model == PCM::SKX
1801           || cpu_model == PCM::ICX
1802           );
1803     }
1804 
packageThermalMetricsAvailable()1805     bool packageThermalMetricsAvailable() const
1806     {
1807         return packageEnergyMetricsAvailable();
1808     }
1809 
outgoingQPITrafficMetricsAvailable()1810     bool outgoingQPITrafficMetricsAvailable() const
1811     {
1812         return getQPILinksPerSocket() > 0 &&
1813             (
1814                 cpu_model == PCM::NEHALEM_EX
1815             ||  cpu_model == PCM::WESTMERE_EX
1816             ||  cpu_model == PCM::JAKETOWN
1817             ||  cpu_model == PCM::IVYTOWN
1818             ||  cpu_model == PCM::HASWELLX
1819             ||  cpu_model == PCM::BDX
1820             ||  cpu_model == PCM::SKX
1821             ||  cpu_model == PCM::ICX
1822             );
1823     }
1824 
incomingQPITrafficMetricsAvailable()1825     bool incomingQPITrafficMetricsAvailable() const
1826     {
1827         return getQPILinksPerSocket() > 0 &&
1828             (
1829                 cpu_model == PCM::NEHALEM_EX
1830             ||  cpu_model == PCM::WESTMERE_EX
1831             ||  cpu_model == PCM::JAKETOWN
1832             ||  cpu_model == PCM::IVYTOWN
1833             || (cpu_model == PCM::SKX && cpu_stepping > 1)
1834             ||  cpu_model == PCM::ICX
1835                );
1836     }
1837 
localMemoryRequestRatioMetricAvailable()1838     bool localMemoryRequestRatioMetricAvailable() const
1839     {
1840         return cpu_model == PCM::HASWELLX
1841             || cpu_model == PCM::BDX
1842             || cpu_model == PCM::SKX
1843             || cpu_model == PCM::ICX
1844             ;
1845     }
1846 
qpiUtilizationMetricsAvailable()1847     bool qpiUtilizationMetricsAvailable() const
1848     {
1849         return outgoingQPITrafficMetricsAvailable();
1850     }
1851 
memoryTrafficMetricsAvailable()1852     bool memoryTrafficMetricsAvailable() const
1853     {
1854         return (!(isAtom() || cpu_model == PCM::CLARKDALE))
1855                ;
1856     }
1857 
MCDRAMmemoryTrafficMetricsAvailable()1858     bool MCDRAMmemoryTrafficMetricsAvailable() const
1859     {
1860         return (cpu_model == PCM::KNL);
1861     }
1862 
memoryIOTrafficMetricAvailable()1863     bool memoryIOTrafficMetricAvailable() const
1864     {
1865         if (cpu_model == TGL) return false;
1866         return (
1867             cpu_model == PCM::SANDY_BRIDGE
1868             || cpu_model == PCM::IVY_BRIDGE
1869             || cpu_model == PCM::HASWELL
1870             || cpu_model == PCM::BROADWELL
1871             || useSKLPath()
1872             );
1873     }
1874 
IIOEventsAvailable()1875     bool IIOEventsAvailable() const
1876     {
1877         return (
1878                cpu_model == PCM::SKX
1879             || cpu_model == PCM::ICX
1880 	    || cpu_model  == PCM::SNOWRIDGE
1881         );
1882     }
1883 
LatencyMetricsAvailable()1884     bool LatencyMetricsAvailable() const
1885     {
1886         return (
1887             cpu_model == PCM::HASWELLX
1888             || cpu_model == PCM::BDX
1889             || cpu_model == PCM::SKX
1890             || cpu_model == PCM::ICX
1891             || useSKLPath()
1892             );
1893     }
1894 
DDRLatencyMetricsAvailable()1895     bool DDRLatencyMetricsAvailable() const
1896     {
1897         return (
1898             cpu_model == PCM::SKX
1899             || cpu_model == PCM::ICX
1900             );
1901     }
1902 
PMMTrafficMetricsAvailable()1903     bool PMMTrafficMetricsAvailable() const
1904     {
1905         return (
1906             isCLX()
1907                     ||  isCPX()
1908                      || cpu_model == PCM::ICX
1909                      || cpu_model == PCM::SNOWRIDGE
1910         );
1911     }
1912 
LLCReadMissLatencyMetricsAvailable()1913     bool LLCReadMissLatencyMetricsAvailable() const
1914     {
1915         return (
1916                HASWELLX == cpu_model
1917             || BDX_DE == cpu_model
1918             || BDX == cpu_model
1919             || isCLX()
1920             || isCPX()
1921 #ifdef PCM_ENABLE_LLCRDLAT_SKX_MP
1922             || SKX == cpu_model
1923 #else
1924             || ((SKX == cpu_model) && (num_sockets == 1))
1925 #endif
1926             || ICX == cpu_model
1927             || SNOWRIDGE == cpu_model
1928                );
1929     }
1930 
hasBecktonUncore()1931     bool hasBecktonUncore() const
1932     {
1933         return (
1934             cpu_model == PCM::NEHALEM_EX
1935             || cpu_model == PCM::WESTMERE_EX
1936             );
1937     }
hasPCICFGUncore()1938     bool hasPCICFGUncore() const // has PCICFG uncore PMON
1939     {
1940         return (
1941             cpu_model == PCM::JAKETOWN
1942             || cpu_model == PCM::SNOWRIDGE
1943             || cpu_model == PCM::IVYTOWN
1944             || cpu_model == PCM::HASWELLX
1945             || cpu_model == PCM::BDX_DE
1946             || cpu_model == PCM::SKX
1947             || cpu_model == PCM::ICX
1948             || cpu_model == PCM::BDX
1949             || cpu_model == PCM::KNL
1950             );
1951     }
1952 
isSkxCompatible()1953     bool isSkxCompatible() const
1954     {
1955         return (
1956             cpu_model == PCM::SKX
1957                );
1958     }
1959 
hasUPI(const int32 cpu_model_)1960     static bool hasUPI(const int32 cpu_model_) // Intel(r) Ultra Path Interconnect
1961     {
1962         return (
1963             cpu_model_ == PCM::SKX
1964          || cpu_model_ == PCM::ICX
1965                );
1966     }
1967 
hasUPI()1968     bool hasUPI() const
1969     {
1970         return hasUPI(cpu_model);
1971     }
1972 
xPI()1973     const char * xPI() const
1974     {
1975         if (hasUPI())
1976             return "UPI";
1977 
1978         return "QPI";
1979     }
1980 
hasCHA()1981     bool hasCHA() const
1982     {
1983         return (
1984             cpu_model == PCM::SKX
1985          || cpu_model == PCM::ICX
1986                );
1987     }
1988 
1989     bool supportsHLE() const;
1990     bool supportsRTM() const;
1991 
useSkylakeEvents()1992     bool useSkylakeEvents() const
1993     {
1994         return    useSKLPath()
1995                || PCM::SKX == cpu_model
1996                || PCM::ICX == cpu_model
1997                ;
1998     }
1999 
hasClientMCCounters()2000     bool hasClientMCCounters() const
2001     {
2002         return  cpu_model == SANDY_BRIDGE
2003             || cpu_model == IVY_BRIDGE
2004             || cpu_model == HASWELL
2005             || cpu_model == BROADWELL
2006             || useSKLPath()
2007             ;
2008     }
2009 
getBytesPerFlit(int32 cpu_model_)2010     static double getBytesPerFlit(int32 cpu_model_)
2011     {
2012         if (hasUPI(cpu_model_))
2013         {
2014             // 172 bits per UPI flit
2015             return 172./8.;
2016         }
2017         // 8 bytes per QPI flit
2018         return 8.;
2019     }
2020 
getBytesPerFlit()2021     double getBytesPerFlit() const
2022     {
2023         return getBytesPerFlit(cpu_model);
2024     }
2025 
getDataBytesPerFlit(int32 cpu_model_)2026     static double getDataBytesPerFlit(int32 cpu_model_)
2027     {
2028         if (hasUPI(cpu_model_))
2029         {
2030             // 9 UPI flits to transfer 64 bytes
2031             return 64./9.;
2032         }
2033         // 8 bytes per QPI flit
2034         return 8.;
2035     }
2036 
getDataBytesPerFlit()2037     double getDataBytesPerFlit() const
2038     {
2039         return getDataBytesPerFlit(cpu_model);
2040     }
2041 
getFlitsPerLinkCycle(int32 cpu_model_)2042     static double getFlitsPerLinkCycle(int32 cpu_model_)
2043     {
2044         if (hasUPI(cpu_model_))
2045         {
2046             // 5 UPI flits sent every 6 link cycles
2047             return 5./6.;
2048         }
2049         return 2.;
2050     }
2051 
getBytesPerLinkCycle(int32 cpu_model_)2052     static double getBytesPerLinkCycle(int32 cpu_model_)
2053     {
2054         return getBytesPerFlit(cpu_model_) * getFlitsPerLinkCycle(cpu_model_);
2055     }
2056 
getBytesPerLinkCycle()2057     double getBytesPerLinkCycle() const
2058     {
2059         return getBytesPerLinkCycle(cpu_model);
2060     }
2061 
getLinkTransfersPerLinkCycle()2062     static double getLinkTransfersPerLinkCycle()
2063     {
2064         return 8.;
2065     }
2066 
getBytesPerLinkTransfer()2067     double getBytesPerLinkTransfer() const
2068     {
2069         return getBytesPerLinkCycle() / getLinkTransfersPerLinkCycle();
2070     }
2071 
2072     //! \brief Setup ExtendedCustomCoreEventDescription object to read offcore (numa) counters for each processor type
2073     //! \param conf conf object to setup offcore MSR values
2074     void setupCustomCoreEventsForNuma(PCM::ExtendedCustomCoreEventDescription& conf) const;
2075 
2076     #define PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(m) bool is##m() const { return m; }
2077 
2078     PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L2CacheHitRatioAvailable)
PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L3CacheHitRatioAvailable)2079     PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L3CacheHitRatioAvailable)
2080     PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L3CacheMissesAvailable)
2081     PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L2CacheMissesAvailable)
2082     PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L2CacheHitsAvailable)
2083     PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L3CacheHitsNoSnoopAvailable)
2084     PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L3CacheHitsSnoopAvailable)
2085     PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L3CacheHitsAvailable)
2086 
2087     #undef PCM_GEN_METRIC_AVAILABLE_FUNCTION
2088 
2089     bool isActiveRelativeFrequencyAvailable() const
2090     {
2091         return !isAtom();
2092     }
2093 
2094     ~PCM();
2095 };
2096 
2097 //! \brief Basic core counter state
2098 //!
2099 //! Intended only for derivation, but not for the direct use
2100 class BasicCounterState
2101 {
2102     friend class PCM;
2103     friend class JSONPrinter;
2104     template <class CounterStateType>
2105     friend double getExecUsage(const CounterStateType & before, const CounterStateType & after);
2106     template <class CounterStateType>
2107     friend double getIPC(const CounterStateType & before, const CounterStateType & after);
2108     template <class CounterStateType>
2109     friend double getAverageFrequency(const CounterStateType & before, const CounterStateType & after);
2110     template <class CounterStateType>
2111     friend double getActiveAverageFrequency(const CounterStateType & before, const CounterStateType & after);
2112     template <class CounterStateType>
2113     friend double getRelativeFrequency(const CounterStateType & before, const CounterStateType & after);
2114     template <class CounterStateType>
2115     friend double getActiveRelativeFrequency(const CounterStateType & before, const CounterStateType & after);
2116     template <class CounterStateType>
2117     friend double getL2CacheHitRatio(const CounterStateType & before, const CounterStateType & after);
2118     template <class CounterStateType>
2119     friend double getL3CacheHitRatio(const CounterStateType & before, const CounterStateType & after);
2120     template <class CounterStateType>
2121     friend uint64 getL3CacheMisses(const CounterStateType & before, const CounterStateType & after);
2122     template <class CounterStateType>
2123     friend uint64 getL2CacheMisses(const CounterStateType & before, const CounterStateType & after);
2124     template <class CounterStateType>
2125     friend uint64 getL2CacheHits(const CounterStateType & before, const CounterStateType & after);
2126     template <class CounterStateType>
2127     friend uint64 getL3CacheHitsNoSnoop(const CounterStateType & before, const CounterStateType & after);
2128     template <class CounterStateType>
2129     friend uint64 getL3CacheHitsSnoop(const CounterStateType & before, const CounterStateType & after);
2130     template <class CounterStateType>
2131     friend uint64 getL3CacheHits(const CounterStateType & before, const CounterStateType & after);
2132     template <class CounterStateType>
2133     friend uint64 getL3CacheOccupancy(const CounterStateType & now);
2134     template <class CounterStateType>
2135     friend uint64 getLocalMemoryBW(const CounterStateType & before, const CounterStateType & after);
2136     template <class CounterStateType>
2137     friend uint64 getRemoteMemoryBW(const CounterStateType & before, const CounterStateType & after);
2138     template <class CounterStateType>
2139     friend uint64 getCycles(const CounterStateType & before, const CounterStateType & after);
2140     template <class CounterStateType>
2141     friend uint64 getInstructionsRetired(const CounterStateType & before, const CounterStateType & after);
2142     template <class CounterStateType>
2143     friend uint64 getCycles(const CounterStateType & now);
2144     template <class CounterStateType>
2145     friend uint64 getInstructionsRetired(const CounterStateType & now);
2146     template <class CounterStateType>
2147     friend uint64 getNumberOfCustomEvents(int32 eventCounterNr, const CounterStateType & before, const CounterStateType & after);
2148     template <class CounterStateType>
2149     friend uint64 getInvariantTSC(const CounterStateType & before, const CounterStateType & after);
2150     template <class CounterStateType>
2151     friend uint64 getRefCycles(const CounterStateType & before, const CounterStateType & after);
2152     template <class CounterStateType>
2153     friend double getCoreCStateResidency(int state, const CounterStateType & before, const CounterStateType & after);
2154     template <class CounterStateType>
2155     friend uint64 getCoreCStateResidency(int state, const CounterStateType& now);
2156     template <class CounterStateType>
2157     friend uint64 getSMICount(const CounterStateType & before, const CounterStateType & after);
2158     template <class CounterStateType>
2159     friend uint64 getAllSlotsRaw(const CounterStateType& before, const CounterStateType& after);
2160     template <class CounterStateType>
2161     friend uint64 getAllSlots(const CounterStateType & before, const CounterStateType & after);
2162     template <class CounterStateType>
2163     friend double getBackendBound(const CounterStateType & before, const CounterStateType & after);
2164     template <class CounterStateType>
2165     friend double getFrontendBound(const CounterStateType & before, const CounterStateType & after);
2166     template <class CounterStateType>
2167     friend double getBadSpeculation(const CounterStateType & before, const CounterStateType & after);
2168     template <class CounterStateType>
2169     friend double getRetiring(const CounterStateType & before, const CounterStateType & after);
2170 
2171 protected:
2172     checked_uint64 InstRetiredAny;
2173     checked_uint64 CpuClkUnhaltedThread;
2174     checked_uint64 CpuClkUnhaltedRef;
2175     checked_uint64 Event[PERF_MAX_CUSTOM_COUNTERS];
2176     enum
2177     {
2178                L3MissPos = 0,
2179           ArchLLCMissPos = 0,
2180         L3UnsharedHitPos = 1,
2181            ArchLLCRefPos = 1,
2182              SKLL3HitPos = 1,
2183                L2HitMPos = 2,
2184             SKLL2MissPos = 2,
2185                 L2HitPos = 3
2186     };
2187     uint64 InvariantTSC; // invariant time stamp counter
2188     uint64 CStateResidency[PCM::MAX_C_STATE + 1];
2189     int32 ThermalHeadroom;
2190     uint64 L3Occupancy;
2191     uint64 MemoryBWLocal;
2192     uint64 MemoryBWTotal;
2193     uint64 SMICount;
2194     uint64 FrontendBoundSlots, BadSpeculationSlots, BackendBoundSlots, RetiringSlots, AllSlotsRaw;
2195 
2196 public:
BasicCounterState()2197     BasicCounterState() :
2198         InvariantTSC(0),
2199         ThermalHeadroom(PCM_INVALID_THERMAL_HEADROOM),
2200         L3Occupancy(0),
2201         MemoryBWLocal(0),
2202         MemoryBWTotal(0),
2203         SMICount(0),
2204     FrontendBoundSlots(0),
2205     BadSpeculationSlots(0),
2206     BackendBoundSlots(0),
2207     RetiringSlots(0),
2208     AllSlotsRaw(0)
2209     {
2210         memset(CStateResidency, 0, sizeof(CStateResidency));
2211     }
~BasicCounterState()2212     virtual ~BasicCounterState() { }
2213 
2214     BasicCounterState( const BasicCounterState& ) = default;
2215     BasicCounterState( BasicCounterState&& ) = default;
2216     BasicCounterState & operator = ( BasicCounterState&& ) = default;
2217 
2218     BasicCounterState & operator += (const BasicCounterState & o)
2219     {
2220         InstRetiredAny += o.InstRetiredAny;
2221         CpuClkUnhaltedThread += o.CpuClkUnhaltedThread;
2222         CpuClkUnhaltedRef += o.CpuClkUnhaltedRef;
2223         for (int i = 0; i < PERF_MAX_CUSTOM_COUNTERS; ++i)
2224         {
2225             Event[i] += o.Event[i];
2226         }
2227         InvariantTSC += o.InvariantTSC;
2228         for (int i = 0; i <= (int)PCM::MAX_C_STATE; ++i)
2229             CStateResidency[i] += o.CStateResidency[i];
2230         // ThermalHeadroom is not accumulative
2231         L3Occupancy += o.L3Occupancy;
2232         MemoryBWLocal += o.MemoryBWLocal;
2233         MemoryBWTotal += o.MemoryBWTotal;
2234         SMICount += o.SMICount;
2235         // std::cout << "before PCM debug aggregate "<< FrontendBoundSlots << " " << BadSpeculationSlots << " " << BackendBoundSlots << " " <<RetiringSlots << std::endl;
2236         BasicCounterState old = *this;
2237         FrontendBoundSlots += o.FrontendBoundSlots;
2238         BadSpeculationSlots += o.BadSpeculationSlots;
2239         BackendBoundSlots += o.BackendBoundSlots;
2240         RetiringSlots += o.RetiringSlots;
2241         AllSlotsRaw += o.AllSlotsRaw;
2242         //std::cout << "after PCM debug aggregate "<< FrontendBoundSlots << " " << BadSpeculationSlots << " " << BackendBoundSlots << " " <<RetiringSlots << std::endl;
2243         assert(FrontendBoundSlots >= old.FrontendBoundSlots);
2244         assert(BadSpeculationSlots >= old.BadSpeculationSlots);
2245         assert(BackendBoundSlots >= old.BackendBoundSlots);
2246         assert(RetiringSlots >= old.RetiringSlots);
2247         return *this;
2248     }
2249 
2250     void readAndAggregate(std::shared_ptr<SafeMsrHandle>);
2251     void readAndAggregateTSC(std::shared_ptr<SafeMsrHandle>);
2252 
2253     //! Returns current thermal headroom below TjMax
getThermalHeadroom()2254     int32 getThermalHeadroom() const { return ThermalHeadroom; }
2255 };
2256 
RDTSC()2257 inline uint64 RDTSC()
2258 {
2259         uint64 result = 0;
2260 #ifdef _MSC_VER
2261         // Windows
2262         #if _MSC_VER>= 1600
2263         result = static_cast<uint64>(__rdtsc());
2264         #endif
2265 #else
2266         // Linux
2267         uint32 high = 0, low = 0;
2268         asm volatile("rdtsc" : "=a" (low), "=d" (high));
2269         result = low + (uint64(high)<<32ULL);
2270 #endif
2271         return result;
2272 
2273 }
2274 
RDTSCP()2275 inline uint64 RDTSCP()
2276 {
2277     uint64 result = 0;
2278 #ifdef _MSC_VER
2279     // Windows
2280     #if _MSC_VER>= 1600
2281     unsigned int Aux;
2282     result = __rdtscp(&Aux);
2283     #endif
2284 #else
2285     // Linux and OS X
2286     uint32 high = 0, low = 0;
2287     asm volatile (
2288        "rdtscp\n\t"
2289        "mov %%edx, %0\n\t"
2290        "mov %%eax, %1\n\t":
2291        "=r" (high), "=r" (low) :: "%rax", "%rcx", "%rdx");
2292     result = low + (uint64(high)<<32ULL);
2293 #endif
2294     return result;
2295 }
2296 
2297 template <class CounterStateType>
getThermalHeadroom(const CounterStateType &,const CounterStateType & after)2298 int32 getThermalHeadroom(const CounterStateType & /* before */, const CounterStateType & after)
2299 {
2300     return after.getThermalHeadroom();
2301 }
2302 
2303 /*! \brief Returns the ratio of QPI cycles in power saving half-lane mode
2304     \param port QPI port number
2305     \param before CPU counter state before the experiment
2306     \param after CPU counter state after the experiment
2307     \return 0..1 - ratio of QPI cycles in power saving half-lane mode
2308 */
2309 template <class CounterStateType>
getNormalizedQPIL0pTxCycles(uint32 port,const CounterStateType & before,const CounterStateType & after)2310 double getNormalizedQPIL0pTxCycles(uint32 port, const CounterStateType & before, const CounterStateType & after)
2311 {
2312     return double(getQPIL0pTxCycles(port, before, after)) / double(getQPIClocks(port, before, after));
2313 }
2314 
2315 /*! \brief Returns the ratio of QPI cycles in power saving shutdown mode
2316     \param port QPI port number
2317     \param before CPU counter state before the experiment
2318     \param after CPU counter state after the experiment
2319     \return 0..1 - ratio of QPI cycles in power saving shutdown mode
2320 */
2321 template <class CounterStateType>
getNormalizedQPIL1Cycles(uint32 port,const CounterStateType & before,const CounterStateType & after)2322 double getNormalizedQPIL1Cycles(uint32 port, const CounterStateType & before, const CounterStateType & after)
2323 {
2324     return double(getQPIL1Cycles(port, before, after)) / double(getQPIClocks(port, before, after));
2325 }
2326 
2327 /*! \brief Returns DRAM clock ticks
2328     \param channel DRAM channel number
2329     \param before CPU counter state before the experiment
2330     \param after CPU counter state after the experiment
2331 */
2332 template <class CounterStateType>
getDRAMClocks(uint32 channel,const CounterStateType & before,const CounterStateType & after)2333 uint64 getDRAMClocks(uint32 channel, const CounterStateType & before, const CounterStateType & after)
2334 {
2335     const auto clk = after.DRAMClocks[channel] - before.DRAMClocks[channel];
2336     const auto cpu_model = PCM::getInstance()->getCPUModel();
2337     if (cpu_model == PCM::ICX || cpu_model == PCM::SNOWRIDGE)
2338     {
2339         return 2 * clk;
2340     }
2341     return clk;
2342 }
2343 
2344 /*! \brief Returns MCDRAM clock ticks
2345     \param channel MCDRAM channel number
2346     \param before CPU counter state before the experiment
2347     \param after CPU counter state after the experiment
2348 */
2349 template <class CounterStateType>
getMCDRAMClocks(uint32 channel,const CounterStateType & before,const CounterStateType & after)2350 uint64 getMCDRAMClocks(uint32 channel, const CounterStateType & before, const CounterStateType & after)
2351 {
2352     return after.MCDRAMClocks[channel] - before.MCDRAMClocks[channel];
2353 }
2354 
2355 
2356 /*! \brief Direct read of memory controller PMU counter (counter meaning depends on the programming: power/performance/etc)
2357     \param counter counter number
2358     \param channel channel number
2359     \param before CPU counter state before the experiment
2360     \param after CPU counter state after the experiment
2361 */
2362 template <class CounterStateType>
getMCCounter(uint32 channel,uint32 counter,const CounterStateType & before,const CounterStateType & after)2363 uint64 getMCCounter(uint32 channel, uint32 counter, const CounterStateType & before, const CounterStateType & after)
2364 {
2365     return after.MCCounter[channel][counter] - before.MCCounter[channel][counter];
2366 }
2367 
2368 /*! \brief Direct read of M3UPI PMU counter (counter meaning depends on the programming: power/performance/etc)
2369     \param counter counter number
2370     \param port UPI port number
2371     \param before CPU counter state before the experiment
2372     \param after CPU counter state after the experiment
2373 */
2374 template <class CounterStateType>
getM3UPICounter(uint32 port,uint32 counter,const CounterStateType & before,const CounterStateType & after)2375 uint64 getM3UPICounter(uint32 port, uint32 counter, const CounterStateType& before, const CounterStateType& after)
2376 {
2377     return after.M3UPICounter[port][counter] - before.M3UPICounter[port][counter];
2378 }
2379 
2380 /*! \brief Direct read of CHA or CBO PMU counter (counter meaning depends on the programming: power/performance/etc)
2381     \param counter counter number
2382     \param cbo cbo or cha number
2383     \param before CPU counter state before the experiment
2384     \param after CPU counter state after the experiment
2385 */
2386 template <class CounterStateType>
getCBOCounter(uint32 cbo,uint32 counter,const CounterStateType & before,const CounterStateType & after)2387 uint64 getCBOCounter(uint32 cbo, uint32 counter, const CounterStateType& before, const CounterStateType& after)
2388 {
2389     return after.CBOCounter[cbo][counter] - before.CBOCounter[cbo][counter];
2390 }
2391 
2392 /*! \brief Direct read of UBOX PMU counter (counter meaning depends on the programming: power/performance/etc)
2393     \param counter counter number
2394     \param cbo cbo or cha number
2395     \param before CPU counter state before the experiment
2396     \param after CPU counter state after the experiment
2397 */
2398 template <class CounterStateType>
getUBOXCounter(uint32 counter,const CounterStateType & before,const CounterStateType & after)2399 uint64 getUBOXCounter(uint32 counter, const CounterStateType& before, const CounterStateType& after)
2400 {
2401     return after.UBOXCounter[counter] - before.UBOXCounter[counter];
2402 }
2403 
2404 /*! \brief Direct read of IIO PMU counter (counter meaning depends on the programming: power/performance/etc)
2405     \param counter counter number
2406     \param cbo IIO stack number
2407     \param before CPU counter state before the experiment
2408     \param after CPU counter state after the experiment
2409 */
2410 template <class CounterStateType>
getIIOCounter(uint32 stack,uint32 counter,const CounterStateType & before,const CounterStateType & after)2411 uint64 getIIOCounter(uint32 stack, uint32 counter, const CounterStateType& before, const CounterStateType& after)
2412 {
2413     return after.IIOCounter[stack][counter] - before.IIOCounter[stack][counter];
2414 }
2415 
2416 /*! \brief Direct read of UPI or QPI PMU counter (counter meaning depends on the programming: power/performance/etc)
2417     \param counter counter number
2418     \param port UPI/QPI port number
2419     \param before CPU counter state before the experiment
2420     \param after CPU counter state after the experiment
2421 */
2422 template <class CounterStateType>
getXPICounter(uint32 port,uint32 counter,const CounterStateType & before,const CounterStateType & after)2423 uint64 getXPICounter(uint32 port, uint32 counter, const CounterStateType& before, const CounterStateType& after)
2424 {
2425     return after.xPICounter[port][counter] - before.xPICounter[port][counter];
2426 }
2427 
2428 /*! \brief Direct read of Memory2Mesh controller PMU counter (counter meaning depends on the programming: power/performance/etc)
2429     \param counter counter number
2430     \param controller controller number
2431     \param before CPU counter state before the experiment
2432     \param after CPU counter state after the experiment
2433 */
2434 template <class CounterStateType>
getM2MCounter(uint32 controller,uint32 counter,const CounterStateType & before,const CounterStateType & after)2435 uint64 getM2MCounter(uint32 controller, uint32 counter, const CounterStateType & before, const CounterStateType & after)
2436 {
2437     return after.M2MCounter[controller][counter] - before.M2MCounter[controller][counter];
2438 }
2439 
2440 
2441 /*! \brief Direct read of embedded DRAM memory controller counter (counter meaning depends on the programming: power/performance/etc)
2442     \param counter counter number
2443     \param channel channel number
2444     \param before CPU counter state before the experiment
2445     \param after CPU counter state after the experiment
2446 */
2447 template <class CounterStateType>
getEDCCounter(uint32 channel,uint32 counter,const CounterStateType & before,const CounterStateType & after)2448 uint64 getEDCCounter(uint32 channel, uint32 counter, const CounterStateType & before, const CounterStateType & after)
2449 {
2450     if (PCM::getInstance()->MCDRAMmemoryTrafficMetricsAvailable())
2451         return after.EDCCounter[channel][counter] - before.EDCCounter[channel][counter];
2452     return 0ULL;
2453 }
2454 
2455 /*! \brief Direct read of power control unit PMU counter (counter meaning depends on the programming: power/performance/etc)
2456     \param counter counter number
2457     \param before CPU counter state before the experiment
2458     \param after CPU counter state after the experiment
2459 */
2460 template <class CounterStateType>
getPCUCounter(uint32 counter,const CounterStateType & before,const CounterStateType & after)2461 uint64 getPCUCounter(uint32 counter, const CounterStateType & before, const CounterStateType & after)
2462 {
2463     return after.PCUCounter[counter] - before.PCUCounter[counter];
2464 }
2465 
2466 /*!  \brief Returns clock ticks of power control unit
2467     \param before CPU counter state before the experiment
2468     \param after CPU counter state after the experiment
2469 */
2470 template <class CounterStateType>
getPCUClocks(const CounterStateType & before,const CounterStateType & after)2471 uint64 getPCUClocks(const CounterStateType & before, const CounterStateType & after)
2472 {
2473     return getPCUCounter(0, before, after);
2474 }
2475 
2476 /*!  \brief Returns energy consumed by processor, excluding DRAM (measured in internal units)
2477     \param before CPU counter state before the experiment
2478     \param after CPU counter state after the experiment
2479 */
2480 template <class CounterStateType>
getConsumedEnergy(const CounterStateType & before,const CounterStateType & after)2481 uint64 getConsumedEnergy(const CounterStateType & before, const CounterStateType & after)
2482 {
2483     return after.PackageEnergyStatus - before.PackageEnergyStatus;
2484 }
2485 
2486 /*!  \brief Returns energy consumed by DRAM (measured in internal units)
2487     \param before CPU counter state before the experiment
2488     \param after CPU counter state after the experiment
2489 */
2490 template <class CounterStateType>
getDRAMConsumedEnergy(const CounterStateType & before,const CounterStateType & after)2491 uint64 getDRAMConsumedEnergy(const CounterStateType & before, const CounterStateType & after)
2492 {
2493     return after.DRAMEnergyStatus - before.DRAMEnergyStatus;
2494 }
2495 
2496 
2497 /*!  \brief Returns free running counter if it exists, -1 otherwise
2498  *   \param counter name of the counter
2499  *   \param before CPU counter state before the experiment
2500  *   \param after CPU counter state after the experiment
2501  */
2502 template <class CounterStateType>
getFreeRunningCounter(const typename CounterStateType::FreeRunningCounterID & counter,const CounterStateType & before,const CounterStateType & after)2503 int64 getFreeRunningCounter(const typename CounterStateType::FreeRunningCounterID & counter, const CounterStateType & before, const CounterStateType & after)
2504 {
2505     const auto beforeIt = before.freeRunningCounter.find(counter);
2506     const auto afterIt = after.freeRunningCounter.find(counter);
2507     if (beforeIt != before.freeRunningCounter.end() &&
2508         afterIt != after.freeRunningCounter.end())
2509     {
2510         return afterIt->second - beforeIt->second;
2511     }
2512     return -1;
2513 }
2514 
2515 
2516 /*!  \brief Returns uncore clock ticks
2517     \param before CPU counter state before the experiment
2518     \param after CPU counter state after the experiment
2519 */
2520 template <class CounterStateType>
getUncoreClocks(const CounterStateType & before,const CounterStateType & after)2521 uint64 getUncoreClocks(const CounterStateType& before, const CounterStateType& after)
2522 {
2523     return after.UncClocks - before.UncClocks;
2524 }
2525 
2526 /*!  \brief Returns Joules consumed by processor (excluding DRAM)
2527     \param before CPU counter state before the experiment
2528     \param after CPU counter state after the experiment
2529 */
2530 template <class CounterStateType>
getConsumedJoules(const CounterStateType & before,const CounterStateType & after)2531 double getConsumedJoules(const CounterStateType & before, const CounterStateType & after)
2532 {
2533     PCM * m = PCM::getInstance();
2534     if (!m) return -1.;
2535 
2536     return double(getConsumedEnergy(before, after)) * m->getJoulesPerEnergyUnit();
2537 }
2538 
2539 /*!  \brief Returns Joules consumed by DRAM
2540     \param before CPU counter state before the experiment
2541     \param after CPU counter state after the experiment
2542 */
2543 template <class CounterStateType>
getDRAMConsumedJoules(const CounterStateType & before,const CounterStateType & after)2544 double getDRAMConsumedJoules(const CounterStateType & before, const CounterStateType & after)
2545 {
2546     PCM * m = PCM::getInstance();
2547     if (!m) return -1.;
2548     double dram_joules_per_energy_unit = 0.;
2549     const auto cpu_model = m->getCPUModel();
2550 
2551     if (PCM::HASWELLX == cpu_model
2552         || PCM::BDX_DE == cpu_model
2553         || PCM::BDX == cpu_model
2554         || PCM::SKX == cpu_model
2555         || PCM::ICX == cpu_model
2556         || PCM::KNL == cpu_model
2557         ) {
2558 /* as described in sections 5.3.2 (DRAM_POWER_INFO) and 5.3.3 (DRAM_ENERGY_STATUS) of
2559  * Volume 2 (Registers) of
2560  * Intel Xeon E5-1600 v3 and Intel Xeon E5-2600 v3 (Haswell-EP) Datasheet (Ref 330784-001, Sept.2014)
2561  * ENERGY_UNIT for DRAM domain is fixed to 15.3 uJ for server HSX, BDW and KNL processors.
2562  */
2563         dram_joules_per_energy_unit = 0.0000153;
2564     } else {
2565 /* for all other processors (including Haswell client/mobile SKUs) the ENERGY_UNIT for DRAM domain
2566  * should be read from PACKAGE_POWER_SKU register (usually value around ~61uJ)
2567  */
2568         dram_joules_per_energy_unit = m->getJoulesPerEnergyUnit();
2569     }
2570     return double(getDRAMConsumedEnergy(before, after)) * dram_joules_per_energy_unit;
2571 }
2572 
2573 //! \brief Basic uncore counter state
2574 //!
2575 //! Intended only for derivation, but not for the direct use
2576 class UncoreCounterState
2577 {
2578     friend class PCM;
2579     friend class JSONPrinter;
2580     template <class CounterStateType>
2581     friend uint64 getBytesReadFromMC(const CounterStateType & before, const CounterStateType & after);
2582     template <class CounterStateType>
2583     friend uint64 getBytesWrittenToMC(const CounterStateType & before, const CounterStateType & after);
2584     template <class CounterStateType>
2585     friend uint64 getBytesReadFromPMM(const CounterStateType & before, const CounterStateType & after);
2586     template <class CounterStateType>
2587     friend uint64 getBytesWrittenToPMM(const CounterStateType & before, const CounterStateType & after);
2588     template <class CounterStateType>
2589     friend uint64 getBytesReadFromEDC(const CounterStateType & before, const CounterStateType & after);
2590     template <class CounterStateType>
2591     friend uint64 getBytesWrittenToEDC(const CounterStateType & before, const CounterStateType & after);
2592     template <class CounterStateType>
2593     friend uint64 getIORequestBytesFromMC(const CounterStateType & before, const CounterStateType & after);
2594     template <class CounterStateType>
2595     friend uint64 getConsumedEnergy(const CounterStateType & before, const CounterStateType & after);
2596     template <class CounterStateType>
2597     friend uint64 getDRAMConsumedEnergy(const CounterStateType & before, const CounterStateType & after);
2598     template <class CounterStateType>
2599     friend uint64 getUncoreClocks(const CounterStateType& before, const CounterStateType& after);
2600     template <class CounterStateType>
2601     friend double getPackageCStateResidency(int state, const CounterStateType & before, const CounterStateType & after);
2602     template <class CounterStateType>
2603     friend uint64 getPackageCStateResidency(int state, const CounterStateType& now);
2604     template <class CounterStateType>
2605     friend double getLLCReadMissLatency(const CounterStateType & before, const CounterStateType & after);
2606     template <class CounterStateType>
2607     friend double getLocalMemoryRequestRatio(const CounterStateType & before, const CounterStateType & after);
2608 
2609 protected:
2610     uint64 UncMCFullWrites;
2611     uint64 UncMCNormalReads;
2612     uint64 UncHARequests;
2613     uint64 UncHALocalRequests;
2614     uint64 UncPMMWrites;
2615     uint64 UncPMMReads;
2616     uint64 UncEDCFullWrites;
2617     uint64 UncEDCNormalReads;
2618     uint64 UncMCIORequests;
2619     uint64 PackageEnergyStatus;
2620     uint64 DRAMEnergyStatus;
2621     uint64 TOROccupancyIAMiss;
2622     uint64 TORInsertsIAMiss;
2623     uint64 UncClocks;
2624     uint64 CStateResidency[PCM::MAX_C_STATE + 1];
2625     void readAndAggregate(std::shared_ptr<SafeMsrHandle>);
2626 
2627 public:
UncoreCounterState()2628     UncoreCounterState() :
2629         UncMCFullWrites(0),
2630         UncMCNormalReads(0),
2631         UncHARequests(0),
2632         UncHALocalRequests(0),
2633         UncPMMWrites(0),
2634         UncPMMReads(0),
2635         UncEDCFullWrites(0),
2636         UncEDCNormalReads(0),
2637         UncMCIORequests(0),
2638         PackageEnergyStatus(0),
2639         DRAMEnergyStatus(0),
2640         TOROccupancyIAMiss(0),
2641         TORInsertsIAMiss(0),
2642         UncClocks(0)
2643     {
2644         memset(CStateResidency, 0, sizeof(CStateResidency));
2645     }
~UncoreCounterState()2646     virtual ~UncoreCounterState() { }
2647 
2648     UncoreCounterState( const UncoreCounterState& ) = default;
2649     UncoreCounterState( UncoreCounterState&& ) = default;
2650     UncoreCounterState & operator = ( UncoreCounterState&& ) = default;
2651 
2652     UncoreCounterState & operator += (const UncoreCounterState & o)
2653     {
2654         UncMCFullWrites += o.UncMCFullWrites;
2655         UncMCNormalReads += o.UncMCNormalReads;
2656         UncHARequests += o.UncHARequests;
2657         UncHALocalRequests += o.UncHALocalRequests;
2658         UncPMMReads += o.UncPMMReads;
2659         UncPMMWrites += o.UncPMMWrites;
2660         UncEDCFullWrites += o.UncEDCFullWrites;
2661         UncEDCNormalReads += o.UncEDCNormalReads;
2662         UncMCIORequests += o.UncMCIORequests;
2663         PackageEnergyStatus += o.PackageEnergyStatus;
2664         DRAMEnergyStatus += o.DRAMEnergyStatus;
2665         TOROccupancyIAMiss += o.TOROccupancyIAMiss;
2666         TORInsertsIAMiss += o.TORInsertsIAMiss;
2667         UncClocks += o.UncClocks;
2668         for (int i = 0; i <= (int)PCM::MAX_C_STATE; ++i)
2669             CStateResidency[i] += o.CStateResidency[i];
2670         return *this;
2671     }
2672 };
2673 
2674 
2675 //! \brief Server uncore power counter state
2676 //!
2677 class ServerUncoreCounterState : public UncoreCounterState
2678 {
2679 public:
2680     enum {
2681         maxControllers = 4,
2682         maxChannels = 12,
2683         maxXPILinks = 6,
2684         maxCBOs = 128,
2685         maxIIOStacks = 16,
2686         maxCounters = 4
2687     };
2688     enum EventPosition
2689     {
2690         xPI_TxL0P_POWER_CYCLES = 0,
2691         xPI_L1_POWER_CYCLES = 2,
2692         xPI_CLOCKTICKS = 3
2693     };
2694     enum FreeRunningCounterID
2695     {
2696         ImcReads,
2697         ImcWrites,
2698         PMMReads,
2699         PMMWrites
2700     };
2701 private:
2702     std::array<std::array<uint64, maxCounters>, maxXPILinks> xPICounter;
2703     std::array<std::array<uint64, maxCounters>, maxXPILinks> M3UPICounter;
2704     std::array<std::array<uint64, maxCounters>, maxCBOs> CBOCounter;
2705     std::array<std::array<uint64, maxCounters>, maxIIOStacks> IIOCounter;
2706     std::array<uint64, maxCounters> UBOXCounter;
2707     std::array<uint64, maxChannels> DRAMClocks;
2708     std::array<uint64, maxChannels> MCDRAMClocks;
2709     std::array<std::array<uint64, maxCounters>, maxChannels> MCCounter; // channel X counter
2710     std::array<std::array<uint64, maxCounters>, maxControllers> M2MCounter; // M2M/iMC boxes x counter
2711     std::array<std::array<uint64, maxCounters>, maxChannels> EDCCounter; // EDC controller X counter
2712     std::array<uint64, maxCounters> PCUCounter;
2713     std::unordered_map<int, uint64> freeRunningCounter;
2714     int32 PackageThermalHeadroom;
2715     uint64 InvariantTSC;    // invariant time stamp counter
2716     friend class PCM;
2717     template <class CounterStateType>
2718     friend uint64 getDRAMClocks(uint32 channel, const CounterStateType & before, const CounterStateType & after);
2719     template <class CounterStateType>
2720     friend uint64 getMCDRAMClocks(uint32 channel, const CounterStateType & before, const CounterStateType & after);
2721     template <class CounterStateType>
2722     friend uint64 getMCCounter(uint32 channel, uint32 counter, const CounterStateType & before, const CounterStateType & after);
2723     template <class CounterStateType>
2724     friend uint64 getM3UPICounter(uint32 port, uint32 counter, const CounterStateType& before, const CounterStateType& after);
2725     template <class CounterStateType>
2726     friend uint64 getCBOCounter(uint32 cbo, uint32 counter, const CounterStateType& before, const CounterStateType& after);
2727     template <class CounterStateType>
2728     friend uint64 getUBOXCounter(uint32 counter, const CounterStateType& before, const CounterStateType& after);
2729     template <class CounterStateType>
2730     friend uint64 getIIOCounter(uint32 stack, uint32 counter, const CounterStateType& before, const CounterStateType& after);
2731     template <class CounterStateType>
2732     friend uint64 getXPICounter(uint32 port, uint32 counter, const CounterStateType& before, const CounterStateType& after);
2733     template <class CounterStateType>
2734     friend uint64 getM2MCounter(uint32 controller, uint32 counter, const CounterStateType & before, const CounterStateType & after);
2735     template <class CounterStateType>
2736     friend uint64 getEDCCounter(uint32 channel, uint32 counter, const CounterStateType & before, const CounterStateType & after);
2737     template <class CounterStateType>
2738     friend uint64 getPCUCounter(uint32 counter, const CounterStateType & before, const CounterStateType & after);
2739     template <class CounterStateType>
2740     friend uint64 getConsumedEnergy(const CounterStateType & before, const CounterStateType & after);
2741     template <class CounterStateType>
2742     friend uint64 getDRAMConsumedEnergy(const CounterStateType & before, const CounterStateType & after);
2743     template <class CounterStateType>
2744     friend uint64 getInvariantTSC(const CounterStateType & before, const CounterStateType & after);
2745     template <class CounterStateType>
2746     friend int64 getFreeRunningCounter(const typename CounterStateType::FreeRunningCounterID &, const CounterStateType & before, const CounterStateType & after);
2747 
2748 public:
2749     //! Returns current thermal headroom below TjMax
getPackageThermalHeadroom()2750     int32 getPackageThermalHeadroom() const { return PackageThermalHeadroom; }
ServerUncoreCounterState()2751     ServerUncoreCounterState() :
2752         xPICounter{{}},
2753         M3UPICounter{{}},
2754         CBOCounter{{}},
2755         IIOCounter{{}},
2756         UBOXCounter{{}},
2757         DRAMClocks{{}},
2758         MCDRAMClocks{{}},
2759         MCCounter{{}},
2760         M2MCounter{{}},
2761         EDCCounter{{}},
2762         PCUCounter{{}},
2763         PackageThermalHeadroom(0),
2764         InvariantTSC(0)
2765     {
2766     }
2767 };
2768 
2769 /*! \brief Returns QPI LL clock ticks
2770     \param port QPI port number
2771     \param before CPU counter state before the experiment
2772     \param after CPU counter state after the experiment
2773 */
2774 template <class CounterStateType>
getQPIClocks(uint32 port,const CounterStateType & before,const CounterStateType & after)2775 uint64 getQPIClocks(uint32 port, const CounterStateType& before, const CounterStateType& after)
2776 {
2777     return getXPICounter(port, ServerUncoreCounterState::EventPosition::xPI_CLOCKTICKS, before, after);
2778 }
2779 
2780 /*! \brief Returns the number of QPI cycles in power saving half-lane mode
2781     \param port QPI port number
2782     \param before CPU counter state before the experiment
2783     \param after CPU counter state after the experiment
2784 */
2785 template <class CounterStateType>
getQPIL0pTxCycles(uint32 port,const CounterStateType & before,const CounterStateType & after)2786 uint64 getQPIL0pTxCycles(uint32 port, const CounterStateType& before, const CounterStateType& after)
2787 {
2788     return getXPICounter(port, ServerUncoreCounterState::EventPosition::xPI_TxL0P_POWER_CYCLES, before, after);
2789 }
2790 
2791 /*! \brief Returns the number of QPI cycles in power saving shutdown mode
2792     \param port QPI port number
2793     \param before CPU counter state before the experiment
2794     \param after CPU counter state after the experiment
2795 */
2796 template <class CounterStateType>
getQPIL1Cycles(uint32 port,const CounterStateType & before,const CounterStateType & after)2797 uint64 getQPIL1Cycles(uint32 port, const CounterStateType& before, const CounterStateType& after)
2798 {
2799     return getXPICounter(port, ServerUncoreCounterState::EventPosition::xPI_L1_POWER_CYCLES, before, after);
2800 }
2801 
2802 //! \brief (Logical) core-wide counter state
2803 class CoreCounterState : public BasicCounterState
2804 {
2805     friend class PCM;
2806 
2807 public:
2808     CoreCounterState() = default;
2809     CoreCounterState( const CoreCounterState& ) = default;
2810     CoreCounterState( CoreCounterState&& ) = default;
2811     CoreCounterState & operator= ( CoreCounterState&& ) = default;
2812 };
2813 
2814 //! \brief Socket-wide counter state
2815 class SocketCounterState : public BasicCounterState, public UncoreCounterState
2816 {
2817     friend class PCM;
2818 
2819 protected:
readAndAggregate(std::shared_ptr<SafeMsrHandle> handle)2820     void readAndAggregate(std::shared_ptr<SafeMsrHandle> handle)
2821     {
2822         BasicCounterState::readAndAggregate(handle);
2823         UncoreCounterState::readAndAggregate(handle);
2824     }
2825 
2826 public:
2827     SocketCounterState& operator += ( const BasicCounterState& ccs )
2828     {
2829         BasicCounterState::operator += ( ccs );
2830 
2831         return *this;
2832     }
2833 
2834     SocketCounterState& operator += ( const UncoreCounterState& ucs )
2835     {
2836         UncoreCounterState::operator += ( ucs );
2837 
2838         return *this;
2839     }
2840 
2841     SocketCounterState() = default;
2842     SocketCounterState( const SocketCounterState& ) = default;
2843     SocketCounterState( SocketCounterState&& ) = default;
2844     SocketCounterState & operator = ( SocketCounterState&& ) = default;
2845 
2846     SocketCounterState & operator = ( UncoreCounterState&& ucs ) {
2847         UncoreCounterState::operator = ( std::move(ucs) );
2848         return *this;
2849     }
2850 };
2851 
2852 //! \brief System-wide counter state
2853 class SystemCounterState : public SocketCounterState
2854 {
2855     friend class PCM;
2856 
2857     std::vector<std::vector<uint64> > incomingQPIPackets; // each 64 byte
2858     std::vector<std::vector<uint64> > outgoingQPIFlits; // idle or data/non-data flits depending on the architecture
2859     std::vector<std::vector<uint64> > TxL0Cycles;
2860     uint64 uncoreTSC;
2861 
2862 protected:
readAndAggregate(std::shared_ptr<SafeMsrHandle> handle)2863     void readAndAggregate(std::shared_ptr<SafeMsrHandle> handle)
2864     {
2865         BasicCounterState::readAndAggregate(handle);
2866         UncoreCounterState::readAndAggregate(handle);
2867     }
2868 
2869 public:
2870     friend uint64 getIncomingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after);
2871     friend uint64 getIncomingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & now);
2872     friend double getOutgoingQPILinkUtilization(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after);
2873     friend uint64 getOutgoingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after);
2874     friend uint64 getOutgoingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & now);
2875 
SystemCounterState()2876     SystemCounterState() :
2877         uncoreTSC(0)
2878     {
2879         PCM * m = PCM::getInstance();
2880         incomingQPIPackets.resize(m->getNumSockets(),
2881                                   std::vector<uint64>((uint32)m->getQPILinksPerSocket(), 0));
2882         outgoingQPIFlits.resize(m->getNumSockets(),
2883                                     std::vector<uint64>((uint32)m->getQPILinksPerSocket(), 0));
2884         TxL0Cycles.resize(m->getNumSockets(),
2885                                     std::vector<uint64>((uint32)m->getQPILinksPerSocket(), 0));
2886     }
2887 
2888     SystemCounterState( const SystemCounterState& ) = default;
2889     SystemCounterState( SystemCounterState&& ) = default;
2890     SystemCounterState & operator = ( SystemCounterState&& ) = default;
2891 
2892     SystemCounterState & operator += ( const SocketCounterState& scs )
2893     {
2894         BasicCounterState::operator += ( scs );
2895         UncoreCounterState::operator += ( scs );
2896 
2897         return *this;
2898     }
2899 
2900     SystemCounterState & operator += ( const UncoreCounterState& ucs )
2901     {
2902         UncoreCounterState::operator += ( ucs );
2903 
2904         return *this;
2905     }
2906 };
2907 
2908 /*! \brief Reads the counter state of the system
2909 
2910         Helper function. Uses PCM object to access counters.
2911 
2912         System consists of several sockets (CPUs).
2913         Socket has a CPU in it. Socket (CPU) consists of several (logical) cores.
2914 
2915         \return State of counters in the entire system
2916 */
2917 PCM_API SystemCounterState getSystemCounterState();
2918 
2919 /*! \brief Reads the counter state of a socket
2920 
2921         Helper function. Uses PCM object to access counters.
2922 
2923         \param socket socket id
2924         \return State of counters in the socket
2925 */
2926 PCM_API SocketCounterState getSocketCounterState(uint32 socket);
2927 
2928 /*! \brief Reads the counter state of a (logical) core
2929 
2930     Helper function. Uses PCM object to access counters.
2931 
2932     \param core core id
2933     \return State of counters in the core
2934 */
2935 PCM_API CoreCounterState getCoreCounterState(uint32 core);
2936 
2937 
2938 /*! \brief Computes average number of retired instructions per core cycle (IPC)
2939 
2940     \param before CPU counter state before the experiment
2941     \param after CPU counter state after the experiment
2942     \return IPC
2943 */
2944 template <class CounterStateType>
getIPC(const CounterStateType & before,const CounterStateType & after)2945 double getIPC(const CounterStateType & before, const CounterStateType & after) // instructions per cycle
2946 {
2947     int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
2948     if (clocks != 0)
2949         return double(after.InstRetiredAny - before.InstRetiredAny) / double(clocks);
2950     return -1;
2951 }
2952 
2953 
2954 /*! \brief Computes the number of retired instructions
2955 
2956     \param before CPU counter state before the experiment
2957     \param after CPU counter state after the experiment
2958     \return number of retired instructions
2959 */
2960 template <class CounterStateType>
getInstructionsRetired(const CounterStateType & before,const CounterStateType & after)2961 uint64 getInstructionsRetired(const CounterStateType & before, const CounterStateType & after) // instructions
2962 {
2963     return after.InstRetiredAny - before.InstRetiredAny;
2964 }
2965 
2966 /*! \brief Computes average number of retired instructions per time intervall
2967 
2968     \param before CPU counter state before the experiment
2969     \param after CPU counter state after the experiment
2970     \return usage
2971 */
2972 template <class CounterStateType>
getExecUsage(const CounterStateType & before,const CounterStateType & after)2973 double getExecUsage(const CounterStateType & before, const CounterStateType & after) // usage
2974 {
2975     int64 timer_clocks = after.InvariantTSC - before.InvariantTSC;
2976     if (timer_clocks != 0)
2977         return double(after.InstRetiredAny - before.InstRetiredAny) / double(timer_clocks);
2978     return -1;
2979 }
2980 
2981 /*! \brief Computes the number of retired instructions
2982 
2983     \param now Current CPU counter state
2984     \return number of retired instructions
2985 */
2986 template <class CounterStateType>
getInstructionsRetired(const CounterStateType & now)2987 uint64 getInstructionsRetired(const CounterStateType & now) // instructions
2988 {
2989     return now.InstRetiredAny.getRawData_NoOverflowProtection();
2990 }
2991 
2992 /*! \brief Computes the number core clock cycles when signal on a specific core is running (not halted)
2993 
2994     Returns number of used cycles (halted cyles are not counted).
2995     The counter does not advance in the following conditions:
2996     - an ACPI C-state is other than C0 for normal operation
2997     - HLT
2998     - STPCLK+ pin is asserted
2999     - being throttled by TM1
3000     - during the frequency switching phase of a performance state transition
3001 
3002     The performance counter for this event counts across performance state
3003     transitions using different core clock frequencies
3004 
3005     \param before CPU counter state before the experiment
3006     \param after CPU counter state after the experiment
3007     \return number core clock cycles
3008 */
3009 template <class CounterStateType>
getCycles(const CounterStateType & before,const CounterStateType & after)3010 uint64 getCycles(const CounterStateType & before, const CounterStateType & after) // clocks
3011 {
3012     return after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
3013 }
3014 
3015 /*! \brief Computes the number of reference clock cycles while clock signal on the core is running
3016 
3017     The reference clock operates at a fixed frequency, irrespective of core
3018     frequency changes due to performance state transitions. See Intel(r) Software
3019     Developer's Manual for more details
3020 
3021     \param before CPU counter state before the experiment
3022     \param after CPU counter state after the experiment
3023     \return number core clock cycles
3024 */
3025 template <class CounterStateType>
getRefCycles(const CounterStateType & before,const CounterStateType & after)3026 uint64 getRefCycles(const CounterStateType & before, const CounterStateType & after) // clocks
3027 {
3028     return after.CpuClkUnhaltedRef - before.CpuClkUnhaltedRef;
3029 }
3030 
3031 /*! \brief Computes the number executed core clock cycles
3032 
3033     Returns number of used cycles (halted cyles are not counted).
3034 
3035     \param now Current CPU counter state
3036     \return number core clock cycles
3037 */
3038 template <class CounterStateType>
getCycles(const CounterStateType & now)3039 uint64 getCycles(const CounterStateType & now) // clocks
3040 {
3041     return now.CpuClkUnhaltedThread.getRawData_NoOverflowProtection();
3042 }
3043 
3044 /*! \brief Computes average number of retired instructions per core cycle for the entire system combining instruction counts from logical cores to corresponding physical cores
3045 
3046         Use this metric to evaluate IPC improvement between SMT(Hyperthreading) on and SMT off.
3047 
3048     \param before CPU counter state before the experiment
3049     \param after CPU counter state after the experiment
3050     \return IPC
3051 */
3052 template <class CounterStateType>
getCoreIPC(const CounterStateType & before,const CounterStateType & after)3053 inline double getCoreIPC(const CounterStateType & before, const CounterStateType & after) // instructions per cycle
3054 {
3055     double ipc = getIPC(before, after);
3056     PCM * m = PCM::getInstance();
3057     if (ipc >= 0. && m && (m->getNumCores() == m->getNumOnlineCores()))
3058         return ipc * double(m->getThreadsPerCore());
3059     return -1;
3060 }
3061 
3062 /*! \brief Computes average number of retired instructions per time intervall for the entire system combining instruction counts from logical cores to corresponding physical cores
3063 
3064         Use this metric to evaluate cores utilization improvement between SMT(Hyperthreading) on and SMT off.
3065 
3066     \param before CPU counter state before the experiment
3067     \param after CPU counter state after the experiment
3068     \return usage
3069 */
3070 template <class CounterStateType>
getTotalExecUsage(const CounterStateType & before,const CounterStateType & after)3071 inline double getTotalExecUsage(const CounterStateType & before, const CounterStateType & after) // usage
3072 {
3073     double usage = getExecUsage(before, after);
3074     PCM * m = PCM::getInstance();
3075     if (usage >= 0. && m && (m->getNumCores() == m->getNumOnlineCores()))
3076         return usage * double(m->getThreadsPerCore());
3077     return -1;
3078 }
3079 
3080 /*! \brief Computes average core frequency also taking Intel Turbo Boost technology into account
3081 
3082     \param before CPU counter state before the experiment
3083     \param after CPU counter state after the experiment
3084     \return frequency in Hz
3085 */
3086 template <class CounterStateType>
getAverageFrequency(const CounterStateType & before,const CounterStateType & after)3087 double getAverageFrequency(const CounterStateType & before, const CounterStateType & after) // in Hz
3088 {
3089     int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
3090     int64 timer_clocks = after.InvariantTSC - before.InvariantTSC;
3091     PCM * m = PCM::getInstance();
3092     if (timer_clocks != 0 && m)
3093         return double(m->getNominalFrequency()) * double(clocks) / double(timer_clocks);
3094     return -1;
3095 }
3096 
3097 /*! \brief Computes average core frequency when not in powersaving C0-state (also taking Intel Turbo Boost technology into account)
3098 
3099     \param before CPU counter state before the experiment
3100     \param after CPU counter state after the experiment
3101     \return frequency in Hz
3102 */
3103 template <class CounterStateType>
getActiveAverageFrequency(const CounterStateType & before,const CounterStateType & after)3104 double getActiveAverageFrequency(const CounterStateType & before, const CounterStateType & after) // in Hz
3105 {
3106     int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
3107     int64 ref_clocks = after.CpuClkUnhaltedRef - before.CpuClkUnhaltedRef;
3108     PCM * m = PCM::getInstance();
3109     if (ref_clocks != 0 && m)
3110         return double(m->getNominalFrequency()) * double(clocks) / double(ref_clocks);
3111     return -1;
3112 }
3113 
3114 /*! \brief Computes average core frequency also taking Intel Turbo Boost technology into account
3115 
3116     \param before CPU counter state before the experiment
3117     \param after CPU counter state after the experiment
3118     \return Fraction of nominal frequency
3119 */
3120 template <class CounterStateType>
getRelativeFrequency(const CounterStateType & before,const CounterStateType & after)3121 double getRelativeFrequency(const CounterStateType & before, const CounterStateType & after) // fraction of nominal frequency
3122 {
3123     int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
3124     int64 timer_clocks = after.InvariantTSC - before.InvariantTSC;
3125     if (timer_clocks != 0)
3126         return double(clocks) / double(timer_clocks);
3127     return -1;
3128 }
3129 
3130 /*! \brief Computes average core frequency when not in powersaving C0-state (also taking Intel Turbo Boost technology into account)
3131 
3132     \param before CPU counter state before the experiment
3133     \param after CPU counter state after the experiment
3134     \return Fraction of nominal frequency (if >1.0 then Turbo was working during the measurement)
3135 */
3136 template <class CounterStateType>
getActiveRelativeFrequency(const CounterStateType & before,const CounterStateType & after)3137 double getActiveRelativeFrequency(const CounterStateType & before, const CounterStateType & after) // fraction of nominal frequency
3138 {
3139     if (!PCM::getInstance()->isActiveRelativeFrequencyAvailable()) return -1.;
3140     int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
3141     int64 ref_clocks = after.CpuClkUnhaltedRef - before.CpuClkUnhaltedRef;
3142     if (ref_clocks != 0)
3143         return double(clocks) / double(ref_clocks);
3144     return -1;
3145 }
3146 
3147 /*! \brief Computes L2 cache hit ratio
3148 
3149     \param before CPU counter state before the experiment
3150     \param after CPU counter state after the experiment
3151     \warning Works only in the DEFAULT_EVENTS programming mode (see program() method)
3152     \return value between 0 and 1
3153 */
3154 template <class CounterStateType>
getL2CacheHitRatio(const CounterStateType & before,const CounterStateType & after)3155 double getL2CacheHitRatio(const CounterStateType& before, const CounterStateType& after) // 0.0 - 1.0
3156 {
3157     if (!PCM::getInstance()->isL2CacheHitRatioAvailable()) return 0;
3158     const auto hits = getL2CacheHits(before, after);
3159     const auto misses = getL2CacheMisses(before, after);
3160     return double(hits) / double(hits + misses);
3161 }
3162 
3163 /*! \brief Computes L3 cache hit ratio
3164 
3165     \param before CPU counter state before the experiment
3166     \param after CPU counter state after the experiment
3167     \warning Works only in the DEFAULT_EVENTS programming mode (see program() method)
3168     \return value between 0 and 1
3169 */
3170 template <class CounterStateType>
getL3CacheHitRatio(const CounterStateType & before,const CounterStateType & after)3171 double getL3CacheHitRatio(const CounterStateType& before, const CounterStateType& after) // 0.0 - 1.0
3172 {
3173     if (!PCM::getInstance()->isL3CacheHitRatioAvailable()) return 0;
3174     const auto hits = getL3CacheHits(before, after);
3175     const auto misses = getL3CacheMisses(before, after);
3176     return double(hits) / double(hits + misses);
3177 }
3178 
3179 /*! \brief Computes number of L3 cache misses
3180 
3181     \param before CPU counter state before the experiment
3182     \param after CPU counter state after the experiment
3183     \warning Works only in the DEFAULT_EVENTS programming mode (see program() method)
3184     \return number of misses
3185 */
3186 template <class CounterStateType>
getL3CacheMisses(const CounterStateType & before,const CounterStateType & after)3187 uint64 getL3CacheMisses(const CounterStateType & before, const CounterStateType & after)
3188 {
3189     if (!PCM::getInstance()->isL3CacheMissesAvailable()) return 0;
3190     return after.Event[BasicCounterState::L3MissPos] - before.Event[BasicCounterState::L3MissPos];
3191 }
3192 
3193 /*! \brief Computes number of L2 cache misses
3194 
3195     \param before CPU counter state before the experiment
3196     \param after CPU counter state after the experiment
3197     \warning Works only in the DEFAULT_EVENTS programming mode (see program() method)
3198     \return number of misses
3199 */
3200 template <class CounterStateType>
getL2CacheMisses(const CounterStateType & before,const CounterStateType & after)3201 uint64 getL2CacheMisses(const CounterStateType & before, const CounterStateType & after)
3202 {
3203     auto pcm = PCM::getInstance();
3204     if (pcm->isL2CacheMissesAvailable() == false) return 0ULL;
3205     const auto cpu_model = pcm->getCPUModel();
3206     if (pcm->useSkylakeEvents() || cpu_model == PCM::SNOWRIDGE) {
3207         return after.Event[BasicCounterState::SKLL2MissPos] - before.Event[BasicCounterState::SKLL2MissPos];
3208     }
3209     if (pcm->isAtom() || cpu_model == PCM::KNL)
3210     {
3211         return after.Event[BasicCounterState::ArchLLCMissPos] - before.Event[BasicCounterState::ArchLLCMissPos];
3212     }
3213     uint64 L3Miss = after.Event[BasicCounterState::L3MissPos] - before.Event[BasicCounterState::L3MissPos];
3214     uint64 L3UnsharedHit = after.Event[BasicCounterState::L3UnsharedHitPos] - before.Event[BasicCounterState::L3UnsharedHitPos];
3215     uint64 L2HitM = after.Event[BasicCounterState::L2HitMPos] - before.Event[BasicCounterState::L2HitMPos];
3216     return L2HitM + L3UnsharedHit + L3Miss;
3217 }
3218 
3219 /*! \brief Computes number of L2 cache hits
3220 
3221     \param before CPU counter state before the experiment
3222     \param after CPU counter state after the experiment
3223     \warning Works only in the DEFAULT_EVENTS programming mode (see program() method)
3224     \return number of hits
3225 */
3226 template <class CounterStateType>
getL2CacheHits(const CounterStateType & before,const CounterStateType & after)3227 uint64 getL2CacheHits(const CounterStateType & before, const CounterStateType & after)
3228 {
3229     auto pcm = PCM::getInstance();
3230     if (pcm->isL2CacheHitsAvailable() == false) return 0ULL;
3231     if (pcm->isAtom() || pcm->getCPUModel() == PCM::KNL)
3232     {
3233         uint64 L2Miss = after.Event[BasicCounterState::ArchLLCMissPos] - before.Event[BasicCounterState::ArchLLCMissPos];
3234         uint64 L2Ref = after.Event[BasicCounterState::ArchLLCRefPos] - before.Event[BasicCounterState::ArchLLCRefPos];
3235         return L2Ref - L2Miss;
3236     }
3237     return after.Event[BasicCounterState::L2HitPos] - before.Event[BasicCounterState::L2HitPos];
3238 }
3239 
3240 /*! \brief Computes L3 Cache Occupancy
3241 
3242 */
3243 template <class CounterStateType>
getL3CacheOccupancy(const CounterStateType & now)3244 uint64 getL3CacheOccupancy(const CounterStateType & now)
3245 {
3246     if (PCM::getInstance()->L3CacheOccupancyMetricAvailable() == false) return 0ULL;
3247     return now.L3Occupancy;
3248 }
3249 /*! \brief Computes Local Memory Bandwidth
3250 
3251  */
3252 template <class CounterStateType>
getLocalMemoryBW(const CounterStateType & before,const CounterStateType & after)3253 uint64 getLocalMemoryBW(const CounterStateType & before, const CounterStateType & after)
3254 {
3255     if (PCM::getInstance()->CoreLocalMemoryBWMetricAvailable() == false) return 0ULL;
3256     return after.MemoryBWLocal - before.MemoryBWLocal;
3257 }
3258 
3259 /*! \brief Computes Remote Memory Bandwidth
3260 
3261  */
3262 template <class CounterStateType>
getRemoteMemoryBW(const CounterStateType & before,const CounterStateType & after)3263 uint64 getRemoteMemoryBW(const CounterStateType & before, const CounterStateType & after)
3264 {
3265     if (PCM::getInstance()->CoreRemoteMemoryBWMetricAvailable() == false) return 0ULL;
3266     const uint64 total = after.MemoryBWTotal - before.MemoryBWTotal;
3267     const uint64 local = getLocalMemoryBW(before, after);
3268     if (total > local)
3269         return total - local;
3270 
3271     return 0;
3272 }
3273 
3274 /*! \brief Computes number of L3 cache hits where no snooping in sibling L2 caches had to be done
3275 
3276     \param before CPU counter state before the experiment
3277     \param after CPU counter state after the experiment
3278     \warning Works only in the DEFAULT_EVENTS programming mode (see program() method)
3279     \return number of hits
3280 */
3281 template <class CounterStateType>
getL3CacheHitsNoSnoop(const CounterStateType & before,const CounterStateType & after)3282 uint64 getL3CacheHitsNoSnoop(const CounterStateType & before, const CounterStateType & after)
3283 {
3284     if (!PCM::getInstance()->isL3CacheHitsNoSnoopAvailable()) return 0;
3285     return after.Event[BasicCounterState::L3UnsharedHitPos] - before.Event[BasicCounterState::L3UnsharedHitPos];
3286 }
3287 
3288 /*! \brief Computes number of L3 cache hits where snooping in sibling L2 caches had to be done
3289 
3290     \param before CPU counter state before the experiment
3291     \param after CPU counter state after the experiment
3292     \warning Works only in the DEFAULT_EVENTS programming mode (see program() method)
3293     \return number of hits
3294 */
3295 template <class CounterStateType>
getL3CacheHitsSnoop(const CounterStateType & before,const CounterStateType & after)3296 uint64 getL3CacheHitsSnoop(const CounterStateType & before, const CounterStateType & after)
3297 {
3298     auto pcm = PCM::getInstance();
3299     if (!pcm->isL3CacheHitsSnoopAvailable()) return 0;
3300     const auto cpu_model = pcm->getCPUModel();
3301     if (cpu_model == PCM::SNOWRIDGE)
3302     {
3303         const int64 misses = getL3CacheMisses(before, after);
3304         const int64 refs = after.Event[BasicCounterState::ArchLLCRefPos] - before.Event[BasicCounterState::ArchLLCRefPos];
3305         const int64 hits = refs - misses;
3306         return (hits > 0)? hits : 0;
3307     }
3308     if (pcm->useSkylakeEvents()) {
3309         return after.Event[BasicCounterState::SKLL3HitPos] - before.Event[BasicCounterState::SKLL3HitPos];
3310     }
3311     return after.Event[BasicCounterState::L2HitMPos] - before.Event[BasicCounterState::L2HitMPos];
3312 }
3313 
3314 
3315 /*! \brief Computes total number of L3 cache hits
3316 
3317     \param before CPU counter state before the experiment
3318     \param after CPU counter state after the experiment
3319     \warning Works only in the DEFAULT_EVENTS programming mode (see program() method)
3320     \return number of hits
3321 */
3322 template <class CounterStateType>
getL3CacheHits(const CounterStateType & before,const CounterStateType & after)3323 uint64 getL3CacheHits(const CounterStateType & before, const CounterStateType & after)
3324 {
3325     if (!PCM::getInstance()->isL3CacheHitsAvailable()) return 0;
3326     return getL3CacheHitsSnoop(before, after) + getL3CacheHitsNoSnoop(before, after);
3327 }
3328 
3329 /*! \brief Computes number of invariant time stamp counter ticks
3330 
3331     This counter counts irrespectively of C-, P- or T-states
3332 
3333     \param before CPU counter state before the experiment
3334     \param after CPU counter state after the experiment
3335     \return number of time stamp counter ticks
3336 */
3337 template <class CounterStateType>
getInvariantTSC(const CounterStateType & before,const CounterStateType & after)3338 uint64 getInvariantTSC(const CounterStateType & before, const CounterStateType & after)
3339 {
3340     return after.InvariantTSC - before.InvariantTSC;
3341 }
3342 
3343 /*! \brief Computes residency in the core C-state
3344 
3345     \param state C-state
3346     \param before CPU counter state before the experiment
3347     \param after CPU counter state after the experiment
3348     \return residence ratio (0..1): 0 - 0%, 1.0 - 100%
3349 */
3350 template <class CounterStateType>
getCoreCStateResidency(int state,const CounterStateType & before,const CounterStateType & after)3351 inline double getCoreCStateResidency(int state, const CounterStateType & before, const CounterStateType & after)
3352 {
3353     const double tsc = double(getInvariantTSC(before, after));
3354 
3355     if (state == 0) return double(getRefCycles(before, after)) / tsc;
3356 
3357     if (state == 1)
3358     {
3359         PCM * m = PCM::getInstance();
3360         double result = 1.0 - double(getRefCycles(before, after)) / tsc; // 1.0 - cC0
3361         for (int i = 2; i <= PCM::MAX_C_STATE; ++i)
3362             if (m->isCoreCStateResidencySupported(state))
3363                 result -= (after.BasicCounterState::CStateResidency[i] - before.BasicCounterState::CStateResidency[i]) / tsc;
3364 
3365         if (result < 0.) result = 0.;       // fix counter dissynchronization
3366         else if (result > 1.) result = 1.;  // fix counter dissynchronization
3367 
3368         return result;
3369     }
3370     return (after.BasicCounterState::CStateResidency[state] - before.BasicCounterState::CStateResidency[state]) / tsc;
3371 }
3372 
3373 /*! \brief Reads raw residency counter for the core C-state
3374 
3375     \param state C-state #
3376     \param now CPU counter state
3377     \return raw residency value
3378 */
3379 template <class CounterStateType>
getCoreCStateResidency(int state,const CounterStateType & now)3380 inline uint64 getCoreCStateResidency(int state, const CounterStateType& now)
3381 {
3382     if (state == 0) return now.CpuClkUnhaltedRef.getRawData_NoOverflowProtection();
3383 
3384     return now.BasicCounterState::CStateResidency[state];
3385 }
3386 
3387 /*! \brief Computes residency in the package C-state
3388 
3389     \param state C-state
3390     \param before CPU counter state before the experiment
3391     \param after CPU counter state after the experiment
3392     \return residence ratio (0..1): 0 - 0%, 1.0 - 100%
3393 */
3394 template <class CounterStateType>
getPackageCStateResidency(int state,const CounterStateType & before,const CounterStateType & after)3395 inline double getPackageCStateResidency(int state, const CounterStateType & before, const CounterStateType & after)
3396 {
3397     const double tsc = double(getInvariantTSC(before, after));
3398     if (state == 0)
3399     {
3400         PCM * m = PCM::getInstance();
3401         double result = 1.0;
3402         for (int i = 1; i <= PCM::MAX_C_STATE; ++i)
3403             if (m->isPackageCStateResidencySupported(state))
3404                 result -= (after.UncoreCounterState::CStateResidency[i] - before.UncoreCounterState::CStateResidency[i]) / tsc;
3405 
3406         if (result < 0.) result = 0.;       // fix counter dissynchronization
3407         else if (result > 1.) result = 1.;  // fix counter dissynchronization
3408 
3409         return result;
3410     }
3411     return double(after.UncoreCounterState::CStateResidency[state] - before.UncoreCounterState::CStateResidency[state]) / tsc;
3412 }
3413 
3414 /*! \brief Reads raw residency counter for the package C-state
3415 
3416     \param state C-state #
3417     \param now CPU counter state
3418     \return raw residency value
3419 */
3420 template <class CounterStateType>
getPackageCStateResidency(int state,const CounterStateType & now)3421 inline uint64 getPackageCStateResidency(int state, const CounterStateType& now)
3422 {
3423     return now.UncoreCounterState::CStateResidency[state];
3424 }
3425 
3426 /*! \brief Computes number of bytes read from DRAM memory controllers
3427 
3428     \param before CPU counter state before the experiment
3429     \param after CPU counter state after the experiment
3430     \return Number of bytes
3431 */
3432 template <class CounterStateType>
getBytesReadFromMC(const CounterStateType & before,const CounterStateType & after)3433 uint64 getBytesReadFromMC(const CounterStateType & before, const CounterStateType & after)
3434 {
3435     if (PCM::getInstance()->memoryTrafficMetricsAvailable())
3436         return (after.UncMCNormalReads - before.UncMCNormalReads) * 64;
3437     return 0ULL;
3438 }
3439 
3440 /*! \brief Computes number of bytes written to DRAM memory controllers
3441 
3442     \param before CPU counter state before the experiment
3443     \param after CPU counter state after the experiment
3444     \return Number of bytes
3445 */
3446 template <class CounterStateType>
getBytesWrittenToMC(const CounterStateType & before,const CounterStateType & after)3447 uint64 getBytesWrittenToMC(const CounterStateType & before, const CounterStateType & after)
3448 {
3449     if (PCM::getInstance()->memoryTrafficMetricsAvailable())
3450         return (after.UncMCFullWrites - before.UncMCFullWrites) * 64;
3451     return 0ULL;
3452 }
3453 
3454 /*! \brief Computes number of bytes read from PMM memory
3455 
3456     \param before CPU counter state before the experiment
3457     \param after CPU counter state after the experiment
3458     \return Number of bytes
3459 */
3460 template <class CounterStateType>
getBytesReadFromPMM(const CounterStateType & before,const CounterStateType & after)3461 uint64 getBytesReadFromPMM(const CounterStateType & before, const CounterStateType & after)
3462 {
3463     if (PCM::getInstance()->PMMTrafficMetricsAvailable())
3464         return (after.UncPMMReads - before.UncPMMReads) * 64;
3465     return 0ULL;
3466 }
3467 
3468 /*! \brief Computes number of bytes written to PMM memory
3469 
3470     \param before CPU counter state before the experiment
3471     \param after CPU counter state after the experiment
3472     \return Number of bytes
3473 */
3474 template <class CounterStateType>
getBytesWrittenToPMM(const CounterStateType & before,const CounterStateType & after)3475 uint64 getBytesWrittenToPMM(const CounterStateType & before, const CounterStateType & after)
3476 {
3477     if (PCM::getInstance()->PMMTrafficMetricsAvailable())
3478         return (after.UncPMMWrites - before.UncPMMWrites) * 64;
3479     return 0ULL;
3480 }
3481 
3482 /*! \brief Computes number of bytes read from MCDRAM memory controllers
3483 
3484     \param before CPU counter state before the experiment
3485     \param after CPU counter state after the experiment
3486     \return Number of bytes
3487 */
3488 template <class CounterStateType>
getBytesReadFromEDC(const CounterStateType & before,const CounterStateType & after)3489 uint64 getBytesReadFromEDC(const CounterStateType & before, const CounterStateType & after)
3490 {
3491     if (PCM::getInstance()->MCDRAMmemoryTrafficMetricsAvailable())
3492         return (after.UncEDCNormalReads - before.UncEDCNormalReads) * 64;
3493     return 0ULL;
3494 }
3495 
3496 /*! \brief Computes number of bytes written to MCDRAM memory controllers
3497 
3498     \param before CPU counter state before the experiment
3499     \param after CPU counter state after the experiment
3500     \return Number of bytes
3501 */
3502 template <class CounterStateType>
getBytesWrittenToEDC(const CounterStateType & before,const CounterStateType & after)3503 uint64 getBytesWrittenToEDC(const CounterStateType & before, const CounterStateType & after)
3504 {
3505     if (PCM::getInstance()->MCDRAMmemoryTrafficMetricsAvailable())
3506         return (after.UncEDCFullWrites - before.UncEDCFullWrites) * 64;
3507     return 0ULL;
3508 }
3509 
3510 
3511 /*! \brief Computes number of bytes of read/write requests from all IO sources
3512 
3513     \param before CPU counter state before the experiment
3514     \param after CPU counter state after the experiment
3515     \return Number of bytes
3516 */
3517 template <class CounterStateType>
getIORequestBytesFromMC(const CounterStateType & before,const CounterStateType & after)3518 uint64 getIORequestBytesFromMC(const CounterStateType & before, const CounterStateType & after)
3519 {
3520     if (PCM::getInstance()->memoryIOTrafficMetricAvailable())
3521         return (after.UncMCIORequests - before.UncMCIORequests) * 64;
3522     return 0ULL;
3523 }
3524 
3525 /*! \brief Returns the number of occured system management interrupts
3526 
3527     \param before CPU counter state before the experiment
3528     \param after CPU counter state after the experiment
3529     \return Number of SMIs (system manegement interrupts)
3530 */
3531 template <class CounterStateType>
getSMICount(const CounterStateType & before,const CounterStateType & after)3532 uint64 getSMICount(const CounterStateType & before, const CounterStateType & after)
3533 {
3534     return after.SMICount - before.SMICount;
3535 }
3536 
3537 /*! \brief Returns the number of occured custom core events
3538 
3539     Read number of events programmed with the \c CUSTOM_CORE_EVENTS
3540 
3541     \param eventCounterNr Event/counter number (value from 0 to 3)
3542     \param before CPU counter state before the experiment
3543     \param after CPU counter state after the experiment
3544     \return Number of bytes
3545 */
3546 template <class CounterStateType>
getNumberOfCustomEvents(int32 eventCounterNr,const CounterStateType & before,const CounterStateType & after)3547 uint64 getNumberOfCustomEvents(int32 eventCounterNr, const CounterStateType & before, const CounterStateType & after)
3548 {
3549     return after.Event[eventCounterNr] - before.Event[eventCounterNr];
3550 }
3551 
3552 /*! \brief Get estimation of QPI data traffic per incoming QPI link
3553 
3554     Returns an estimation of number of data bytes transferred to a socket over Intel(r) Quick Path Interconnect
3555 
3556     \param socketNr socket identifier
3557     \param linkNr linkNr
3558     \param before System CPU counter state before the experiment
3559     \param after System CPU counter state after the experiment
3560     \return Number of bytes
3561 */
getIncomingQPILinkBytes(uint32 socketNr,uint32 linkNr,const SystemCounterState & before,const SystemCounterState & after)3562 inline uint64 getIncomingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after)
3563 {
3564     if (!PCM::getInstance()->incomingQPITrafficMetricsAvailable()) return 0ULL;
3565     uint64 b = before.incomingQPIPackets[socketNr][linkNr];
3566     uint64 a = after.incomingQPIPackets[socketNr][linkNr];
3567     // prevent overflows due to counter dissynchronisation
3568     return (a > b) ? (64 * (a - b)) : 0;
3569 }
3570 
3571 /*! \brief Get data utilization of incoming QPI link (0..1)
3572 
3573     Returns an estimation of utilization of QPI link by data traffic transferred to a socket over Intel(r) Quick Path Interconnect
3574 
3575     \param socketNr socket identifier
3576     \param linkNr linkNr
3577     \param before System CPU counter state before the experiment
3578     \param after System CPU counter state after the experiment
3579     \return utilization (0..1)
3580 */
getIncomingQPILinkUtilization(uint32 socketNr,uint32 linkNr,const SystemCounterState & before,const SystemCounterState & after)3581 inline double getIncomingQPILinkUtilization(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after)
3582 {
3583     PCM * m = PCM::getInstance();
3584     if (!(m->qpiUtilizationMetricsAvailable())) return 0.;
3585 
3586     const double bytes = (double)getIncomingQPILinkBytes(socketNr, linkNr, before, after);
3587     const uint64 max_speed = m->getQPILinkSpeed(socketNr, linkNr);
3588     const double max_bytes = (double)(double(max_speed) * double(getInvariantTSC(before, after) / double(m->getNumCores())) / double(m->getNominalFrequency()));
3589     return bytes / max_bytes;
3590 }
3591 
3592 /*! \brief Get utilization of outgoing QPI link (0..1)
3593 
3594     Returns an estimation of utilization of QPI link by (data+nondata) traffic transferred from a socket over Intel(r) Quick Path Interconnect
3595 
3596     \param socketNr socket identifier
3597     \param linkNr linkNr
3598     \param before System CPU counter state before the experiment
3599     \param after System CPU counter state after the experiment
3600     \return utilization (0..1)
3601 */
getOutgoingQPILinkUtilization(uint32 socketNr,uint32 linkNr,const SystemCounterState & before,const SystemCounterState & after)3602 inline double getOutgoingQPILinkUtilization(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after)
3603 {
3604     PCM * m = PCM::getInstance();
3605 
3606     if (m->outgoingQPITrafficMetricsAvailable() == false) return 0.;
3607 
3608     if (m->hasBecktonUncore())
3609     {
3610         const uint64 b = before.outgoingQPIFlits[socketNr][linkNr]; // idle flits
3611         const uint64 a = after.outgoingQPIFlits[socketNr][linkNr];  // idle flits
3612         // prevent overflows due to counter dissynchronisation
3613         const double idle_flits = (double)((a > b) ? (a - b) : 0);
3614         const uint64 bTSC = before.uncoreTSC;
3615         const uint64 aTSC = after.uncoreTSC;
3616         const double tsc = (double)((aTSC > bTSC) ? (aTSC - bTSC) : 0);
3617         if (idle_flits >= tsc) return 0.; // prevent oveflows due to potential counter dissynchronization
3618 
3619         return (1. - (idle_flits / tsc));
3620     } else if (m->hasPCICFGUncore())
3621     {
3622         const uint64 b = before.outgoingQPIFlits[socketNr][linkNr]; // data + non-data flits or idle (null) flits
3623         const uint64 a = after.outgoingQPIFlits[socketNr][linkNr]; // data + non-data flits or idle (null) flits
3624         // prevent overflows due to counter dissynchronisation
3625         double flits = (double)((a > b) ? (a - b) : 0);
3626         const double max_flits = ((double(getInvariantTSC(before, after)) * double(m->getQPILinkSpeed(socketNr, linkNr)) / m->getBytesPerFlit()) / double(m->getNominalFrequency())) / double(m->getNumCores());
3627         if(m->hasUPI())
3628         {
3629             flits = flits/3.;
3630         }
3631         if (flits > max_flits) return 1.; // prevent oveflows due to potential counter dissynchronization
3632         return (flits / max_flits);
3633     }
3634 
3635     return 0;
3636 }
3637 
3638 /*! \brief Get estimation of QPI (data+nondata) traffic per outgoing QPI link
3639 
3640     Returns an estimation of number of data bytes transferred from a socket over Intel(r) Quick Path Interconnect
3641 
3642     \param socketNr socket identifier
3643     \param linkNr linkNr
3644     \param before System CPU counter state before the experiment
3645     \param after System CPU counter state after the experiment
3646     \return Number of bytes
3647 */
getOutgoingQPILinkBytes(uint32 socketNr,uint32 linkNr,const SystemCounterState & before,const SystemCounterState & after)3648 inline uint64 getOutgoingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after)
3649 {
3650     PCM * m = PCM::getInstance();
3651     if (!(m->outgoingQPITrafficMetricsAvailable())) return 0ULL;
3652 
3653     const double util = getOutgoingQPILinkUtilization(socketNr, linkNr, before, after);
3654     const double max_bytes = (double(m->getQPILinkSpeed(socketNr, linkNr)) * double(getInvariantTSC(before, after) / double(m->getNumCores())) / double(m->getNominalFrequency()));
3655 
3656     return (uint64)(max_bytes * util);
3657 }
3658 
3659 
3660 /*! \brief Get estimation of total QPI data traffic
3661 
3662     Returns an estimation of number of data bytes transferred to all sockets over all Intel(r) Quick Path Interconnect links
3663 
3664     \param before System CPU counter state before the experiment
3665     \param after System CPU counter state after the experiment
3666     \return Number of bytes
3667 */
getAllIncomingQPILinkBytes(const SystemCounterState & before,const SystemCounterState & after)3668 inline uint64 getAllIncomingQPILinkBytes(const SystemCounterState & before, const SystemCounterState & after)
3669 {
3670     PCM * m = PCM::getInstance();
3671     const uint32 ns = m->getNumSockets();
3672     const uint32 qpiLinks = (uint32)m->getQPILinksPerSocket();
3673     uint64 sum = 0;
3674 
3675     for (uint32 s = 0; s < ns; ++s)
3676         for (uint32 q = 0; q < qpiLinks; ++q)
3677             sum += getIncomingQPILinkBytes(s, q, before, after);
3678 
3679     return sum;
3680 }
3681 
3682 /*! \brief Get estimation of total QPI data+nondata traffic
3683 
3684     Returns an estimation of number of data and non-data bytes transferred from all sockets over all Intel(r) Quick Path Interconnect links
3685 
3686     \param before System CPU counter state before the experiment
3687     \param after System CPU counter state after the experiment
3688     \return Number of bytes
3689 */
getAllOutgoingQPILinkBytes(const SystemCounterState & before,const SystemCounterState & after)3690 inline uint64 getAllOutgoingQPILinkBytes(const SystemCounterState & before, const SystemCounterState & after)
3691 {
3692     PCM * m = PCM::getInstance();
3693     const uint32 ns = m->getNumSockets();
3694     const uint32 qpiLinks = (uint32)m->getQPILinksPerSocket();
3695     uint64 sum = 0;
3696 
3697     for (uint32 s = 0; s < ns; ++s)
3698         for (uint32 q = 0; q < qpiLinks; ++q)
3699             sum += getOutgoingQPILinkBytes(s, q, before, after);
3700 
3701     return sum;
3702 }
3703 
3704 
3705 /*! \brief Return current value of the counter of QPI data traffic per incoming QPI link
3706 
3707     Returns the number of incoming data bytes to a socket over Intel(r) Quick Path Interconnect
3708 
3709     \param socketNr socket identifier
3710     \param linkNr linkNr
3711     \param now Current System CPU counter state
3712     \return Number of bytes
3713 */
getIncomingQPILinkBytes(uint32 socketNr,uint32 linkNr,const SystemCounterState & now)3714 inline uint64 getIncomingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & now)
3715 {
3716     if (PCM::getInstance()->incomingQPITrafficMetricsAvailable())
3717         return 64 * now.incomingQPIPackets[socketNr][linkNr];
3718     return 0ULL;
3719 }
3720 
3721 /*! \brief Get estimation of total QPI data traffic for this socket
3722 
3723     Returns an estimation of number of bytes transferred to this sockets over all Intel(r) Quick Path Interconnect links on this socket
3724 
3725     \param before System CPU counter state before the experiment
3726     \param after System CPU counter state after the experiment
3727     \return Number of bytes
3728 */
getSocketIncomingQPILinkBytes(uint32 socketNr,const SystemCounterState & now)3729 inline uint64 getSocketIncomingQPILinkBytes(uint32 socketNr, const SystemCounterState & now)
3730 {
3731     PCM * m = PCM::getInstance();
3732     const uint32 qpiLinks = (uint32)m->getQPILinksPerSocket();
3733     uint64 sum = 0;
3734 
3735     for (uint32 q = 0; q < qpiLinks; ++q)
3736         sum += getIncomingQPILinkBytes(socketNr, q, now);
3737 
3738     return sum;
3739 }
3740 
3741 /*! \brief Get estimation of Socket QPI data traffic
3742 
3743     Returns an estimation of number of data bytes transferred to all sockets over all Intel(r) Quick Path Interconnect links
3744 
3745     \param now System CPU counter state
3746     \return Number of bytes
3747 */
getAllIncomingQPILinkBytes(const SystemCounterState & now)3748 inline uint64 getAllIncomingQPILinkBytes(const SystemCounterState & now)
3749 {
3750     PCM * m = PCM::getInstance();
3751     const uint32 ns = m->getNumSockets();
3752     uint64 sum = 0;
3753 
3754     for (uint32 s = 0; s < ns; ++s)
3755         sum += getSocketIncomingQPILinkBytes(s, now);
3756     return sum;
3757 }
3758 
3759 
3760 /*! \brief Get QPI data to Memory Controller traffic ratio
3761 
3762     Ideally for NUMA-optmized programs the ratio should be close to 0.
3763 
3764     \param before System CPU counter state before the experiment
3765     \param after System CPU counter state after the experiment
3766     \return Ratio
3767 */
3768 
getQPItoMCTrafficRatio(const SystemCounterState & before,const SystemCounterState & after)3769 inline double getQPItoMCTrafficRatio(const SystemCounterState & before, const SystemCounterState & after)
3770 {
3771     const uint64 totalQPI = getAllIncomingQPILinkBytes(before, after);
3772     uint64 memTraffic = getBytesReadFromMC(before, after) + getBytesWrittenToMC(before, after);
3773     if (PCM::getInstance()->PMMTrafficMetricsAvailable())
3774     {
3775         memTraffic += getBytesReadFromPMM(before, after) + getBytesWrittenToPMM(before, after);
3776     }
3777     return double(totalQPI) / double(memTraffic);
3778 }
3779 
3780 /*! \brief Get local memory access ration measured in home agent
3781 
3782     \param before System CPU counter state before the experiment
3783     \param after System CPU counter state after the experiment
3784     \return Ratio
3785 */
3786 template <class CounterStateType>
getLocalMemoryRequestRatio(const CounterStateType & before,const CounterStateType & after)3787 inline double getLocalMemoryRequestRatio(const CounterStateType & before, const CounterStateType & after)
3788 {
3789     if (PCM::getInstance()->localMemoryRequestRatioMetricAvailable() == false) return -1.;
3790     const auto all = after.UncHARequests - before.UncHARequests;
3791     const auto local = after.UncHALocalRequests - before.UncHALocalRequests;
3792     // std::cout << "PCM DEBUG "<< 64*all/1e6 << " " << 64*local/1e6 << "\n";
3793     return double(local)/double(all);
3794 }
3795 
3796 //! \brief Returns the raw count of events
3797 //! \param before counter state before the experiment
3798 //! \param after counter state after the experiment
3799 template <class CounterType>
getNumberOfEvents(const CounterType & before,const CounterType & after)3800 inline uint64 getNumberOfEvents(const CounterType & before, const CounterType & after)
3801 {
3802     return after.data - before.data;
3803 }
3804 //! \brief Returns average last level cache read+prefetch miss latency in ns
3805 
3806 template <class CounterStateType>
getLLCReadMissLatency(const CounterStateType & before,const CounterStateType & after)3807 inline double getLLCReadMissLatency(const CounterStateType & before, const CounterStateType & after)
3808 {
3809     if (PCM::getInstance()->LLCReadMissLatencyMetricsAvailable() == false) return -1.;
3810     const double occupancy = double(after.TOROccupancyIAMiss) - double(before.TOROccupancyIAMiss);
3811     const double inserts = double(after.TORInsertsIAMiss) - double(before.TORInsertsIAMiss);
3812     const double unc_clocks = double(after.UncClocks) - double(before.UncClocks);
3813     auto * m = PCM::getInstance();
3814     const double seconds = double(getInvariantTSC(before, after)) / double(m->getNumCores()/m->getNumSockets()) / double(m->getNominalFrequency());
3815     return 1e9*seconds*(occupancy/inserts)/unc_clocks;
3816 }
3817 
3818 template <class CounterStateType>
getAllSlots(const CounterStateType & before,const CounterStateType & after)3819 inline uint64 getAllSlots(const CounterStateType & before, const CounterStateType & after)
3820 {
3821     const int64 a = after.BackendBoundSlots - before.BackendBoundSlots;
3822     const int64 b = after.FrontendBoundSlots - before.FrontendBoundSlots;
3823     const int64 c = after.BadSpeculationSlots - before.BadSpeculationSlots;
3824     const int64 d = after.RetiringSlots - before.RetiringSlots;
3825     // std::cout << "before DEBUG: " << before.FrontendBoundSlots << " " << before.BadSpeculationSlots << " "<< before.BackendBoundSlots << " " << before.RetiringSlots << std::endl;
3826     // std::cout << "after DEBUG: " <<  after.FrontendBoundSlots << " " << after.BadSpeculationSlots << " " << after.BackendBoundSlots << " " << after.RetiringSlots << std::endl;
3827     assert(a >= 0);
3828     assert(b >= 0);
3829     assert(c >= 0);
3830     assert(d >= 0);
3831     return a + b + c + d;
3832 }
3833 
3834 template <class CounterStateType>
getAllSlotsRaw(const CounterStateType & before,const CounterStateType & after)3835 inline uint64 getAllSlotsRaw(const CounterStateType& before, const CounterStateType& after)
3836 {
3837     return after.AllSlotsRaw - before.AllSlotsRaw;
3838 }
3839 
3840 //! \brief Returns unutilized pipeline slots where no uop was delivered due to lack of back-end resources as range 0..1
3841 template <class CounterStateType>
getBackendBound(const CounterStateType & before,const CounterStateType & after)3842 inline double getBackendBound(const CounterStateType & before, const CounterStateType & after)
3843 {
3844 //    std::cout << "DEBUG: "<< after.BackendBoundSlots - before.BackendBoundSlots << " " << getAllSlots(before, after) << std::endl;
3845     if (PCM::getInstance()->isHWTMAL1Supported())
3846         return double(after.BackendBoundSlots - before.BackendBoundSlots)/double(getAllSlots(before, after));
3847     return 0.;
3848 }
3849 
3850 //! \brief Returns unutilized pipeline slots where Front-end did not deliver a uop while back-end is ready as range 0..1
3851 template <class CounterStateType>
getFrontendBound(const CounterStateType & before,const CounterStateType & after)3852 inline double getFrontendBound(const CounterStateType & before, const CounterStateType & after)
3853 {
3854 //    std::cout << "DEBUG: "<< after.FrontendBoundSlots - before.FrontendBoundSlots << " " << getAllSlots(before, after) << std::endl;
3855     if (PCM::getInstance()->isHWTMAL1Supported())
3856         return double(after.FrontendBoundSlots - before.FrontendBoundSlots)/double(getAllSlots(before, after));
3857     return 0.;
3858 }
3859 
3860 //! \brief Returns wasted pipeline slots due to incorrect speculation, covering whole penalty: Utilized by uops that do not retire, or Recovery Bubbles (unutilized slots) as range 0..1
3861 template <class CounterStateType>
getBadSpeculation(const CounterStateType & before,const CounterStateType & after)3862 inline double getBadSpeculation(const CounterStateType & before, const CounterStateType & after)
3863 {
3864 //    std::cout << "DEBUG: "<< after.BadSpeculationSlots - before.BadSpeculationSlots << " " << getAllSlots(before, after) << std::endl;
3865     if (PCM::getInstance()->isHWTMAL1Supported())
3866         return double(after.BadSpeculationSlots - before.BadSpeculationSlots)/double(getAllSlots(before, after));
3867     return 0.;
3868 }
3869 
3870 //! \brief Returns pipeline slots utilized by uops that eventually retire (commit)
3871 template <class CounterStateType>
getRetiring(const CounterStateType & before,const CounterStateType & after)3872 inline double getRetiring(const CounterStateType & before, const CounterStateType & after)
3873 {
3874 //    std::cout << "DEBUG: "<< after.RetiringSlots - before.RetiringSlots << " " << getAllSlots(before, after) << std::endl;
3875     if (PCM::getInstance()->isHWTMAL1Supported())
3876         return double(after.RetiringSlots - before.RetiringSlots)/double(getAllSlots(before, after));
3877     return 0.;
3878 }
3879 
3880 } // namespace pcm
3881 
3882 #endif
3883