1 /*
2 Copyright (c) 2009-2020, Intel Corporation
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
6
7 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
8 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
9 * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
10
11 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
12 */
13 // written by Roman Dementiev
14 // Thomas Willhalm
15
16 #ifndef CPUCOUNTERS_HEADER
17 #define CPUCOUNTERS_HEADER
18
19 /*! \file cpucounters.h
20 \brief Main CPU counters header
21
22 Include this header file if you want to access CPU counters (core and uncore - including memory controller chips and QPI)
23 */
24
25 #include "version.h"
26
27 #ifndef PCM_API
28 #define PCM_API
29 #endif
30
31 #undef PCM_HA_REQUESTS_READS_ONLY
32
33 #include "types.h"
34 #include "msr.h"
35 #include "pci.h"
36 #include "bw.h"
37 #include "width_extender.h"
38 #include "exceptions/unsupported_processor_exception.hpp"
39
40 #include <vector>
41 #include <array>
42 #include <limits>
43 #include <string>
44 #include <memory>
45 #include <map>
46 #include <unordered_map>
47 #include <string.h>
48 #include <assert.h>
49
50 #ifdef PCM_USE_PERF
51 #include <linux/perf_event.h>
52 #include <errno.h>
53 #define PCM_PERF_COUNT_HW_REF_CPU_CYCLES (9)
54 #endif
55
56 #ifndef _MSC_VER
57 #define NOMINMAX
58 #include <semaphore.h>
59 #include <sys/types.h>
60 #include <sys/stat.h>
61 #include <fcntl.h>
62 #include <sys/syscall.h>
63 #include <unistd.h>
64 #endif
65
66 #ifdef _MSC_VER
67 #if _MSC_VER>= 1600
68 #include <intrin.h>
69 #endif
70 #endif
71
72 #include "resctrl.h"
73
74 namespace pcm {
75
76 #ifdef _MSC_VER
77 void PCM_API restrictDriverAccess(LPCWSTR path);
78 #endif
79
80 class SystemCounterState;
81 class SocketCounterState;
82 class CoreCounterState;
83 class BasicCounterState;
84 class ServerUncoreCounterState;
85 class PCM;
86 class CoreTaskQueue;
87 class SystemRoot;
88
89 /*
90 CPU performance monitoring routines
91
92 A set of performance monitoring routines for recent Intel CPUs
93 */
94
95 struct PCM_API TopologyEntry // decribes a core
96 {
97 int32 os_id;
98 int32 thread_id;
99 int32 core_id;
100 int32 tile_id; // tile is a constalation of 1 or more cores sharing salem L2 cache. Unique for entire system
101 int32 socket;
102
TopologyEntryTopologyEntry103 TopologyEntry() : os_id(-1), thread_id (-1), core_id(-1), tile_id(-1), socket(-1) { }
104 };
105
106 class HWRegister
107 {
108 public:
109 virtual void operator = (uint64 val) = 0; // write operation
110 virtual operator uint64 () = 0; //read operation
~HWRegister()111 virtual ~HWRegister() {}
112 };
113
114 class PCICFGRegister64 : public HWRegister
115 {
116 std::shared_ptr<PciHandleType> handle;
117 size_t offset;
118 public:
PCICFGRegister64(const std::shared_ptr<PciHandleType> & handle_,size_t offset_)119 PCICFGRegister64(const std::shared_ptr<PciHandleType> & handle_, size_t offset_) :
120 handle(handle_),
121 offset(offset_)
122 {
123 }
124 void operator = (uint64 val) override
125 {
126 cvt_ds cvt;
127 cvt.ui64 = val;
128 handle->write32(offset, cvt.ui32.low);
129 handle->write32(offset + sizeof(uint32), cvt.ui32.high);
130 }
uint64()131 operator uint64 () override
132 {
133 uint64 result = 0;
134 handle->read64(offset, &result);
135 return result;
136 }
137 };
138
139 class PCICFGRegister32 : public HWRegister
140 {
141 std::shared_ptr<PciHandleType> handle;
142 size_t offset;
143 public:
PCICFGRegister32(const std::shared_ptr<PciHandleType> & handle_,size_t offset_)144 PCICFGRegister32(const std::shared_ptr<PciHandleType> & handle_, size_t offset_) :
145 handle(handle_),
146 offset(offset_)
147 {
148 }
149 void operator = (uint64 val) override
150 {
151 handle->write32(offset, (uint32)val);
152 }
uint64()153 operator uint64 () override
154 {
155 uint32 result = 0;
156 handle->read32(offset, &result);
157 return result;
158 }
159 };
160
161 class MMIORegister64 : public HWRegister
162 {
163 std::shared_ptr<MMIORange> handle;
164 size_t offset;
165 public:
MMIORegister64(const std::shared_ptr<MMIORange> & handle_,size_t offset_)166 MMIORegister64(const std::shared_ptr<MMIORange> & handle_, size_t offset_) :
167 handle(handle_),
168 offset(offset_)
169 {
170 }
171 void operator = (uint64 val) override
172 {
173 handle->write64(offset, val);
174 }
uint64()175 operator uint64 () override
176 {
177 return handle->read64(offset);
178 }
179 };
180
181 class MMIORegister32 : public HWRegister
182 {
183 std::shared_ptr<MMIORange> handle;
184 size_t offset;
185 public:
MMIORegister32(const std::shared_ptr<MMIORange> & handle_,size_t offset_)186 MMIORegister32(const std::shared_ptr<MMIORange> & handle_, size_t offset_) :
187 handle(handle_),
188 offset(offset_)
189 {
190 }
191 void operator = (uint64 val) override
192 {
193 handle->write32(offset, (uint32)val);
194 }
uint64()195 operator uint64 () override
196 {
197 return (uint64)handle->read32(offset);
198 }
199 };
200
201 class MSRRegister : public HWRegister
202 {
203 std::shared_ptr<SafeMsrHandle> handle;
204 size_t offset;
205 public:
MSRRegister(const std::shared_ptr<SafeMsrHandle> & handle_,size_t offset_)206 MSRRegister(const std::shared_ptr<SafeMsrHandle> & handle_, size_t offset_) :
207 handle(handle_),
208 offset(offset_)
209 {
210 }
211 void operator = (uint64 val) override
212 {
213 handle->write(offset, val);
214 }
uint64()215 operator uint64 () override
216 {
217 uint64 value = 0;
218 handle->read(offset, &value);
219 return value;
220 }
221 };
222
223 class CounterWidthExtenderRegister : public HWRegister
224 {
225 std::shared_ptr<CounterWidthExtender> handle;
226 public:
CounterWidthExtenderRegister(const std::shared_ptr<CounterWidthExtender> & handle_)227 CounterWidthExtenderRegister(const std::shared_ptr<CounterWidthExtender> & handle_) :
228 handle(handle_)
229 {
230 }
231 void operator = (uint64 val) override
232 {
233 if (val == 0)
234 {
235 handle->reset();
236 }
237 else
238 {
239 std::cerr << "ERROR: writing non-zero values to CounterWidthExtenderRegister is not supported\n";
240 throw std::exception();
241 }
242 }
uint64()243 operator uint64 () override
244 {
245 return handle->read();;
246 }
247 };
248
249 #undef PCM_UNCORE_PMON_BOX_CHECK_STATUS // debug only
250
251 class UncorePMU
252 {
253 typedef std::shared_ptr<HWRegister> HWRegisterPtr;
254 HWRegisterPtr unitControl;
255 public:
256 HWRegisterPtr counterControl[4];
257 HWRegisterPtr counterValue[4];
258 HWRegisterPtr fixedCounterControl;
259 HWRegisterPtr fixedCounterValue;
260 HWRegisterPtr filter[2];
261
262 UncorePMU(const HWRegisterPtr & unitControl_,
263 const HWRegisterPtr & counterControl0,
264 const HWRegisterPtr & counterControl1,
265 const HWRegisterPtr & counterControl2,
266 const HWRegisterPtr & counterControl3,
267 const HWRegisterPtr & counterValue0,
268 const HWRegisterPtr & counterValue1,
269 const HWRegisterPtr & counterValue2,
270 const HWRegisterPtr & counterValue3,
271 const HWRegisterPtr & fixedCounterControl_ = HWRegisterPtr(),
272 const HWRegisterPtr & fixedCounterValue_ = HWRegisterPtr(),
273 const HWRegisterPtr & filter0 = HWRegisterPtr(),
274 const HWRegisterPtr & filter1 = HWRegisterPtr()
275 ) :
unitControl(unitControl_)276 unitControl(unitControl_),
277 counterControl{ counterControl0, counterControl1, counterControl2, counterControl3 },
278 counterValue{ counterValue0, counterValue1, counterValue2, counterValue3 },
279 fixedCounterControl(fixedCounterControl_),
280 fixedCounterValue(fixedCounterValue_),
281 filter{ filter0 , filter1 }
282 {
283 }
UncorePMU()284 UncorePMU() {}
~UncorePMU()285 virtual ~UncorePMU() {}
valid()286 bool valid() const
287 {
288 return unitControl.get() != nullptr;
289 }
writeUnitControl(const uint32 value)290 void writeUnitControl(const uint32 value)
291 {
292 *unitControl = value;
293 }
294 void cleanup();
295 void freeze(const uint32 extra);
296 bool initFreeze(const uint32 extra, const char* xPICheckMsg = nullptr);
297 void unfreeze(const uint32 extra);
298 void resetUnfreeze(const uint32 extra);
299 };
300
301 enum ServerUncoreMemoryMetrics
302 {
303 PartialWrites,
304 Pmem,
305 PmemMemoryMode,
306 PmemMixedMode
307 };
308
309 //! Object to access uncore counters in a socket/processor with microarchitecture codename SandyBridge-EP (Jaketown) or Ivytown-EP or Ivytown-EX
310 class ServerPCICFGUncore
311 {
312 friend class PCM;
313 int32 iMCbus,UPIbus,M2Mbus;
314 uint32 groupnr;
315 int32 cpu_model;
316 typedef std::vector<UncorePMU> UncorePMUVector;
317 UncorePMUVector imcPMUs;
318 UncorePMUVector edcPMUs;
319 UncorePMUVector xpiPMUs;
320 UncorePMUVector m3upiPMUs;
321 UncorePMUVector m2mPMUs;
322 UncorePMUVector haPMUs;
323 std::vector<UncorePMUVector*> allPMUs{ &imcPMUs, &edcPMUs, &xpiPMUs, &m3upiPMUs , &m2mPMUs, &haPMUs };
324 std::vector<uint64> qpi_speed;
325 std::vector<uint32> num_imc_channels; // number of memory channels in each memory controller
326 std::vector<std::pair<uint32, uint32> > XPIRegisterLocation; // (device, function)
327 std::vector<std::pair<uint32, uint32> > M3UPIRegisterLocation; // (device, function)
328 std::vector<std::vector< std::pair<uint32, uint32> > > MCRegisterLocation; // MCRegisterLocation[controller]: (device, function)
329 std::vector<std::pair<uint32, uint32> > EDCRegisterLocation; // EDCRegisterLocation: (device, function)
330 std::vector<std::pair<uint32, uint32> > M2MRegisterLocation; // M2MRegisterLocation: (device, function)
331 std::vector<std::pair<uint32, uint32> > HARegisterLocation; // HARegisterLocation: (device, function)
332
333 static std::vector<std::pair<uint32, uint32> > socket2iMCbus;
334 static std::vector<std::pair<uint32, uint32> > socket2UPIbus;
335 static std::vector<std::pair<uint32, uint32> > socket2M2Mbus;
336
337 ServerPCICFGUncore(); // forbidden
338 ServerPCICFGUncore(ServerPCICFGUncore &); // forbidden
339 ServerPCICFGUncore & operator = (const ServerPCICFGUncore &); // forbidden
340 PciHandleType * createIntelPerfMonDevice(uint32 groupnr, int32 bus, uint32 dev, uint32 func, bool checkVendor = false);
341 void programIMC(const uint32 * MCCntConfig);
342 void programEDC(const uint32 * EDCCntConfig);
343 void programM2M(const uint64 * M2MCntConfig);
344 void programM2M();
345 void programHA(const uint32 * config);
346 void programHA();
347 void programXPI(const uint32 * XPICntConfig);
348 void programM3UPI(const uint32* M3UPICntConfig);
349 typedef std::pair<size_t, std::vector<uint64 *> > MemTestParam;
350 void initMemTest(MemTestParam & param);
351 void doMemTest(const MemTestParam & param);
352 void cleanupMemTest(const MemTestParam & param);
353 void cleanupQPIHandles();
354 void cleanupPMUs();
355 void writeAllUnitControl(const uint32 value);
356 void initDirect(uint32 socket_, const PCM * pcm);
357 void initPerf(uint32 socket_, const PCM * pcm);
358 void initBuses(uint32 socket_, const PCM * pcm);
359 void initRegisterLocations(const PCM * pcm);
360 uint64 getPMUCounter(std::vector<UncorePMU> & pmu, const uint32 id, const uint32 counter);
361
362 public:
363 enum EventPosition {
364 READ=0,
365 WRITE=1,
366 READ_RANK_A=0,
367 WRITE_RANK_A=1,
368 READ_RANK_B=2,
369 WRITE_RANK_B=3,
370 PARTIAL=2,
371 PMM_READ=2,
372 PMM_WRITE=3,
373 PMM_MM_MISS_CLEAN=2,
374 PMM_MM_MISS_DIRTY=3,
375 NM_HIT=0, // NM : Near Memory (DRAM cache) in Memory Mode
376 M2M_CLOCKTICKS=1
377 };
378 //! \brief Initialize access data structures
379 //! \param socket_ socket id
380 //! \param pcm pointer to PCM instance
381 ServerPCICFGUncore(uint32 socket_, const PCM * pcm);
382 //! \brief Program performance counters (disables programming power counters)
383 void program();
384 //! \brief Get the number of integrated controller reads (in cache lines)
385 uint64 getImcReads();
386 //! \brief Get the number of integrated controller reads for given controller (in cache lines)
387 //! \param controller controller ID/number
388 uint64 getImcReadsForController(uint32 controller);
389 //! \brief Get the number of integrated controller reads for given channels (in cache lines)
390 //! \param beginChannel first channel in the range
391 //! \param endChannel last channel + 1: the range is [beginChannel, endChannel). endChannel is not included.
392 uint64 getImcReadsForChannels(uint32 beginChannel, uint32 endChannel);
393 //! \brief Get the number of integrated controller writes (in cache lines)
394 uint64 getImcWrites();
395 //! \brief Get the number of requests to home agent (BDX/HSX only)
396 uint64 getHALocalRequests();
397 //! \brief Get the number of local requests to home agent (BDX/HSX only)
398 uint64 getHARequests();
399
400 //! \brief Get the number of PMM memory reads (in cache lines)
401 uint64 getPMMReads();
402 //! \brief Get the number of PMM memory writes (in cache lines)
403 uint64 getPMMWrites();
404
405 //! \brief Get the number of cache lines read by EDC (embedded DRAM controller)
406 uint64 getEdcReads();
407 //! \brief Get the number of cache lines written by EDC (embedded DRAM controller)
408 uint64 getEdcWrites();
409
410 //! \brief Get the number of incoming data flits to the socket through a port
411 //! \param port QPI port id
412 uint64 getIncomingDataFlits(uint32 port);
413
414 //! \brief Get the number of outgoing data and non-data or idle flits (depending on the architecture) from the socket through a port
415 //! \param port QPI port id
416 uint64 getOutgoingFlits(uint32 port);
417
418 ~ServerPCICFGUncore();
419
420 //! \brief Program power counters (disables programming performance counters)
421 //! \param mc_profile memory controller measurement profile. See description of profiles in pcm-power.cpp
422 void program_power_metrics(int mc_profile);
423
424 //! \brief Program memory counters (disables programming performance counters)
425 //! \param rankA count DIMM rank1 statistics (disables memory channel monitoring)
426 //! \param rankB count DIMM rank2 statistics (disables memory channel monitoring)
427 //! \brief metrics metric set (see the ServerUncoreMemoryMetrics enum)
428 void programServerUncoreMemoryMetrics(const ServerUncoreMemoryMetrics & metrics, const int rankA = -1, const int rankB = -1);
429
430 //! \brief Get number of QPI LL clocks on a QPI port
431 //! \param port QPI port number
432 uint64 getQPIClocks(uint32 port);
433
434 //! \brief Get number cycles on a QPI port when the link was in a power saving half-lane mode
435 //! \param port QPI port number
436 uint64 getQPIL0pTxCycles(uint32 port);
437 //! \brief Get number cycles on a UPI port when the link was in a L0 mode (fully active)
438 //! \param port UPI port number
439 uint64 getUPIL0TxCycles(uint32 port);
440 //! \brief Get number cycles on a QPI port when the link was in a power saving shutdown mode
441 //! \param port QPI port number
442 uint64 getQPIL1Cycles(uint32 port);
443 //! \brief Get number DRAM channel cycles
444 //! \param channel channel number
445 uint64 getDRAMClocks(uint32 channel);
446 //! \brief Get number MCDRAM channel cycles
447 //! \param channel channel number
448 uint64 getMCDRAMClocks(uint32 channel);
449 //! \brief Direct read of memory controller PMU counter (counter meaning depends on the programming: power/performance/etc)
450 //! \param channel channel number
451 //! \param counter counter number
452 uint64 getMCCounter(uint32 channel, uint32 counter);
453 //! \brief Direct read of embedded DRAM memory controller PMU counter (counter meaning depends on the programming: power/performance/etc)
454 //! \param channel channel number
455 //! \param counter counter number
456 uint64 getEDCCounter(uint32 channel, uint32 counter);
457 //! \brief Direct read of QPI LL PMU counter (counter meaning depends on the programming: power/performance/etc)
458 //! \param port port number
459 //! \param counter counter number
460 uint64 getQPILLCounter(uint32 port, uint32 counter);
461 //! \brief Direct read of M3UPI PMU counter (counter meaning depends on the programming: power/performance/etc)
462 //! \param port port number
463 //! \param counter counter number
464 uint64 getM3UPICounter(uint32 port, uint32 counter);
465 //! \brief Direct read of M2M counter
466 //! \param box box ID/number
467 //! \param counter counter number
468 uint64 getM2MCounter(uint32 box, uint32 counter);
469
470 //! \brief Freezes event counting
471 void freezeCounters();
472 //! \brief Unfreezes event counting
473 void unfreezeCounters();
474
475 //! \brief Measures/computes the maximum theoretical QPI link bandwidth speed in GByte/seconds
476 uint64 computeQPISpeed(const uint32 ref_core, const int cpumodel);
477
478 //! \brief Enable correct counting of various LLC events (with memory access perf penalty)
479 void enableJKTWorkaround(bool enable);
480
481 //! \brief Returns the number of detected QPI ports
getNumQPIPorts()482 size_t getNumQPIPorts() const { return xpiPMUs.size(); }
483
484 //! \brief Returns the speed of the QPI link
getQPILinkSpeed(const uint32 linkNr)485 uint64 getQPILinkSpeed(const uint32 linkNr) const
486 {
487 return qpi_speed.empty() ? 0 : qpi_speed[linkNr];
488 }
489
490 //! \brief Print QPI Speeds
491 void reportQPISpeed() const;
492
493 //! \brief Returns the number of detected integrated memory controllers
getNumMC()494 uint32 getNumMC() const { return (uint32)num_imc_channels.size(); }
495
496 //! \brief Returns the total number of detected memory channels on all integrated memory controllers
getNumMCChannels()497 size_t getNumMCChannels() const { return (size_t)imcPMUs.size(); }
498
499 //! \brief Returns the total number of detected memory channels on given integrated memory controller
500 //! \param controller controller number
501 size_t getNumMCChannels(const uint32 controller) const;
502
503 //! \brief Returns the total number of detected memory channels on all embedded DRAM controllers (EDC)
getNumEDCChannels()504 size_t getNumEDCChannels() const { return edcPMUs.size(); }
505 };
506
507 class SimpleCounterState
508 {
509 template <class T>
510 friend uint64 getNumberOfEvents(const T & before, const T & after);
511 friend class PCM;
512 uint64 data;
513
514 public:
SimpleCounterState()515 SimpleCounterState() : data(0)
516 { }
~SimpleCounterState()517 virtual ~SimpleCounterState() { }
518 };
519
520 typedef SimpleCounterState PCIeCounterState;
521 typedef SimpleCounterState IIOCounterState;
522 typedef std::vector<uint64> eventGroup_t;
523
524 class PerfVirtualControlRegister;
525
526 /*!
527 \brief CPU Performance Monitor
528
529 This singleton object needs to be instantiated for each process
530 before accessing counting and measuring routines
531 */
532 class PCM_API PCM
533 {
534 friend class BasicCounterState;
535 friend class UncoreCounterState;
536 friend class Socket;
537 friend class ServerUncore;
538 friend class PerfVirtualControlRegister;
539 friend class Aggregator;
540 friend class ServerPCICFGUncore;
541 PCM(); // forbidden to call directly because it is a singleton
542 PCM(const PCM &) = delete;
543 PCM & operator = (const PCM &) = delete;
544
545 int32 cpu_family;
546 int32 cpu_model;
547 int32 cpu_stepping;
548 int64 cpu_microcode_level;
549 int32 max_cpuid;
550 int32 threads_per_core;
551 int32 num_cores;
552 int32 num_sockets;
553 int32 num_phys_cores_per_socket;
554 int32 num_online_cores;
555 int32 num_online_sockets;
556 uint32 core_gen_counter_num_max;
557 uint32 core_gen_counter_num_used;
558 uint32 core_gen_counter_width;
559 uint32 core_fixed_counter_num_max;
560 uint32 core_fixed_counter_num_used;
561 uint32 core_fixed_counter_width;
562 uint32 uncore_gen_counter_num_max;
563 uint32 uncore_gen_counter_num_used;
564 uint32 uncore_gen_counter_width;
565 uint32 uncore_fixed_counter_num_max;
566 uint32 uncore_fixed_counter_num_used;
567 uint32 uncore_fixed_counter_width;
568 uint32 perfmon_version;
569 int32 perfmon_config_anythread;
570 uint64 nominal_frequency;
571 uint64 max_qpi_speed; // in GBytes/second
572 uint32 L3ScalingFactor;
573 int32 pkgThermalSpecPower, pkgMinimumPower, pkgMaximumPower;
574
575 std::vector<TopologyEntry> topology;
576 SystemRoot* systemTopology;
577 std::string errorMessage;
578
579 static PCM * instance;
580 bool allow_multiple_instances;
581 bool programmed_pmu;
582 std::vector<std::shared_ptr<SafeMsrHandle> > MSR;
583 std::vector<std::shared_ptr<ServerPCICFGUncore> > server_pcicfg_uncore;
584 std::vector<UncorePMU> pcuPMUs;
585 std::vector<std::map<int32, UncorePMU> > iioPMUs;
586 std::vector<UncorePMU> uboxPMUs;
587 double joulesPerEnergyUnit;
588 std::vector<std::shared_ptr<CounterWidthExtender> > energy_status;
589 std::vector<std::shared_ptr<CounterWidthExtender> > dram_energy_status;
590 std::vector<std::vector<UncorePMU> > cboPMUs;
591
592 std::vector<std::shared_ptr<CounterWidthExtender> > memory_bw_local;
593 std::vector<std::shared_ptr<CounterWidthExtender> > memory_bw_total;
594 #ifdef __linux__
595 Resctrl resctrl;
596 #endif
597 bool useResctrl;
598
599 std::shared_ptr<FreeRunningBWCounters> clientBW;
600 std::shared_ptr<CounterWidthExtender> clientImcReads;
601 std::shared_ptr<CounterWidthExtender> clientImcWrites;
602 std::shared_ptr<CounterWidthExtender> clientIoRequests;
603
604 std::vector<std::shared_ptr<ServerBW> > serverBW;
605
606 bool disable_JKT_workaround;
607 bool blocked; // track if time-driven counter update is running or not: PCM is blocked
608
609 uint64 * coreCStateMsr; // MSR addresses of core C-state free-running counters
610 uint64 * pkgCStateMsr; // MSR addresses of package C-state free-running counters
611
612 std::vector<std::shared_ptr<CoreTaskQueue> > coreTaskQueues;
613
614 bool L2CacheHitRatioAvailable;
615 bool L3CacheHitRatioAvailable;
616 bool L3CacheMissesAvailable;
617 bool L2CacheMissesAvailable;
618 bool L2CacheHitsAvailable;
619 bool L3CacheHitsNoSnoopAvailable;
620 bool L3CacheHitsSnoopAvailable;
621 bool L3CacheHitsAvailable;
622
623 bool forceRTMAbortMode;
624
625 std::vector<uint64> FrontendBoundSlots, BadSpeculationSlots, BackendBoundSlots, RetiringSlots, AllSlotsRaw;
626 bool isFixedCounterSupported(unsigned c);
627 bool vm = false;
628 bool linux_arch_perfmon = false;
629
630 public:
631 enum { MAX_C_STATE = 10 }; // max C-state on Intel architecture
632
633 //! \brief Returns true if the specified core C-state residency metric is supported
isCoreCStateResidencySupported(int state)634 bool isCoreCStateResidencySupported(int state)
635 {
636 if (state == 0 || state == 1)
637 return true;
638
639 return (coreCStateMsr != NULL && state <= ((int)MAX_C_STATE) && coreCStateMsr[state] != 0);
640 }
641
642 //! \brief Returns true if the specified package C-state residency metric is supported
isPackageCStateResidencySupported(int state)643 bool isPackageCStateResidencySupported(int state)
644 {
645 if (state == 0)
646 {
647 return true;
648 }
649 return (pkgCStateMsr != NULL && state <= ((int)MAX_C_STATE) && pkgCStateMsr[state] != 0);
650 }
651
652 //! \brief Redirects output destination to provided file, instead of std::cout
653 void setOutput(const std::string filename);
654
655 //! \brief Restores output, closes output file if opened
656 void restoreOutput();
657
658 //! \brief Set Run State.
659 // Arguments:
660 // -- 1 - program is running
661 // -- 0 -pgram is sleeping
setRunState(int new_state)662 void setRunState(int new_state) { run_state = new_state; }
663
664 //! \brief Returns program's Run State.
665 // Results:
666 // -- 1 - program is running
667 // -- 0 -pgram is sleeping
getRunState(void)668 int getRunState(void) { return run_state; }
669
isBlocked(void)670 bool isBlocked(void) { return blocked; }
setBlocked(const bool new_blocked)671 void setBlocked(const bool new_blocked) { blocked = new_blocked; }
672
673 //! \brief Call it before program() to allow multiple running instances of PCM on the same system
allowMultipleInstances()674 void allowMultipleInstances()
675 {
676 allow_multiple_instances = true;
677 }
678
679 //! Mode of programming (parameter in the program() method)
680 enum ProgramMode {
681 DEFAULT_EVENTS = 0, /*!< Default choice of events, the additional parameter is not needed and ignored */
682 CUSTOM_CORE_EVENTS = 1, /*!< Custom set of core events specified in the parameter to the program method. The parameter must be a pointer to array of four \c CustomCoreEventDescription values */
683 EXT_CUSTOM_CORE_EVENTS = 2, /*!< Custom set of core events specified in the parameter to the program method. The parameter must be a pointer to a \c ExtendedCustomCoreEventDescription data structure */
684 INVALID_MODE /*!< Non-programmed mode */
685 };
686
687 //! Return codes (e.g. for program(..) method)
688 enum ErrorCode {
689 Success = 0,
690 MSRAccessDenied = 1,
691 PMUBusy = 2,
692 UnknownError
693 };
694
695 enum PerfmonField {
696 INVALID, /* Use to parse invalid field */
697 OPCODE,
698 EVENT_SELECT,
699 UMASK,
700 RESET,
701 EDGE_DET,
702 IGNORED,
703 OVERFLOW_ENABLE,
704 ENABLE,
705 INVERT,
706 THRESH,
707 CH_MASK,
708 FC_MASK,
709 /* Below are not part of perfmon definition */
710 H_EVENT_NAME,
711 V_EVENT_NAME,
712 MULTIPLIER,
713 DIVIDER,
714 COUNTER_INDEX
715 };
716
717 enum PCIeWidthMode {
718 X1,
719 X4,
720 X8,
721 X16,
722 XFF
723 };
724
725 enum { // offsets/enumeration of IIO stacks
726 IIO_CBDMA = 0, // shared with DMI
727 IIO_PCIe0 = 1,
728 IIO_PCIe1 = 2,
729 IIO_PCIe2 = 3,
730 IIO_MCP0 = 4,
731 IIO_MCP1 = 5,
732 IIO_STACK_COUNT = 6
733 };
734
735 // Offsets/enumeration of IIO stacks Skylake server.
736 enum SkylakeIIOStacks {
737 SKX_IIO_CBDMA_DMI = 0,
738 SKX_IIO_PCIe0 = 1,
739 SKX_IIO_PCIe1 = 2,
740 SKX_IIO_PCIe2 = 3,
741 SKX_IIO_MCP0 = 4,
742 SKX_IIO_MCP1 = 5,
743 SKX_IIO_STACK_COUNT = 6
744 };
745
746 // Offsets/enumeration of IIO stacks for IceLake server.
747 enum IcelakeIIOStacks {
748 ICX_IIO_PCIe0 = 0,
749 ICX_IIO_PCIe1 = 1,
750 ICX_IIO_MCP0 = 2,
751 ICX_IIO_PCIe2 = 3,
752 ICX_IIO_PCIe3 = 4,
753 ICX_IIO_CBDMA_DMI = 5,
754 ICX_IIO_STACK_COUNT = 6
755 };
756
757 // Offsets/enumeration of IIO stacks for IceLake server.
758 enum SnowridgeIIOStacks {
759 SNR_IIO_QAT = 0,
760 SNR_IIO_CBDMA_DMI = 1,
761 SNR_IIO_NIS = 2,
762 SNR_IIO_HQM = 3,
763 SNR_IIO_PCIe0 = 4,
764 SNR_IIO_STACK_COUNT = 5
765 };
766
767 struct SimplePCIeDevInfo
768 {
769 enum PCIeWidthMode width;
770 std::string pciDevName;
771 std::string busNumber;
772
SimplePCIeDevInfoSimplePCIeDevInfo773 SimplePCIeDevInfo() : width(XFF) { }
774 };
775
776 /*! \brief Custom Core event description
777
778 See "Intel 64 and IA-32 Architectures Software Developers Manual Volume 3B:
779 System Programming Guide, Part 2" for the concrete values of the data structure fields,
780 e.g. Appendix A.2 "Performance Monitoring Events for Intel(r) Core(tm) Processor Family
781 and Xeon Processor Family"
782 */
783 struct CustomCoreEventDescription
784 {
785 int32 event_number, umask_value;
786 };
787
788 /*! \brief Extended custom core event description
789
790 In contrast to CustomCoreEventDescription supports configuration of all fields.
791
792 See "Intel 64 and IA-32 Architectures Software Developers Manual Volume 3B:
793 System Programming Guide, Part 2" for the concrete values of the data structure fields,
794 e.g. Appendix A.2 "Performance Monitoring Events for Intel(r) Core(tm) Processor Family
795 and Xeon Processor Family"
796 */
797 struct ExtendedCustomCoreEventDescription
798 {
799 FixedEventControlRegister * fixedCfg; // if NULL, then default configuration performed for fixed counters
800 uint32 nGPCounters; // number of general purpose counters
801 EventSelectRegister * gpCounterCfg; // general purpose counters, if NULL, then default configuration performed for GP counters
802 uint64 OffcoreResponseMsrValue[2];
ExtendedCustomCoreEventDescriptionExtendedCustomCoreEventDescription803 ExtendedCustomCoreEventDescription() : fixedCfg(NULL), nGPCounters(0), gpCounterCfg(NULL)
804 {
805 OffcoreResponseMsrValue[0] = 0;
806 OffcoreResponseMsrValue[1] = 0;
807 }
808 };
809
810 struct CustomIIOEventDescription
811 {
812 /* We program the same counters to every IIO Stacks */
813 std::string eventNames[4];
814 IIOPMUCNTCTLRegister eventOpcodes[4];
815 int multiplier[4]; //Some IIO event requires transformation to get meaningful output (i.e. DWord to bytes)
816 int divider[4]; //We usually like to have some kind of divider (i.e. /10e6 )
817 };
818
819 private:
820 ProgramMode mode;
821 CustomCoreEventDescription coreEventDesc[PERF_MAX_CUSTOM_COUNTERS];
822
823 #ifdef _MSC_VER
824 HANDLE numInstancesSemaphore; // global semaphore that counts the number of PCM instances on the system
825 #else
826 // global semaphore that counts the number of PCM instances on the system
827 sem_t * numInstancesSemaphore;
828 #endif
829
830 std::vector<int32> socketRefCore;
831
832 bool canUsePerf;
833 #ifdef PCM_USE_PERF
834 std::vector<std::vector<int> > perfEventHandle;
835 void readPerfData(uint32 core, std::vector<uint64> & data);
836
837 enum {
838 PERF_INST_RETIRED_POS = 0,
839 PERF_CPU_CLK_UNHALTED_THREAD_POS = 1,
840 PERF_CPU_CLK_UNHALTED_REF_POS = 2,
841 PERF_GEN_EVENT_0_POS = 3,
842 PERF_GEN_EVENT_1_POS = 4,
843 PERF_GEN_EVENT_2_POS = 5,
844 PERF_GEN_EVENT_3_POS = 6,
845 PERF_TOPDOWN_SLOTS_POS = PERF_GEN_EVENT_0_POS + PERF_MAX_CUSTOM_COUNTERS,
846 PERF_TOPDOWN_FRONTEND_POS = PERF_TOPDOWN_SLOTS_POS + 1,
847 PERF_TOPDOWN_BADSPEC_POS = PERF_TOPDOWN_SLOTS_POS + 2,
848 PERF_TOPDOWN_BACKEND_POS = PERF_TOPDOWN_SLOTS_POS + 3,
849 PERF_TOPDOWN_RETIRING_POS = PERF_TOPDOWN_SLOTS_POS + 4
850 };
851
852 std::unordered_map<int, int> perfTopDownPos;
853
854 enum {
855 PERF_GROUP_LEADER_COUNTER = PERF_INST_RETIRED_POS,
856 PERF_TOPDOWN_GROUP_LEADER_COUNTER = PERF_TOPDOWN_SLOTS_POS
857 };
858 #endif
859 std::ofstream * outfile; // output file stream
860 std::streambuf * backup_ofile; // backup of original output = cout
861 int run_state; // either running (1) or sleeping (0)
862
863 bool needToRestoreNMIWatchdog;
864
865 std::vector<std::vector<EventSelectRegister> > lastProgrammedCustomCounters;
866 uint32 checkCustomCoreProgramming(std::shared_ptr<SafeMsrHandle> msr);
867 ErrorCode programCoreCounters(int core, const PCM::ProgramMode mode, const ExtendedCustomCoreEventDescription * pExtDesc,
868 std::vector<EventSelectRegister> & programmedCustomCounters);
869
870 bool PMUinUse();
871 void cleanupPMU(const bool silent = false);
872 void cleanupRDT(const bool silent = false);
873 bool decrementInstanceSemaphore(); // returns true if it was the last instance
874
875 #ifdef __APPLE__
876 // OSX does not have sem_getvalue, so we must get the number of instances by a different method
877 uint32 getNumInstances();
878 uint32 decrementNumInstances();
879 uint32 incrementNumInstances();
880 #endif
881
882
883 void computeQPISpeedBeckton(int core_nr);
884 void destroyMSR();
885 void computeNominalFrequency();
886 static bool isCPUModelSupported(const int model_);
887 std::string getSupportedUarchCodenames() const;
888 std::string getUnsupportedMessage() const;
889 bool detectModel();
890 bool checkModel();
891
892 void initCStateSupportTables();
893 bool discoverSystemTopology();
894 void printSystemTopology() const;
895 bool initMSR();
896 bool detectNominalFrequency();
897 void showSpecControlMSRs();
898 void initEnergyMonitoring();
899 void initUncoreObjects();
900 /*!
901 * \brief initializes each core with an RMID
902 *
903 * \returns nothing
904 */
905 void initRDT();
906 /*!
907 * \brief Initializes RDT
908 *
909 * Initializes RDT infrastructure through resctrl Linux driver or direct MSR programming.
910 * For the latter: initializes each core event MSR with an RMID for QOS event (L3 cache monitoring or memory bandwidth monitoring)
911 * \returns nothing
912 */
913 void initQOSevent(const uint64 event, const int32 core);
914 void programBecktonUncore(int core);
915 void programNehalemEPUncore(int core);
916 void enableJKTWorkaround(bool enable);
917 template <class CounterStateType>
918 void readAndAggregateMemoryBWCounters(const uint32 core, CounterStateType & counterState);
919 template <class CounterStateType>
920 void readAndAggregateUncoreMCCounters(const uint32 socket, CounterStateType & counterState);
921 template <class CounterStateType>
922 void readAndAggregateEnergyCounters(const uint32 socket, CounterStateType & counterState);
923 template <class CounterStateType>
924 void readPackageThermalHeadroom(const uint32 socket, CounterStateType & counterState);
925 template <class CounterStateType>
926 void readAndAggregatePackageCStateResidencies(std::shared_ptr<SafeMsrHandle> msr, CounterStateType & result);
927 void readQPICounters(SystemCounterState & counterState);
928 void reportQPISpeed() const;
929 void readCoreCounterConfig(const bool complainAboutMSR = false);
930 void readCPUMicrocodeLevel();
931
932 uint64 CX_MSR_PMON_CTRY(uint32 Cbo, uint32 Ctr) const;
933 uint64 CX_MSR_PMON_BOX_FILTER(uint32 Cbo) const;
934 uint64 CX_MSR_PMON_BOX_FILTER1(uint32 Cbo) const;
935 uint64 CX_MSR_PMON_CTLY(uint32 Cbo, uint32 Ctl) const;
936 uint64 CX_MSR_PMON_BOX_CTL(uint32 Cbo) const;
937 void programCboOpcodeFilter(const uint32 opc0, UncorePMU & pmu, const uint32 nc_, const uint32 opc1, const uint32 loc, const uint32 rem);
938 void initLLCReadMissLatencyEvents(uint64 * events, uint32 & opCode);
939 void initCHARequestEvents(uint64 * events);
940 void programCbo();
941 uint64 getCBOCounterState(const uint32 socket, const uint32 ctr_);
942 template <class Iterator>
program(UncorePMU & pmu,const Iterator & eventsBegin,const Iterator & eventsEnd,const uint32 extra)943 static void program(UncorePMU& pmu, const Iterator& eventsBegin, const Iterator& eventsEnd, const uint32 extra)
944 {
945 if (!eventsBegin) return;
946 Iterator curEvent = eventsBegin;
947 for (int c = 0; curEvent != eventsEnd; ++c, ++curEvent)
948 {
949 auto ctrl = pmu.counterControl[c];
950 if (ctrl.get() != nullptr)
951 {
952 *ctrl = MC_CH_PCI_PMON_CTL_EN;
953 *ctrl = MC_CH_PCI_PMON_CTL_EN | *curEvent;
954 }
955 }
956 if (extra)
957 {
958 pmu.resetUnfreeze(extra);
959 }
960 }
961 void programPCU(uint32 * events, const uint64 filter);
962 void programUBOX(const uint64* events);
963
964 void cleanupUncorePMUs(const bool silent = false);
965
isCLX()966 bool isCLX() const // Cascade Lake-SP
967 {
968 return (PCM::SKX == cpu_model) && (cpu_stepping > 4 && cpu_stepping < 8);
969 }
970
isCPX(int cpu_model_,int cpu_stepping_)971 static bool isCPX(int cpu_model_, int cpu_stepping_) // Cooper Lake
972 {
973 return (PCM::SKX == cpu_model_) && (cpu_stepping_ >= 10);
974 }
975
isCPX()976 bool isCPX() const
977 {
978 return isCPX(cpu_model, cpu_stepping);
979 }
980
981 void initUncorePMUsDirect();
982 void initUncorePMUsPerf();
983 bool isRDTDisabled() const;
984
985 public:
986 //! check if TMA level 1 metrics are supported
987 bool isHWTMAL1Supported() const;
988
989 enum EventPosition
990 {
991 TOR_OCCUPANCY = 0,
992 TOR_INSERTS = 1,
993 REQUESTS_ALL = 2,
994 REQUESTS_LOCAL = 3
995 };
996 //! check if in secure boot mode
997 bool isSecureBoot() const;
998
999 //! true if Linux perf for uncore PMU programming should AND can be used internally
1000 bool useLinuxPerfForUncore() const;
1001
1002 /*!
1003 \brief The system, sockets, uncores, cores and threads are structured like a tree
1004
1005 \returns a reference to a const System object representing the root of the tree
1006 */
getSystemTopology()1007 SystemRoot const & getSystemTopology() const {
1008 return *systemTopology;
1009 }
1010
1011 /*!
1012 \brief checks if QOS monitoring support present
1013
1014 \returns true or false
1015 */
1016 bool QOSMetricAvailable() const;
1017 /*!
1018 \brief checks L3 cache support for QOS present
1019
1020 \returns true or false
1021 */
1022 bool L3QOSMetricAvailable() const;
1023 /*!
1024 \brief checks if L3 cache monitoring present
1025
1026 \returns true or false
1027 */
1028 bool L3CacheOccupancyMetricAvailable() const;
1029 /*!
1030 \brief checks if local memory bandwidth monitoring present
1031
1032 \returns true or false
1033 */
1034 bool CoreLocalMemoryBWMetricAvailable() const;
1035 /*!
1036 \brief checks if total memory bandwidth monitoring present
1037
1038 \returns true or false
1039 */
1040 bool CoreRemoteMemoryBWMetricAvailable() const;
1041 /*!
1042 * \brief returns the max number of RMID supported by socket
1043 *
1044 * \returns maximum number of RMID supported by socket
1045 */
1046 unsigned getMaxRMID() const;
1047
1048 //! \brief Returns the number of CBO or CHA units per socket
1049 uint32 getMaxNumOfCBoxes() const;
1050
1051 //! \brief Returns the number of IIO stacks per socket
1052 uint32 getMaxNumOfIIOStacks() const;
1053
1054 /*!
1055 \brief Returns PCM object
1056
1057 Returns PCM object. If the PCM has not been created before than
1058 an instance is created. PCM is a singleton.
1059
1060 \return Pointer to PCM object
1061 */
1062 static PCM * getInstance(); // the only way to get access
1063
1064 /*!
1065 \brief Checks the status of PCM object
1066
1067 Call this method to check if PCM gained access to model specific registers. The method is deprecated, see program error code instead.
1068
1069 \return true iff access to model specific registers works without problems
1070 */
1071 bool good(); // true if access to CPU counters works
1072
1073 /*! \brief Returns the error message
1074
1075 Call this when good() returns false, otherwise return an empty string
1076 */
getErrorMessage()1077 const std::string & getErrorMessage() const
1078 {
1079 return errorMessage;
1080 }
1081
1082 /*! \brief Programs performance counters
1083 \param mode_ mode of programming, see ProgramMode definition
1084 \param parameter_ optional parameter for some of programming modes
1085
1086 Call this method before you start using the performance counting routines.
1087
1088 \warning Using this routines with other tools that *program* Performance Monitoring
1089 Units (PMUs) on CPUs is not recommended because PMU can not be shared. Tools that are known to
1090 program PMUs: Intel(r) VTune(tm), Intel(r) Performance Tuning Utility (PTU). This code may make
1091 VTune or PTU measurements invalid. VTune or PTU measurement may make measurement with this code invalid. Please enable either usage of these routines or VTune/PTU/etc.
1092 */
1093 ErrorCode program(const ProgramMode mode_ = DEFAULT_EVENTS, const void * parameter_ = NULL, const bool silent = false); // program counters and start counting
1094
1095 /*! \brief Programs uncore latency counters on microarchitectures codename SandyBridge-EP and later Xeon uarch
1096 \param enable_pmm enables DDR/PMM. See possible profile values in pcm-latency.cpp example
1097
1098 Call this method before you start using the latency counter routines on microarchitecture codename SandyBridge-EP and later Xeon uarch
1099
1100 \warning After this call the memory and QPI bandwidth counters on microarchitecture codename SandyBridge-EP and later Xeon uarch will not work.
1101 \warning Using this routines with other tools that *program* Performance Monitoring
1102 Units (PMUs) on CPUs is not recommended because PMU can not be shared. Tools that are known to
1103 program PMUs: Intel(r) VTune(tm), Intel(r) Performance Tuning Utility (PTU). This code may make
1104 VTune or PTU measurements invalid. VTune or PTU measurement may make measurement with this code invalid. Please enable either usage of these routines or VTune/PTU/etc.
1105 */
1106 ErrorCode programServerUncoreLatencyMetrics(bool enable_pmm);
1107
1108 /*! \brief Programs uncore power/energy counters on microarchitectures codename SandyBridge-EP and later Xeon uarch
1109 \param mc_profile profile for integrated memory controller PMU. See possible profile values in pcm-power.cpp example
1110 \param pcu_profile profile for power control unit PMU. See possible profile values in pcm-power.cpp example
1111 \param freq_bands array of three integer values for core frequency band monitoring. See usage in pcm-power.cpp example
1112
1113 Call this method before you start using the power counter routines on microarchitecture codename SandyBridge-EP and later Xeon uarch
1114
1115 \warning After this call the memory and QPI bandwidth counters on microarchitecture codename SandyBridge-EP and later Xeon uarch will not work.
1116 \warning Using this routines with other tools that *program* Performance Monitoring
1117 Units (PMUs) on CPUs is not recommended because PMU can not be shared. Tools that are known to
1118 program PMUs: Intel(r) VTune(tm), Intel(r) Performance Tuning Utility (PTU). This code may make
1119 VTune or PTU measurements invalid. VTune or PTU measurement may make measurement with this code invalid. Please enable either usage of these routines or VTune/PTU/etc.
1120 */
1121 ErrorCode programServerUncorePowerMetrics(int mc_profile, int pcu_profile, int * freq_bands = NULL);
1122
1123 /* \brief Program memory counters (disables programming performance counters)
1124 \param rankA count DIMM rank1 statistics (disables memory channel monitoring)
1125 \param rankB count DIMM rank2 statistics (disables memory channel monitoring)
1126 \brief metrics metric set (see the ServerUncoreMemoryMetrics enum)
1127
1128 Call this method before you start using the memory counter routines on microarchitecture codename SandyBridge-EP and later Xeon uarch
1129
1130 \warning Using this routines with other tools that *program* Performance Monitoring
1131 Units (PMUs) on CPUs is not recommended because PMU can not be shared. Tools that are known to
1132 program PMUs: Intel(r) VTune(tm), Intel(r) Performance Tuning Utility (PTU). This code may make
1133 VTune or PTU measurements invalid. VTune or PTU measurement may make measurement with this code invalid. Please enable either usage of these routines or VTune/PTU/etc.
1134 */
1135 ErrorCode programServerUncoreMemoryMetrics(const ServerUncoreMemoryMetrics & metrics, int rankA = -1, int rankB = -1);
1136
1137 // vector of IDs. E.g. for core {raw event} or {raw event, offcore response1 msr value, } or {raw event, offcore response1 msr value, offcore response2}
1138 // or for cha/cbo {raw event, filter value}, etc
1139 // + user-supplied name
1140 typedef std::pair<std::array<uint64, 3>, std::string> RawEventConfig;
1141 struct RawPMUConfig
1142 {
1143 std::vector<RawEventConfig> programmable;
1144 std::vector<RawEventConfig> fixed;
1145 };
1146 typedef std::map<std::string, RawPMUConfig> RawPMUConfigs;
1147 ErrorCode program(const RawPMUConfigs& curPMUConfigs, const bool silent = false);
1148
1149 //! \brief Freezes uncore event counting (works only on microarchitecture codename SandyBridge-EP and IvyTown)
1150 void freezeServerUncoreCounters();
1151
1152 //! \brief Unfreezes uncore event counting (works only on microarchitecture codename SandyBridge-EP and IvyTown)
1153 void unfreezeServerUncoreCounters();
1154
1155 /*! \brief Reads the power/energy counter state of a socket (works only on microarchitecture codename SandyBridge-EP)
1156 \param socket socket id
1157 \return State of power counters in the socket
1158 */
1159 ServerUncoreCounterState getServerUncoreCounterState(uint32 socket);
1160
1161 /*! \brief Cleanups resources and stops performance counting
1162
1163 One needs to call this method when your program finishes or/and you are not going to use the
1164 performance counting routines anymore.
1165 */
1166 void cleanup(const bool silent = false);
1167
1168 /*! \brief Forces PMU reset
1169
1170 If there is no chance to free up PMU from other applications you might try to call this method at your own risk.
1171 */
1172 void resetPMU();
1173
1174 /*! \brief Reads all counter states (including system, sockets and cores)
1175
1176 \param systemState system counter state (return parameter)
1177 \param socketStates socket counter states (return parameter)
1178 \param coreStates core counter states (return parameter)
1179
1180 */
1181 void getAllCounterStates(SystemCounterState & systemState, std::vector<SocketCounterState> & socketStates, std::vector<CoreCounterState> & coreStates);
1182
1183 /*! \brief Reads uncore counter states (including system and sockets) but no core counters
1184
1185 \param systemState system counter state (return parameter)
1186 \param socketStates socket counter states (return parameter)
1187
1188 */
1189 void getUncoreCounterStates(SystemCounterState & systemState, std::vector<SocketCounterState> & socketStates);
1190
1191 /*! \brief Return true if the core in online
1192
1193 \param os_core_id OS core id
1194 */
1195 bool isCoreOnline(int32 os_core_id) const;
1196
1197 /*! \brief Return true if the socket in online
1198
1199 \param socket_id OS socket id
1200 */
1201 bool isSocketOnline(int32 socket_id) const;
1202
1203 /*! \brief Reads the counter state of the system
1204
1205 System consists of several sockets (CPUs).
1206 Socket has a CPU in it. Socket (CPU) consists of several (logical) cores.
1207
1208 \return State of counters in the entire system
1209 */
1210 SystemCounterState getSystemCounterState();
1211
1212 /*! \brief Reads the counter state of a socket
1213 \param socket socket id
1214 \return State of counters in the socket
1215 */
1216 SocketCounterState getSocketCounterState(uint32 socket);
1217
1218 /*! \brief Reads the counter state of a (logical) core
1219
1220 Be aware that during the measurement other threads may be scheduled on the same core by the operating system (this is called context-switching). The performance events caused by these threads will be counted as well.
1221
1222
1223 \param core core id
1224 \return State of counters in the core
1225 */
1226 CoreCounterState getCoreCounterState(uint32 core);
1227
1228 /*! \brief Reads number of logical cores in the system
1229 \return Number of logical cores in the system
1230 */
1231 uint32 getNumCores() const;
1232
1233 /*! \brief Reads number of online logical cores in the system
1234 \return Number of online logical cores in the system
1235 */
1236 uint32 getNumOnlineCores() const;
1237
1238 /*! \brief Reads number of sockets (CPUs) in the system
1239 \return Number of sockets in the system
1240 */
1241 uint32 getNumSockets() const;
1242
1243 /*! \brief Reads number of online sockets (CPUs) in the system
1244 \return Number of online sockets in the system
1245 */
1246 uint32 getNumOnlineSockets() const;
1247
1248 /*! \brief Reads how many hardware threads has a physical core
1249 "Hardware thread" is a logical core in a different terminology.
1250 If Intel(r) Hyperthreading(tm) is enabled then this function returns 2.
1251 \return Number of hardware threads per physical core
1252 */
1253 uint32 getThreadsPerCore() const;
1254
1255 /*! \brief Checks if SMT (HyperThreading) is enabled.
1256 \return true iff SMT (HyperThreading) is enabled.
1257 */
1258 bool getSMT() const; // returns true iff SMT ("Hyperthreading") is on
1259
1260 /*! \brief Reads the nominal core frequency
1261 \return Nominal frequency in Hz
1262 */
1263 uint64 getNominalFrequency() const; // in Hz
1264
1265 /*! \brief runs CPUID.0xF.0x01 to get the L3 up scaling factor to calculate L3 Occupancy
1266 * Scaling factor is returned in EBX register after running the CPU instruction
1267 * \return L3 up scaling factor
1268 */
1269 uint32 getL3ScalingFactor() const;
1270
1271 /*! \brief runs CPUID.0xB.0x01 to get maximum logical cores (including SMT) per socket.
1272 * max_lcores_per_socket is returned in EBX[15:0]. Compare this value with number of cores per socket
1273 * detected in the system to see if some cores are offlined
1274 * \return true iff max_lcores_per_socket == number of cores per socket detected
1275 */
1276 bool isSomeCoreOfflined();
1277
1278 /*! \brief Returns the maximum number of custom (general-purpose) core events supported by CPU
1279 */
1280 int32 getMaxCustomCoreEvents();
1281
1282 //! \brief Identifiers of supported CPU models
1283 enum SupportedCPUModels
1284 {
1285 NEHALEM_EP = 26,
1286 NEHALEM = 30,
1287 ATOM = 28,
1288 ATOM_2 = 53,
1289 CENTERTON = 54,
1290 BAYTRAIL = 55,
1291 AVOTON = 77,
1292 CHERRYTRAIL = 76,
1293 APOLLO_LAKE = 92,
1294 DENVERTON = 95,
1295 SNOWRIDGE = 134,
1296 CLARKDALE = 37,
1297 WESTMERE_EP = 44,
1298 NEHALEM_EX = 46,
1299 WESTMERE_EX = 47,
1300 SANDY_BRIDGE = 42,
1301 JAKETOWN = 45,
1302 IVY_BRIDGE = 58,
1303 HASWELL = 60,
1304 HASWELL_ULT = 69,
1305 HASWELL_2 = 70,
1306 IVYTOWN = 62,
1307 HASWELLX = 63,
1308 BROADWELL = 61,
1309 BROADWELL_XEON_E3 = 71,
1310 BDX_DE = 86,
1311 SKL_UY = 78,
1312 KBL = 158,
1313 KBL_1 = 142,
1314 CML = 166,
1315 CML_1 = 165,
1316 ICL = 126,
1317 ICL_1 = 125,
1318 RKL = 167,
1319 TGL = 140,
1320 TGL_1 = 141,
1321 BDX = 79,
1322 KNL = 87,
1323 SKL = 94,
1324 SKX = 85,
1325 ICX_D = 108,
1326 ICX = 106,
1327 END_OF_MODEL_LIST = 0x0ffff
1328 };
1329
1330 #define PCM_SKL_PATH_CASES \
1331 case PCM::SKL_UY: \
1332 case PCM::KBL: \
1333 case PCM::KBL_1: \
1334 case PCM::CML: \
1335 case PCM::ICL: \
1336 case PCM::RKL: \
1337 case PCM::TGL: \
1338 case PCM::SKL:
1339
1340 private:
useSKLPath()1341 bool useSKLPath() const
1342 {
1343 switch (cpu_model)
1344 {
1345 PCM_SKL_PATH_CASES
1346 return true;
1347 }
1348 return false;
1349 }
1350 public:
1351
1352 //! \brief Reads CPU model id
1353 //! \return CPU model ID
getCPUModel()1354 uint32 getCPUModel() const { return (uint32)cpu_model; }
1355
1356 //! \brief Reads CPU stepping id
1357 //! \return CPU stepping ID
getCPUStepping()1358 uint32 getCPUStepping() const { return (uint32)cpu_stepping; }
1359
1360 //! \brief Determines physical thread of given processor ID within a core
1361 //! \param os_id processor identifier
1362 //! \return physical thread identifier
getThreadId(uint32 os_id)1363 int32 getThreadId(uint32 os_id) const { return (int32)topology[os_id].thread_id; }
1364
1365 //! \brief Determines physical core of given processor ID within a socket
1366 //! \param os_id processor identifier
1367 //! \return physical core identifier
getCoreId(uint32 os_id)1368 int32 getCoreId(uint32 os_id) const { return (int32)topology[os_id].core_id; }
1369
1370 //! \brief Determines physical tile (cores sharing L2 cache) of given processor ID
1371 //! \param os_id processor identifier
1372 //! \return physical tile identifier
getTileId(uint32 os_id)1373 int32 getTileId(uint32 os_id) const { return (int32)topology[os_id].tile_id; }
1374
1375 //! \brief Determines socket of given core
1376 //! \param core_id core identifier
1377 //! \return socket identifier
getSocketId(uint32 core_id)1378 int32 getSocketId(uint32 core_id) const { return (int32)topology[core_id].socket; }
1379
1380 //! \brief Returns the number of Intel(r) Quick Path Interconnect(tm) links per socket
1381 //! \return number of QPI links per socket
getQPILinksPerSocket()1382 uint64 getQPILinksPerSocket() const
1383 {
1384 switch (cpu_model)
1385 {
1386 case NEHALEM_EP:
1387 case WESTMERE_EP:
1388 case CLARKDALE:
1389 if (num_sockets == 2)
1390 return 2;
1391 else
1392 return 1;
1393 case NEHALEM_EX:
1394 case WESTMERE_EX:
1395 return 4;
1396 case JAKETOWN:
1397 case IVYTOWN:
1398 case HASWELLX:
1399 case BDX_DE:
1400 case BDX:
1401 case SKX:
1402 case ICX:
1403 return (server_pcicfg_uncore.size() && server_pcicfg_uncore[0].get()) ? (server_pcicfg_uncore[0]->getNumQPIPorts()) : 0;
1404 }
1405 return 0;
1406 }
1407
1408 //! \brief Returns the number of detected integrated memory controllers per socket
getMCPerSocket()1409 uint32 getMCPerSocket() const
1410 {
1411 switch (cpu_model)
1412 {
1413 case NEHALEM_EP:
1414 case WESTMERE_EP:
1415 case CLARKDALE:
1416 return 1;
1417 case NEHALEM_EX:
1418 case WESTMERE_EX:
1419 return 2;
1420 case JAKETOWN:
1421 case IVYTOWN:
1422 case HASWELLX:
1423 case BDX_DE:
1424 case SKX:
1425 case ICX:
1426 case BDX:
1427 case KNL:
1428 return (server_pcicfg_uncore.size() && server_pcicfg_uncore[0].get()) ? (server_pcicfg_uncore[0]->getNumMC()) : 0;
1429 }
1430 return 0;
1431 }
1432
1433 //! \brief Returns the total number of detected memory channels on all integrated memory controllers per socket
getMCChannelsPerSocket()1434 size_t getMCChannelsPerSocket() const
1435 {
1436 switch (cpu_model)
1437 {
1438 case NEHALEM_EP:
1439 case WESTMERE_EP:
1440 case CLARKDALE:
1441 return 3;
1442 case NEHALEM_EX:
1443 case WESTMERE_EX:
1444 return 4;
1445 case JAKETOWN:
1446 case IVYTOWN:
1447 case HASWELLX:
1448 case BDX_DE:
1449 case SKX:
1450 case ICX:
1451 case BDX:
1452 case KNL:
1453 case SNOWRIDGE:
1454 return (server_pcicfg_uncore.size() && server_pcicfg_uncore[0].get()) ? (server_pcicfg_uncore[0]->getNumMCChannels()) : 0;
1455 }
1456 return 0;
1457 }
1458
1459 //! \brief Returns the number of detected memory channels on given integrated memory controllers
1460 //! \param socket socket
1461 //! \param controller controller
getMCChannels(uint32 socket,uint32 controller)1462 size_t getMCChannels(uint32 socket, uint32 controller) const
1463 {
1464 switch (cpu_model)
1465 {
1466 case NEHALEM_EP:
1467 case WESTMERE_EP:
1468 case CLARKDALE:
1469 return 3;
1470 case NEHALEM_EX:
1471 case WESTMERE_EX:
1472 return 4;
1473 case JAKETOWN:
1474 case IVYTOWN:
1475 case HASWELLX:
1476 case BDX_DE:
1477 case SKX:
1478 case ICX:
1479 case BDX:
1480 case KNL:
1481 case SNOWRIDGE:
1482 return (socket < server_pcicfg_uncore.size() && server_pcicfg_uncore[socket].get()) ? (server_pcicfg_uncore[socket]->getNumMCChannels(controller)) : 0;
1483 }
1484 return 0;
1485 }
1486
1487
1488 //! \brief Returns the total number of detected memory channels on all integrated memory controllers per socket
getEDCChannelsPerSocket()1489 size_t getEDCChannelsPerSocket() const
1490 {
1491 switch (cpu_model)
1492 {
1493 case KNL:
1494 return (server_pcicfg_uncore.size() && server_pcicfg_uncore[0].get()) ? (server_pcicfg_uncore[0]->getNumEDCChannels()) : 0;
1495 }
1496 return 0;
1497 }
1498
1499
1500 //! \brief Returns the max number of instructions per cycle
1501 //! \return max number of instructions per cycle
getMaxIPC()1502 uint32 getMaxIPC() const
1503 {
1504 if (ICL == cpu_model || TGL == cpu_model || RKL == cpu_model) return 5;
1505 switch (cpu_model)
1506 {
1507 case SNOWRIDGE:
1508 return 4;
1509 case DENVERTON:
1510 return 3;
1511 case NEHALEM_EP:
1512 case WESTMERE_EP:
1513 case NEHALEM_EX:
1514 case WESTMERE_EX:
1515 case CLARKDALE:
1516 case SANDY_BRIDGE:
1517 case JAKETOWN:
1518 case IVYTOWN:
1519 case IVY_BRIDGE:
1520 case HASWELL:
1521 case HASWELLX:
1522 case BROADWELL:
1523 case BDX_DE:
1524 case BDX:
1525 PCM_SKL_PATH_CASES
1526 case SKX:
1527 return 4;
1528 case KNL:
1529 return 2;
1530 case ICX:
1531 return 5;
1532 }
1533 if (isAtom())
1534 {
1535 return 2;
1536 }
1537 return 0;
1538 }
1539
1540 //! \brief Returns the frequency of Power Control Unit
getPCUFrequency()1541 uint64 getPCUFrequency() const
1542 {
1543 switch (cpu_model)
1544 {
1545 case JAKETOWN:
1546 case IVYTOWN:
1547 return 800000000ULL; // 800 MHz
1548 case HASWELLX:
1549 case BDX_DE:
1550 case BDX:
1551 case KNL:
1552 return 1000000000ULL; // 1 GHz
1553 case SKX:
1554 case ICX:
1555 case SNOWRIDGE:
1556 return 1100000000ULL; // 1.1 GHz
1557 }
1558 return 0;
1559 }
1560
1561 //! \brief Returns whether it is a server part
isServerCPU()1562 bool isServerCPU() const
1563 {
1564 switch (cpu_model)
1565 {
1566 case NEHALEM_EP:
1567 case NEHALEM_EX:
1568 case WESTMERE_EP:
1569 case WESTMERE_EX:
1570 case JAKETOWN:
1571 case IVYTOWN:
1572 case HASWELLX:
1573 case BDX:
1574 case BDX_DE:
1575 case SKX:
1576 case ICX:
1577 case SNOWRIDGE:
1578 case KNL:
1579 return true;
1580 default:
1581 return false;
1582 };
1583 }
1584
1585 //! \brief Returns whether it is a client part
isClientCPU()1586 bool isClientCPU() const
1587 {
1588 return !isServerCPU();
1589 }
1590 //! \brief Return TSC timer value in time units
1591 //! \param multiplier use 1 for seconds, 1000 for ms, 1000000 for mks, etc (default is 1000: ms)
1592 //! \param core core to read on-chip TSC value (default is 0)
1593 //! \return time counter value
1594 uint64 getTickCount(uint64 multiplier = 1000 /* ms */, uint32 core = 0);
1595
1596 //! \brief Return TSC timer value in time units using rdtscp instruction from current core
1597 //! \param multiplier use 1 for seconds, 1000 for ms, 1000000 for mks, etc (default is 1000: ms)
1598 //! \warning Processor support is required bit 27 of cpuid EDX must be set, for Windows, Visual Studio 2010 is required
1599 //! \return time counter value
1600 uint64 getTickCountRDTSCP(uint64 multiplier = 1000 /* ms */);
1601
1602 //! \brief Returns uncore clock ticks on specified socket
1603 uint64 getUncoreClocks(const uint32 socket_);
1604
1605 //! \brief Return QPI Link Speed in GBytes/second
1606 //! \warning Works only for Nehalem-EX (Xeon 7500) and Xeon E7 and E5 processors
1607 //! \return QPI Link Speed in GBytes/second
getQPILinkSpeed(uint32 socketNr,uint32 linkNr)1608 uint64 getQPILinkSpeed(uint32 socketNr, uint32 linkNr) const
1609 {
1610 return hasPCICFGUncore() ? server_pcicfg_uncore[socketNr]->getQPILinkSpeed(linkNr) : max_qpi_speed;
1611 }
1612
1613 //! \brief Returns how many joules are in an internal processor energy unit
getJoulesPerEnergyUnit()1614 double getJoulesPerEnergyUnit() const { return joulesPerEnergyUnit; }
1615
1616 //! \brief Returns thermal specification power of the package domain in Watt
getPackageThermalSpecPower()1617 int32 getPackageThermalSpecPower() const { return pkgThermalSpecPower; }
1618
1619 //! \brief Returns minimum power derived from electrical spec of the package domain in Watt
getPackageMinimumPower()1620 int32 getPackageMinimumPower() const { return pkgMinimumPower; }
1621
1622 //! \brief Returns maximum power derived from electrical spec of the package domain in Watt
getPackageMaximumPower()1623 int32 getPackageMaximumPower() const { return pkgMaximumPower; }
1624
1625 #ifndef NO_WINRING // In cases where loading the WinRing0 driver is not desirable as a fallback to MSR.sys, add -DNO_WINRING to compile command to remove ability to load driver
1626 //! \brief Loads and initializes Winring0 third party library for access to processor model specific and PCI configuration registers
1627 //! \return returns true in case of success
1628 static bool initWinRing0Lib();
1629 #endif // NO_WINRING
1630
disableJKTWorkaround()1631 inline void disableJKTWorkaround() { disable_JKT_workaround = true; }
1632
1633 enum PCIeEventCode
1634 {
1635 // PCIe read events (PCI devices reading from memory - application writes to disk/network/PCIe device)
1636 PCIeRdCur = 0x19E, // PCIe read current (full cache line)
1637 PCIeNSRd = 0x1E4, // PCIe non-snoop read (full cache line)
1638 // PCIe write events (PCI devices writing to memory - application reads from disk/network/PCIe device)
1639 PCIeWiLF = 0x194, // PCIe Write (non-allocating) (full cache line)
1640 PCIeItoM = 0x19C, // PCIe Write (allocating) (full cache line)
1641 PCIeNSWr = 0x1E5, // PCIe Non-snoop write (partial cache line)
1642 PCIeNSWrF = 0x1E6, // PCIe Non-snoop write (full cache line)
1643 // events shared by CPU and IO
1644 RFO = 0x180, // Demand Data RFO; share the same code for CPU, use tid to filter PCIe only traffic
1645 CRd = 0x181, // Demand Code Read
1646 DRd = 0x182, // Demand Data Read
1647 PRd = 0x187, // Partial Reads (UC) (MMIO Read)
1648 WiL = 0x18F, // Write Invalidate Line - partial (MMIO write), PL: Not documented in HSX/IVT
1649 ItoM = 0x1C8, // Request Invalidate Line; share the same code for CPU, use tid to filter PCIe only traffic
1650
1651 SKX_RFO = 0x200,
1652 SKX_CRd = 0x201,
1653 SKX_DRd = 0x202,
1654 SKX_PRd = 0x207,
1655 SKX_WiL = 0x20F,
1656 SKX_RdCur = 0x21E,
1657 SKX_ItoM = 0x248,
1658 };
1659
1660 enum ChaPipelineQueue
1661 {
1662 None,
1663 IRQ,
1664 PRQ,
1665 };
1666
1667 enum CBoEventTid
1668 {
1669 RFOtid = 0x3E,
1670 ItoMtid = 0x3E,
1671 };
1672
1673 //! \brief Program uncore PCIe monitoring event(s)
1674 //! \param eventGroup - events to programm for the same run
1675 void programPCIeEventGroup(eventGroup_t &eventGroup);
1676 uint64 getPCIeCounterData(const uint32 socket_, const uint32 ctr_);
1677
1678 //! \brief Program CBO (or CHA on SKX+) counters
1679 //! \param events array with four raw event values
1680 //! \param opCode opcode match filter
1681 //! \param nc_ match non-coherent requests
1682 //! \param llc_lookup_tid_filter filter for LLC lookup event filter and TID filter (core and thread ID)
1683 //! \param loc match on local node target
1684 //! \param rem match on remote node target
1685 void programCbo(const uint64 * events, const uint32 opCode = 0, const uint32 nc_ = 0, const uint32 llc_lookup_tid_filter = 0, const uint32 loc = 1, const uint32 rem = 1);
1686
1687 //! \brief Program CBO (or CHA on SKX+) counters
1688 //! \param events array with four raw event values
1689 //! \param filter0 raw filter value
1690 //! \param filter1 raw filter1 value
1691 void programCboRaw(const uint64* events, const uint64 filter0, const uint64 filter1);
1692
1693 //! \brief Get the state of PCIe counter(s)
1694 //! \param socket_ socket of the PCIe controller
1695 //! \return State of PCIe counter(s)
1696 PCIeCounterState getPCIeCounterState(const uint32 socket_, const uint32 ctr_ = 0);
1697
1698 //! \brief Program uncore IIO events
1699 //! \param rawEvents events to program (raw format)
1700 //! \param IIOStack id of the IIO stack to program (-1 for all, if parameter omitted)
1701 void programIIOCounters(uint64 rawEvents[4], int IIOStack = -1);
1702
1703 //! \brief Get the state of IIO counter
1704 //! \param socket socket of the IIO stack
1705 //! \param IIOStack id of the IIO stack
1706 //! \return State of IIO counter
1707 IIOCounterState getIIOCounterState(int socket, int IIOStack, int counter);
1708
1709 //! \brief Get the states of the four IIO counters in bulk (faster than four single reads)
1710 //! \param socket socket of the IIO stack
1711 //! \param IIOStack id of the IIO stack
1712 //! \param result states of IIO counters (array of four IIOCounterState elements)
1713 void getIIOCounterStates(int socket, int IIOStack, IIOCounterState * result);
1714
1715 uint64 extractCoreGenCounterValue(uint64 val);
1716 uint64 extractCoreFixedCounterValue(uint64 val);
1717 uint64 extractUncoreGenCounterValue(uint64 val);
1718 uint64 extractUncoreFixedCounterValue(uint64 val);
1719 uint64 extractQOSMonitoring(uint64 val);
1720
1721 //! \brief Get a string describing the codename of the processor microarchitecture
1722 //! \param cpu_model_ cpu model (if no parameter provided the codename of the detected CPU is returned)
1723 const char * getUArchCodename(const int32 cpu_model_ = -1) const;
1724
1725 //! \brief Get Brand string of processor
1726 static std::string getCPUBrandString();
1727 std::string getCPUFamilyModelString();
1728
1729
1730 //! \brief Enables "force all RTM transaction abort" mode also enabling 4+ programmable counters on Skylake generation processors
1731 void enableForceRTMAbortMode(const bool silent = false);
1732
1733 //! \brief queries status of "force all RTM transaction abort" mode
1734 bool isForceRTMAbortModeEnabled() const;
1735
1736 //! \brief Disables "force all RTM transaction abort" mode restricting the number of programmable counters on Skylake generation processors to 3
1737 void disableForceRTMAbortMode(const bool silent = false);
1738
1739 //! \brief queries availability of "force all RTM transaction abort" mode
1740 bool isForceRTMAbortModeAvailable() const;
1741
1742 //! \brief Get microcode level (returns -1 if retrieval not supported due to some restrictions)
getCPUMicrocodeLevel()1743 int64 getCPUMicrocodeLevel() const { return cpu_microcode_level; }
1744
1745 //! \brief returns true if CPU model is Atom-based
isAtom(const int32 cpu_model_)1746 static bool isAtom(const int32 cpu_model_)
1747 {
1748 return cpu_model_ == ATOM
1749 || cpu_model_ == ATOM_2
1750 || cpu_model_ == CENTERTON
1751 || cpu_model_ == BAYTRAIL
1752 || cpu_model_ == AVOTON
1753 || cpu_model_ == CHERRYTRAIL
1754 || cpu_model_ == APOLLO_LAKE
1755 || cpu_model_ == DENVERTON
1756 // || cpu_model_ == SNOWRIDGE do not use Atom code for SNOWRIDGE
1757 ;
1758 }
1759
1760 //! \brief returns true if CPU is Atom-based
isAtom()1761 bool isAtom() const
1762 {
1763 return isAtom(cpu_model);
1764 }
1765
packageEnergyMetricsAvailable()1766 bool packageEnergyMetricsAvailable() const
1767 {
1768 return (
1769 cpu_model == PCM::JAKETOWN
1770 || cpu_model == PCM::IVYTOWN
1771 || cpu_model == PCM::SANDY_BRIDGE
1772 || cpu_model == PCM::IVY_BRIDGE
1773 || cpu_model == PCM::HASWELL
1774 || cpu_model == PCM::AVOTON
1775 || cpu_model == PCM::CHERRYTRAIL
1776 || cpu_model == PCM::BAYTRAIL
1777 || cpu_model == PCM::APOLLO_LAKE
1778 || cpu_model == PCM::DENVERTON
1779 || cpu_model == PCM::SNOWRIDGE
1780 || cpu_model == PCM::HASWELLX
1781 || cpu_model == PCM::BROADWELL
1782 || cpu_model == PCM::BDX_DE
1783 || cpu_model == PCM::BDX
1784 || cpu_model == PCM::KNL
1785 || useSKLPath()
1786 || cpu_model == PCM::SKX
1787 || cpu_model == PCM::ICX
1788 );
1789 }
1790
dramEnergyMetricsAvailable()1791 bool dramEnergyMetricsAvailable() const
1792 {
1793 return (
1794 cpu_model == PCM::JAKETOWN
1795 || cpu_model == PCM::IVYTOWN
1796 || cpu_model == PCM::HASWELLX
1797 || cpu_model == PCM::BDX_DE
1798 || cpu_model == PCM::BDX
1799 || cpu_model == PCM::KNL
1800 || cpu_model == PCM::SKX
1801 || cpu_model == PCM::ICX
1802 );
1803 }
1804
packageThermalMetricsAvailable()1805 bool packageThermalMetricsAvailable() const
1806 {
1807 return packageEnergyMetricsAvailable();
1808 }
1809
outgoingQPITrafficMetricsAvailable()1810 bool outgoingQPITrafficMetricsAvailable() const
1811 {
1812 return getQPILinksPerSocket() > 0 &&
1813 (
1814 cpu_model == PCM::NEHALEM_EX
1815 || cpu_model == PCM::WESTMERE_EX
1816 || cpu_model == PCM::JAKETOWN
1817 || cpu_model == PCM::IVYTOWN
1818 || cpu_model == PCM::HASWELLX
1819 || cpu_model == PCM::BDX
1820 || cpu_model == PCM::SKX
1821 || cpu_model == PCM::ICX
1822 );
1823 }
1824
incomingQPITrafficMetricsAvailable()1825 bool incomingQPITrafficMetricsAvailable() const
1826 {
1827 return getQPILinksPerSocket() > 0 &&
1828 (
1829 cpu_model == PCM::NEHALEM_EX
1830 || cpu_model == PCM::WESTMERE_EX
1831 || cpu_model == PCM::JAKETOWN
1832 || cpu_model == PCM::IVYTOWN
1833 || (cpu_model == PCM::SKX && cpu_stepping > 1)
1834 || cpu_model == PCM::ICX
1835 );
1836 }
1837
localMemoryRequestRatioMetricAvailable()1838 bool localMemoryRequestRatioMetricAvailable() const
1839 {
1840 return cpu_model == PCM::HASWELLX
1841 || cpu_model == PCM::BDX
1842 || cpu_model == PCM::SKX
1843 || cpu_model == PCM::ICX
1844 ;
1845 }
1846
qpiUtilizationMetricsAvailable()1847 bool qpiUtilizationMetricsAvailable() const
1848 {
1849 return outgoingQPITrafficMetricsAvailable();
1850 }
1851
memoryTrafficMetricsAvailable()1852 bool memoryTrafficMetricsAvailable() const
1853 {
1854 return (!(isAtom() || cpu_model == PCM::CLARKDALE))
1855 ;
1856 }
1857
MCDRAMmemoryTrafficMetricsAvailable()1858 bool MCDRAMmemoryTrafficMetricsAvailable() const
1859 {
1860 return (cpu_model == PCM::KNL);
1861 }
1862
memoryIOTrafficMetricAvailable()1863 bool memoryIOTrafficMetricAvailable() const
1864 {
1865 if (cpu_model == TGL) return false;
1866 return (
1867 cpu_model == PCM::SANDY_BRIDGE
1868 || cpu_model == PCM::IVY_BRIDGE
1869 || cpu_model == PCM::HASWELL
1870 || cpu_model == PCM::BROADWELL
1871 || useSKLPath()
1872 );
1873 }
1874
IIOEventsAvailable()1875 bool IIOEventsAvailable() const
1876 {
1877 return (
1878 cpu_model == PCM::SKX
1879 || cpu_model == PCM::ICX
1880 || cpu_model == PCM::SNOWRIDGE
1881 );
1882 }
1883
LatencyMetricsAvailable()1884 bool LatencyMetricsAvailable() const
1885 {
1886 return (
1887 cpu_model == PCM::HASWELLX
1888 || cpu_model == PCM::BDX
1889 || cpu_model == PCM::SKX
1890 || cpu_model == PCM::ICX
1891 || useSKLPath()
1892 );
1893 }
1894
DDRLatencyMetricsAvailable()1895 bool DDRLatencyMetricsAvailable() const
1896 {
1897 return (
1898 cpu_model == PCM::SKX
1899 || cpu_model == PCM::ICX
1900 );
1901 }
1902
PMMTrafficMetricsAvailable()1903 bool PMMTrafficMetricsAvailable() const
1904 {
1905 return (
1906 isCLX()
1907 || isCPX()
1908 || cpu_model == PCM::ICX
1909 || cpu_model == PCM::SNOWRIDGE
1910 );
1911 }
1912
LLCReadMissLatencyMetricsAvailable()1913 bool LLCReadMissLatencyMetricsAvailable() const
1914 {
1915 return (
1916 HASWELLX == cpu_model
1917 || BDX_DE == cpu_model
1918 || BDX == cpu_model
1919 || isCLX()
1920 || isCPX()
1921 #ifdef PCM_ENABLE_LLCRDLAT_SKX_MP
1922 || SKX == cpu_model
1923 #else
1924 || ((SKX == cpu_model) && (num_sockets == 1))
1925 #endif
1926 || ICX == cpu_model
1927 || SNOWRIDGE == cpu_model
1928 );
1929 }
1930
hasBecktonUncore()1931 bool hasBecktonUncore() const
1932 {
1933 return (
1934 cpu_model == PCM::NEHALEM_EX
1935 || cpu_model == PCM::WESTMERE_EX
1936 );
1937 }
hasPCICFGUncore()1938 bool hasPCICFGUncore() const // has PCICFG uncore PMON
1939 {
1940 return (
1941 cpu_model == PCM::JAKETOWN
1942 || cpu_model == PCM::SNOWRIDGE
1943 || cpu_model == PCM::IVYTOWN
1944 || cpu_model == PCM::HASWELLX
1945 || cpu_model == PCM::BDX_DE
1946 || cpu_model == PCM::SKX
1947 || cpu_model == PCM::ICX
1948 || cpu_model == PCM::BDX
1949 || cpu_model == PCM::KNL
1950 );
1951 }
1952
isSkxCompatible()1953 bool isSkxCompatible() const
1954 {
1955 return (
1956 cpu_model == PCM::SKX
1957 );
1958 }
1959
hasUPI(const int32 cpu_model_)1960 static bool hasUPI(const int32 cpu_model_) // Intel(r) Ultra Path Interconnect
1961 {
1962 return (
1963 cpu_model_ == PCM::SKX
1964 || cpu_model_ == PCM::ICX
1965 );
1966 }
1967
hasUPI()1968 bool hasUPI() const
1969 {
1970 return hasUPI(cpu_model);
1971 }
1972
xPI()1973 const char * xPI() const
1974 {
1975 if (hasUPI())
1976 return "UPI";
1977
1978 return "QPI";
1979 }
1980
hasCHA()1981 bool hasCHA() const
1982 {
1983 return (
1984 cpu_model == PCM::SKX
1985 || cpu_model == PCM::ICX
1986 );
1987 }
1988
1989 bool supportsHLE() const;
1990 bool supportsRTM() const;
1991
useSkylakeEvents()1992 bool useSkylakeEvents() const
1993 {
1994 return useSKLPath()
1995 || PCM::SKX == cpu_model
1996 || PCM::ICX == cpu_model
1997 ;
1998 }
1999
hasClientMCCounters()2000 bool hasClientMCCounters() const
2001 {
2002 return cpu_model == SANDY_BRIDGE
2003 || cpu_model == IVY_BRIDGE
2004 || cpu_model == HASWELL
2005 || cpu_model == BROADWELL
2006 || useSKLPath()
2007 ;
2008 }
2009
getBytesPerFlit(int32 cpu_model_)2010 static double getBytesPerFlit(int32 cpu_model_)
2011 {
2012 if (hasUPI(cpu_model_))
2013 {
2014 // 172 bits per UPI flit
2015 return 172./8.;
2016 }
2017 // 8 bytes per QPI flit
2018 return 8.;
2019 }
2020
getBytesPerFlit()2021 double getBytesPerFlit() const
2022 {
2023 return getBytesPerFlit(cpu_model);
2024 }
2025
getDataBytesPerFlit(int32 cpu_model_)2026 static double getDataBytesPerFlit(int32 cpu_model_)
2027 {
2028 if (hasUPI(cpu_model_))
2029 {
2030 // 9 UPI flits to transfer 64 bytes
2031 return 64./9.;
2032 }
2033 // 8 bytes per QPI flit
2034 return 8.;
2035 }
2036
getDataBytesPerFlit()2037 double getDataBytesPerFlit() const
2038 {
2039 return getDataBytesPerFlit(cpu_model);
2040 }
2041
getFlitsPerLinkCycle(int32 cpu_model_)2042 static double getFlitsPerLinkCycle(int32 cpu_model_)
2043 {
2044 if (hasUPI(cpu_model_))
2045 {
2046 // 5 UPI flits sent every 6 link cycles
2047 return 5./6.;
2048 }
2049 return 2.;
2050 }
2051
getBytesPerLinkCycle(int32 cpu_model_)2052 static double getBytesPerLinkCycle(int32 cpu_model_)
2053 {
2054 return getBytesPerFlit(cpu_model_) * getFlitsPerLinkCycle(cpu_model_);
2055 }
2056
getBytesPerLinkCycle()2057 double getBytesPerLinkCycle() const
2058 {
2059 return getBytesPerLinkCycle(cpu_model);
2060 }
2061
getLinkTransfersPerLinkCycle()2062 static double getLinkTransfersPerLinkCycle()
2063 {
2064 return 8.;
2065 }
2066
getBytesPerLinkTransfer()2067 double getBytesPerLinkTransfer() const
2068 {
2069 return getBytesPerLinkCycle() / getLinkTransfersPerLinkCycle();
2070 }
2071
2072 //! \brief Setup ExtendedCustomCoreEventDescription object to read offcore (numa) counters for each processor type
2073 //! \param conf conf object to setup offcore MSR values
2074 void setupCustomCoreEventsForNuma(PCM::ExtendedCustomCoreEventDescription& conf) const;
2075
2076 #define PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(m) bool is##m() const { return m; }
2077
2078 PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L2CacheHitRatioAvailable)
PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L3CacheHitRatioAvailable)2079 PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L3CacheHitRatioAvailable)
2080 PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L3CacheMissesAvailable)
2081 PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L2CacheMissesAvailable)
2082 PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L2CacheHitsAvailable)
2083 PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L3CacheHitsNoSnoopAvailable)
2084 PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L3CacheHitsSnoopAvailable)
2085 PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L3CacheHitsAvailable)
2086
2087 #undef PCM_GEN_METRIC_AVAILABLE_FUNCTION
2088
2089 bool isActiveRelativeFrequencyAvailable() const
2090 {
2091 return !isAtom();
2092 }
2093
2094 ~PCM();
2095 };
2096
2097 //! \brief Basic core counter state
2098 //!
2099 //! Intended only for derivation, but not for the direct use
2100 class BasicCounterState
2101 {
2102 friend class PCM;
2103 friend class JSONPrinter;
2104 template <class CounterStateType>
2105 friend double getExecUsage(const CounterStateType & before, const CounterStateType & after);
2106 template <class CounterStateType>
2107 friend double getIPC(const CounterStateType & before, const CounterStateType & after);
2108 template <class CounterStateType>
2109 friend double getAverageFrequency(const CounterStateType & before, const CounterStateType & after);
2110 template <class CounterStateType>
2111 friend double getActiveAverageFrequency(const CounterStateType & before, const CounterStateType & after);
2112 template <class CounterStateType>
2113 friend double getRelativeFrequency(const CounterStateType & before, const CounterStateType & after);
2114 template <class CounterStateType>
2115 friend double getActiveRelativeFrequency(const CounterStateType & before, const CounterStateType & after);
2116 template <class CounterStateType>
2117 friend double getL2CacheHitRatio(const CounterStateType & before, const CounterStateType & after);
2118 template <class CounterStateType>
2119 friend double getL3CacheHitRatio(const CounterStateType & before, const CounterStateType & after);
2120 template <class CounterStateType>
2121 friend uint64 getL3CacheMisses(const CounterStateType & before, const CounterStateType & after);
2122 template <class CounterStateType>
2123 friend uint64 getL2CacheMisses(const CounterStateType & before, const CounterStateType & after);
2124 template <class CounterStateType>
2125 friend uint64 getL2CacheHits(const CounterStateType & before, const CounterStateType & after);
2126 template <class CounterStateType>
2127 friend uint64 getL3CacheHitsNoSnoop(const CounterStateType & before, const CounterStateType & after);
2128 template <class CounterStateType>
2129 friend uint64 getL3CacheHitsSnoop(const CounterStateType & before, const CounterStateType & after);
2130 template <class CounterStateType>
2131 friend uint64 getL3CacheHits(const CounterStateType & before, const CounterStateType & after);
2132 template <class CounterStateType>
2133 friend uint64 getL3CacheOccupancy(const CounterStateType & now);
2134 template <class CounterStateType>
2135 friend uint64 getLocalMemoryBW(const CounterStateType & before, const CounterStateType & after);
2136 template <class CounterStateType>
2137 friend uint64 getRemoteMemoryBW(const CounterStateType & before, const CounterStateType & after);
2138 template <class CounterStateType>
2139 friend uint64 getCycles(const CounterStateType & before, const CounterStateType & after);
2140 template <class CounterStateType>
2141 friend uint64 getInstructionsRetired(const CounterStateType & before, const CounterStateType & after);
2142 template <class CounterStateType>
2143 friend uint64 getCycles(const CounterStateType & now);
2144 template <class CounterStateType>
2145 friend uint64 getInstructionsRetired(const CounterStateType & now);
2146 template <class CounterStateType>
2147 friend uint64 getNumberOfCustomEvents(int32 eventCounterNr, const CounterStateType & before, const CounterStateType & after);
2148 template <class CounterStateType>
2149 friend uint64 getInvariantTSC(const CounterStateType & before, const CounterStateType & after);
2150 template <class CounterStateType>
2151 friend uint64 getRefCycles(const CounterStateType & before, const CounterStateType & after);
2152 template <class CounterStateType>
2153 friend double getCoreCStateResidency(int state, const CounterStateType & before, const CounterStateType & after);
2154 template <class CounterStateType>
2155 friend uint64 getCoreCStateResidency(int state, const CounterStateType& now);
2156 template <class CounterStateType>
2157 friend uint64 getSMICount(const CounterStateType & before, const CounterStateType & after);
2158 template <class CounterStateType>
2159 friend uint64 getAllSlotsRaw(const CounterStateType& before, const CounterStateType& after);
2160 template <class CounterStateType>
2161 friend uint64 getAllSlots(const CounterStateType & before, const CounterStateType & after);
2162 template <class CounterStateType>
2163 friend double getBackendBound(const CounterStateType & before, const CounterStateType & after);
2164 template <class CounterStateType>
2165 friend double getFrontendBound(const CounterStateType & before, const CounterStateType & after);
2166 template <class CounterStateType>
2167 friend double getBadSpeculation(const CounterStateType & before, const CounterStateType & after);
2168 template <class CounterStateType>
2169 friend double getRetiring(const CounterStateType & before, const CounterStateType & after);
2170
2171 protected:
2172 checked_uint64 InstRetiredAny;
2173 checked_uint64 CpuClkUnhaltedThread;
2174 checked_uint64 CpuClkUnhaltedRef;
2175 checked_uint64 Event[PERF_MAX_CUSTOM_COUNTERS];
2176 enum
2177 {
2178 L3MissPos = 0,
2179 ArchLLCMissPos = 0,
2180 L3UnsharedHitPos = 1,
2181 ArchLLCRefPos = 1,
2182 SKLL3HitPos = 1,
2183 L2HitMPos = 2,
2184 SKLL2MissPos = 2,
2185 L2HitPos = 3
2186 };
2187 uint64 InvariantTSC; // invariant time stamp counter
2188 uint64 CStateResidency[PCM::MAX_C_STATE + 1];
2189 int32 ThermalHeadroom;
2190 uint64 L3Occupancy;
2191 uint64 MemoryBWLocal;
2192 uint64 MemoryBWTotal;
2193 uint64 SMICount;
2194 uint64 FrontendBoundSlots, BadSpeculationSlots, BackendBoundSlots, RetiringSlots, AllSlotsRaw;
2195
2196 public:
BasicCounterState()2197 BasicCounterState() :
2198 InvariantTSC(0),
2199 ThermalHeadroom(PCM_INVALID_THERMAL_HEADROOM),
2200 L3Occupancy(0),
2201 MemoryBWLocal(0),
2202 MemoryBWTotal(0),
2203 SMICount(0),
2204 FrontendBoundSlots(0),
2205 BadSpeculationSlots(0),
2206 BackendBoundSlots(0),
2207 RetiringSlots(0),
2208 AllSlotsRaw(0)
2209 {
2210 memset(CStateResidency, 0, sizeof(CStateResidency));
2211 }
~BasicCounterState()2212 virtual ~BasicCounterState() { }
2213
2214 BasicCounterState( const BasicCounterState& ) = default;
2215 BasicCounterState( BasicCounterState&& ) = default;
2216 BasicCounterState & operator = ( BasicCounterState&& ) = default;
2217
2218 BasicCounterState & operator += (const BasicCounterState & o)
2219 {
2220 InstRetiredAny += o.InstRetiredAny;
2221 CpuClkUnhaltedThread += o.CpuClkUnhaltedThread;
2222 CpuClkUnhaltedRef += o.CpuClkUnhaltedRef;
2223 for (int i = 0; i < PERF_MAX_CUSTOM_COUNTERS; ++i)
2224 {
2225 Event[i] += o.Event[i];
2226 }
2227 InvariantTSC += o.InvariantTSC;
2228 for (int i = 0; i <= (int)PCM::MAX_C_STATE; ++i)
2229 CStateResidency[i] += o.CStateResidency[i];
2230 // ThermalHeadroom is not accumulative
2231 L3Occupancy += o.L3Occupancy;
2232 MemoryBWLocal += o.MemoryBWLocal;
2233 MemoryBWTotal += o.MemoryBWTotal;
2234 SMICount += o.SMICount;
2235 // std::cout << "before PCM debug aggregate "<< FrontendBoundSlots << " " << BadSpeculationSlots << " " << BackendBoundSlots << " " <<RetiringSlots << std::endl;
2236 BasicCounterState old = *this;
2237 FrontendBoundSlots += o.FrontendBoundSlots;
2238 BadSpeculationSlots += o.BadSpeculationSlots;
2239 BackendBoundSlots += o.BackendBoundSlots;
2240 RetiringSlots += o.RetiringSlots;
2241 AllSlotsRaw += o.AllSlotsRaw;
2242 //std::cout << "after PCM debug aggregate "<< FrontendBoundSlots << " " << BadSpeculationSlots << " " << BackendBoundSlots << " " <<RetiringSlots << std::endl;
2243 assert(FrontendBoundSlots >= old.FrontendBoundSlots);
2244 assert(BadSpeculationSlots >= old.BadSpeculationSlots);
2245 assert(BackendBoundSlots >= old.BackendBoundSlots);
2246 assert(RetiringSlots >= old.RetiringSlots);
2247 return *this;
2248 }
2249
2250 void readAndAggregate(std::shared_ptr<SafeMsrHandle>);
2251 void readAndAggregateTSC(std::shared_ptr<SafeMsrHandle>);
2252
2253 //! Returns current thermal headroom below TjMax
getThermalHeadroom()2254 int32 getThermalHeadroom() const { return ThermalHeadroom; }
2255 };
2256
RDTSC()2257 inline uint64 RDTSC()
2258 {
2259 uint64 result = 0;
2260 #ifdef _MSC_VER
2261 // Windows
2262 #if _MSC_VER>= 1600
2263 result = static_cast<uint64>(__rdtsc());
2264 #endif
2265 #else
2266 // Linux
2267 uint32 high = 0, low = 0;
2268 asm volatile("rdtsc" : "=a" (low), "=d" (high));
2269 result = low + (uint64(high)<<32ULL);
2270 #endif
2271 return result;
2272
2273 }
2274
RDTSCP()2275 inline uint64 RDTSCP()
2276 {
2277 uint64 result = 0;
2278 #ifdef _MSC_VER
2279 // Windows
2280 #if _MSC_VER>= 1600
2281 unsigned int Aux;
2282 result = __rdtscp(&Aux);
2283 #endif
2284 #else
2285 // Linux and OS X
2286 uint32 high = 0, low = 0;
2287 asm volatile (
2288 "rdtscp\n\t"
2289 "mov %%edx, %0\n\t"
2290 "mov %%eax, %1\n\t":
2291 "=r" (high), "=r" (low) :: "%rax", "%rcx", "%rdx");
2292 result = low + (uint64(high)<<32ULL);
2293 #endif
2294 return result;
2295 }
2296
2297 template <class CounterStateType>
getThermalHeadroom(const CounterStateType &,const CounterStateType & after)2298 int32 getThermalHeadroom(const CounterStateType & /* before */, const CounterStateType & after)
2299 {
2300 return after.getThermalHeadroom();
2301 }
2302
2303 /*! \brief Returns the ratio of QPI cycles in power saving half-lane mode
2304 \param port QPI port number
2305 \param before CPU counter state before the experiment
2306 \param after CPU counter state after the experiment
2307 \return 0..1 - ratio of QPI cycles in power saving half-lane mode
2308 */
2309 template <class CounterStateType>
getNormalizedQPIL0pTxCycles(uint32 port,const CounterStateType & before,const CounterStateType & after)2310 double getNormalizedQPIL0pTxCycles(uint32 port, const CounterStateType & before, const CounterStateType & after)
2311 {
2312 return double(getQPIL0pTxCycles(port, before, after)) / double(getQPIClocks(port, before, after));
2313 }
2314
2315 /*! \brief Returns the ratio of QPI cycles in power saving shutdown mode
2316 \param port QPI port number
2317 \param before CPU counter state before the experiment
2318 \param after CPU counter state after the experiment
2319 \return 0..1 - ratio of QPI cycles in power saving shutdown mode
2320 */
2321 template <class CounterStateType>
getNormalizedQPIL1Cycles(uint32 port,const CounterStateType & before,const CounterStateType & after)2322 double getNormalizedQPIL1Cycles(uint32 port, const CounterStateType & before, const CounterStateType & after)
2323 {
2324 return double(getQPIL1Cycles(port, before, after)) / double(getQPIClocks(port, before, after));
2325 }
2326
2327 /*! \brief Returns DRAM clock ticks
2328 \param channel DRAM channel number
2329 \param before CPU counter state before the experiment
2330 \param after CPU counter state after the experiment
2331 */
2332 template <class CounterStateType>
getDRAMClocks(uint32 channel,const CounterStateType & before,const CounterStateType & after)2333 uint64 getDRAMClocks(uint32 channel, const CounterStateType & before, const CounterStateType & after)
2334 {
2335 const auto clk = after.DRAMClocks[channel] - before.DRAMClocks[channel];
2336 const auto cpu_model = PCM::getInstance()->getCPUModel();
2337 if (cpu_model == PCM::ICX || cpu_model == PCM::SNOWRIDGE)
2338 {
2339 return 2 * clk;
2340 }
2341 return clk;
2342 }
2343
2344 /*! \brief Returns MCDRAM clock ticks
2345 \param channel MCDRAM channel number
2346 \param before CPU counter state before the experiment
2347 \param after CPU counter state after the experiment
2348 */
2349 template <class CounterStateType>
getMCDRAMClocks(uint32 channel,const CounterStateType & before,const CounterStateType & after)2350 uint64 getMCDRAMClocks(uint32 channel, const CounterStateType & before, const CounterStateType & after)
2351 {
2352 return after.MCDRAMClocks[channel] - before.MCDRAMClocks[channel];
2353 }
2354
2355
2356 /*! \brief Direct read of memory controller PMU counter (counter meaning depends on the programming: power/performance/etc)
2357 \param counter counter number
2358 \param channel channel number
2359 \param before CPU counter state before the experiment
2360 \param after CPU counter state after the experiment
2361 */
2362 template <class CounterStateType>
getMCCounter(uint32 channel,uint32 counter,const CounterStateType & before,const CounterStateType & after)2363 uint64 getMCCounter(uint32 channel, uint32 counter, const CounterStateType & before, const CounterStateType & after)
2364 {
2365 return after.MCCounter[channel][counter] - before.MCCounter[channel][counter];
2366 }
2367
2368 /*! \brief Direct read of M3UPI PMU counter (counter meaning depends on the programming: power/performance/etc)
2369 \param counter counter number
2370 \param port UPI port number
2371 \param before CPU counter state before the experiment
2372 \param after CPU counter state after the experiment
2373 */
2374 template <class CounterStateType>
getM3UPICounter(uint32 port,uint32 counter,const CounterStateType & before,const CounterStateType & after)2375 uint64 getM3UPICounter(uint32 port, uint32 counter, const CounterStateType& before, const CounterStateType& after)
2376 {
2377 return after.M3UPICounter[port][counter] - before.M3UPICounter[port][counter];
2378 }
2379
2380 /*! \brief Direct read of CHA or CBO PMU counter (counter meaning depends on the programming: power/performance/etc)
2381 \param counter counter number
2382 \param cbo cbo or cha number
2383 \param before CPU counter state before the experiment
2384 \param after CPU counter state after the experiment
2385 */
2386 template <class CounterStateType>
getCBOCounter(uint32 cbo,uint32 counter,const CounterStateType & before,const CounterStateType & after)2387 uint64 getCBOCounter(uint32 cbo, uint32 counter, const CounterStateType& before, const CounterStateType& after)
2388 {
2389 return after.CBOCounter[cbo][counter] - before.CBOCounter[cbo][counter];
2390 }
2391
2392 /*! \brief Direct read of UBOX PMU counter (counter meaning depends on the programming: power/performance/etc)
2393 \param counter counter number
2394 \param cbo cbo or cha number
2395 \param before CPU counter state before the experiment
2396 \param after CPU counter state after the experiment
2397 */
2398 template <class CounterStateType>
getUBOXCounter(uint32 counter,const CounterStateType & before,const CounterStateType & after)2399 uint64 getUBOXCounter(uint32 counter, const CounterStateType& before, const CounterStateType& after)
2400 {
2401 return after.UBOXCounter[counter] - before.UBOXCounter[counter];
2402 }
2403
2404 /*! \brief Direct read of IIO PMU counter (counter meaning depends on the programming: power/performance/etc)
2405 \param counter counter number
2406 \param cbo IIO stack number
2407 \param before CPU counter state before the experiment
2408 \param after CPU counter state after the experiment
2409 */
2410 template <class CounterStateType>
getIIOCounter(uint32 stack,uint32 counter,const CounterStateType & before,const CounterStateType & after)2411 uint64 getIIOCounter(uint32 stack, uint32 counter, const CounterStateType& before, const CounterStateType& after)
2412 {
2413 return after.IIOCounter[stack][counter] - before.IIOCounter[stack][counter];
2414 }
2415
2416 /*! \brief Direct read of UPI or QPI PMU counter (counter meaning depends on the programming: power/performance/etc)
2417 \param counter counter number
2418 \param port UPI/QPI port number
2419 \param before CPU counter state before the experiment
2420 \param after CPU counter state after the experiment
2421 */
2422 template <class CounterStateType>
getXPICounter(uint32 port,uint32 counter,const CounterStateType & before,const CounterStateType & after)2423 uint64 getXPICounter(uint32 port, uint32 counter, const CounterStateType& before, const CounterStateType& after)
2424 {
2425 return after.xPICounter[port][counter] - before.xPICounter[port][counter];
2426 }
2427
2428 /*! \brief Direct read of Memory2Mesh controller PMU counter (counter meaning depends on the programming: power/performance/etc)
2429 \param counter counter number
2430 \param controller controller number
2431 \param before CPU counter state before the experiment
2432 \param after CPU counter state after the experiment
2433 */
2434 template <class CounterStateType>
getM2MCounter(uint32 controller,uint32 counter,const CounterStateType & before,const CounterStateType & after)2435 uint64 getM2MCounter(uint32 controller, uint32 counter, const CounterStateType & before, const CounterStateType & after)
2436 {
2437 return after.M2MCounter[controller][counter] - before.M2MCounter[controller][counter];
2438 }
2439
2440
2441 /*! \brief Direct read of embedded DRAM memory controller counter (counter meaning depends on the programming: power/performance/etc)
2442 \param counter counter number
2443 \param channel channel number
2444 \param before CPU counter state before the experiment
2445 \param after CPU counter state after the experiment
2446 */
2447 template <class CounterStateType>
getEDCCounter(uint32 channel,uint32 counter,const CounterStateType & before,const CounterStateType & after)2448 uint64 getEDCCounter(uint32 channel, uint32 counter, const CounterStateType & before, const CounterStateType & after)
2449 {
2450 if (PCM::getInstance()->MCDRAMmemoryTrafficMetricsAvailable())
2451 return after.EDCCounter[channel][counter] - before.EDCCounter[channel][counter];
2452 return 0ULL;
2453 }
2454
2455 /*! \brief Direct read of power control unit PMU counter (counter meaning depends on the programming: power/performance/etc)
2456 \param counter counter number
2457 \param before CPU counter state before the experiment
2458 \param after CPU counter state after the experiment
2459 */
2460 template <class CounterStateType>
getPCUCounter(uint32 counter,const CounterStateType & before,const CounterStateType & after)2461 uint64 getPCUCounter(uint32 counter, const CounterStateType & before, const CounterStateType & after)
2462 {
2463 return after.PCUCounter[counter] - before.PCUCounter[counter];
2464 }
2465
2466 /*! \brief Returns clock ticks of power control unit
2467 \param before CPU counter state before the experiment
2468 \param after CPU counter state after the experiment
2469 */
2470 template <class CounterStateType>
getPCUClocks(const CounterStateType & before,const CounterStateType & after)2471 uint64 getPCUClocks(const CounterStateType & before, const CounterStateType & after)
2472 {
2473 return getPCUCounter(0, before, after);
2474 }
2475
2476 /*! \brief Returns energy consumed by processor, excluding DRAM (measured in internal units)
2477 \param before CPU counter state before the experiment
2478 \param after CPU counter state after the experiment
2479 */
2480 template <class CounterStateType>
getConsumedEnergy(const CounterStateType & before,const CounterStateType & after)2481 uint64 getConsumedEnergy(const CounterStateType & before, const CounterStateType & after)
2482 {
2483 return after.PackageEnergyStatus - before.PackageEnergyStatus;
2484 }
2485
2486 /*! \brief Returns energy consumed by DRAM (measured in internal units)
2487 \param before CPU counter state before the experiment
2488 \param after CPU counter state after the experiment
2489 */
2490 template <class CounterStateType>
getDRAMConsumedEnergy(const CounterStateType & before,const CounterStateType & after)2491 uint64 getDRAMConsumedEnergy(const CounterStateType & before, const CounterStateType & after)
2492 {
2493 return after.DRAMEnergyStatus - before.DRAMEnergyStatus;
2494 }
2495
2496
2497 /*! \brief Returns free running counter if it exists, -1 otherwise
2498 * \param counter name of the counter
2499 * \param before CPU counter state before the experiment
2500 * \param after CPU counter state after the experiment
2501 */
2502 template <class CounterStateType>
getFreeRunningCounter(const typename CounterStateType::FreeRunningCounterID & counter,const CounterStateType & before,const CounterStateType & after)2503 int64 getFreeRunningCounter(const typename CounterStateType::FreeRunningCounterID & counter, const CounterStateType & before, const CounterStateType & after)
2504 {
2505 const auto beforeIt = before.freeRunningCounter.find(counter);
2506 const auto afterIt = after.freeRunningCounter.find(counter);
2507 if (beforeIt != before.freeRunningCounter.end() &&
2508 afterIt != after.freeRunningCounter.end())
2509 {
2510 return afterIt->second - beforeIt->second;
2511 }
2512 return -1;
2513 }
2514
2515
2516 /*! \brief Returns uncore clock ticks
2517 \param before CPU counter state before the experiment
2518 \param after CPU counter state after the experiment
2519 */
2520 template <class CounterStateType>
getUncoreClocks(const CounterStateType & before,const CounterStateType & after)2521 uint64 getUncoreClocks(const CounterStateType& before, const CounterStateType& after)
2522 {
2523 return after.UncClocks - before.UncClocks;
2524 }
2525
2526 /*! \brief Returns Joules consumed by processor (excluding DRAM)
2527 \param before CPU counter state before the experiment
2528 \param after CPU counter state after the experiment
2529 */
2530 template <class CounterStateType>
getConsumedJoules(const CounterStateType & before,const CounterStateType & after)2531 double getConsumedJoules(const CounterStateType & before, const CounterStateType & after)
2532 {
2533 PCM * m = PCM::getInstance();
2534 if (!m) return -1.;
2535
2536 return double(getConsumedEnergy(before, after)) * m->getJoulesPerEnergyUnit();
2537 }
2538
2539 /*! \brief Returns Joules consumed by DRAM
2540 \param before CPU counter state before the experiment
2541 \param after CPU counter state after the experiment
2542 */
2543 template <class CounterStateType>
getDRAMConsumedJoules(const CounterStateType & before,const CounterStateType & after)2544 double getDRAMConsumedJoules(const CounterStateType & before, const CounterStateType & after)
2545 {
2546 PCM * m = PCM::getInstance();
2547 if (!m) return -1.;
2548 double dram_joules_per_energy_unit = 0.;
2549 const auto cpu_model = m->getCPUModel();
2550
2551 if (PCM::HASWELLX == cpu_model
2552 || PCM::BDX_DE == cpu_model
2553 || PCM::BDX == cpu_model
2554 || PCM::SKX == cpu_model
2555 || PCM::ICX == cpu_model
2556 || PCM::KNL == cpu_model
2557 ) {
2558 /* as described in sections 5.3.2 (DRAM_POWER_INFO) and 5.3.3 (DRAM_ENERGY_STATUS) of
2559 * Volume 2 (Registers) of
2560 * Intel Xeon E5-1600 v3 and Intel Xeon E5-2600 v3 (Haswell-EP) Datasheet (Ref 330784-001, Sept.2014)
2561 * ENERGY_UNIT for DRAM domain is fixed to 15.3 uJ for server HSX, BDW and KNL processors.
2562 */
2563 dram_joules_per_energy_unit = 0.0000153;
2564 } else {
2565 /* for all other processors (including Haswell client/mobile SKUs) the ENERGY_UNIT for DRAM domain
2566 * should be read from PACKAGE_POWER_SKU register (usually value around ~61uJ)
2567 */
2568 dram_joules_per_energy_unit = m->getJoulesPerEnergyUnit();
2569 }
2570 return double(getDRAMConsumedEnergy(before, after)) * dram_joules_per_energy_unit;
2571 }
2572
2573 //! \brief Basic uncore counter state
2574 //!
2575 //! Intended only for derivation, but not for the direct use
2576 class UncoreCounterState
2577 {
2578 friend class PCM;
2579 friend class JSONPrinter;
2580 template <class CounterStateType>
2581 friend uint64 getBytesReadFromMC(const CounterStateType & before, const CounterStateType & after);
2582 template <class CounterStateType>
2583 friend uint64 getBytesWrittenToMC(const CounterStateType & before, const CounterStateType & after);
2584 template <class CounterStateType>
2585 friend uint64 getBytesReadFromPMM(const CounterStateType & before, const CounterStateType & after);
2586 template <class CounterStateType>
2587 friend uint64 getBytesWrittenToPMM(const CounterStateType & before, const CounterStateType & after);
2588 template <class CounterStateType>
2589 friend uint64 getBytesReadFromEDC(const CounterStateType & before, const CounterStateType & after);
2590 template <class CounterStateType>
2591 friend uint64 getBytesWrittenToEDC(const CounterStateType & before, const CounterStateType & after);
2592 template <class CounterStateType>
2593 friend uint64 getIORequestBytesFromMC(const CounterStateType & before, const CounterStateType & after);
2594 template <class CounterStateType>
2595 friend uint64 getConsumedEnergy(const CounterStateType & before, const CounterStateType & after);
2596 template <class CounterStateType>
2597 friend uint64 getDRAMConsumedEnergy(const CounterStateType & before, const CounterStateType & after);
2598 template <class CounterStateType>
2599 friend uint64 getUncoreClocks(const CounterStateType& before, const CounterStateType& after);
2600 template <class CounterStateType>
2601 friend double getPackageCStateResidency(int state, const CounterStateType & before, const CounterStateType & after);
2602 template <class CounterStateType>
2603 friend uint64 getPackageCStateResidency(int state, const CounterStateType& now);
2604 template <class CounterStateType>
2605 friend double getLLCReadMissLatency(const CounterStateType & before, const CounterStateType & after);
2606 template <class CounterStateType>
2607 friend double getLocalMemoryRequestRatio(const CounterStateType & before, const CounterStateType & after);
2608
2609 protected:
2610 uint64 UncMCFullWrites;
2611 uint64 UncMCNormalReads;
2612 uint64 UncHARequests;
2613 uint64 UncHALocalRequests;
2614 uint64 UncPMMWrites;
2615 uint64 UncPMMReads;
2616 uint64 UncEDCFullWrites;
2617 uint64 UncEDCNormalReads;
2618 uint64 UncMCIORequests;
2619 uint64 PackageEnergyStatus;
2620 uint64 DRAMEnergyStatus;
2621 uint64 TOROccupancyIAMiss;
2622 uint64 TORInsertsIAMiss;
2623 uint64 UncClocks;
2624 uint64 CStateResidency[PCM::MAX_C_STATE + 1];
2625 void readAndAggregate(std::shared_ptr<SafeMsrHandle>);
2626
2627 public:
UncoreCounterState()2628 UncoreCounterState() :
2629 UncMCFullWrites(0),
2630 UncMCNormalReads(0),
2631 UncHARequests(0),
2632 UncHALocalRequests(0),
2633 UncPMMWrites(0),
2634 UncPMMReads(0),
2635 UncEDCFullWrites(0),
2636 UncEDCNormalReads(0),
2637 UncMCIORequests(0),
2638 PackageEnergyStatus(0),
2639 DRAMEnergyStatus(0),
2640 TOROccupancyIAMiss(0),
2641 TORInsertsIAMiss(0),
2642 UncClocks(0)
2643 {
2644 memset(CStateResidency, 0, sizeof(CStateResidency));
2645 }
~UncoreCounterState()2646 virtual ~UncoreCounterState() { }
2647
2648 UncoreCounterState( const UncoreCounterState& ) = default;
2649 UncoreCounterState( UncoreCounterState&& ) = default;
2650 UncoreCounterState & operator = ( UncoreCounterState&& ) = default;
2651
2652 UncoreCounterState & operator += (const UncoreCounterState & o)
2653 {
2654 UncMCFullWrites += o.UncMCFullWrites;
2655 UncMCNormalReads += o.UncMCNormalReads;
2656 UncHARequests += o.UncHARequests;
2657 UncHALocalRequests += o.UncHALocalRequests;
2658 UncPMMReads += o.UncPMMReads;
2659 UncPMMWrites += o.UncPMMWrites;
2660 UncEDCFullWrites += o.UncEDCFullWrites;
2661 UncEDCNormalReads += o.UncEDCNormalReads;
2662 UncMCIORequests += o.UncMCIORequests;
2663 PackageEnergyStatus += o.PackageEnergyStatus;
2664 DRAMEnergyStatus += o.DRAMEnergyStatus;
2665 TOROccupancyIAMiss += o.TOROccupancyIAMiss;
2666 TORInsertsIAMiss += o.TORInsertsIAMiss;
2667 UncClocks += o.UncClocks;
2668 for (int i = 0; i <= (int)PCM::MAX_C_STATE; ++i)
2669 CStateResidency[i] += o.CStateResidency[i];
2670 return *this;
2671 }
2672 };
2673
2674
2675 //! \brief Server uncore power counter state
2676 //!
2677 class ServerUncoreCounterState : public UncoreCounterState
2678 {
2679 public:
2680 enum {
2681 maxControllers = 4,
2682 maxChannels = 12,
2683 maxXPILinks = 6,
2684 maxCBOs = 128,
2685 maxIIOStacks = 16,
2686 maxCounters = 4
2687 };
2688 enum EventPosition
2689 {
2690 xPI_TxL0P_POWER_CYCLES = 0,
2691 xPI_L1_POWER_CYCLES = 2,
2692 xPI_CLOCKTICKS = 3
2693 };
2694 enum FreeRunningCounterID
2695 {
2696 ImcReads,
2697 ImcWrites,
2698 PMMReads,
2699 PMMWrites
2700 };
2701 private:
2702 std::array<std::array<uint64, maxCounters>, maxXPILinks> xPICounter;
2703 std::array<std::array<uint64, maxCounters>, maxXPILinks> M3UPICounter;
2704 std::array<std::array<uint64, maxCounters>, maxCBOs> CBOCounter;
2705 std::array<std::array<uint64, maxCounters>, maxIIOStacks> IIOCounter;
2706 std::array<uint64, maxCounters> UBOXCounter;
2707 std::array<uint64, maxChannels> DRAMClocks;
2708 std::array<uint64, maxChannels> MCDRAMClocks;
2709 std::array<std::array<uint64, maxCounters>, maxChannels> MCCounter; // channel X counter
2710 std::array<std::array<uint64, maxCounters>, maxControllers> M2MCounter; // M2M/iMC boxes x counter
2711 std::array<std::array<uint64, maxCounters>, maxChannels> EDCCounter; // EDC controller X counter
2712 std::array<uint64, maxCounters> PCUCounter;
2713 std::unordered_map<int, uint64> freeRunningCounter;
2714 int32 PackageThermalHeadroom;
2715 uint64 InvariantTSC; // invariant time stamp counter
2716 friend class PCM;
2717 template <class CounterStateType>
2718 friend uint64 getDRAMClocks(uint32 channel, const CounterStateType & before, const CounterStateType & after);
2719 template <class CounterStateType>
2720 friend uint64 getMCDRAMClocks(uint32 channel, const CounterStateType & before, const CounterStateType & after);
2721 template <class CounterStateType>
2722 friend uint64 getMCCounter(uint32 channel, uint32 counter, const CounterStateType & before, const CounterStateType & after);
2723 template <class CounterStateType>
2724 friend uint64 getM3UPICounter(uint32 port, uint32 counter, const CounterStateType& before, const CounterStateType& after);
2725 template <class CounterStateType>
2726 friend uint64 getCBOCounter(uint32 cbo, uint32 counter, const CounterStateType& before, const CounterStateType& after);
2727 template <class CounterStateType>
2728 friend uint64 getUBOXCounter(uint32 counter, const CounterStateType& before, const CounterStateType& after);
2729 template <class CounterStateType>
2730 friend uint64 getIIOCounter(uint32 stack, uint32 counter, const CounterStateType& before, const CounterStateType& after);
2731 template <class CounterStateType>
2732 friend uint64 getXPICounter(uint32 port, uint32 counter, const CounterStateType& before, const CounterStateType& after);
2733 template <class CounterStateType>
2734 friend uint64 getM2MCounter(uint32 controller, uint32 counter, const CounterStateType & before, const CounterStateType & after);
2735 template <class CounterStateType>
2736 friend uint64 getEDCCounter(uint32 channel, uint32 counter, const CounterStateType & before, const CounterStateType & after);
2737 template <class CounterStateType>
2738 friend uint64 getPCUCounter(uint32 counter, const CounterStateType & before, const CounterStateType & after);
2739 template <class CounterStateType>
2740 friend uint64 getConsumedEnergy(const CounterStateType & before, const CounterStateType & after);
2741 template <class CounterStateType>
2742 friend uint64 getDRAMConsumedEnergy(const CounterStateType & before, const CounterStateType & after);
2743 template <class CounterStateType>
2744 friend uint64 getInvariantTSC(const CounterStateType & before, const CounterStateType & after);
2745 template <class CounterStateType>
2746 friend int64 getFreeRunningCounter(const typename CounterStateType::FreeRunningCounterID &, const CounterStateType & before, const CounterStateType & after);
2747
2748 public:
2749 //! Returns current thermal headroom below TjMax
getPackageThermalHeadroom()2750 int32 getPackageThermalHeadroom() const { return PackageThermalHeadroom; }
ServerUncoreCounterState()2751 ServerUncoreCounterState() :
2752 xPICounter{{}},
2753 M3UPICounter{{}},
2754 CBOCounter{{}},
2755 IIOCounter{{}},
2756 UBOXCounter{{}},
2757 DRAMClocks{{}},
2758 MCDRAMClocks{{}},
2759 MCCounter{{}},
2760 M2MCounter{{}},
2761 EDCCounter{{}},
2762 PCUCounter{{}},
2763 PackageThermalHeadroom(0),
2764 InvariantTSC(0)
2765 {
2766 }
2767 };
2768
2769 /*! \brief Returns QPI LL clock ticks
2770 \param port QPI port number
2771 \param before CPU counter state before the experiment
2772 \param after CPU counter state after the experiment
2773 */
2774 template <class CounterStateType>
getQPIClocks(uint32 port,const CounterStateType & before,const CounterStateType & after)2775 uint64 getQPIClocks(uint32 port, const CounterStateType& before, const CounterStateType& after)
2776 {
2777 return getXPICounter(port, ServerUncoreCounterState::EventPosition::xPI_CLOCKTICKS, before, after);
2778 }
2779
2780 /*! \brief Returns the number of QPI cycles in power saving half-lane mode
2781 \param port QPI port number
2782 \param before CPU counter state before the experiment
2783 \param after CPU counter state after the experiment
2784 */
2785 template <class CounterStateType>
getQPIL0pTxCycles(uint32 port,const CounterStateType & before,const CounterStateType & after)2786 uint64 getQPIL0pTxCycles(uint32 port, const CounterStateType& before, const CounterStateType& after)
2787 {
2788 return getXPICounter(port, ServerUncoreCounterState::EventPosition::xPI_TxL0P_POWER_CYCLES, before, after);
2789 }
2790
2791 /*! \brief Returns the number of QPI cycles in power saving shutdown mode
2792 \param port QPI port number
2793 \param before CPU counter state before the experiment
2794 \param after CPU counter state after the experiment
2795 */
2796 template <class CounterStateType>
getQPIL1Cycles(uint32 port,const CounterStateType & before,const CounterStateType & after)2797 uint64 getQPIL1Cycles(uint32 port, const CounterStateType& before, const CounterStateType& after)
2798 {
2799 return getXPICounter(port, ServerUncoreCounterState::EventPosition::xPI_L1_POWER_CYCLES, before, after);
2800 }
2801
2802 //! \brief (Logical) core-wide counter state
2803 class CoreCounterState : public BasicCounterState
2804 {
2805 friend class PCM;
2806
2807 public:
2808 CoreCounterState() = default;
2809 CoreCounterState( const CoreCounterState& ) = default;
2810 CoreCounterState( CoreCounterState&& ) = default;
2811 CoreCounterState & operator= ( CoreCounterState&& ) = default;
2812 };
2813
2814 //! \brief Socket-wide counter state
2815 class SocketCounterState : public BasicCounterState, public UncoreCounterState
2816 {
2817 friend class PCM;
2818
2819 protected:
readAndAggregate(std::shared_ptr<SafeMsrHandle> handle)2820 void readAndAggregate(std::shared_ptr<SafeMsrHandle> handle)
2821 {
2822 BasicCounterState::readAndAggregate(handle);
2823 UncoreCounterState::readAndAggregate(handle);
2824 }
2825
2826 public:
2827 SocketCounterState& operator += ( const BasicCounterState& ccs )
2828 {
2829 BasicCounterState::operator += ( ccs );
2830
2831 return *this;
2832 }
2833
2834 SocketCounterState& operator += ( const UncoreCounterState& ucs )
2835 {
2836 UncoreCounterState::operator += ( ucs );
2837
2838 return *this;
2839 }
2840
2841 SocketCounterState() = default;
2842 SocketCounterState( const SocketCounterState& ) = default;
2843 SocketCounterState( SocketCounterState&& ) = default;
2844 SocketCounterState & operator = ( SocketCounterState&& ) = default;
2845
2846 SocketCounterState & operator = ( UncoreCounterState&& ucs ) {
2847 UncoreCounterState::operator = ( std::move(ucs) );
2848 return *this;
2849 }
2850 };
2851
2852 //! \brief System-wide counter state
2853 class SystemCounterState : public SocketCounterState
2854 {
2855 friend class PCM;
2856
2857 std::vector<std::vector<uint64> > incomingQPIPackets; // each 64 byte
2858 std::vector<std::vector<uint64> > outgoingQPIFlits; // idle or data/non-data flits depending on the architecture
2859 std::vector<std::vector<uint64> > TxL0Cycles;
2860 uint64 uncoreTSC;
2861
2862 protected:
readAndAggregate(std::shared_ptr<SafeMsrHandle> handle)2863 void readAndAggregate(std::shared_ptr<SafeMsrHandle> handle)
2864 {
2865 BasicCounterState::readAndAggregate(handle);
2866 UncoreCounterState::readAndAggregate(handle);
2867 }
2868
2869 public:
2870 friend uint64 getIncomingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after);
2871 friend uint64 getIncomingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & now);
2872 friend double getOutgoingQPILinkUtilization(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after);
2873 friend uint64 getOutgoingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after);
2874 friend uint64 getOutgoingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & now);
2875
SystemCounterState()2876 SystemCounterState() :
2877 uncoreTSC(0)
2878 {
2879 PCM * m = PCM::getInstance();
2880 incomingQPIPackets.resize(m->getNumSockets(),
2881 std::vector<uint64>((uint32)m->getQPILinksPerSocket(), 0));
2882 outgoingQPIFlits.resize(m->getNumSockets(),
2883 std::vector<uint64>((uint32)m->getQPILinksPerSocket(), 0));
2884 TxL0Cycles.resize(m->getNumSockets(),
2885 std::vector<uint64>((uint32)m->getQPILinksPerSocket(), 0));
2886 }
2887
2888 SystemCounterState( const SystemCounterState& ) = default;
2889 SystemCounterState( SystemCounterState&& ) = default;
2890 SystemCounterState & operator = ( SystemCounterState&& ) = default;
2891
2892 SystemCounterState & operator += ( const SocketCounterState& scs )
2893 {
2894 BasicCounterState::operator += ( scs );
2895 UncoreCounterState::operator += ( scs );
2896
2897 return *this;
2898 }
2899
2900 SystemCounterState & operator += ( const UncoreCounterState& ucs )
2901 {
2902 UncoreCounterState::operator += ( ucs );
2903
2904 return *this;
2905 }
2906 };
2907
2908 /*! \brief Reads the counter state of the system
2909
2910 Helper function. Uses PCM object to access counters.
2911
2912 System consists of several sockets (CPUs).
2913 Socket has a CPU in it. Socket (CPU) consists of several (logical) cores.
2914
2915 \return State of counters in the entire system
2916 */
2917 PCM_API SystemCounterState getSystemCounterState();
2918
2919 /*! \brief Reads the counter state of a socket
2920
2921 Helper function. Uses PCM object to access counters.
2922
2923 \param socket socket id
2924 \return State of counters in the socket
2925 */
2926 PCM_API SocketCounterState getSocketCounterState(uint32 socket);
2927
2928 /*! \brief Reads the counter state of a (logical) core
2929
2930 Helper function. Uses PCM object to access counters.
2931
2932 \param core core id
2933 \return State of counters in the core
2934 */
2935 PCM_API CoreCounterState getCoreCounterState(uint32 core);
2936
2937
2938 /*! \brief Computes average number of retired instructions per core cycle (IPC)
2939
2940 \param before CPU counter state before the experiment
2941 \param after CPU counter state after the experiment
2942 \return IPC
2943 */
2944 template <class CounterStateType>
getIPC(const CounterStateType & before,const CounterStateType & after)2945 double getIPC(const CounterStateType & before, const CounterStateType & after) // instructions per cycle
2946 {
2947 int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
2948 if (clocks != 0)
2949 return double(after.InstRetiredAny - before.InstRetiredAny) / double(clocks);
2950 return -1;
2951 }
2952
2953
2954 /*! \brief Computes the number of retired instructions
2955
2956 \param before CPU counter state before the experiment
2957 \param after CPU counter state after the experiment
2958 \return number of retired instructions
2959 */
2960 template <class CounterStateType>
getInstructionsRetired(const CounterStateType & before,const CounterStateType & after)2961 uint64 getInstructionsRetired(const CounterStateType & before, const CounterStateType & after) // instructions
2962 {
2963 return after.InstRetiredAny - before.InstRetiredAny;
2964 }
2965
2966 /*! \brief Computes average number of retired instructions per time intervall
2967
2968 \param before CPU counter state before the experiment
2969 \param after CPU counter state after the experiment
2970 \return usage
2971 */
2972 template <class CounterStateType>
getExecUsage(const CounterStateType & before,const CounterStateType & after)2973 double getExecUsage(const CounterStateType & before, const CounterStateType & after) // usage
2974 {
2975 int64 timer_clocks = after.InvariantTSC - before.InvariantTSC;
2976 if (timer_clocks != 0)
2977 return double(after.InstRetiredAny - before.InstRetiredAny) / double(timer_clocks);
2978 return -1;
2979 }
2980
2981 /*! \brief Computes the number of retired instructions
2982
2983 \param now Current CPU counter state
2984 \return number of retired instructions
2985 */
2986 template <class CounterStateType>
getInstructionsRetired(const CounterStateType & now)2987 uint64 getInstructionsRetired(const CounterStateType & now) // instructions
2988 {
2989 return now.InstRetiredAny.getRawData_NoOverflowProtection();
2990 }
2991
2992 /*! \brief Computes the number core clock cycles when signal on a specific core is running (not halted)
2993
2994 Returns number of used cycles (halted cyles are not counted).
2995 The counter does not advance in the following conditions:
2996 - an ACPI C-state is other than C0 for normal operation
2997 - HLT
2998 - STPCLK+ pin is asserted
2999 - being throttled by TM1
3000 - during the frequency switching phase of a performance state transition
3001
3002 The performance counter for this event counts across performance state
3003 transitions using different core clock frequencies
3004
3005 \param before CPU counter state before the experiment
3006 \param after CPU counter state after the experiment
3007 \return number core clock cycles
3008 */
3009 template <class CounterStateType>
getCycles(const CounterStateType & before,const CounterStateType & after)3010 uint64 getCycles(const CounterStateType & before, const CounterStateType & after) // clocks
3011 {
3012 return after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
3013 }
3014
3015 /*! \brief Computes the number of reference clock cycles while clock signal on the core is running
3016
3017 The reference clock operates at a fixed frequency, irrespective of core
3018 frequency changes due to performance state transitions. See Intel(r) Software
3019 Developer's Manual for more details
3020
3021 \param before CPU counter state before the experiment
3022 \param after CPU counter state after the experiment
3023 \return number core clock cycles
3024 */
3025 template <class CounterStateType>
getRefCycles(const CounterStateType & before,const CounterStateType & after)3026 uint64 getRefCycles(const CounterStateType & before, const CounterStateType & after) // clocks
3027 {
3028 return after.CpuClkUnhaltedRef - before.CpuClkUnhaltedRef;
3029 }
3030
3031 /*! \brief Computes the number executed core clock cycles
3032
3033 Returns number of used cycles (halted cyles are not counted).
3034
3035 \param now Current CPU counter state
3036 \return number core clock cycles
3037 */
3038 template <class CounterStateType>
getCycles(const CounterStateType & now)3039 uint64 getCycles(const CounterStateType & now) // clocks
3040 {
3041 return now.CpuClkUnhaltedThread.getRawData_NoOverflowProtection();
3042 }
3043
3044 /*! \brief Computes average number of retired instructions per core cycle for the entire system combining instruction counts from logical cores to corresponding physical cores
3045
3046 Use this metric to evaluate IPC improvement between SMT(Hyperthreading) on and SMT off.
3047
3048 \param before CPU counter state before the experiment
3049 \param after CPU counter state after the experiment
3050 \return IPC
3051 */
3052 template <class CounterStateType>
getCoreIPC(const CounterStateType & before,const CounterStateType & after)3053 inline double getCoreIPC(const CounterStateType & before, const CounterStateType & after) // instructions per cycle
3054 {
3055 double ipc = getIPC(before, after);
3056 PCM * m = PCM::getInstance();
3057 if (ipc >= 0. && m && (m->getNumCores() == m->getNumOnlineCores()))
3058 return ipc * double(m->getThreadsPerCore());
3059 return -1;
3060 }
3061
3062 /*! \brief Computes average number of retired instructions per time intervall for the entire system combining instruction counts from logical cores to corresponding physical cores
3063
3064 Use this metric to evaluate cores utilization improvement between SMT(Hyperthreading) on and SMT off.
3065
3066 \param before CPU counter state before the experiment
3067 \param after CPU counter state after the experiment
3068 \return usage
3069 */
3070 template <class CounterStateType>
getTotalExecUsage(const CounterStateType & before,const CounterStateType & after)3071 inline double getTotalExecUsage(const CounterStateType & before, const CounterStateType & after) // usage
3072 {
3073 double usage = getExecUsage(before, after);
3074 PCM * m = PCM::getInstance();
3075 if (usage >= 0. && m && (m->getNumCores() == m->getNumOnlineCores()))
3076 return usage * double(m->getThreadsPerCore());
3077 return -1;
3078 }
3079
3080 /*! \brief Computes average core frequency also taking Intel Turbo Boost technology into account
3081
3082 \param before CPU counter state before the experiment
3083 \param after CPU counter state after the experiment
3084 \return frequency in Hz
3085 */
3086 template <class CounterStateType>
getAverageFrequency(const CounterStateType & before,const CounterStateType & after)3087 double getAverageFrequency(const CounterStateType & before, const CounterStateType & after) // in Hz
3088 {
3089 int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
3090 int64 timer_clocks = after.InvariantTSC - before.InvariantTSC;
3091 PCM * m = PCM::getInstance();
3092 if (timer_clocks != 0 && m)
3093 return double(m->getNominalFrequency()) * double(clocks) / double(timer_clocks);
3094 return -1;
3095 }
3096
3097 /*! \brief Computes average core frequency when not in powersaving C0-state (also taking Intel Turbo Boost technology into account)
3098
3099 \param before CPU counter state before the experiment
3100 \param after CPU counter state after the experiment
3101 \return frequency in Hz
3102 */
3103 template <class CounterStateType>
getActiveAverageFrequency(const CounterStateType & before,const CounterStateType & after)3104 double getActiveAverageFrequency(const CounterStateType & before, const CounterStateType & after) // in Hz
3105 {
3106 int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
3107 int64 ref_clocks = after.CpuClkUnhaltedRef - before.CpuClkUnhaltedRef;
3108 PCM * m = PCM::getInstance();
3109 if (ref_clocks != 0 && m)
3110 return double(m->getNominalFrequency()) * double(clocks) / double(ref_clocks);
3111 return -1;
3112 }
3113
3114 /*! \brief Computes average core frequency also taking Intel Turbo Boost technology into account
3115
3116 \param before CPU counter state before the experiment
3117 \param after CPU counter state after the experiment
3118 \return Fraction of nominal frequency
3119 */
3120 template <class CounterStateType>
getRelativeFrequency(const CounterStateType & before,const CounterStateType & after)3121 double getRelativeFrequency(const CounterStateType & before, const CounterStateType & after) // fraction of nominal frequency
3122 {
3123 int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
3124 int64 timer_clocks = after.InvariantTSC - before.InvariantTSC;
3125 if (timer_clocks != 0)
3126 return double(clocks) / double(timer_clocks);
3127 return -1;
3128 }
3129
3130 /*! \brief Computes average core frequency when not in powersaving C0-state (also taking Intel Turbo Boost technology into account)
3131
3132 \param before CPU counter state before the experiment
3133 \param after CPU counter state after the experiment
3134 \return Fraction of nominal frequency (if >1.0 then Turbo was working during the measurement)
3135 */
3136 template <class CounterStateType>
getActiveRelativeFrequency(const CounterStateType & before,const CounterStateType & after)3137 double getActiveRelativeFrequency(const CounterStateType & before, const CounterStateType & after) // fraction of nominal frequency
3138 {
3139 if (!PCM::getInstance()->isActiveRelativeFrequencyAvailable()) return -1.;
3140 int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
3141 int64 ref_clocks = after.CpuClkUnhaltedRef - before.CpuClkUnhaltedRef;
3142 if (ref_clocks != 0)
3143 return double(clocks) / double(ref_clocks);
3144 return -1;
3145 }
3146
3147 /*! \brief Computes L2 cache hit ratio
3148
3149 \param before CPU counter state before the experiment
3150 \param after CPU counter state after the experiment
3151 \warning Works only in the DEFAULT_EVENTS programming mode (see program() method)
3152 \return value between 0 and 1
3153 */
3154 template <class CounterStateType>
getL2CacheHitRatio(const CounterStateType & before,const CounterStateType & after)3155 double getL2CacheHitRatio(const CounterStateType& before, const CounterStateType& after) // 0.0 - 1.0
3156 {
3157 if (!PCM::getInstance()->isL2CacheHitRatioAvailable()) return 0;
3158 const auto hits = getL2CacheHits(before, after);
3159 const auto misses = getL2CacheMisses(before, after);
3160 return double(hits) / double(hits + misses);
3161 }
3162
3163 /*! \brief Computes L3 cache hit ratio
3164
3165 \param before CPU counter state before the experiment
3166 \param after CPU counter state after the experiment
3167 \warning Works only in the DEFAULT_EVENTS programming mode (see program() method)
3168 \return value between 0 and 1
3169 */
3170 template <class CounterStateType>
getL3CacheHitRatio(const CounterStateType & before,const CounterStateType & after)3171 double getL3CacheHitRatio(const CounterStateType& before, const CounterStateType& after) // 0.0 - 1.0
3172 {
3173 if (!PCM::getInstance()->isL3CacheHitRatioAvailable()) return 0;
3174 const auto hits = getL3CacheHits(before, after);
3175 const auto misses = getL3CacheMisses(before, after);
3176 return double(hits) / double(hits + misses);
3177 }
3178
3179 /*! \brief Computes number of L3 cache misses
3180
3181 \param before CPU counter state before the experiment
3182 \param after CPU counter state after the experiment
3183 \warning Works only in the DEFAULT_EVENTS programming mode (see program() method)
3184 \return number of misses
3185 */
3186 template <class CounterStateType>
getL3CacheMisses(const CounterStateType & before,const CounterStateType & after)3187 uint64 getL3CacheMisses(const CounterStateType & before, const CounterStateType & after)
3188 {
3189 if (!PCM::getInstance()->isL3CacheMissesAvailable()) return 0;
3190 return after.Event[BasicCounterState::L3MissPos] - before.Event[BasicCounterState::L3MissPos];
3191 }
3192
3193 /*! \brief Computes number of L2 cache misses
3194
3195 \param before CPU counter state before the experiment
3196 \param after CPU counter state after the experiment
3197 \warning Works only in the DEFAULT_EVENTS programming mode (see program() method)
3198 \return number of misses
3199 */
3200 template <class CounterStateType>
getL2CacheMisses(const CounterStateType & before,const CounterStateType & after)3201 uint64 getL2CacheMisses(const CounterStateType & before, const CounterStateType & after)
3202 {
3203 auto pcm = PCM::getInstance();
3204 if (pcm->isL2CacheMissesAvailable() == false) return 0ULL;
3205 const auto cpu_model = pcm->getCPUModel();
3206 if (pcm->useSkylakeEvents() || cpu_model == PCM::SNOWRIDGE) {
3207 return after.Event[BasicCounterState::SKLL2MissPos] - before.Event[BasicCounterState::SKLL2MissPos];
3208 }
3209 if (pcm->isAtom() || cpu_model == PCM::KNL)
3210 {
3211 return after.Event[BasicCounterState::ArchLLCMissPos] - before.Event[BasicCounterState::ArchLLCMissPos];
3212 }
3213 uint64 L3Miss = after.Event[BasicCounterState::L3MissPos] - before.Event[BasicCounterState::L3MissPos];
3214 uint64 L3UnsharedHit = after.Event[BasicCounterState::L3UnsharedHitPos] - before.Event[BasicCounterState::L3UnsharedHitPos];
3215 uint64 L2HitM = after.Event[BasicCounterState::L2HitMPos] - before.Event[BasicCounterState::L2HitMPos];
3216 return L2HitM + L3UnsharedHit + L3Miss;
3217 }
3218
3219 /*! \brief Computes number of L2 cache hits
3220
3221 \param before CPU counter state before the experiment
3222 \param after CPU counter state after the experiment
3223 \warning Works only in the DEFAULT_EVENTS programming mode (see program() method)
3224 \return number of hits
3225 */
3226 template <class CounterStateType>
getL2CacheHits(const CounterStateType & before,const CounterStateType & after)3227 uint64 getL2CacheHits(const CounterStateType & before, const CounterStateType & after)
3228 {
3229 auto pcm = PCM::getInstance();
3230 if (pcm->isL2CacheHitsAvailable() == false) return 0ULL;
3231 if (pcm->isAtom() || pcm->getCPUModel() == PCM::KNL)
3232 {
3233 uint64 L2Miss = after.Event[BasicCounterState::ArchLLCMissPos] - before.Event[BasicCounterState::ArchLLCMissPos];
3234 uint64 L2Ref = after.Event[BasicCounterState::ArchLLCRefPos] - before.Event[BasicCounterState::ArchLLCRefPos];
3235 return L2Ref - L2Miss;
3236 }
3237 return after.Event[BasicCounterState::L2HitPos] - before.Event[BasicCounterState::L2HitPos];
3238 }
3239
3240 /*! \brief Computes L3 Cache Occupancy
3241
3242 */
3243 template <class CounterStateType>
getL3CacheOccupancy(const CounterStateType & now)3244 uint64 getL3CacheOccupancy(const CounterStateType & now)
3245 {
3246 if (PCM::getInstance()->L3CacheOccupancyMetricAvailable() == false) return 0ULL;
3247 return now.L3Occupancy;
3248 }
3249 /*! \brief Computes Local Memory Bandwidth
3250
3251 */
3252 template <class CounterStateType>
getLocalMemoryBW(const CounterStateType & before,const CounterStateType & after)3253 uint64 getLocalMemoryBW(const CounterStateType & before, const CounterStateType & after)
3254 {
3255 if (PCM::getInstance()->CoreLocalMemoryBWMetricAvailable() == false) return 0ULL;
3256 return after.MemoryBWLocal - before.MemoryBWLocal;
3257 }
3258
3259 /*! \brief Computes Remote Memory Bandwidth
3260
3261 */
3262 template <class CounterStateType>
getRemoteMemoryBW(const CounterStateType & before,const CounterStateType & after)3263 uint64 getRemoteMemoryBW(const CounterStateType & before, const CounterStateType & after)
3264 {
3265 if (PCM::getInstance()->CoreRemoteMemoryBWMetricAvailable() == false) return 0ULL;
3266 const uint64 total = after.MemoryBWTotal - before.MemoryBWTotal;
3267 const uint64 local = getLocalMemoryBW(before, after);
3268 if (total > local)
3269 return total - local;
3270
3271 return 0;
3272 }
3273
3274 /*! \brief Computes number of L3 cache hits where no snooping in sibling L2 caches had to be done
3275
3276 \param before CPU counter state before the experiment
3277 \param after CPU counter state after the experiment
3278 \warning Works only in the DEFAULT_EVENTS programming mode (see program() method)
3279 \return number of hits
3280 */
3281 template <class CounterStateType>
getL3CacheHitsNoSnoop(const CounterStateType & before,const CounterStateType & after)3282 uint64 getL3CacheHitsNoSnoop(const CounterStateType & before, const CounterStateType & after)
3283 {
3284 if (!PCM::getInstance()->isL3CacheHitsNoSnoopAvailable()) return 0;
3285 return after.Event[BasicCounterState::L3UnsharedHitPos] - before.Event[BasicCounterState::L3UnsharedHitPos];
3286 }
3287
3288 /*! \brief Computes number of L3 cache hits where snooping in sibling L2 caches had to be done
3289
3290 \param before CPU counter state before the experiment
3291 \param after CPU counter state after the experiment
3292 \warning Works only in the DEFAULT_EVENTS programming mode (see program() method)
3293 \return number of hits
3294 */
3295 template <class CounterStateType>
getL3CacheHitsSnoop(const CounterStateType & before,const CounterStateType & after)3296 uint64 getL3CacheHitsSnoop(const CounterStateType & before, const CounterStateType & after)
3297 {
3298 auto pcm = PCM::getInstance();
3299 if (!pcm->isL3CacheHitsSnoopAvailable()) return 0;
3300 const auto cpu_model = pcm->getCPUModel();
3301 if (cpu_model == PCM::SNOWRIDGE)
3302 {
3303 const int64 misses = getL3CacheMisses(before, after);
3304 const int64 refs = after.Event[BasicCounterState::ArchLLCRefPos] - before.Event[BasicCounterState::ArchLLCRefPos];
3305 const int64 hits = refs - misses;
3306 return (hits > 0)? hits : 0;
3307 }
3308 if (pcm->useSkylakeEvents()) {
3309 return after.Event[BasicCounterState::SKLL3HitPos] - before.Event[BasicCounterState::SKLL3HitPos];
3310 }
3311 return after.Event[BasicCounterState::L2HitMPos] - before.Event[BasicCounterState::L2HitMPos];
3312 }
3313
3314
3315 /*! \brief Computes total number of L3 cache hits
3316
3317 \param before CPU counter state before the experiment
3318 \param after CPU counter state after the experiment
3319 \warning Works only in the DEFAULT_EVENTS programming mode (see program() method)
3320 \return number of hits
3321 */
3322 template <class CounterStateType>
getL3CacheHits(const CounterStateType & before,const CounterStateType & after)3323 uint64 getL3CacheHits(const CounterStateType & before, const CounterStateType & after)
3324 {
3325 if (!PCM::getInstance()->isL3CacheHitsAvailable()) return 0;
3326 return getL3CacheHitsSnoop(before, after) + getL3CacheHitsNoSnoop(before, after);
3327 }
3328
3329 /*! \brief Computes number of invariant time stamp counter ticks
3330
3331 This counter counts irrespectively of C-, P- or T-states
3332
3333 \param before CPU counter state before the experiment
3334 \param after CPU counter state after the experiment
3335 \return number of time stamp counter ticks
3336 */
3337 template <class CounterStateType>
getInvariantTSC(const CounterStateType & before,const CounterStateType & after)3338 uint64 getInvariantTSC(const CounterStateType & before, const CounterStateType & after)
3339 {
3340 return after.InvariantTSC - before.InvariantTSC;
3341 }
3342
3343 /*! \brief Computes residency in the core C-state
3344
3345 \param state C-state
3346 \param before CPU counter state before the experiment
3347 \param after CPU counter state after the experiment
3348 \return residence ratio (0..1): 0 - 0%, 1.0 - 100%
3349 */
3350 template <class CounterStateType>
getCoreCStateResidency(int state,const CounterStateType & before,const CounterStateType & after)3351 inline double getCoreCStateResidency(int state, const CounterStateType & before, const CounterStateType & after)
3352 {
3353 const double tsc = double(getInvariantTSC(before, after));
3354
3355 if (state == 0) return double(getRefCycles(before, after)) / tsc;
3356
3357 if (state == 1)
3358 {
3359 PCM * m = PCM::getInstance();
3360 double result = 1.0 - double(getRefCycles(before, after)) / tsc; // 1.0 - cC0
3361 for (int i = 2; i <= PCM::MAX_C_STATE; ++i)
3362 if (m->isCoreCStateResidencySupported(state))
3363 result -= (after.BasicCounterState::CStateResidency[i] - before.BasicCounterState::CStateResidency[i]) / tsc;
3364
3365 if (result < 0.) result = 0.; // fix counter dissynchronization
3366 else if (result > 1.) result = 1.; // fix counter dissynchronization
3367
3368 return result;
3369 }
3370 return (after.BasicCounterState::CStateResidency[state] - before.BasicCounterState::CStateResidency[state]) / tsc;
3371 }
3372
3373 /*! \brief Reads raw residency counter for the core C-state
3374
3375 \param state C-state #
3376 \param now CPU counter state
3377 \return raw residency value
3378 */
3379 template <class CounterStateType>
getCoreCStateResidency(int state,const CounterStateType & now)3380 inline uint64 getCoreCStateResidency(int state, const CounterStateType& now)
3381 {
3382 if (state == 0) return now.CpuClkUnhaltedRef.getRawData_NoOverflowProtection();
3383
3384 return now.BasicCounterState::CStateResidency[state];
3385 }
3386
3387 /*! \brief Computes residency in the package C-state
3388
3389 \param state C-state
3390 \param before CPU counter state before the experiment
3391 \param after CPU counter state after the experiment
3392 \return residence ratio (0..1): 0 - 0%, 1.0 - 100%
3393 */
3394 template <class CounterStateType>
getPackageCStateResidency(int state,const CounterStateType & before,const CounterStateType & after)3395 inline double getPackageCStateResidency(int state, const CounterStateType & before, const CounterStateType & after)
3396 {
3397 const double tsc = double(getInvariantTSC(before, after));
3398 if (state == 0)
3399 {
3400 PCM * m = PCM::getInstance();
3401 double result = 1.0;
3402 for (int i = 1; i <= PCM::MAX_C_STATE; ++i)
3403 if (m->isPackageCStateResidencySupported(state))
3404 result -= (after.UncoreCounterState::CStateResidency[i] - before.UncoreCounterState::CStateResidency[i]) / tsc;
3405
3406 if (result < 0.) result = 0.; // fix counter dissynchronization
3407 else if (result > 1.) result = 1.; // fix counter dissynchronization
3408
3409 return result;
3410 }
3411 return double(after.UncoreCounterState::CStateResidency[state] - before.UncoreCounterState::CStateResidency[state]) / tsc;
3412 }
3413
3414 /*! \brief Reads raw residency counter for the package C-state
3415
3416 \param state C-state #
3417 \param now CPU counter state
3418 \return raw residency value
3419 */
3420 template <class CounterStateType>
getPackageCStateResidency(int state,const CounterStateType & now)3421 inline uint64 getPackageCStateResidency(int state, const CounterStateType& now)
3422 {
3423 return now.UncoreCounterState::CStateResidency[state];
3424 }
3425
3426 /*! \brief Computes number of bytes read from DRAM memory controllers
3427
3428 \param before CPU counter state before the experiment
3429 \param after CPU counter state after the experiment
3430 \return Number of bytes
3431 */
3432 template <class CounterStateType>
getBytesReadFromMC(const CounterStateType & before,const CounterStateType & after)3433 uint64 getBytesReadFromMC(const CounterStateType & before, const CounterStateType & after)
3434 {
3435 if (PCM::getInstance()->memoryTrafficMetricsAvailable())
3436 return (after.UncMCNormalReads - before.UncMCNormalReads) * 64;
3437 return 0ULL;
3438 }
3439
3440 /*! \brief Computes number of bytes written to DRAM memory controllers
3441
3442 \param before CPU counter state before the experiment
3443 \param after CPU counter state after the experiment
3444 \return Number of bytes
3445 */
3446 template <class CounterStateType>
getBytesWrittenToMC(const CounterStateType & before,const CounterStateType & after)3447 uint64 getBytesWrittenToMC(const CounterStateType & before, const CounterStateType & after)
3448 {
3449 if (PCM::getInstance()->memoryTrafficMetricsAvailable())
3450 return (after.UncMCFullWrites - before.UncMCFullWrites) * 64;
3451 return 0ULL;
3452 }
3453
3454 /*! \brief Computes number of bytes read from PMM memory
3455
3456 \param before CPU counter state before the experiment
3457 \param after CPU counter state after the experiment
3458 \return Number of bytes
3459 */
3460 template <class CounterStateType>
getBytesReadFromPMM(const CounterStateType & before,const CounterStateType & after)3461 uint64 getBytesReadFromPMM(const CounterStateType & before, const CounterStateType & after)
3462 {
3463 if (PCM::getInstance()->PMMTrafficMetricsAvailable())
3464 return (after.UncPMMReads - before.UncPMMReads) * 64;
3465 return 0ULL;
3466 }
3467
3468 /*! \brief Computes number of bytes written to PMM memory
3469
3470 \param before CPU counter state before the experiment
3471 \param after CPU counter state after the experiment
3472 \return Number of bytes
3473 */
3474 template <class CounterStateType>
getBytesWrittenToPMM(const CounterStateType & before,const CounterStateType & after)3475 uint64 getBytesWrittenToPMM(const CounterStateType & before, const CounterStateType & after)
3476 {
3477 if (PCM::getInstance()->PMMTrafficMetricsAvailable())
3478 return (after.UncPMMWrites - before.UncPMMWrites) * 64;
3479 return 0ULL;
3480 }
3481
3482 /*! \brief Computes number of bytes read from MCDRAM memory controllers
3483
3484 \param before CPU counter state before the experiment
3485 \param after CPU counter state after the experiment
3486 \return Number of bytes
3487 */
3488 template <class CounterStateType>
getBytesReadFromEDC(const CounterStateType & before,const CounterStateType & after)3489 uint64 getBytesReadFromEDC(const CounterStateType & before, const CounterStateType & after)
3490 {
3491 if (PCM::getInstance()->MCDRAMmemoryTrafficMetricsAvailable())
3492 return (after.UncEDCNormalReads - before.UncEDCNormalReads) * 64;
3493 return 0ULL;
3494 }
3495
3496 /*! \brief Computes number of bytes written to MCDRAM memory controllers
3497
3498 \param before CPU counter state before the experiment
3499 \param after CPU counter state after the experiment
3500 \return Number of bytes
3501 */
3502 template <class CounterStateType>
getBytesWrittenToEDC(const CounterStateType & before,const CounterStateType & after)3503 uint64 getBytesWrittenToEDC(const CounterStateType & before, const CounterStateType & after)
3504 {
3505 if (PCM::getInstance()->MCDRAMmemoryTrafficMetricsAvailable())
3506 return (after.UncEDCFullWrites - before.UncEDCFullWrites) * 64;
3507 return 0ULL;
3508 }
3509
3510
3511 /*! \brief Computes number of bytes of read/write requests from all IO sources
3512
3513 \param before CPU counter state before the experiment
3514 \param after CPU counter state after the experiment
3515 \return Number of bytes
3516 */
3517 template <class CounterStateType>
getIORequestBytesFromMC(const CounterStateType & before,const CounterStateType & after)3518 uint64 getIORequestBytesFromMC(const CounterStateType & before, const CounterStateType & after)
3519 {
3520 if (PCM::getInstance()->memoryIOTrafficMetricAvailable())
3521 return (after.UncMCIORequests - before.UncMCIORequests) * 64;
3522 return 0ULL;
3523 }
3524
3525 /*! \brief Returns the number of occured system management interrupts
3526
3527 \param before CPU counter state before the experiment
3528 \param after CPU counter state after the experiment
3529 \return Number of SMIs (system manegement interrupts)
3530 */
3531 template <class CounterStateType>
getSMICount(const CounterStateType & before,const CounterStateType & after)3532 uint64 getSMICount(const CounterStateType & before, const CounterStateType & after)
3533 {
3534 return after.SMICount - before.SMICount;
3535 }
3536
3537 /*! \brief Returns the number of occured custom core events
3538
3539 Read number of events programmed with the \c CUSTOM_CORE_EVENTS
3540
3541 \param eventCounterNr Event/counter number (value from 0 to 3)
3542 \param before CPU counter state before the experiment
3543 \param after CPU counter state after the experiment
3544 \return Number of bytes
3545 */
3546 template <class CounterStateType>
getNumberOfCustomEvents(int32 eventCounterNr,const CounterStateType & before,const CounterStateType & after)3547 uint64 getNumberOfCustomEvents(int32 eventCounterNr, const CounterStateType & before, const CounterStateType & after)
3548 {
3549 return after.Event[eventCounterNr] - before.Event[eventCounterNr];
3550 }
3551
3552 /*! \brief Get estimation of QPI data traffic per incoming QPI link
3553
3554 Returns an estimation of number of data bytes transferred to a socket over Intel(r) Quick Path Interconnect
3555
3556 \param socketNr socket identifier
3557 \param linkNr linkNr
3558 \param before System CPU counter state before the experiment
3559 \param after System CPU counter state after the experiment
3560 \return Number of bytes
3561 */
getIncomingQPILinkBytes(uint32 socketNr,uint32 linkNr,const SystemCounterState & before,const SystemCounterState & after)3562 inline uint64 getIncomingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after)
3563 {
3564 if (!PCM::getInstance()->incomingQPITrafficMetricsAvailable()) return 0ULL;
3565 uint64 b = before.incomingQPIPackets[socketNr][linkNr];
3566 uint64 a = after.incomingQPIPackets[socketNr][linkNr];
3567 // prevent overflows due to counter dissynchronisation
3568 return (a > b) ? (64 * (a - b)) : 0;
3569 }
3570
3571 /*! \brief Get data utilization of incoming QPI link (0..1)
3572
3573 Returns an estimation of utilization of QPI link by data traffic transferred to a socket over Intel(r) Quick Path Interconnect
3574
3575 \param socketNr socket identifier
3576 \param linkNr linkNr
3577 \param before System CPU counter state before the experiment
3578 \param after System CPU counter state after the experiment
3579 \return utilization (0..1)
3580 */
getIncomingQPILinkUtilization(uint32 socketNr,uint32 linkNr,const SystemCounterState & before,const SystemCounterState & after)3581 inline double getIncomingQPILinkUtilization(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after)
3582 {
3583 PCM * m = PCM::getInstance();
3584 if (!(m->qpiUtilizationMetricsAvailable())) return 0.;
3585
3586 const double bytes = (double)getIncomingQPILinkBytes(socketNr, linkNr, before, after);
3587 const uint64 max_speed = m->getQPILinkSpeed(socketNr, linkNr);
3588 const double max_bytes = (double)(double(max_speed) * double(getInvariantTSC(before, after) / double(m->getNumCores())) / double(m->getNominalFrequency()));
3589 return bytes / max_bytes;
3590 }
3591
3592 /*! \brief Get utilization of outgoing QPI link (0..1)
3593
3594 Returns an estimation of utilization of QPI link by (data+nondata) traffic transferred from a socket over Intel(r) Quick Path Interconnect
3595
3596 \param socketNr socket identifier
3597 \param linkNr linkNr
3598 \param before System CPU counter state before the experiment
3599 \param after System CPU counter state after the experiment
3600 \return utilization (0..1)
3601 */
getOutgoingQPILinkUtilization(uint32 socketNr,uint32 linkNr,const SystemCounterState & before,const SystemCounterState & after)3602 inline double getOutgoingQPILinkUtilization(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after)
3603 {
3604 PCM * m = PCM::getInstance();
3605
3606 if (m->outgoingQPITrafficMetricsAvailable() == false) return 0.;
3607
3608 if (m->hasBecktonUncore())
3609 {
3610 const uint64 b = before.outgoingQPIFlits[socketNr][linkNr]; // idle flits
3611 const uint64 a = after.outgoingQPIFlits[socketNr][linkNr]; // idle flits
3612 // prevent overflows due to counter dissynchronisation
3613 const double idle_flits = (double)((a > b) ? (a - b) : 0);
3614 const uint64 bTSC = before.uncoreTSC;
3615 const uint64 aTSC = after.uncoreTSC;
3616 const double tsc = (double)((aTSC > bTSC) ? (aTSC - bTSC) : 0);
3617 if (idle_flits >= tsc) return 0.; // prevent oveflows due to potential counter dissynchronization
3618
3619 return (1. - (idle_flits / tsc));
3620 } else if (m->hasPCICFGUncore())
3621 {
3622 const uint64 b = before.outgoingQPIFlits[socketNr][linkNr]; // data + non-data flits or idle (null) flits
3623 const uint64 a = after.outgoingQPIFlits[socketNr][linkNr]; // data + non-data flits or idle (null) flits
3624 // prevent overflows due to counter dissynchronisation
3625 double flits = (double)((a > b) ? (a - b) : 0);
3626 const double max_flits = ((double(getInvariantTSC(before, after)) * double(m->getQPILinkSpeed(socketNr, linkNr)) / m->getBytesPerFlit()) / double(m->getNominalFrequency())) / double(m->getNumCores());
3627 if(m->hasUPI())
3628 {
3629 flits = flits/3.;
3630 }
3631 if (flits > max_flits) return 1.; // prevent oveflows due to potential counter dissynchronization
3632 return (flits / max_flits);
3633 }
3634
3635 return 0;
3636 }
3637
3638 /*! \brief Get estimation of QPI (data+nondata) traffic per outgoing QPI link
3639
3640 Returns an estimation of number of data bytes transferred from a socket over Intel(r) Quick Path Interconnect
3641
3642 \param socketNr socket identifier
3643 \param linkNr linkNr
3644 \param before System CPU counter state before the experiment
3645 \param after System CPU counter state after the experiment
3646 \return Number of bytes
3647 */
getOutgoingQPILinkBytes(uint32 socketNr,uint32 linkNr,const SystemCounterState & before,const SystemCounterState & after)3648 inline uint64 getOutgoingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after)
3649 {
3650 PCM * m = PCM::getInstance();
3651 if (!(m->outgoingQPITrafficMetricsAvailable())) return 0ULL;
3652
3653 const double util = getOutgoingQPILinkUtilization(socketNr, linkNr, before, after);
3654 const double max_bytes = (double(m->getQPILinkSpeed(socketNr, linkNr)) * double(getInvariantTSC(before, after) / double(m->getNumCores())) / double(m->getNominalFrequency()));
3655
3656 return (uint64)(max_bytes * util);
3657 }
3658
3659
3660 /*! \brief Get estimation of total QPI data traffic
3661
3662 Returns an estimation of number of data bytes transferred to all sockets over all Intel(r) Quick Path Interconnect links
3663
3664 \param before System CPU counter state before the experiment
3665 \param after System CPU counter state after the experiment
3666 \return Number of bytes
3667 */
getAllIncomingQPILinkBytes(const SystemCounterState & before,const SystemCounterState & after)3668 inline uint64 getAllIncomingQPILinkBytes(const SystemCounterState & before, const SystemCounterState & after)
3669 {
3670 PCM * m = PCM::getInstance();
3671 const uint32 ns = m->getNumSockets();
3672 const uint32 qpiLinks = (uint32)m->getQPILinksPerSocket();
3673 uint64 sum = 0;
3674
3675 for (uint32 s = 0; s < ns; ++s)
3676 for (uint32 q = 0; q < qpiLinks; ++q)
3677 sum += getIncomingQPILinkBytes(s, q, before, after);
3678
3679 return sum;
3680 }
3681
3682 /*! \brief Get estimation of total QPI data+nondata traffic
3683
3684 Returns an estimation of number of data and non-data bytes transferred from all sockets over all Intel(r) Quick Path Interconnect links
3685
3686 \param before System CPU counter state before the experiment
3687 \param after System CPU counter state after the experiment
3688 \return Number of bytes
3689 */
getAllOutgoingQPILinkBytes(const SystemCounterState & before,const SystemCounterState & after)3690 inline uint64 getAllOutgoingQPILinkBytes(const SystemCounterState & before, const SystemCounterState & after)
3691 {
3692 PCM * m = PCM::getInstance();
3693 const uint32 ns = m->getNumSockets();
3694 const uint32 qpiLinks = (uint32)m->getQPILinksPerSocket();
3695 uint64 sum = 0;
3696
3697 for (uint32 s = 0; s < ns; ++s)
3698 for (uint32 q = 0; q < qpiLinks; ++q)
3699 sum += getOutgoingQPILinkBytes(s, q, before, after);
3700
3701 return sum;
3702 }
3703
3704
3705 /*! \brief Return current value of the counter of QPI data traffic per incoming QPI link
3706
3707 Returns the number of incoming data bytes to a socket over Intel(r) Quick Path Interconnect
3708
3709 \param socketNr socket identifier
3710 \param linkNr linkNr
3711 \param now Current System CPU counter state
3712 \return Number of bytes
3713 */
getIncomingQPILinkBytes(uint32 socketNr,uint32 linkNr,const SystemCounterState & now)3714 inline uint64 getIncomingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & now)
3715 {
3716 if (PCM::getInstance()->incomingQPITrafficMetricsAvailable())
3717 return 64 * now.incomingQPIPackets[socketNr][linkNr];
3718 return 0ULL;
3719 }
3720
3721 /*! \brief Get estimation of total QPI data traffic for this socket
3722
3723 Returns an estimation of number of bytes transferred to this sockets over all Intel(r) Quick Path Interconnect links on this socket
3724
3725 \param before System CPU counter state before the experiment
3726 \param after System CPU counter state after the experiment
3727 \return Number of bytes
3728 */
getSocketIncomingQPILinkBytes(uint32 socketNr,const SystemCounterState & now)3729 inline uint64 getSocketIncomingQPILinkBytes(uint32 socketNr, const SystemCounterState & now)
3730 {
3731 PCM * m = PCM::getInstance();
3732 const uint32 qpiLinks = (uint32)m->getQPILinksPerSocket();
3733 uint64 sum = 0;
3734
3735 for (uint32 q = 0; q < qpiLinks; ++q)
3736 sum += getIncomingQPILinkBytes(socketNr, q, now);
3737
3738 return sum;
3739 }
3740
3741 /*! \brief Get estimation of Socket QPI data traffic
3742
3743 Returns an estimation of number of data bytes transferred to all sockets over all Intel(r) Quick Path Interconnect links
3744
3745 \param now System CPU counter state
3746 \return Number of bytes
3747 */
getAllIncomingQPILinkBytes(const SystemCounterState & now)3748 inline uint64 getAllIncomingQPILinkBytes(const SystemCounterState & now)
3749 {
3750 PCM * m = PCM::getInstance();
3751 const uint32 ns = m->getNumSockets();
3752 uint64 sum = 0;
3753
3754 for (uint32 s = 0; s < ns; ++s)
3755 sum += getSocketIncomingQPILinkBytes(s, now);
3756 return sum;
3757 }
3758
3759
3760 /*! \brief Get QPI data to Memory Controller traffic ratio
3761
3762 Ideally for NUMA-optmized programs the ratio should be close to 0.
3763
3764 \param before System CPU counter state before the experiment
3765 \param after System CPU counter state after the experiment
3766 \return Ratio
3767 */
3768
getQPItoMCTrafficRatio(const SystemCounterState & before,const SystemCounterState & after)3769 inline double getQPItoMCTrafficRatio(const SystemCounterState & before, const SystemCounterState & after)
3770 {
3771 const uint64 totalQPI = getAllIncomingQPILinkBytes(before, after);
3772 uint64 memTraffic = getBytesReadFromMC(before, after) + getBytesWrittenToMC(before, after);
3773 if (PCM::getInstance()->PMMTrafficMetricsAvailable())
3774 {
3775 memTraffic += getBytesReadFromPMM(before, after) + getBytesWrittenToPMM(before, after);
3776 }
3777 return double(totalQPI) / double(memTraffic);
3778 }
3779
3780 /*! \brief Get local memory access ration measured in home agent
3781
3782 \param before System CPU counter state before the experiment
3783 \param after System CPU counter state after the experiment
3784 \return Ratio
3785 */
3786 template <class CounterStateType>
getLocalMemoryRequestRatio(const CounterStateType & before,const CounterStateType & after)3787 inline double getLocalMemoryRequestRatio(const CounterStateType & before, const CounterStateType & after)
3788 {
3789 if (PCM::getInstance()->localMemoryRequestRatioMetricAvailable() == false) return -1.;
3790 const auto all = after.UncHARequests - before.UncHARequests;
3791 const auto local = after.UncHALocalRequests - before.UncHALocalRequests;
3792 // std::cout << "PCM DEBUG "<< 64*all/1e6 << " " << 64*local/1e6 << "\n";
3793 return double(local)/double(all);
3794 }
3795
3796 //! \brief Returns the raw count of events
3797 //! \param before counter state before the experiment
3798 //! \param after counter state after the experiment
3799 template <class CounterType>
getNumberOfEvents(const CounterType & before,const CounterType & after)3800 inline uint64 getNumberOfEvents(const CounterType & before, const CounterType & after)
3801 {
3802 return after.data - before.data;
3803 }
3804 //! \brief Returns average last level cache read+prefetch miss latency in ns
3805
3806 template <class CounterStateType>
getLLCReadMissLatency(const CounterStateType & before,const CounterStateType & after)3807 inline double getLLCReadMissLatency(const CounterStateType & before, const CounterStateType & after)
3808 {
3809 if (PCM::getInstance()->LLCReadMissLatencyMetricsAvailable() == false) return -1.;
3810 const double occupancy = double(after.TOROccupancyIAMiss) - double(before.TOROccupancyIAMiss);
3811 const double inserts = double(after.TORInsertsIAMiss) - double(before.TORInsertsIAMiss);
3812 const double unc_clocks = double(after.UncClocks) - double(before.UncClocks);
3813 auto * m = PCM::getInstance();
3814 const double seconds = double(getInvariantTSC(before, after)) / double(m->getNumCores()/m->getNumSockets()) / double(m->getNominalFrequency());
3815 return 1e9*seconds*(occupancy/inserts)/unc_clocks;
3816 }
3817
3818 template <class CounterStateType>
getAllSlots(const CounterStateType & before,const CounterStateType & after)3819 inline uint64 getAllSlots(const CounterStateType & before, const CounterStateType & after)
3820 {
3821 const int64 a = after.BackendBoundSlots - before.BackendBoundSlots;
3822 const int64 b = after.FrontendBoundSlots - before.FrontendBoundSlots;
3823 const int64 c = after.BadSpeculationSlots - before.BadSpeculationSlots;
3824 const int64 d = after.RetiringSlots - before.RetiringSlots;
3825 // std::cout << "before DEBUG: " << before.FrontendBoundSlots << " " << before.BadSpeculationSlots << " "<< before.BackendBoundSlots << " " << before.RetiringSlots << std::endl;
3826 // std::cout << "after DEBUG: " << after.FrontendBoundSlots << " " << after.BadSpeculationSlots << " " << after.BackendBoundSlots << " " << after.RetiringSlots << std::endl;
3827 assert(a >= 0);
3828 assert(b >= 0);
3829 assert(c >= 0);
3830 assert(d >= 0);
3831 return a + b + c + d;
3832 }
3833
3834 template <class CounterStateType>
getAllSlotsRaw(const CounterStateType & before,const CounterStateType & after)3835 inline uint64 getAllSlotsRaw(const CounterStateType& before, const CounterStateType& after)
3836 {
3837 return after.AllSlotsRaw - before.AllSlotsRaw;
3838 }
3839
3840 //! \brief Returns unutilized pipeline slots where no uop was delivered due to lack of back-end resources as range 0..1
3841 template <class CounterStateType>
getBackendBound(const CounterStateType & before,const CounterStateType & after)3842 inline double getBackendBound(const CounterStateType & before, const CounterStateType & after)
3843 {
3844 // std::cout << "DEBUG: "<< after.BackendBoundSlots - before.BackendBoundSlots << " " << getAllSlots(before, after) << std::endl;
3845 if (PCM::getInstance()->isHWTMAL1Supported())
3846 return double(after.BackendBoundSlots - before.BackendBoundSlots)/double(getAllSlots(before, after));
3847 return 0.;
3848 }
3849
3850 //! \brief Returns unutilized pipeline slots where Front-end did not deliver a uop while back-end is ready as range 0..1
3851 template <class CounterStateType>
getFrontendBound(const CounterStateType & before,const CounterStateType & after)3852 inline double getFrontendBound(const CounterStateType & before, const CounterStateType & after)
3853 {
3854 // std::cout << "DEBUG: "<< after.FrontendBoundSlots - before.FrontendBoundSlots << " " << getAllSlots(before, after) << std::endl;
3855 if (PCM::getInstance()->isHWTMAL1Supported())
3856 return double(after.FrontendBoundSlots - before.FrontendBoundSlots)/double(getAllSlots(before, after));
3857 return 0.;
3858 }
3859
3860 //! \brief Returns wasted pipeline slots due to incorrect speculation, covering whole penalty: Utilized by uops that do not retire, or Recovery Bubbles (unutilized slots) as range 0..1
3861 template <class CounterStateType>
getBadSpeculation(const CounterStateType & before,const CounterStateType & after)3862 inline double getBadSpeculation(const CounterStateType & before, const CounterStateType & after)
3863 {
3864 // std::cout << "DEBUG: "<< after.BadSpeculationSlots - before.BadSpeculationSlots << " " << getAllSlots(before, after) << std::endl;
3865 if (PCM::getInstance()->isHWTMAL1Supported())
3866 return double(after.BadSpeculationSlots - before.BadSpeculationSlots)/double(getAllSlots(before, after));
3867 return 0.;
3868 }
3869
3870 //! \brief Returns pipeline slots utilized by uops that eventually retire (commit)
3871 template <class CounterStateType>
getRetiring(const CounterStateType & before,const CounterStateType & after)3872 inline double getRetiring(const CounterStateType & before, const CounterStateType & after)
3873 {
3874 // std::cout << "DEBUG: "<< after.RetiringSlots - before.RetiringSlots << " " << getAllSlots(before, after) << std::endl;
3875 if (PCM::getInstance()->isHWTMAL1Supported())
3876 return double(after.RetiringSlots - before.RetiringSlots)/double(getAllSlots(before, after));
3877 return 0.;
3878 }
3879
3880 } // namespace pcm
3881
3882 #endif
3883