1 // Copyright (c) the JPEG XL Project Authors. All rights reserved. 2 // 3 // Use of this source code is governed by a BSD-style 4 // license that can be found in the LICENSE file. 5 6 #ifndef LIB_PROFILER_PROFILER_H_ 7 #define LIB_PROFILER_PROFILER_H_ 8 9 // High precision, low overhead time measurements. Returns exact call counts and 10 // total elapsed time for user-defined 'zones' (code regions, i.e. C++ scopes). 11 // 12 // Usage: instrument regions of interest: { PROFILER_ZONE("name"); /*code*/ } or 13 // void FuncToMeasure() { PROFILER_FUNC; /*code*/ }. 14 // After all threads have exited any zones, invoke PROFILER_PRINT_RESULTS() to 15 // print call counts and average durations [CPU cycles] to stdout, sorted in 16 // descending order of total duration. 17 18 // If zero, this file has no effect and no measurements will be recorded. 19 #ifndef PROFILER_ENABLED 20 #define PROFILER_ENABLED 0 21 #endif 22 #if PROFILER_ENABLED 23 24 #include <stddef.h> 25 #include <stdint.h> 26 27 #include <hwy/aligned_allocator.h> 28 #include <hwy/base.h> 29 30 #include "lib/profiler/tsc_timer.h" 31 32 #if HWY_COMPILER_MSVC 33 #define PROFILER_PUBLIC 34 #else 35 #define PROFILER_PUBLIC __attribute__((visibility("default"))) 36 #endif 37 38 namespace profiler { 39 40 // Represents zone entry/exit events. POD. 41 #pragma pack(push, 1) 42 struct Packet { 43 // Computing a hash or string table is likely too expensive, and offsets 44 // from other libraries' string literals can be too large to combine them and 45 // a full-resolution timestamp into 64 bits. 46 uint64_t timestamp; 47 const char* name; // nullptr for exit packets 48 #if UINTPTR_MAX <= 0xFFFFFFFFu 49 uint32_t padding; 50 #endif 51 }; 52 #pragma pack(pop) 53 static_assert(sizeof(Packet) == 16, "Wrong Packet size"); 54 55 class Results; // pImpl 56 57 // Per-thread packet storage, dynamically allocated and aligned. 58 class ThreadSpecific { 59 static constexpr size_t kBufferCapacity = 64 / sizeof(Packet); 60 61 public: 62 PROFILER_PUBLIC explicit ThreadSpecific(); 63 PROFILER_PUBLIC ~ThreadSpecific(); 64 65 // Depends on Zone => defined out of line. 66 PROFILER_PUBLIC void ComputeOverhead(); 67 WriteEntry(const char * name)68 HWY_INLINE void WriteEntry(const char* name) { Write(name, TicksBefore()); } WriteExit()69 HWY_INLINE void WriteExit() { Write(nullptr, TicksAfter()); } 70 71 PROFILER_PUBLIC void AnalyzeRemainingPackets(); 72 73 // Accessors instead of public member for well-defined data layout. SetNext(ThreadSpecific * next)74 void SetNext(ThreadSpecific* next) { next_ = next; } GetNext()75 ThreadSpecific* GetNext() const { return next_; } 76 GetResults()77 Results& GetResults() { return *results_; } 78 79 private: 80 PROFILER_PUBLIC void FlushBuffer(); 81 82 // Write packet to buffer/storage, emptying them as needed. Write(const char * name,const uint64_t timestamp)83 void Write(const char* name, const uint64_t timestamp) { 84 if (buffer_size_ == kBufferCapacity) { // Full 85 FlushBuffer(); 86 } 87 buffer_[buffer_size_].name = name; 88 buffer_[buffer_size_].timestamp = timestamp; 89 ++buffer_size_; 90 } 91 92 // Write-combining buffer to avoid cache pollution. Must be the first 93 // non-static member to ensure cache-line alignment. 94 Packet buffer_[kBufferCapacity]; 95 size_t buffer_size_ = 0; 96 97 // Contiguous storage for zone enter/exit packets. 98 const size_t max_packets_; 99 hwy::AlignedFreeUniquePtr<Packet[]> packets_; 100 size_t num_packets_; 101 102 // Linked list of all threads. 103 ThreadSpecific* next_ = nullptr; // Owned, never released. 104 105 hwy::AlignedUniquePtr<Results> results_; 106 }; 107 108 // RAII zone enter/exit recorder constructed by PROFILER_ZONE; also 109 // responsible for initializing ThreadSpecific. 110 class Zone { 111 public: Zone(const char * name)112 HWY_NOINLINE explicit Zone(const char* name) { 113 HWY_FENCE; 114 ThreadSpecific* HWY_RESTRICT thread_specific = GetThreadSpecific(); 115 if (HWY_UNLIKELY(thread_specific == nullptr)) { 116 thread_specific = InitThreadSpecific(); 117 } 118 119 thread_specific->WriteEntry(name); 120 } 121 ~Zone()122 HWY_NOINLINE ~Zone() { GetThreadSpecific()->WriteExit(); } 123 124 // Call exactly once after all threads have exited all zones. 125 PROFILER_PUBLIC static void PrintResults(); 126 127 private: 128 // Returns reference to the thread's ThreadSpecific pointer (initially null). 129 // Function-local static avoids needing a separate definition. GetThreadSpecific()130 static ThreadSpecific*& GetThreadSpecific() { 131 static thread_local ThreadSpecific* thread_specific; 132 return thread_specific; 133 } 134 135 // Non time-critical. 136 PROFILER_PUBLIC ThreadSpecific* InitThreadSpecific(); 137 }; 138 139 // Creates a zone starting from here until the end of the current scope. 140 // Timestamps will be recorded when entering and exiting the zone. 141 // To ensure the name pointer remains valid, we require it to be a string 142 // literal (by merging with ""). We also compare strings by address. 143 #define PROFILER_ZONE(name) \ 144 HWY_FENCE; \ 145 const ::profiler::Zone zone("" name); \ 146 HWY_FENCE 147 148 // Creates a zone for an entire function (when placed at its beginning). 149 // Shorter/more convenient than ZONE. 150 #define PROFILER_FUNC \ 151 HWY_FENCE; \ 152 const ::profiler::Zone zone(__func__); \ 153 HWY_FENCE 154 155 #define PROFILER_PRINT_RESULTS ::profiler::Zone::PrintResults 156 157 } // namespace profiler 158 159 #else // !PROFILER_ENABLED 160 #define PROFILER_ZONE(name) 161 #define PROFILER_FUNC 162 #define PROFILER_PRINT_RESULTS() 163 #endif 164 165 #endif // LIB_PROFILER_PROFILER_H_ 166