1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
2 //
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file.
5 
6 #ifndef LIB_PROFILER_PROFILER_H_
7 #define LIB_PROFILER_PROFILER_H_
8 
9 // High precision, low overhead time measurements. Returns exact call counts and
10 // total elapsed time for user-defined 'zones' (code regions, i.e. C++ scopes).
11 //
12 // Usage: instrument regions of interest: { PROFILER_ZONE("name"); /*code*/ } or
13 // void FuncToMeasure() { PROFILER_FUNC; /*code*/ }.
14 // After all threads have exited any zones, invoke PROFILER_PRINT_RESULTS() to
15 // print call counts and average durations [CPU cycles] to stdout, sorted in
16 // descending order of total duration.
17 
18 // If zero, this file has no effect and no measurements will be recorded.
19 #ifndef PROFILER_ENABLED
20 #define PROFILER_ENABLED 0
21 #endif
22 #if PROFILER_ENABLED
23 
24 #include <stddef.h>
25 #include <stdint.h>
26 
27 #include <hwy/aligned_allocator.h>
28 #include <hwy/base.h>
29 
30 #include "lib/profiler/tsc_timer.h"
31 
32 #if HWY_COMPILER_MSVC
33 #define PROFILER_PUBLIC
34 #else
35 #define PROFILER_PUBLIC __attribute__((visibility("default")))
36 #endif
37 
38 namespace profiler {
39 
40 // Represents zone entry/exit events. POD.
41 #pragma pack(push, 1)
42 struct Packet {
43   // Computing a hash or string table is likely too expensive, and offsets
44   // from other libraries' string literals can be too large to combine them and
45   // a full-resolution timestamp into 64 bits.
46   uint64_t timestamp;
47   const char* name;  // nullptr for exit packets
48 #if UINTPTR_MAX <= 0xFFFFFFFFu
49   uint32_t padding;
50 #endif
51 };
52 #pragma pack(pop)
53 static_assert(sizeof(Packet) == 16, "Wrong Packet size");
54 
55 class Results;  // pImpl
56 
57 // Per-thread packet storage, dynamically allocated and aligned.
58 class ThreadSpecific {
59   static constexpr size_t kBufferCapacity = 64 / sizeof(Packet);
60 
61  public:
62   PROFILER_PUBLIC explicit ThreadSpecific();
63   PROFILER_PUBLIC ~ThreadSpecific();
64 
65   // Depends on Zone => defined out of line.
66   PROFILER_PUBLIC void ComputeOverhead();
67 
WriteEntry(const char * name)68   HWY_INLINE void WriteEntry(const char* name) { Write(name, TicksBefore()); }
WriteExit()69   HWY_INLINE void WriteExit() { Write(nullptr, TicksAfter()); }
70 
71   PROFILER_PUBLIC void AnalyzeRemainingPackets();
72 
73   // Accessors instead of public member for well-defined data layout.
SetNext(ThreadSpecific * next)74   void SetNext(ThreadSpecific* next) { next_ = next; }
GetNext()75   ThreadSpecific* GetNext() const { return next_; }
76 
GetResults()77   Results& GetResults() { return *results_; }
78 
79  private:
80   PROFILER_PUBLIC void FlushBuffer();
81 
82   // Write packet to buffer/storage, emptying them as needed.
Write(const char * name,const uint64_t timestamp)83   void Write(const char* name, const uint64_t timestamp) {
84     if (buffer_size_ == kBufferCapacity) {  // Full
85       FlushBuffer();
86     }
87     buffer_[buffer_size_].name = name;
88     buffer_[buffer_size_].timestamp = timestamp;
89     ++buffer_size_;
90   }
91 
92   // Write-combining buffer to avoid cache pollution. Must be the first
93   // non-static member to ensure cache-line alignment.
94   Packet buffer_[kBufferCapacity];
95   size_t buffer_size_ = 0;
96 
97   // Contiguous storage for zone enter/exit packets.
98   const size_t max_packets_;
99   hwy::AlignedFreeUniquePtr<Packet[]> packets_;
100   size_t num_packets_;
101 
102   // Linked list of all threads.
103   ThreadSpecific* next_ = nullptr;  // Owned, never released.
104 
105   hwy::AlignedUniquePtr<Results> results_;
106 };
107 
108 // RAII zone enter/exit recorder constructed by PROFILER_ZONE; also
109 // responsible for initializing ThreadSpecific.
110 class Zone {
111  public:
Zone(const char * name)112   HWY_NOINLINE explicit Zone(const char* name) {
113     HWY_FENCE;
114     ThreadSpecific* HWY_RESTRICT thread_specific = GetThreadSpecific();
115     if (HWY_UNLIKELY(thread_specific == nullptr)) {
116       thread_specific = InitThreadSpecific();
117     }
118 
119     thread_specific->WriteEntry(name);
120   }
121 
~Zone()122   HWY_NOINLINE ~Zone() { GetThreadSpecific()->WriteExit(); }
123 
124   // Call exactly once after all threads have exited all zones.
125   PROFILER_PUBLIC static void PrintResults();
126 
127  private:
128   // Returns reference to the thread's ThreadSpecific pointer (initially null).
129   // Function-local static avoids needing a separate definition.
GetThreadSpecific()130   static ThreadSpecific*& GetThreadSpecific() {
131     static thread_local ThreadSpecific* thread_specific;
132     return thread_specific;
133   }
134 
135   // Non time-critical.
136   PROFILER_PUBLIC ThreadSpecific* InitThreadSpecific();
137 };
138 
139 // Creates a zone starting from here until the end of the current scope.
140 // Timestamps will be recorded when entering and exiting the zone.
141 // To ensure the name pointer remains valid, we require it to be a string
142 // literal (by merging with ""). We also compare strings by address.
143 #define PROFILER_ZONE(name)             \
144   HWY_FENCE;                            \
145   const ::profiler::Zone zone("" name); \
146   HWY_FENCE
147 
148 // Creates a zone for an entire function (when placed at its beginning).
149 // Shorter/more convenient than ZONE.
150 #define PROFILER_FUNC                    \
151   HWY_FENCE;                             \
152   const ::profiler::Zone zone(__func__); \
153   HWY_FENCE
154 
155 #define PROFILER_PRINT_RESULTS ::profiler::Zone::PrintResults
156 
157 }  // namespace profiler
158 
159 #else  // !PROFILER_ENABLED
160 #define PROFILER_ZONE(name)
161 #define PROFILER_FUNC
162 #define PROFILER_PRINT_RESULTS()
163 #endif
164 
165 #endif  // LIB_PROFILER_PROFILER_H_
166