1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
2 //
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file.
5
6 #include "lib/jxl/base/profiler.h"
7
8 #if PROFILER_ENABLED
9
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h> // memcpy
13
14 #include <algorithm> // sort
15 #include <atomic>
16 #include <cinttypes> // PRIu64
17 #include <hwy/cache_control.h>
18 #include <new>
19
20 #include "lib/jxl/base/robust_statistics.h" // HalfSampleMode
21
22 // Optionally use SIMD in StreamCacheLine if available.
23 #undef HWY_TARGET_INCLUDE
24 #define HWY_TARGET_INCLUDE "lib/profiler/profiler.cc"
25 #include <hwy/foreach_target.h>
26 #include <hwy/highway.h>
27
28 HWY_BEFORE_NAMESPACE();
29 namespace profiler {
30 namespace HWY_NAMESPACE {
31
32 // Overwrites `to` without loading it into cache (read-for-ownership).
33 // Copies 64 bytes from/to naturally aligned addresses.
StreamCacheLine(const Packet * HWY_RESTRICT from,Packet * HWY_RESTRICT to)34 void StreamCacheLine(const Packet* HWY_RESTRICT from, Packet* HWY_RESTRICT to) {
35 #if HWY_TARGET == HWY_SCALAR
36 hwy::CopyBytes<64>(from, to);
37 #else
38 const HWY_CAPPED(uint64_t, 2) d;
39 HWY_FENCE;
40 const uint64_t* HWY_RESTRICT from64 = reinterpret_cast<const uint64_t*>(from);
41 const auto v0 = Load(d, from64 + 0);
42 const auto v1 = Load(d, from64 + 2);
43 const auto v2 = Load(d, from64 + 4);
44 const auto v3 = Load(d, from64 + 6);
45 // Fences prevent the compiler from reordering loads/stores, which may
46 // interfere with write-combining.
47 HWY_FENCE;
48 uint64_t* HWY_RESTRICT to64 = reinterpret_cast<uint64_t*>(to);
49 Stream(v0, d, to64 + 0);
50 Stream(v1, d, to64 + 2);
51 Stream(v2, d, to64 + 4);
52 Stream(v3, d, to64 + 6);
53 HWY_FENCE;
54 #endif
55 }
56
57 // NOLINTNEXTLINE(google-readability-namespace-comments)
58 } // namespace HWY_NAMESPACE
59 } // namespace profiler
60 HWY_AFTER_NAMESPACE();
61
62 #if HWY_ONCE
63 namespace profiler {
64
65 HWY_EXPORT(StreamCacheLine);
66
67 namespace {
68
69 // How many mebibytes to allocate (if PROFILER_ENABLED) per thread that
70 // enters at least one zone. Once this buffer is full, the thread will analyze
71 // packets (two per zone), which introduces observer overhead.
72 #ifndef PROFILER_THREAD_STORAGE
73 #define PROFILER_THREAD_STORAGE 32ULL
74 #endif
75
76 #define PROFILER_PRINT_OVERHEAD 0
77
78 // Upper bounds for fixed-size data structures (guarded via HWY_ASSERT):
79 constexpr size_t kMaxDepth = 64; // Maximum nesting of zones.
80 constexpr size_t kMaxZones = 256; // Total number of zones.
81
82 // Stack of active (entered but not exited) zones. POD, uninitialized.
83 // Used to deduct child duration from the parent's self time.
84 struct ActiveZone {
85 const char* name;
86 uint64_t entry_timestamp;
87 uint64_t child_total;
88 };
89
90 // Totals for all Zones with the same name. POD, must be zero-initialized.
91 struct ZoneTotals {
92 uint64_t total_duration;
93 const char* name;
94 uint64_t num_calls;
95 };
96
97 template <typename T>
ClampedSubtract(const T minuend,const T subtrahend)98 inline T ClampedSubtract(const T minuend, const T subtrahend) {
99 if (subtrahend > minuend) {
100 return 0;
101 }
102 return minuend - subtrahend;
103 }
104
105 } // namespace
106
107 // Per-thread call graph (stack) and ZoneTotals for each zone.
108 class Results {
109 public:
Results()110 Results() {
111 // Zero-initialize all accumulators (avoids a check for num_zones_ == 0).
112 memset(zones_, 0, sizeof(zones_));
113 }
114
115 // Used for computing overhead when this thread encounters its first Zone.
116 // This has no observable effect apart from increasing "analyze_elapsed_".
ZoneDuration(const Packet * packets)117 uint64_t ZoneDuration(const Packet* packets) {
118 HWY_ASSERT(depth_ == 0);
119 HWY_ASSERT(num_zones_ == 0);
120 AnalyzePackets(packets, 2);
121 const uint64_t duration = zones_[0].total_duration;
122 zones_[0].num_calls = 0;
123 zones_[0].total_duration = 0;
124 HWY_ASSERT(depth_ == 0);
125 num_zones_ = 0;
126 return duration;
127 }
128
SetSelfOverhead(const uint64_t self_overhead)129 void SetSelfOverhead(const uint64_t self_overhead) {
130 self_overhead_ = self_overhead;
131 }
132
SetChildOverhead(const uint64_t child_overhead)133 void SetChildOverhead(const uint64_t child_overhead) {
134 child_overhead_ = child_overhead;
135 }
136
137 // Draw all required information from the packets, which can be discarded
138 // afterwards. Called whenever this thread's storage is full.
AnalyzePackets(const Packet * HWY_RESTRICT packets,const size_t num_packets)139 void AnalyzePackets(const Packet* HWY_RESTRICT packets,
140 const size_t num_packets) {
141 // Ensures prior weakly-ordered streaming stores are globally visible.
142 hwy::StoreFence();
143
144 const uint64_t t0 = TicksBefore();
145
146 for (size_t i = 0; i < num_packets; ++i) {
147 const uint64_t timestamp = packets[i].timestamp;
148 // Entering a zone
149 if (packets[i].name != nullptr) {
150 HWY_ASSERT(depth_ < kMaxDepth);
151 zone_stack_[depth_].name = packets[i].name;
152 zone_stack_[depth_].entry_timestamp = timestamp;
153 zone_stack_[depth_].child_total = 0;
154 ++depth_;
155 continue;
156 }
157
158 HWY_ASSERT(depth_ != 0);
159 const ActiveZone& active = zone_stack_[depth_ - 1];
160 const uint64_t duration = timestamp - active.entry_timestamp;
161 const uint64_t self_duration = ClampedSubtract(
162 duration, self_overhead_ + child_overhead_ + active.child_total);
163
164 UpdateOrAdd(active.name, 1, self_duration);
165 --depth_;
166
167 // "Deduct" the nested time from its parent's self_duration.
168 if (depth_ != 0) {
169 zone_stack_[depth_ - 1].child_total += duration + child_overhead_;
170 }
171 }
172
173 const uint64_t t1 = TicksAfter();
174 analyze_elapsed_ += t1 - t0;
175 }
176
177 // Incorporates results from another thread. Call after all threads have
178 // exited any zones.
Assimilate(const Results & other)179 void Assimilate(const Results& other) {
180 const uint64_t t0 = TicksBefore();
181 HWY_ASSERT(depth_ == 0);
182 HWY_ASSERT(other.depth_ == 0);
183
184 for (size_t i = 0; i < other.num_zones_; ++i) {
185 const ZoneTotals& zone = other.zones_[i];
186 UpdateOrAdd(zone.name, zone.num_calls, zone.total_duration);
187 }
188 const uint64_t t1 = TicksAfter();
189 analyze_elapsed_ += t1 - t0 + other.analyze_elapsed_;
190 }
191
192 // Single-threaded.
Print()193 void Print() {
194 const uint64_t t0 = TicksBefore();
195 MergeDuplicates();
196
197 // Sort by decreasing total (self) cost.
198 std::sort(zones_, zones_ + num_zones_,
199 [](const ZoneTotals& r1, const ZoneTotals& r2) {
200 return r1.total_duration > r2.total_duration;
201 });
202
203 uint64_t total_visible_duration = 0;
204 for (size_t i = 0; i < num_zones_; ++i) {
205 const ZoneTotals& r = zones_[i];
206 if (r.name[0] != '@') {
207 total_visible_duration += r.total_duration;
208 printf("%-40s: %10" PRIu64 " x %15" PRIu64 "= %15" PRIu64 "\n", r.name,
209 r.num_calls, r.total_duration / r.num_calls, r.total_duration);
210 }
211 }
212
213 const uint64_t t1 = TicksAfter();
214 analyze_elapsed_ += t1 - t0;
215 printf("Total clocks during analysis: %" PRIu64 "\n", analyze_elapsed_);
216 printf("Total clocks measured: %" PRIu64 "\n", total_visible_duration);
217 }
218
219 // Single-threaded. Clears all results as if no zones had been recorded.
Reset()220 void Reset() {
221 analyze_elapsed_ = 0;
222 HWY_ASSERT(depth_ == 0);
223 num_zones_ = 0;
224 memset(zone_stack_, 0, sizeof(zone_stack_));
225 memset(zones_, 0, sizeof(zones_));
226 }
227
228 private:
229 // Updates ZoneTotals of the same name, or inserts a new one if this thread
230 // has not yet seen that name. Uses a self-organizing list data structure,
231 // which avoids dynamic memory allocations and is faster than unordered_map.
UpdateOrAdd(const char * name,const uint64_t num_calls,const uint64_t duration)232 void UpdateOrAdd(const char* name, const uint64_t num_calls,
233 const uint64_t duration) {
234 // Special case for first zone: (maybe) update, without swapping.
235 if (zones_[0].name == name) {
236 zones_[0].total_duration += duration;
237 zones_[0].num_calls += num_calls;
238 return;
239 }
240
241 // Look for a zone with the same name.
242 for (size_t i = 1; i < num_zones_; ++i) {
243 if (zones_[i].name == name) {
244 zones_[i].total_duration += duration;
245 zones_[i].num_calls += num_calls;
246 // Swap with predecessor (more conservative than move to front,
247 // but at least as successful).
248 std::swap(zones_[i - 1], zones_[i]);
249 return;
250 }
251 }
252
253 // Not found; create a new ZoneTotals.
254 HWY_ASSERT(num_zones_ < kMaxZones);
255 ZoneTotals* HWY_RESTRICT zone = zones_ + num_zones_;
256 zone->name = name;
257 zone->num_calls = num_calls;
258 zone->total_duration = duration;
259 ++num_zones_;
260 }
261
262 // Each instantiation of a function template seems to get its own copy of
263 // __func__ and GCC doesn't merge them. An N^2 search for duplicates is
264 // acceptable because we only expect a few dozen zones.
MergeDuplicates()265 void MergeDuplicates() {
266 for (size_t i = 0; i < num_zones_; ++i) {
267 // Add any subsequent duplicates to num_calls and total_duration.
268 for (size_t j = i + 1; j < num_zones_;) {
269 if (!strcmp(zones_[i].name, zones_[j].name)) {
270 zones_[i].num_calls += zones_[j].num_calls;
271 zones_[i].total_duration += zones_[j].total_duration;
272 // Fill hole with last item.
273 zones_[j] = zones_[--num_zones_];
274 } else { // Name differed, try next ZoneTotals.
275 ++j;
276 }
277 }
278 }
279 }
280
281 uint64_t analyze_elapsed_ = 0;
282 uint64_t self_overhead_ = 0;
283 uint64_t child_overhead_ = 0;
284
285 size_t depth_ = 0; // Number of active zones <= kMaxDepth.
286 size_t num_zones_ = 0; // Number of unique zones <= kMaxZones.
287
288 // After other members to avoid large pointer offsets.
289 alignas(64) ActiveZone zone_stack_[kMaxDepth]; // Last = newest
290 alignas(64) ZoneTotals zones_[kMaxZones]; // Self-organizing list
291 };
292
ThreadSpecific()293 ThreadSpecific::ThreadSpecific()
294 : max_packets_(PROFILER_THREAD_STORAGE << 16), // MiB / sizeof(Packet)
295 packets_(hwy::AllocateAligned<Packet>(max_packets_)),
296 num_packets_(0),
297 results_(hwy::MakeUniqueAligned<Results>()) {}
298
~ThreadSpecific()299 ThreadSpecific::~ThreadSpecific() {}
300
FlushBuffer()301 void ThreadSpecific::FlushBuffer() {
302 if (num_packets_ + kBufferCapacity > max_packets_) {
303 results_->AnalyzePackets(packets_.get(), num_packets_);
304 num_packets_ = 0;
305 }
306 // This buffering halves observer overhead and decreases the overall
307 // runtime by about 3%.
308 HWY_DYNAMIC_DISPATCH(StreamCacheLine)
309 (buffer_, packets_.get() + num_packets_);
310 num_packets_ += kBufferCapacity;
311 buffer_size_ = 0;
312 }
313
AnalyzeRemainingPackets()314 void ThreadSpecific::AnalyzeRemainingPackets() {
315 // Storage full => empty it.
316 if (num_packets_ + buffer_size_ > max_packets_) {
317 results_->AnalyzePackets(packets_.get(), num_packets_);
318 num_packets_ = 0;
319 }
320
321 // Move buffer to storage
322 memcpy(packets_.get() + num_packets_, buffer_, buffer_size_ * sizeof(Packet));
323 num_packets_ += buffer_size_;
324 buffer_size_ = 0;
325
326 results_->AnalyzePackets(packets_.get(), num_packets_);
327 num_packets_ = 0;
328 }
329
ComputeOverhead()330 void ThreadSpecific::ComputeOverhead() {
331 // Delay after capturing timestamps before/after the actual zone runs. Even
332 // with frequency throttling disabled, this has a multimodal distribution,
333 // including 32, 34, 48, 52, 59, 62.
334 uint64_t self_overhead;
335 {
336 const size_t kNumSamples = 32;
337 uint32_t samples[kNumSamples];
338 for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
339 const size_t kNumDurations = 1024;
340 uint32_t durations[kNumDurations];
341
342 for (size_t idx_duration = 0; idx_duration < kNumDurations;
343 ++idx_duration) {
344 { //
345 PROFILER_ZONE("Dummy Zone (never shown)");
346 }
347 const uint64_t duration = results_->ZoneDuration(buffer_);
348 buffer_size_ = 0;
349 durations[idx_duration] = static_cast<uint32_t>(duration);
350 HWY_ASSERT(num_packets_ == 0);
351 }
352 jxl::CountingSort(durations, durations + kNumDurations);
353 samples[idx_sample] = jxl::HalfSampleMode()(durations, kNumDurations);
354 }
355 // Median.
356 jxl::CountingSort(samples, samples + kNumSamples);
357 self_overhead = samples[kNumSamples / 2];
358 #if PROFILER_PRINT_OVERHEAD
359 printf("Overhead: %zu\n", self_overhead);
360 #endif
361 results_->SetSelfOverhead(self_overhead);
362 }
363
364 // Delay before capturing start timestamp / after end timestamp.
365 const size_t kNumSamples = 32;
366 uint32_t samples[kNumSamples];
367 for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
368 const size_t kNumDurations = 16;
369 uint32_t durations[kNumDurations];
370 for (size_t idx_duration = 0; idx_duration < kNumDurations;
371 ++idx_duration) {
372 const size_t kReps = 10000;
373 // Analysis time should not be included => must fit within buffer.
374 HWY_ASSERT(kReps * 2 < max_packets_);
375 hwy::StoreFence();
376 const uint64_t t0 = TicksBefore();
377 for (size_t i = 0; i < kReps; ++i) {
378 PROFILER_ZONE("Dummy");
379 }
380 hwy::StoreFence();
381 const uint64_t t1 = TicksAfter();
382 HWY_ASSERT(num_packets_ + buffer_size_ == kReps * 2);
383 buffer_size_ = 0;
384 num_packets_ = 0;
385 const uint64_t avg_duration = (t1 - t0 + kReps / 2) / kReps;
386 durations[idx_duration] =
387 static_cast<uint32_t>(ClampedSubtract(avg_duration, self_overhead));
388 }
389 jxl::CountingSort(durations, durations + kNumDurations);
390 samples[idx_sample] = jxl::HalfSampleMode()(durations, kNumDurations);
391 }
392 jxl::CountingSort(samples, samples + kNumSamples);
393 const uint64_t child_overhead = samples[9 * kNumSamples / 10];
394 #if PROFILER_PRINT_OVERHEAD
395 printf("Child overhead: %zu\n", child_overhead);
396 #endif
397 results_->SetChildOverhead(child_overhead);
398 }
399
400 namespace {
401
402 // Could be a static member of Zone, but that would expose <atomic> in header.
GetHead()403 std::atomic<ThreadSpecific*>& GetHead() {
404 static std::atomic<ThreadSpecific*> head_{nullptr}; // Owning
405 return head_;
406 }
407
408 } // namespace
409
410 // Thread-safe.
InitThreadSpecific()411 ThreadSpecific* Zone::InitThreadSpecific() {
412 ThreadSpecific* thread_specific =
413 hwy::MakeUniqueAligned<ThreadSpecific>().release();
414
415 // Insert into unordered list
416 std::atomic<ThreadSpecific*>& head = GetHead();
417 ThreadSpecific* old_head = head.load(std::memory_order_relaxed);
418 thread_specific->SetNext(old_head);
419 while (!head.compare_exchange_weak(old_head, thread_specific,
420 std::memory_order_release,
421 std::memory_order_relaxed)) {
422 thread_specific->SetNext(old_head);
423 // TODO(janwas): pause
424 }
425
426 // ComputeOverhead also creates a Zone, so this needs to be set before that
427 // to prevent infinite recursion.
428 GetThreadSpecific() = thread_specific;
429
430 thread_specific->ComputeOverhead();
431 return thread_specific;
432 }
433
434 // Single-threaded.
PrintResults()435 /*static*/ void Zone::PrintResults() {
436 ThreadSpecific* head = GetHead().load(std::memory_order_relaxed);
437 ThreadSpecific* p = head;
438 while (p) {
439 p->AnalyzeRemainingPackets();
440
441 // Combine all threads into a single Result.
442 if (p != head) {
443 head->GetResults().Assimilate(p->GetResults());
444 p->GetResults().Reset();
445 }
446
447 p = p->GetNext();
448 }
449
450 if (head != nullptr) {
451 head->GetResults().Print();
452 head->GetResults().Reset();
453 }
454 }
455
456 } // namespace profiler
457
458 #endif // HWY_ONCE
459 #endif // PROFILER_ENABLED
460