1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
2 //
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file.
5 
6 #include "lib/jxl/base/profiler.h"
7 
8 #if PROFILER_ENABLED
9 
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>  // memcpy
13 
14 #include <algorithm>  // sort
15 #include <atomic>
16 #include <cinttypes>  // PRIu64
17 #include <hwy/cache_control.h>
18 #include <new>
19 
20 #include "lib/jxl/base/robust_statistics.h"  // HalfSampleMode
21 
22 // Optionally use SIMD in StreamCacheLine if available.
23 #undef HWY_TARGET_INCLUDE
24 #define HWY_TARGET_INCLUDE "lib/profiler/profiler.cc"
25 #include <hwy/foreach_target.h>
26 #include <hwy/highway.h>
27 
28 HWY_BEFORE_NAMESPACE();
29 namespace profiler {
30 namespace HWY_NAMESPACE {
31 
32 // Overwrites `to` without loading it into cache (read-for-ownership).
33 // Copies 64 bytes from/to naturally aligned addresses.
StreamCacheLine(const Packet * HWY_RESTRICT from,Packet * HWY_RESTRICT to)34 void StreamCacheLine(const Packet* HWY_RESTRICT from, Packet* HWY_RESTRICT to) {
35 #if HWY_TARGET == HWY_SCALAR
36   hwy::CopyBytes<64>(from, to);
37 #else
38   const HWY_CAPPED(uint64_t, 2) d;
39   HWY_FENCE;
40   const uint64_t* HWY_RESTRICT from64 = reinterpret_cast<const uint64_t*>(from);
41   const auto v0 = Load(d, from64 + 0);
42   const auto v1 = Load(d, from64 + 2);
43   const auto v2 = Load(d, from64 + 4);
44   const auto v3 = Load(d, from64 + 6);
45   // Fences prevent the compiler from reordering loads/stores, which may
46   // interfere with write-combining.
47   HWY_FENCE;
48   uint64_t* HWY_RESTRICT to64 = reinterpret_cast<uint64_t*>(to);
49   Stream(v0, d, to64 + 0);
50   Stream(v1, d, to64 + 2);
51   Stream(v2, d, to64 + 4);
52   Stream(v3, d, to64 + 6);
53   HWY_FENCE;
54 #endif
55 }
56 
57 // NOLINTNEXTLINE(google-readability-namespace-comments)
58 }  // namespace HWY_NAMESPACE
59 }  // namespace profiler
60 HWY_AFTER_NAMESPACE();
61 
62 #if HWY_ONCE
63 namespace profiler {
64 
65 HWY_EXPORT(StreamCacheLine);
66 
67 namespace {
68 
69 // How many mebibytes to allocate (if PROFILER_ENABLED) per thread that
70 // enters at least one zone. Once this buffer is full, the thread will analyze
71 // packets (two per zone), which introduces observer overhead.
72 #ifndef PROFILER_THREAD_STORAGE
73 #define PROFILER_THREAD_STORAGE 32ULL
74 #endif
75 
76 #define PROFILER_PRINT_OVERHEAD 0
77 
78 // Upper bounds for fixed-size data structures (guarded via HWY_ASSERT):
79 constexpr size_t kMaxDepth = 64;   // Maximum nesting of zones.
80 constexpr size_t kMaxZones = 256;  // Total number of zones.
81 
82 // Stack of active (entered but not exited) zones. POD, uninitialized.
83 // Used to deduct child duration from the parent's self time.
84 struct ActiveZone {
85   const char* name;
86   uint64_t entry_timestamp;
87   uint64_t child_total;
88 };
89 
90 // Totals for all Zones with the same name. POD, must be zero-initialized.
91 struct ZoneTotals {
92   uint64_t total_duration;
93   const char* name;
94   uint64_t num_calls;
95 };
96 
97 template <typename T>
ClampedSubtract(const T minuend,const T subtrahend)98 inline T ClampedSubtract(const T minuend, const T subtrahend) {
99   if (subtrahend > minuend) {
100     return 0;
101   }
102   return minuend - subtrahend;
103 }
104 
105 }  // namespace
106 
107 // Per-thread call graph (stack) and ZoneTotals for each zone.
108 class Results {
109  public:
Results()110   Results() {
111     // Zero-initialize all accumulators (avoids a check for num_zones_ == 0).
112     memset(zones_, 0, sizeof(zones_));
113   }
114 
115   // Used for computing overhead when this thread encounters its first Zone.
116   // This has no observable effect apart from increasing "analyze_elapsed_".
ZoneDuration(const Packet * packets)117   uint64_t ZoneDuration(const Packet* packets) {
118     HWY_ASSERT(depth_ == 0);
119     HWY_ASSERT(num_zones_ == 0);
120     AnalyzePackets(packets, 2);
121     const uint64_t duration = zones_[0].total_duration;
122     zones_[0].num_calls = 0;
123     zones_[0].total_duration = 0;
124     HWY_ASSERT(depth_ == 0);
125     num_zones_ = 0;
126     return duration;
127   }
128 
SetSelfOverhead(const uint64_t self_overhead)129   void SetSelfOverhead(const uint64_t self_overhead) {
130     self_overhead_ = self_overhead;
131   }
132 
SetChildOverhead(const uint64_t child_overhead)133   void SetChildOverhead(const uint64_t child_overhead) {
134     child_overhead_ = child_overhead;
135   }
136 
137   // Draw all required information from the packets, which can be discarded
138   // afterwards. Called whenever this thread's storage is full.
AnalyzePackets(const Packet * HWY_RESTRICT packets,const size_t num_packets)139   void AnalyzePackets(const Packet* HWY_RESTRICT packets,
140                       const size_t num_packets) {
141     // Ensures prior weakly-ordered streaming stores are globally visible.
142     hwy::StoreFence();
143 
144     const uint64_t t0 = TicksBefore();
145 
146     for (size_t i = 0; i < num_packets; ++i) {
147       const uint64_t timestamp = packets[i].timestamp;
148       // Entering a zone
149       if (packets[i].name != nullptr) {
150         HWY_ASSERT(depth_ < kMaxDepth);
151         zone_stack_[depth_].name = packets[i].name;
152         zone_stack_[depth_].entry_timestamp = timestamp;
153         zone_stack_[depth_].child_total = 0;
154         ++depth_;
155         continue;
156       }
157 
158       HWY_ASSERT(depth_ != 0);
159       const ActiveZone& active = zone_stack_[depth_ - 1];
160       const uint64_t duration = timestamp - active.entry_timestamp;
161       const uint64_t self_duration = ClampedSubtract(
162           duration, self_overhead_ + child_overhead_ + active.child_total);
163 
164       UpdateOrAdd(active.name, 1, self_duration);
165       --depth_;
166 
167       // "Deduct" the nested time from its parent's self_duration.
168       if (depth_ != 0) {
169         zone_stack_[depth_ - 1].child_total += duration + child_overhead_;
170       }
171     }
172 
173     const uint64_t t1 = TicksAfter();
174     analyze_elapsed_ += t1 - t0;
175   }
176 
177   // Incorporates results from another thread. Call after all threads have
178   // exited any zones.
Assimilate(const Results & other)179   void Assimilate(const Results& other) {
180     const uint64_t t0 = TicksBefore();
181     HWY_ASSERT(depth_ == 0);
182     HWY_ASSERT(other.depth_ == 0);
183 
184     for (size_t i = 0; i < other.num_zones_; ++i) {
185       const ZoneTotals& zone = other.zones_[i];
186       UpdateOrAdd(zone.name, zone.num_calls, zone.total_duration);
187     }
188     const uint64_t t1 = TicksAfter();
189     analyze_elapsed_ += t1 - t0 + other.analyze_elapsed_;
190   }
191 
192   // Single-threaded.
Print()193   void Print() {
194     const uint64_t t0 = TicksBefore();
195     MergeDuplicates();
196 
197     // Sort by decreasing total (self) cost.
198     std::sort(zones_, zones_ + num_zones_,
199               [](const ZoneTotals& r1, const ZoneTotals& r2) {
200                 return r1.total_duration > r2.total_duration;
201               });
202 
203     uint64_t total_visible_duration = 0;
204     for (size_t i = 0; i < num_zones_; ++i) {
205       const ZoneTotals& r = zones_[i];
206       if (r.name[0] != '@') {
207         total_visible_duration += r.total_duration;
208         printf("%-40s: %10" PRIu64 " x %15" PRIu64 "= %15" PRIu64 "\n", r.name,
209                r.num_calls, r.total_duration / r.num_calls, r.total_duration);
210       }
211     }
212 
213     const uint64_t t1 = TicksAfter();
214     analyze_elapsed_ += t1 - t0;
215     printf("Total clocks during analysis: %" PRIu64 "\n", analyze_elapsed_);
216     printf("Total clocks measured: %" PRIu64 "\n", total_visible_duration);
217   }
218 
219   // Single-threaded. Clears all results as if no zones had been recorded.
Reset()220   void Reset() {
221     analyze_elapsed_ = 0;
222     HWY_ASSERT(depth_ == 0);
223     num_zones_ = 0;
224     memset(zone_stack_, 0, sizeof(zone_stack_));
225     memset(zones_, 0, sizeof(zones_));
226   }
227 
228  private:
229   // Updates ZoneTotals of the same name, or inserts a new one if this thread
230   // has not yet seen that name. Uses a self-organizing list data structure,
231   // which avoids dynamic memory allocations and is faster than unordered_map.
UpdateOrAdd(const char * name,const uint64_t num_calls,const uint64_t duration)232   void UpdateOrAdd(const char* name, const uint64_t num_calls,
233                    const uint64_t duration) {
234     // Special case for first zone: (maybe) update, without swapping.
235     if (zones_[0].name == name) {
236       zones_[0].total_duration += duration;
237       zones_[0].num_calls += num_calls;
238       return;
239     }
240 
241     // Look for a zone with the same name.
242     for (size_t i = 1; i < num_zones_; ++i) {
243       if (zones_[i].name == name) {
244         zones_[i].total_duration += duration;
245         zones_[i].num_calls += num_calls;
246         // Swap with predecessor (more conservative than move to front,
247         // but at least as successful).
248         std::swap(zones_[i - 1], zones_[i]);
249         return;
250       }
251     }
252 
253     // Not found; create a new ZoneTotals.
254     HWY_ASSERT(num_zones_ < kMaxZones);
255     ZoneTotals* HWY_RESTRICT zone = zones_ + num_zones_;
256     zone->name = name;
257     zone->num_calls = num_calls;
258     zone->total_duration = duration;
259     ++num_zones_;
260   }
261 
262   // Each instantiation of a function template seems to get its own copy of
263   // __func__ and GCC doesn't merge them. An N^2 search for duplicates is
264   // acceptable because we only expect a few dozen zones.
MergeDuplicates()265   void MergeDuplicates() {
266     for (size_t i = 0; i < num_zones_; ++i) {
267       // Add any subsequent duplicates to num_calls and total_duration.
268       for (size_t j = i + 1; j < num_zones_;) {
269         if (!strcmp(zones_[i].name, zones_[j].name)) {
270           zones_[i].num_calls += zones_[j].num_calls;
271           zones_[i].total_duration += zones_[j].total_duration;
272           // Fill hole with last item.
273           zones_[j] = zones_[--num_zones_];
274         } else {  // Name differed, try next ZoneTotals.
275           ++j;
276         }
277       }
278     }
279   }
280 
281   uint64_t analyze_elapsed_ = 0;
282   uint64_t self_overhead_ = 0;
283   uint64_t child_overhead_ = 0;
284 
285   size_t depth_ = 0;      // Number of active zones <= kMaxDepth.
286   size_t num_zones_ = 0;  // Number of unique zones <= kMaxZones.
287 
288   // After other members to avoid large pointer offsets.
289   alignas(64) ActiveZone zone_stack_[kMaxDepth];  // Last = newest
290   alignas(64) ZoneTotals zones_[kMaxZones];       // Self-organizing list
291 };
292 
ThreadSpecific()293 ThreadSpecific::ThreadSpecific()
294     : max_packets_(PROFILER_THREAD_STORAGE << 16),  // MiB / sizeof(Packet)
295       packets_(hwy::AllocateAligned<Packet>(max_packets_)),
296       num_packets_(0),
297       results_(hwy::MakeUniqueAligned<Results>()) {}
298 
~ThreadSpecific()299 ThreadSpecific::~ThreadSpecific() {}
300 
FlushBuffer()301 void ThreadSpecific::FlushBuffer() {
302   if (num_packets_ + kBufferCapacity > max_packets_) {
303     results_->AnalyzePackets(packets_.get(), num_packets_);
304     num_packets_ = 0;
305   }
306   // This buffering halves observer overhead and decreases the overall
307   // runtime by about 3%.
308   HWY_DYNAMIC_DISPATCH(StreamCacheLine)
309   (buffer_, packets_.get() + num_packets_);
310   num_packets_ += kBufferCapacity;
311   buffer_size_ = 0;
312 }
313 
AnalyzeRemainingPackets()314 void ThreadSpecific::AnalyzeRemainingPackets() {
315   // Storage full => empty it.
316   if (num_packets_ + buffer_size_ > max_packets_) {
317     results_->AnalyzePackets(packets_.get(), num_packets_);
318     num_packets_ = 0;
319   }
320 
321   // Move buffer to storage
322   memcpy(packets_.get() + num_packets_, buffer_, buffer_size_ * sizeof(Packet));
323   num_packets_ += buffer_size_;
324   buffer_size_ = 0;
325 
326   results_->AnalyzePackets(packets_.get(), num_packets_);
327   num_packets_ = 0;
328 }
329 
ComputeOverhead()330 void ThreadSpecific::ComputeOverhead() {
331   // Delay after capturing timestamps before/after the actual zone runs. Even
332   // with frequency throttling disabled, this has a multimodal distribution,
333   // including 32, 34, 48, 52, 59, 62.
334   uint64_t self_overhead;
335   {
336     const size_t kNumSamples = 32;
337     uint32_t samples[kNumSamples];
338     for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
339       const size_t kNumDurations = 1024;
340       uint32_t durations[kNumDurations];
341 
342       for (size_t idx_duration = 0; idx_duration < kNumDurations;
343            ++idx_duration) {
344         {  //
345           PROFILER_ZONE("Dummy Zone (never shown)");
346         }
347         const uint64_t duration = results_->ZoneDuration(buffer_);
348         buffer_size_ = 0;
349         durations[idx_duration] = static_cast<uint32_t>(duration);
350         HWY_ASSERT(num_packets_ == 0);
351       }
352       jxl::CountingSort(durations, durations + kNumDurations);
353       samples[idx_sample] = jxl::HalfSampleMode()(durations, kNumDurations);
354     }
355     // Median.
356     jxl::CountingSort(samples, samples + kNumSamples);
357     self_overhead = samples[kNumSamples / 2];
358 #if PROFILER_PRINT_OVERHEAD
359     printf("Overhead: %zu\n", self_overhead);
360 #endif
361     results_->SetSelfOverhead(self_overhead);
362   }
363 
364   // Delay before capturing start timestamp / after end timestamp.
365   const size_t kNumSamples = 32;
366   uint32_t samples[kNumSamples];
367   for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
368     const size_t kNumDurations = 16;
369     uint32_t durations[kNumDurations];
370     for (size_t idx_duration = 0; idx_duration < kNumDurations;
371          ++idx_duration) {
372       const size_t kReps = 10000;
373       // Analysis time should not be included => must fit within buffer.
374       HWY_ASSERT(kReps * 2 < max_packets_);
375       hwy::StoreFence();
376       const uint64_t t0 = TicksBefore();
377       for (size_t i = 0; i < kReps; ++i) {
378         PROFILER_ZONE("Dummy");
379       }
380       hwy::StoreFence();
381       const uint64_t t1 = TicksAfter();
382       HWY_ASSERT(num_packets_ + buffer_size_ == kReps * 2);
383       buffer_size_ = 0;
384       num_packets_ = 0;
385       const uint64_t avg_duration = (t1 - t0 + kReps / 2) / kReps;
386       durations[idx_duration] =
387           static_cast<uint32_t>(ClampedSubtract(avg_duration, self_overhead));
388     }
389     jxl::CountingSort(durations, durations + kNumDurations);
390     samples[idx_sample] = jxl::HalfSampleMode()(durations, kNumDurations);
391   }
392   jxl::CountingSort(samples, samples + kNumSamples);
393   const uint64_t child_overhead = samples[9 * kNumSamples / 10];
394 #if PROFILER_PRINT_OVERHEAD
395   printf("Child overhead: %zu\n", child_overhead);
396 #endif
397   results_->SetChildOverhead(child_overhead);
398 }
399 
400 namespace {
401 
402 // Could be a static member of Zone, but that would expose <atomic> in header.
GetHead()403 std::atomic<ThreadSpecific*>& GetHead() {
404   static std::atomic<ThreadSpecific*> head_{nullptr};  // Owning
405   return head_;
406 }
407 
408 }  // namespace
409 
410 // Thread-safe.
InitThreadSpecific()411 ThreadSpecific* Zone::InitThreadSpecific() {
412   ThreadSpecific* thread_specific =
413       hwy::MakeUniqueAligned<ThreadSpecific>().release();
414 
415   // Insert into unordered list
416   std::atomic<ThreadSpecific*>& head = GetHead();
417   ThreadSpecific* old_head = head.load(std::memory_order_relaxed);
418   thread_specific->SetNext(old_head);
419   while (!head.compare_exchange_weak(old_head, thread_specific,
420                                      std::memory_order_release,
421                                      std::memory_order_relaxed)) {
422     thread_specific->SetNext(old_head);
423     // TODO(janwas): pause
424   }
425 
426   // ComputeOverhead also creates a Zone, so this needs to be set before that
427   // to prevent infinite recursion.
428   GetThreadSpecific() = thread_specific;
429 
430   thread_specific->ComputeOverhead();
431   return thread_specific;
432 }
433 
434 // Single-threaded.
PrintResults()435 /*static*/ void Zone::PrintResults() {
436   ThreadSpecific* head = GetHead().load(std::memory_order_relaxed);
437   ThreadSpecific* p = head;
438   while (p) {
439     p->AnalyzeRemainingPackets();
440 
441     // Combine all threads into a single Result.
442     if (p != head) {
443       head->GetResults().Assimilate(p->GetResults());
444       p->GetResults().Reset();
445     }
446 
447     p = p->GetNext();
448   }
449 
450   if (head != nullptr) {
451     head->GetResults().Print();
452     head->GetResults().Reset();
453   }
454 }
455 
456 }  // namespace profiler
457 
458 #endif  // HWY_ONCE
459 #endif  // PROFILER_ENABLED
460