1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // profiler.h: a simple sampling profiler that's always just one #include away!
16 //
17 // Overview
18 // ========
19 //
20 // This profiler only samples a pseudo-stack, not the actual call stack.
21 // The code to be profiled needs to be instrumented with
22 // pseudo-stack "labels", see ScopedProfilingLabel.
23 // Using pseudo-stacks allows this profiler to be very simple, low-overhead,
24 // portable, and independent of compilation details such as function inlining
25 // and frame pointers. The granularity of instrumentation can be freely chosen,
26 // and it is possible to get some annotate-like detail, i.e. detail within one
27 // function without splitting it into multiple functions.
28 //
29 // This profiler should remain small and simple; its key feature is to fit in
30 // a single header file so that there should never be a reason to refrain
31 // from profiling. More complex and feature-rich alternatives are
32 // readily available. This one offers a strict superset of its
33 // functionality: https://github.com/bgirard/GeckoProfiler, including
34 // intertwining pseudostacks with real call stacks, more annotation options,
35 // and advanced visualization.
36 //
37 // Usage
38 // =====
39 //
40 // 0. Enable profiling by defining GEMMLOWP_PROFILING. When profiling is
41 //    not enabled, profiling instrumentation from instrumentation.h
42 //    (ScopedProfilingLabel, RegisterCurrentThreadForProfiling)
43 //    is still defined but does nothing. On the other hand,
44 //    when profiling is not enabled, it is an error to #include the
45 //    present file.
46 //
47 // 1. Each thread can opt in to profiling by calling
48 //    RegisterCurrentThreadForProfiling() defined in instrumentation.h.
49 //    This can be done at any time, before or during profiling.
50 //    No sample will be collected from a thread until
51 //    it has called RegisterCurrentThreadForProfiling().
52 //
53 // 2. Instrument your code to be profiled with ScopedProfilingLabel,
54 //    which is a RAII helper defined in instrumentation.h. The identifier
55 //    names (some_label, etc) do not matter; what will show up
56 //    in the profile is the string passed to the constructor, which
57 //    must be a literal string. See the full example below.
58 //
59 //    Note: the overhead of ScopedProfilingLabel is zero when not
60 //    enabling profiling (when not defining GEMMLOWP_PROFILING).
61 //
62 // 3. Use the profiler.h interface to control profiling. There are two
63 //    functions: StartProfiling() and FinishProfiling(). They must be
64 //    called on the same thread. FinishProfiling() prints the profile
65 //    on stdout.
66 //
67 // Full example
68 // ============
69 /*
70     #define GEMMLOWP_PROFILING
71     #include "profiling/instrumentation.h"
72     using namespace gemmlowp;
73 
74     const int iters = 100000000;
75     volatile int i;
76 
77     void Bar() {
78       ScopedProfilingLabel label("Bar");
79       for (i = 0; i < iters; i++) {}
80     }
81 
82     void Foo() {
83       ScopedProfilingLabel label("Foo");
84       for (i = 0; i < iters; i++) {}
85       Bar();
86     }
87 
88     void Init() {
89       RegisterCurrentThreadForProfiling();
90     }
91 
92     #include "profiling/profiler.h"
93 
94     int main() {
95       Init();
96       StartProfiling();
97       Foo();
98       FinishProfiling();
99     }
100 *
101 * Output:
102 *
103     gemmlowp profile (1 threads, 304 samples)
104     100.00% Foo
105         51.32% other
106         48.68% Bar
107     0.00% other (outside of any label)
108 */
109 //
110 // Interpreting results
111 // ====================
112 //
113 //  Each node shows the absolute percentage, among all the samples,
114 //  of the number of samples that recorded the given pseudo-stack.
115 //  The percentages are *NOT* relative to the parent node. In addition
116 //  to your own labels, you will also see 'other' nodes that collect
117 //  the remainder of samples under the parent node that didn't fall into
118 //  any of the labelled child nodes. Example:
119 //
120 //  20% Foo
121 //      12% Bar
122 //      6% Xyz
123 //      2% other
124 //
125 //  This means that 20% of all labels were under Foo, of which 12%/20%==60%
126 //  were under Bar, 6%/20%==30% were under Xyz, and 2%/20%==10% were not
127 //  under either Bar or Xyz.
128 //
129 //  Typically, one wants to keep adding ScopedProfilingLabel's until
130 //  the 'other' nodes show low percentages.
131 //
132 // Interpreting results with multiple threads
133 // ==========================================
134 //
135 // At each sample, each thread registered for profiling gets sampled once.
136 // So if there is one "main thread" spending its time in MainFunc() and
137 // 4 "worker threads" spending time in WorkerFunc(), then 80% (=4/5) of the
138 // samples will be in WorkerFunc, so the profile will look like this:
139 //
140 // 80% WorkerFunc
141 // 20% MainFunc
142 
143 #ifndef GEMMLOWP_PROFILING_PROFILER_H_
144 #define GEMMLOWP_PROFILING_PROFILER_H_
145 
146 #ifndef GEMMLOWP_PROFILING
147 #error Profiling is not enabled!
148 #endif
149 
150 #include <vector>
151 
152 #include "instrumentation.h"
153 
154 namespace gemmlowp {
155 
156 // A tree view of a profile.
157 class ProfileTreeView {
158   struct Node {
159     std::vector<Node*> children;
160     const char* label;
161     std::size_t weight;
NodeNode162     Node() : label(nullptr), weight(0) {}
~NodeNode163     ~Node() {
164       for (auto child : children) {
165         delete child;
166       }
167     }
168   };
169 
CompareNodes(Node * n1,Node * n2)170   static bool CompareNodes(Node* n1, Node* n2) {
171     return n1->weight > n2->weight;
172   }
173 
174   Node root_;
175 
PrintNode(const Node * node,int level)176   void PrintNode(const Node* node, int level) const {
177     if (level) {
178       for (int i = 1; i < level; i++) {
179         printf("    ");
180       }
181       printf("%.2f%% %s\n", 100.0f * node->weight / root_.weight, node->label);
182     }
183     for (auto child : node->children) {
184       PrintNode(child, level + 1);
185     }
186   }
187 
AddStackToNode(const ProfilingStack & stack,Node * node,std::size_t level)188   static void AddStackToNode(const ProfilingStack& stack, Node* node,
189                              std::size_t level) {
190     node->weight++;
191     if (stack.size == level) {
192       return;
193     }
194     Node* child_to_add_to = nullptr;
195     for (auto child : node->children) {
196       if (child->label == stack.labels[level]) {
197         child_to_add_to = child;
198         break;
199       }
200     }
201     if (!child_to_add_to) {
202       child_to_add_to = new Node;
203       child_to_add_to->label = stack.labels[level];
204       node->children.push_back(child_to_add_to);
205     }
206     AddStackToNode(stack, child_to_add_to, level + 1);
207     return;
208   }
209 
AddStack(const ProfilingStack & stack)210   void AddStack(const ProfilingStack& stack) {
211     AddStackToNode(stack, &root_, 0);
212   }
213 
AddOtherChildrenToNode(Node * node)214   void AddOtherChildrenToNode(Node* node) {
215     std::size_t top_level_children_weight = 0;
216     for (auto c : node->children) {
217       AddOtherChildrenToNode(c);
218       top_level_children_weight += c->weight;
219     }
220     if (top_level_children_weight) {
221       Node* other_child = new Node;
222       other_child->label =
223           node == &root_ ? "other (outside of any label)" : "other";
224       other_child->weight = node->weight - top_level_children_weight;
225       node->children.push_back(other_child);
226     }
227   }
228 
AddOtherNodes()229   void AddOtherNodes() { AddOtherChildrenToNode(&root_); }
230 
SortNode(Node * node)231   void SortNode(Node* node) {
232     std::sort(node->children.begin(), node->children.end(), CompareNodes);
233     for (auto child : node->children) {
234       SortNode(child);
235     }
236   }
237 
Sort()238   void Sort() { SortNode(&root_); }
239 
240  public:
ProfileTreeView(const std::vector<ProfilingStack> & stacks)241   explicit ProfileTreeView(const std::vector<ProfilingStack>& stacks) {
242     for (auto stack : stacks) {
243       AddStack(stack);
244     }
245     AddOtherNodes();
246     Sort();
247   }
248 
Print()249   void Print() const {
250     printf("\n");
251     printf("gemmlowp profile (%d threads, %d samples)\n",
252            static_cast<int>(ThreadsUnderProfiling().size()),
253            static_cast<int>(root_.weight));
254     PrintNode(&root_, 0);
255     printf("\n");
256   }
257 };
258 
259 // This function is the only place that determines our sampling frequency.
WaitOneProfilerTick()260 inline void WaitOneProfilerTick() {
261   static const int millisecond = 1000000;
262 
263 #if defined __arm__ || defined __aarch64__
264   // Reduced sampling frequency on mobile devices helps limit time and memory
265   // overhead there.
266   static const int interval = 10 * millisecond;
267 #else
268   static const int interval = 1 * millisecond;
269 #endif
270 
271   timespec ts;
272   ts.tv_sec = 0;
273   ts.tv_nsec = interval;
274   nanosleep(&ts, nullptr);
275 }
276 
277 // This is how we track whether we've already started profiling,
278 // to guard against misuse of the API.
IsProfiling()279 inline bool& IsProfiling() {
280   static bool b;
281   return b;
282 }
283 
284 // This is how we tell the profiler thread to finish.
ProfilerThreadShouldFinish()285 inline bool& ProfilerThreadShouldFinish() {
286   static bool b;
287   return b;
288 }
289 
290 // The profiler thread. See ProfilerThreadFunc.
ProfilerThread()291 inline pthread_t& ProfilerThread() {
292   static pthread_t t;
293   return t;
294 }
295 
296 // Records a stack from a running thread.
297 // The tricky part is that we're not interrupting the thread.
298 // This is OK because we're looking at a pseudo-stack of labels,
299 // not at the real thread stack, and if the pseudo-stack changes
300 // while we're recording it, we are OK with getting either the
301 // old or the new stack. Note that ProfilingStack::Pop
302 // only decrements the size, and doesn't null the popped label,
303 // so if we're concurrently recording it, it shouldn't change
304 // under our feet until another label is pushed, at which point
305 // we are OK with getting either this new label or the old one.
306 // In the end, the key atomicity property that we are relying on
307 // here is that pointers are changed atomically, and the labels
308 // are pointers (to literal strings).
RecordStack(ThreadInfo * thread,ProfilingStack * dst)309 inline void RecordStack(ThreadInfo* thread, ProfilingStack* dst) {
310   ScopedLock sl(thread->stack.lock);
311   assert(!dst->size);
312   while (dst->size < thread->stack.size) {
313     dst->labels[dst->size] = thread->stack.labels[dst->size];
314     dst->size++;
315   }
316 }
317 
318 // The profiler thread's entry point.
319 // Note that a separate thread is to be started each time we call
320 // StartProfiling(), and finishes when we call FinishProfiling().
321 // So here we only need to handle the recording and reporting of
322 // a single profile.
ProfilerThreadFunc(void *)323 inline void* ProfilerThreadFunc(void*) {
324   assert(ProfilerThread() == pthread_self());
325 
326   // Since we only handle one profile per profiler thread, the
327   // profile data (the array of recorded stacks) can be a local variable here.
328   std::vector<ProfilingStack> stacks;
329 
330   while (!ProfilerThreadShouldFinish()) {
331     WaitOneProfilerTick();
332     {
333       ScopedLock sl(GlobalMutexes::Profiler());
334       for (auto t : ThreadsUnderProfiling()) {
335         ProfilingStack s;
336         RecordStack(t, &s);
337         stacks.push_back(s);
338       }
339     }
340   }
341 
342   // Profiling is finished and we now report the results.
343   ProfileTreeView(stacks).Print();
344 
345   return nullptr;
346 }
347 
348 // Starts recording samples.
StartProfiling()349 inline void StartProfiling() {
350   ScopedLock sl(GlobalMutexes::Profiler());
351   ReleaseBuildAssertion(!IsProfiling(), "We're already profiling!");
352   IsProfiling() = true;
353   ProfilerThreadShouldFinish() = false;
354   pthread_create(&ProfilerThread(), nullptr, ProfilerThreadFunc, nullptr);
355 }
356 
357 // Stops recording samples, and prints a profile tree-view on stdout.
FinishProfiling()358 inline void FinishProfiling() {
359   {
360     ScopedLock sl(GlobalMutexes::Profiler());
361     ReleaseBuildAssertion(IsProfiling(), "We weren't profiling!");
362     // The ProfilerThreadShouldFinish() mechanism here is really naive and bad,
363     // as the scary comments below should make clear.
364     // Should we use a condition variable?
365     ProfilerThreadShouldFinish() = true;
366   }  // must release the lock here to avoid deadlock with profiler thread.
367   pthread_join(ProfilerThread(), nullptr);
368   IsProfiling() = false;  // yikes, this should be guarded by the lock!
369 }
370 
371 }  // namespace gemmlowp
372 
373 #endif  // GEMMLOWP_PROFILING_PROFILER_H_
374