1 #ifndef KMP_STATS_H
2 #define KMP_STATS_H
3 
4 /** @file kmp_stats.h
5  * Functions for collecting statistics.
6  */
7 
8 //===----------------------------------------------------------------------===//
9 //
10 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
11 // See https://llvm.org/LICENSE.txt for license information.
12 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "kmp_config.h"
17 #include "kmp_debug.h"
18 
19 #if KMP_STATS_ENABLED
20 /* Statistics accumulator.
21    Accumulates number of samples and computes min, max, mean, standard deviation
22    on the fly.
23 
24    Online variance calculation algorithm from
25    http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
26  */
27 
28 #include "kmp_stats_timing.h"
29 #include <limits>
30 #include <math.h>
31 #include <new> // placement new
32 #include <stdint.h>
33 #include <string>
34 #include <vector>
35 
36 /* Enable developer statistics here if you want them. They are more detailed
37    than is useful for application characterisation and are intended for the
38    runtime library developer. */
39 #define KMP_DEVELOPER_STATS 0
40 
41 /* Enable/Disable histogram output */
42 #define KMP_STATS_HIST 0
43 
44 /*!
45  * @ingroup STATS_GATHERING
46  * \brief flags to describe the statistic (timer or counter)
47  *
48  */
49 enum stats_flags_e {
50   noTotal = 1 << 0, //!< do not show a TOTAL_aggregation for this statistic
51   onlyInMaster = 1 << 1, //!< statistic is valid only for master
52   noUnits = 1 << 2, //!< statistic doesn't need units printed next to it
53   notInMaster = 1 << 3, //!< statistic is valid only for non-master threads
54   logEvent = 1 << 4 //!< statistic can be logged on the event timeline when
55   //! KMP_STATS_EVENTS is on (valid only for timers)
56 };
57 
58 /*!
59  * @ingroup STATS_GATHERING
60  * \brief the states which a thread can be in
61  *
62  */
63 enum stats_state_e {
64   IDLE,
65   SERIAL_REGION,
66   FORK_JOIN_BARRIER,
67   PLAIN_BARRIER,
68   TASKWAIT,
69   TASKYIELD,
70   TASKGROUP,
71   IMPLICIT_TASK,
72   EXPLICIT_TASK,
73   TEAMS_REGION
74 };
75 
76 /*!
77  * \brief Add new counters under KMP_FOREACH_COUNTER() macro in kmp_stats.h
78  *
79  * @param macro a user defined macro that takes three arguments -
80  * macro(COUNTER_NAME, flags, arg)
81  * @param arg a user defined argument to send to the user defined macro
82  *
83  * \details A counter counts the occurrence of some event. Each thread
84  * accumulates its own count, at the end of execution the counts are aggregated
85  * treating each thread as a separate measurement. (Unless onlyInMaster is set,
86  * in which case there's only a single measurement). The min,mean,max are
87  * therefore the values for the threads. Adding the counter here and then
88  * putting a KMP_BLOCK_COUNTER(name) at the point you want to count is all you
89  * need to do. All of the tables and printing is generated from this macro.
90  * Format is "macro(name, flags, arg)"
91  *
92  * @ingroup STATS_GATHERING
93  */
94 // clang-format off
95 #define KMP_FOREACH_COUNTER(macro, arg)                                        \
96   macro(OMP_PARALLEL,stats_flags_e::onlyInMaster|stats_flags_e::noTotal,arg)   \
97   macro(OMP_NESTED_PARALLEL, 0, arg)                                           \
98   macro(OMP_LOOP_STATIC, 0, arg)                                               \
99   macro(OMP_LOOP_STATIC_STEAL, 0, arg)                                         \
100   macro(OMP_LOOP_DYNAMIC, 0, arg)                                              \
101   macro(OMP_DISTRIBUTE, 0, arg)                                                \
102   macro(OMP_BARRIER, 0, arg)                                                   \
103   macro(OMP_CRITICAL, 0, arg)                                                  \
104   macro(OMP_SINGLE, 0, arg)                                                    \
105   macro(OMP_MASTER, 0, arg)                                                    \
106   macro(OMP_TEAMS, 0, arg)                                                     \
107   macro(OMP_set_lock, 0, arg)                                                  \
108   macro(OMP_test_lock, 0, arg)                                                 \
109   macro(REDUCE_wait, 0, arg)                                                   \
110   macro(REDUCE_nowait, 0, arg)                                                 \
111   macro(OMP_TASKYIELD, 0, arg)                                                 \
112   macro(OMP_TASKLOOP, 0, arg)                                                  \
113   macro(TASK_executed, 0, arg)                                                 \
114   macro(TASK_cancelled, 0, arg)                                                \
115   macro(TASK_stolen, 0, arg)
116 // clang-format on
117 
118 /*!
119  * \brief Add new timers under KMP_FOREACH_TIMER() macro in kmp_stats.h
120  *
121  * @param macro a user defined macro that takes three arguments -
122  * macro(TIMER_NAME, flags, arg)
123  * @param arg a user defined argument to send to the user defined macro
124  *
125  * \details A timer collects multiple samples of some count in each thread and
126  * then finally aggregates all of the samples from all of the threads. For most
127  * timers the printing code also provides an aggregation over the thread totals.
128  * These are printed as TOTAL_foo. The count is normally a time (in ticks),
129  * hence the name "timer". (But can be any value, so we use this for "number of
130  * arguments passed to fork" as well). For timers the threads are not
131  * significant, it's the individual observations that count, so the statistics
132  * are at that level. Format is "macro(name, flags, arg)"
133  *
134  * @ingroup STATS_GATHERING2
135  */
136 // clang-format off
137 #define KMP_FOREACH_TIMER(macro, arg)                                          \
138   macro (OMP_worker_thread_life, stats_flags_e::logEvent, arg)                 \
139   macro (OMP_parallel, stats_flags_e::logEvent, arg)                           \
140   macro (OMP_parallel_overhead, stats_flags_e::logEvent, arg)                  \
141   macro (OMP_teams, stats_flags_e::logEvent, arg)                              \
142   macro (OMP_teams_overhead, stats_flags_e::logEvent, arg)                     \
143   macro (OMP_loop_static, 0, arg)                                              \
144   macro (OMP_loop_static_scheduling, 0, arg)                                   \
145   macro (OMP_loop_dynamic, 0, arg)                                             \
146   macro (OMP_loop_dynamic_scheduling, 0, arg)                                  \
147   macro (OMP_distribute, 0, arg)                                               \
148   macro (OMP_distribute_scheduling, 0, arg)                                    \
149   macro (OMP_critical, 0, arg)                                                 \
150   macro (OMP_critical_wait, 0, arg)                                            \
151   macro (OMP_single, 0, arg)                                                   \
152   macro (OMP_master, 0, arg)                                                   \
153   macro (OMP_task_immediate, 0, arg)                                           \
154   macro (OMP_task_taskwait, 0, arg)                                            \
155   macro (OMP_task_taskyield, 0, arg)                                           \
156   macro (OMP_task_taskgroup, 0, arg)                                           \
157   macro (OMP_task_join_bar, 0, arg)                                            \
158   macro (OMP_task_plain_bar, 0, arg)                                           \
159   macro (OMP_taskloop_scheduling, 0, arg)                                      \
160   macro (OMP_plain_barrier, stats_flags_e::logEvent, arg)                      \
161   macro (OMP_idle, stats_flags_e::logEvent, arg)                               \
162   macro (OMP_fork_barrier, stats_flags_e::logEvent, arg)                       \
163   macro (OMP_join_barrier, stats_flags_e::logEvent, arg)                       \
164   macro (OMP_serial, stats_flags_e::logEvent, arg)                             \
165   macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal,  \
166          arg)                                                                  \
167   macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal,   \
168          arg)                                                                  \
169   macro (OMP_loop_static_iterations,                                           \
170          stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
171   macro (OMP_loop_static_total_iterations,                                     \
172          stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
173   macro (OMP_loop_dynamic_iterations,                                          \
174          stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
175   macro (OMP_loop_dynamic_total_iterations,                                    \
176          stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
177   macro (OMP_distribute_iterations,                                            \
178          stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
179   KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
180 // clang-format on
181 
182 // OMP_worker_thread_life -- Time from thread becoming an OpenMP thread (either
183 //                           initializing OpenMP or being created by a master)
184 //                           until the thread is destroyed
185 // OMP_parallel           -- Time thread spends executing work directly
186 //                           within a #pragma omp parallel
187 // OMP_parallel_overhead  -- Time thread spends setting up a parallel region
188 // OMP_loop_static        -- Time thread spends executing loop iterations from
189 //                           a statically scheduled loop
190 // OMP_loop_static_scheduling -- Time thread spends scheduling loop iterations
191 //                               from a statically scheduled loop
192 // OMP_loop_dynamic       -- Time thread spends executing loop iterations from
193 //                           a dynamically scheduled loop
194 // OMP_loop_dynamic_scheduling -- Time thread spends scheduling loop iterations
195 //                                from a dynamically scheduled loop
196 // OMP_critical           -- Time thread spends executing critical section
197 // OMP_critical_wait      -- Time thread spends waiting to enter
198 //                           a critical section
199 // OMP_single             -- Time spent executing a "single" region
200 // OMP_master             -- Time spent executing a "master" region
201 // OMP_task_immediate     -- Time spent executing non-deferred tasks
202 // OMP_task_taskwait      -- Time spent executing tasks inside a taskwait
203 //                           construct
204 // OMP_task_taskyield     -- Time spent executing tasks inside a taskyield
205 //                           construct
206 // OMP_task_taskgroup     -- Time spent executing tasks inside a taskygroup
207 //                           construct
208 // OMP_task_join_bar      -- Time spent executing tasks inside a join barrier
209 // OMP_task_plain_bar     -- Time spent executing tasks inside a barrier
210 //                           construct
211 // OMP_taskloop_scheduling -- Time spent scheduling tasks inside a taskloop
212 //                            construct
213 // OMP_plain_barrier      -- Time spent in a #pragma omp barrier construct or
214 //                           inside implicit barrier at end of worksharing
215 //                           construct
216 // OMP_idle               -- Time worker threads spend waiting for next
217 //                           parallel region
218 // OMP_fork_barrier       -- Time spent in a the fork barrier surrounding a
219 //                           parallel region
220 // OMP_join_barrier       -- Time spent in a the join barrier surrounding a
221 //                           parallel region
222 // OMP_serial             -- Time thread zero spends executing serial code
223 // OMP_set_numthreads     -- Values passed to omp_set_num_threads
224 // OMP_PARALLEL_args      -- Number of arguments passed to a parallel region
225 // OMP_loop_static_iterations -- Number of iterations thread is assigned for
226 //                               statically scheduled loops
227 // OMP_loop_dynamic_iterations -- Number of iterations thread is assigned for
228 //                                dynamically scheduled loops
229 
230 #if (KMP_DEVELOPER_STATS)
231 // Timers which are of interest to runtime library developers, not end users.
232 // These have to be explicitly enabled in addition to the other stats.
233 
234 // KMP_fork_barrier       -- time in __kmp_fork_barrier
235 // KMP_join_barrier       -- time in __kmp_join_barrier
236 // KMP_barrier            -- time in __kmp_barrier
237 // KMP_end_split_barrier  -- time in __kmp_end_split_barrier
238 // KMP_setup_icv_copy     -- time in __kmp_setup_icv_copy
239 // KMP_icv_copy           -- start/stop timer for any ICV copying
240 // KMP_linear_gather      -- time in __kmp_linear_barrier_gather
241 // KMP_linear_release     -- time in __kmp_linear_barrier_release
242 // KMP_tree_gather        -- time in __kmp_tree_barrier_gather
243 // KMP_tree_release       -- time in __kmp_tree_barrier_release
244 // KMP_hyper_gather       -- time in __kmp_hyper_barrier_gather
245 // KMP_hyper_release      -- time in __kmp_hyper_barrier_release
246 // clang-format off
247 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)                                \
248   macro(KMP_fork_call, 0, arg)                                                 \
249   macro(KMP_join_call, 0, arg)                                                 \
250   macro(KMP_end_split_barrier, 0, arg)                                         \
251   macro(KMP_hier_gather, 0, arg)                                               \
252   macro(KMP_hier_release, 0, arg)                                              \
253   macro(KMP_hyper_gather, 0, arg)                                              \
254   macro(KMP_hyper_release, 0, arg)                                             \
255   macro(KMP_linear_gather, 0, arg)                                             \
256   macro(KMP_linear_release, 0, arg)                                            \
257   macro(KMP_tree_gather, 0, arg)                                               \
258   macro(KMP_tree_release, 0, arg)                                              \
259   macro(USER_resume, 0, arg)                                                   \
260   macro(USER_suspend, 0, arg)                                                  \
261   macro(USER_mwait, 0, arg)                                                    \
262   macro(KMP_allocate_team, 0, arg)                                             \
263   macro(KMP_setup_icv_copy, 0, arg)                                            \
264   macro(USER_icv_copy, 0, arg)                                                 \
265   macro (FOR_static_steal_stolen,                                              \
266          stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
267   macro (FOR_static_steal_chunks,                                              \
268          stats_flags_e::noUnits | stats_flags_e::noTotal, arg)
269 #else
270 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
271 #endif
272 // clang-format on
273 
274 /*!
275  * \brief Add new explicit timers under KMP_FOREACH_EXPLICIT_TIMER() macro.
276  *
277  * @param macro a user defined macro that takes three arguments -
278  * macro(TIMER_NAME, flags, arg)
279  * @param arg a user defined argument to send to the user defined macro
280  *
281  * \warning YOU MUST HAVE THE SAME NAMED TIMER UNDER KMP_FOREACH_TIMER() OR ELSE
282  * BAD THINGS WILL HAPPEN!
283  *
284  * \details Explicit timers are ones where we need to allocate a timer itself
285  * (as well as the accumulated timing statistics). We allocate these on a
286  * per-thread basis, and explicitly start and stop them. Block timers just
287  * allocate the timer itself on the stack, and use the destructor to notice
288  * block exit; they don't need to be defined here. The name here should be the
289  * same as that of a timer above.
290  *
291  * @ingroup STATS_GATHERING
292 */
293 #define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) KMP_FOREACH_TIMER(macro, arg)
294 
295 #define ENUMERATE(name, ignore, prefix) prefix##name,
296 enum timer_e { KMP_FOREACH_TIMER(ENUMERATE, TIMER_) TIMER_LAST };
297 
298 enum explicit_timer_e {
299   KMP_FOREACH_EXPLICIT_TIMER(ENUMERATE, EXPLICIT_TIMER_) EXPLICIT_TIMER_LAST
300 };
301 
302 enum counter_e { KMP_FOREACH_COUNTER(ENUMERATE, COUNTER_) COUNTER_LAST };
303 #undef ENUMERATE
304 
305 /*
306  * A logarithmic histogram. It accumulates the number of values in each power of
307  * ten bin.  So 1<=x<10, 10<=x<100, ...
308  * Mostly useful where we have some big outliers and want to see information
309  * about them.
310  */
311 class logHistogram {
312   enum {
313     numBins = 31, /* Number of powers of 10. If this changes you need to change
314                    * the initializer for binMax */
315 
316     /*
317      * If you want to use this to analyse values that may be less than 1, (for
318      * instance times in s), then the logOffset gives you negative powers.
319      * In our case here, we're just looking at times in ticks, or counts, so we
320      * can never see values with magnitude < 1 (other than zero), so we can set
321      * it to 0.  As above change the initializer if you change this.
322      */
323     logOffset = 0
324   };
325   uint32_t KMP_ALIGN_CACHE zeroCount;
326   struct {
327     uint32_t count;
328     double total;
329   } bins[numBins];
330 
331   static double binMax[numBins];
332 
333 #ifdef KMP_DEBUG
334   uint64_t _total;
335 
336   void check() const {
337     uint64_t t = zeroCount;
338     for (int i = 0; i < numBins; i++)
339       t += bins[i].count;
340     KMP_DEBUG_ASSERT(t == _total);
341   }
342 #else
343   void check() const {}
344 #endif
345 
346 public:
347   logHistogram() { reset(); }
348 
349   logHistogram(logHistogram const &o) {
350     for (int i = 0; i < numBins; i++)
351       bins[i] = o.bins[i];
352 #ifdef KMP_DEBUG
353     _total = o._total;
354 #endif
355   }
356 
357   void reset() {
358     zeroCount = 0;
359     for (int i = 0; i < numBins; i++) {
360       bins[i].count = 0;
361       bins[i].total = 0;
362     }
363 
364 #ifdef KMP_DEBUG
365     _total = 0;
366 #endif
367   }
368   uint32_t count(int b) const { return bins[b + logOffset].count; }
369   double total(int b) const { return bins[b + logOffset].total; }
370   static uint32_t findBin(double sample);
371 
372   logHistogram &operator+=(logHistogram const &o) {
373     zeroCount += o.zeroCount;
374     for (int i = 0; i < numBins; i++) {
375       bins[i].count += o.bins[i].count;
376       bins[i].total += o.bins[i].total;
377     }
378 #ifdef KMP_DEBUG
379     _total += o._total;
380     check();
381 #endif
382 
383     return *this;
384   }
385 
386   void addSample(double sample);
387   int minBin() const;
388   int maxBin() const;
389 
390   std::string format(char) const;
391 };
392 
393 class statistic {
394   double KMP_ALIGN_CACHE minVal;
395   double maxVal;
396   double meanVal;
397   double m2;
398   uint64_t sampleCount;
399   double offset;
400   bool collectingHist;
401   logHistogram hist;
402 
403 public:
404   statistic(bool doHist = bool(KMP_STATS_HIST)) {
405     reset();
406     collectingHist = doHist;
407   }
408   statistic(statistic const &o)
409       : minVal(o.minVal), maxVal(o.maxVal), meanVal(o.meanVal), m2(o.m2),
410         sampleCount(o.sampleCount), offset(o.offset),
411         collectingHist(o.collectingHist), hist(o.hist) {}
412   statistic(double minv, double maxv, double meanv, uint64_t sc, double sd)
413       : minVal(minv), maxVal(maxv), meanVal(meanv), m2(sd * sd * sc),
414         sampleCount(sc), offset(0.0), collectingHist(false) {}
415   bool haveHist() const { return collectingHist; }
416   double getMin() const { return minVal; }
417   double getMean() const { return meanVal; }
418   double getMax() const { return maxVal; }
419   uint64_t getCount() const { return sampleCount; }
420   double getSD() const { return sqrt(m2 / sampleCount); }
421   double getTotal() const { return sampleCount * meanVal; }
422   logHistogram const *getHist() const { return &hist; }
423   void setOffset(double d) { offset = d; }
424 
425   void reset() {
426     minVal = (std::numeric_limits<double>::max)();
427     maxVal = -minVal;
428     meanVal = 0.0;
429     m2 = 0.0;
430     sampleCount = 0;
431     offset = 0.0;
432     hist.reset();
433   }
434   void addSample(double sample);
435   void scale(double factor);
436   void scaleDown(double f) { scale(1. / f); }
437   void forceCount(uint64_t count) { sampleCount = count; }
438   statistic &operator+=(statistic const &other);
439 
440   std::string format(char unit, bool total = false) const;
441   std::string formatHist(char unit) const { return hist.format(unit); }
442 };
443 
444 struct statInfo {
445   const char *name;
446   uint32_t flags;
447 };
448 
449 class timeStat : public statistic {
450   static statInfo timerInfo[];
451 
452 public:
453   timeStat() : statistic() {}
454   static const char *name(timer_e e) { return timerInfo[e].name; }
455   static bool noTotal(timer_e e) {
456     return timerInfo[e].flags & stats_flags_e::noTotal;
457   }
458   static bool masterOnly(timer_e e) {
459     return timerInfo[e].flags & stats_flags_e::onlyInMaster;
460   }
461   static bool workerOnly(timer_e e) {
462     return timerInfo[e].flags & stats_flags_e::notInMaster;
463   }
464   static bool noUnits(timer_e e) {
465     return timerInfo[e].flags & stats_flags_e::noUnits;
466   }
467   static bool logEvent(timer_e e) {
468     return timerInfo[e].flags & stats_flags_e::logEvent;
469   }
470   static void clearEventFlags() {
471     for (int i = 0; i < TIMER_LAST; i++) {
472       timerInfo[i].flags &= (~(stats_flags_e::logEvent));
473     }
474   }
475 };
476 
477 // Where we need explicitly to start and end the timer, this version can be used
478 // Since these timers normally aren't nicely scoped, so don't have a good place
479 // to live on the stack of the thread, they're more work to use.
480 class explicitTimer {
481   timeStat *stat;
482   timer_e timerEnumValue;
483   tsc_tick_count startTime;
484   tsc_tick_count pauseStartTime;
485   tsc_tick_count::tsc_interval_t totalPauseTime;
486 
487 public:
488   explicitTimer(timeStat *s, timer_e te)
489       : stat(s), timerEnumValue(te), startTime(), pauseStartTime(0),
490         totalPauseTime() {}
491 
492   // void setStat(timeStat *s) { stat = s; }
493   void start(tsc_tick_count tick);
494   void pause(tsc_tick_count tick) { pauseStartTime = tick; }
495   void resume(tsc_tick_count tick) {
496     totalPauseTime += (tick - pauseStartTime);
497   }
498   void stop(tsc_tick_count tick, kmp_stats_list *stats_ptr = nullptr);
499   void reset() {
500     startTime = 0;
501     pauseStartTime = 0;
502     totalPauseTime = 0;
503   }
504   timer_e get_type() const { return timerEnumValue; }
505 };
506 
507 // Where you need to partition a threads clock ticks into separate states
508 // e.g., a partitionedTimers class with two timers of EXECUTING_TASK, and
509 // DOING_NOTHING would render these conditions:
510 // time(EXECUTING_TASK) + time(DOING_NOTHING) = total time thread is alive
511 // No clock tick in the EXECUTING_TASK is a member of DOING_NOTHING and vice
512 // versa
513 class partitionedTimers {
514 private:
515   std::vector<explicitTimer> timer_stack;
516 
517 public:
518   partitionedTimers();
519   void init(explicitTimer timer);
520   void exchange(explicitTimer timer);
521   void push(explicitTimer timer);
522   void pop();
523   void windup();
524 };
525 
526 // Special wrapper around the partitioned timers to aid timing code blocks
527 // It avoids the need to have an explicit end, leaving the scope suffices.
528 class blockPartitionedTimer {
529   partitionedTimers *part_timers;
530 
531 public:
532   blockPartitionedTimer(partitionedTimers *pt, explicitTimer timer)
533       : part_timers(pt) {
534     part_timers->push(timer);
535   }
536   ~blockPartitionedTimer() { part_timers->pop(); }
537 };
538 
539 // Special wrapper around the thread state to aid in keeping state in code
540 // blocks It avoids the need to have an explicit end, leaving the scope
541 // suffices.
542 class blockThreadState {
543   stats_state_e *state_pointer;
544   stats_state_e old_state;
545 
546 public:
547   blockThreadState(stats_state_e *thread_state_pointer, stats_state_e new_state)
548       : state_pointer(thread_state_pointer), old_state(*thread_state_pointer) {
549     *state_pointer = new_state;
550   }
551   ~blockThreadState() { *state_pointer = old_state; }
552 };
553 
554 // If all you want is a count, then you can use this...
555 // The individual per-thread counts will be aggregated into a statistic at
556 // program exit.
557 class counter {
558   uint64_t value;
559   static const statInfo counterInfo[];
560 
561 public:
562   counter() : value(0) {}
563   void increment() { value++; }
564   uint64_t getValue() const { return value; }
565   void reset() { value = 0; }
566   static const char *name(counter_e e) { return counterInfo[e].name; }
567   static bool masterOnly(counter_e e) {
568     return counterInfo[e].flags & stats_flags_e::onlyInMaster;
569   }
570 };
571 
572 /* ****************************************************************
573     Class to implement an event
574 
575     There are four components to an event: start time, stop time
576     nest_level, and timer_name.
577     The start and stop time should be obvious (recorded in clock ticks).
578     The nest_level relates to the bar width in the timeline graph.
579     The timer_name is used to determine which timer event triggered this event.
580 
581     the interface to this class is through four read-only operations:
582     1) getStart()     -- returns the start time as 64 bit integer
583     2) getStop()      -- returns the stop time as 64 bit integer
584     3) getNestLevel() -- returns the nest level of the event
585     4) getTimerName() -- returns the timer name that triggered event
586 
587     *MORE ON NEST_LEVEL*
588     The nest level is used in the bar graph that represents the timeline.
589     Its main purpose is for showing how events are nested inside eachother.
590     For example, say events, A, B, and C are recorded.  If the timeline
591     looks like this:
592 
593 Begin -------------------------------------------------------------> Time
594          |    |          |        |          |              |
595          A    B          C        C          B              A
596        start start     start     end        end            end
597 
598        Then A, B, C will have a nest level of 1, 2, 3 respectively.
599        These values are then used to calculate the barwidth so you can
600        see that inside A, B has occurred, and inside B, C has occurred.
601        Currently, this is shown with A's bar width being larger than B's
602        bar width, and B's bar width being larger than C's bar width.
603 
604 **************************************************************** */
605 class kmp_stats_event {
606   uint64_t start;
607   uint64_t stop;
608   int nest_level;
609   timer_e timer_name;
610 
611 public:
612   kmp_stats_event()
613       : start(0), stop(0), nest_level(0), timer_name(TIMER_LAST) {}
614   kmp_stats_event(uint64_t strt, uint64_t stp, int nst, timer_e nme)
615       : start(strt), stop(stp), nest_level(nst), timer_name(nme) {}
616   inline uint64_t getStart() const { return start; }
617   inline uint64_t getStop() const { return stop; }
618   inline int getNestLevel() const { return nest_level; }
619   inline timer_e getTimerName() const { return timer_name; }
620 };
621 
622 /* ****************************************************************
623     Class to implement a dynamically expandable array of events
624 
625     ---------------------------------------------------------
626     | event 1 | event 2 | event 3 | event 4 | ... | event N |
627     ---------------------------------------------------------
628 
629     An event is pushed onto the back of this array at every
630     explicitTimer->stop() call.  The event records the thread #,
631     start time, stop time, and nest level related to the bar width.
632 
633     The event vector starts at size INIT_SIZE and grows (doubles in size)
634     if needed.  An implication of this behavior is that log(N)
635     reallocations are needed (where N is number of events).  If you want
636     to avoid reallocations, then set INIT_SIZE to a large value.
637 
638     the interface to this class is through six operations:
639     1) reset() -- sets the internal_size back to 0 but does not deallocate any
640        memory
641     2) size()  -- returns the number of valid elements in the vector
642     3) push_back(start, stop, nest, timer_name) -- pushes an event onto
643        the back of the array
644     4) deallocate() -- frees all memory associated with the vector
645     5) sort() -- sorts the vector by start time
646     6) operator[index] or at(index) -- returns event reference at that index
647 **************************************************************** */
648 class kmp_stats_event_vector {
649   kmp_stats_event *events;
650   int internal_size;
651   int allocated_size;
652   static const int INIT_SIZE = 1024;
653 
654 public:
655   kmp_stats_event_vector() {
656     events =
657         (kmp_stats_event *)__kmp_allocate(sizeof(kmp_stats_event) * INIT_SIZE);
658     internal_size = 0;
659     allocated_size = INIT_SIZE;
660   }
661   ~kmp_stats_event_vector() {}
662   inline void reset() { internal_size = 0; }
663   inline int size() const { return internal_size; }
664   void push_back(uint64_t start_time, uint64_t stop_time, int nest_level,
665                  timer_e name) {
666     int i;
667     if (internal_size == allocated_size) {
668       kmp_stats_event *tmp = (kmp_stats_event *)__kmp_allocate(
669           sizeof(kmp_stats_event) * allocated_size * 2);
670       for (i = 0; i < internal_size; i++)
671         tmp[i] = events[i];
672       __kmp_free(events);
673       events = tmp;
674       allocated_size *= 2;
675     }
676     events[internal_size] =
677         kmp_stats_event(start_time, stop_time, nest_level, name);
678     internal_size++;
679     return;
680   }
681   void deallocate();
682   void sort();
683   const kmp_stats_event &operator[](int index) const { return events[index]; }
684   kmp_stats_event &operator[](int index) { return events[index]; }
685   const kmp_stats_event &at(int index) const { return events[index]; }
686   kmp_stats_event &at(int index) { return events[index]; }
687 };
688 
689 /* ****************************************************************
690     Class to implement a doubly-linked, circular, statistics list
691 
692     |---| ---> |---| ---> |---| ---> |---| ---> ... next
693     |   |      |   |      |   |      |   |
694     |---| <--- |---| <--- |---| <--- |---| <--- ... prev
695     Sentinel   first      second     third
696     Node       node       node       node
697 
698     The Sentinel Node is the user handle on the list.
699     The first node corresponds to thread 0's statistics.
700     The second node corresponds to thread 1's statistics and so on...
701 
702     Each node has a _timers, _counters, and _explicitTimers array to hold that
703     thread's statistics. The _explicitTimers point to the correct _timer and
704     update its statistics at every stop() call. The explicitTimers' pointers are
705     set up in the constructor. Each node also has an event vector to hold that
706     thread's timing events. The event vector expands as necessary and records
707     the start-stop times for each timer.
708 
709     The nestLevel variable is for plotting events and is related
710     to the bar width in the timeline graph.
711 
712     Every thread will have a thread local pointer to its node in
713     the list.  The sentinel node is used by the master thread to
714     store "dummy" statistics before __kmp_create_worker() is called.
715 **************************************************************** */
716 class kmp_stats_list {
717   int gtid;
718   timeStat _timers[TIMER_LAST + 1];
719   counter _counters[COUNTER_LAST + 1];
720   explicitTimer thread_life_timer;
721   partitionedTimers _partitionedTimers;
722   int _nestLevel; // one per thread
723   kmp_stats_event_vector _event_vector;
724   kmp_stats_list *next;
725   kmp_stats_list *prev;
726   stats_state_e state;
727   int thread_is_idle_flag;
728 
729 public:
730   kmp_stats_list()
731       : thread_life_timer(&_timers[TIMER_OMP_worker_thread_life],
732                           TIMER_OMP_worker_thread_life),
733         _nestLevel(0), _event_vector(), next(this), prev(this), state(IDLE),
734         thread_is_idle_flag(0) {}
735   ~kmp_stats_list() {}
736   inline timeStat *getTimer(timer_e idx) { return &_timers[idx]; }
737   inline counter *getCounter(counter_e idx) { return &_counters[idx]; }
738   inline partitionedTimers *getPartitionedTimers() {
739     return &_partitionedTimers;
740   }
741   inline timeStat *getTimers() { return _timers; }
742   inline counter *getCounters() { return _counters; }
743   inline kmp_stats_event_vector &getEventVector() { return _event_vector; }
744   inline void startLife() { thread_life_timer.start(tsc_tick_count::now()); }
745   inline void endLife() { thread_life_timer.stop(tsc_tick_count::now(), this); }
746   inline void resetEventVector() { _event_vector.reset(); }
747   inline void incrementNestValue() { _nestLevel++; }
748   inline int getNestValue() { return _nestLevel; }
749   inline void decrementNestValue() { _nestLevel--; }
750   inline int getGtid() const { return gtid; }
751   inline void setGtid(int newgtid) { gtid = newgtid; }
752   inline void setState(stats_state_e newstate) { state = newstate; }
753   inline stats_state_e getState() const { return state; }
754   inline stats_state_e *getStatePointer() { return &state; }
755   inline bool isIdle() { return thread_is_idle_flag == 1; }
756   inline void setIdleFlag() { thread_is_idle_flag = 1; }
757   inline void resetIdleFlag() { thread_is_idle_flag = 0; }
758   kmp_stats_list *push_back(int gtid); // returns newly created list node
759   inline void push_event(uint64_t start_time, uint64_t stop_time,
760                          int nest_level, timer_e name) {
761     _event_vector.push_back(start_time, stop_time, nest_level, name);
762   }
763   void deallocate();
764   class iterator;
765   kmp_stats_list::iterator begin();
766   kmp_stats_list::iterator end();
767   int size();
768   class iterator {
769     kmp_stats_list *ptr;
770     friend kmp_stats_list::iterator kmp_stats_list::begin();
771     friend kmp_stats_list::iterator kmp_stats_list::end();
772 
773   public:
774     iterator();
775     ~iterator();
776     iterator operator++();
777     iterator operator++(int dummy);
778     iterator operator--();
779     iterator operator--(int dummy);
780     bool operator!=(const iterator &rhs);
781     bool operator==(const iterator &rhs);
782     kmp_stats_list *operator*() const; // dereference operator
783   };
784 };
785 
786 /* ****************************************************************
787    Class to encapsulate all output functions and the environment variables
788 
789    This module holds filenames for various outputs (normal stats, events, plot
790    file), as well as coloring information for the plot file.
791 
792    The filenames and flags variables are read from environment variables.
793    These are read once by the constructor of the global variable
794    __kmp_stats_output which calls init().
795 
796    During this init() call, event flags for the timeStat::timerInfo[] global
797    array are cleared if KMP_STATS_EVENTS is not true (on, 1, yes).
798 
799    The only interface function that is public is outputStats(heading).  This
800    function should print out everything it needs to, either to files or stderr,
801    depending on the environment variables described below
802 
803    ENVIRONMENT VARIABLES:
804    KMP_STATS_FILE -- if set, all statistics (not events) will be printed to this
805                      file, otherwise, print to stderr
806    KMP_STATS_THREADS -- if set to "on", then will print per thread statistics to
807                         either KMP_STATS_FILE or stderr
808    KMP_STATS_PLOT_FILE -- if set, print the ploticus plot file to this filename,
809                           otherwise, the plot file is sent to "events.plt"
810    KMP_STATS_EVENTS -- if set to "on", then log events, otherwise, don't log
811                        events
812    KMP_STATS_EVENTS_FILE -- if set, all events are outputted to this file,
813                             otherwise, output is sent to "events.dat"
814 **************************************************************** */
815 class kmp_stats_output_module {
816 
817 public:
818   struct rgb_color {
819     float r;
820     float g;
821     float b;
822   };
823 
824 private:
825   std::string outputFileName;
826   static const char *eventsFileName;
827   static const char *plotFileName;
828   static int printPerThreadFlag;
829   static int printPerThreadEventsFlag;
830   static const rgb_color globalColorArray[];
831   static rgb_color timerColorInfo[];
832 
833   void init();
834   static void setupEventColors();
835   static void printPloticusFile();
836   static void printHeaderInfo(FILE *statsOut);
837   static void printTimerStats(FILE *statsOut, statistic const *theStats,
838                               statistic const *totalStats);
839   static void printCounterStats(FILE *statsOut, statistic const *theStats);
840   static void printCounters(FILE *statsOut, counter const *theCounters);
841   static void printEvents(FILE *eventsOut, kmp_stats_event_vector *theEvents,
842                           int gtid);
843   static rgb_color getEventColor(timer_e e) { return timerColorInfo[e]; }
844   static void windupExplicitTimers();
845   bool eventPrintingEnabled() const { return printPerThreadEventsFlag; }
846 
847 public:
848   kmp_stats_output_module() { init(); }
849   void outputStats(const char *heading);
850 };
851 
852 #ifdef __cplusplus
853 extern "C" {
854 #endif
855 void __kmp_stats_init();
856 void __kmp_stats_fini();
857 void __kmp_reset_stats();
858 void __kmp_output_stats(const char *);
859 void __kmp_accumulate_stats_at_exit(void);
860 // thread local pointer to stats node within list
861 extern KMP_THREAD_LOCAL kmp_stats_list *__kmp_stats_thread_ptr;
862 // head to stats list.
863 extern kmp_stats_list *__kmp_stats_list;
864 // lock for __kmp_stats_list
865 extern kmp_tas_lock_t __kmp_stats_lock;
866 // reference start time
867 extern tsc_tick_count __kmp_stats_start_time;
868 // interface to output
869 extern kmp_stats_output_module __kmp_stats_output;
870 
871 #ifdef __cplusplus
872 }
873 #endif
874 
875 // Simple, standard interfaces that drop out completely if stats aren't enabled
876 
877 /*!
878  * \brief Adds value to specified timer (name).
879  *
880  * @param name timer name as specified under the KMP_FOREACH_TIMER() macro
881  * @param value double precision sample value to add to statistics for the timer
882  *
883  * \details Use KMP_COUNT_VALUE(name, value) macro to add a particular value to
884  * a timer statistics.
885  *
886  * @ingroup STATS_GATHERING
887 */
888 #define KMP_COUNT_VALUE(name, value)                                           \
889   __kmp_stats_thread_ptr->getTimer(TIMER_##name)->addSample((double)value)
890 
891 /*!
892  * \brief Increments specified counter (name).
893  *
894  * @param name counter name as specified under the KMP_FOREACH_COUNTER() macro
895  *
896  * \details Use KMP_COUNT_BLOCK(name, value) macro to increment a statistics
897  * counter for the executing thread.
898  *
899  * @ingroup STATS_GATHERING
900 */
901 #define KMP_COUNT_BLOCK(name)                                                  \
902   __kmp_stats_thread_ptr->getCounter(COUNTER_##name)->increment()
903 
904 /*!
905  * \brief Outputs the current thread statistics and reset them.
906  *
907  * @param heading_string heading put above the final stats output
908  *
909  * \details Explicitly stops all timers and outputs all stats. Environment
910  * variable, `OMPTB_STATSFILE=filename`, can be used to output the stats to a
911  * filename instead of stderr. Environment variable,
912  * `OMPTB_STATSTHREADS=true|undefined`, can be used to output thread specific
913  * stats. For now the `OMPTB_STATSTHREADS` environment variable can either be
914  * defined with any value, which will print out thread specific stats, or it can
915  * be undefined (not specified in the environment) and thread specific stats
916  * won't be printed. It should be noted that all statistics are reset when this
917  * macro is called.
918  *
919  * @ingroup STATS_GATHERING
920 */
921 #define KMP_OUTPUT_STATS(heading_string) __kmp_output_stats(heading_string)
922 
923 /*!
924  * \brief Initializes the partitioned timers to begin with name.
925  *
926  * @param name timer which you want this thread to begin with
927  *
928  * @ingroup STATS_GATHERING
929 */
930 #define KMP_INIT_PARTITIONED_TIMERS(name)                                      \
931   __kmp_stats_thread_ptr->getPartitionedTimers()->init(explicitTimer(          \
932       __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
933 
934 #define KMP_TIME_PARTITIONED_BLOCK(name)                                       \
935   blockPartitionedTimer __PBLOCKTIME__(                                        \
936       __kmp_stats_thread_ptr->getPartitionedTimers(),                          \
937       explicitTimer(__kmp_stats_thread_ptr->getTimer(TIMER_##name),            \
938                     TIMER_##name))
939 
940 #define KMP_PUSH_PARTITIONED_TIMER(name)                                       \
941   __kmp_stats_thread_ptr->getPartitionedTimers()->push(explicitTimer(          \
942       __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
943 
944 #define KMP_POP_PARTITIONED_TIMER()                                            \
945   __kmp_stats_thread_ptr->getPartitionedTimers()->pop()
946 
947 #define KMP_EXCHANGE_PARTITIONED_TIMER(name)                                   \
948   __kmp_stats_thread_ptr->getPartitionedTimers()->exchange(explicitTimer(      \
949       __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
950 
951 #define KMP_SET_THREAD_STATE(state_name)                                       \
952   __kmp_stats_thread_ptr->setState(state_name)
953 
954 #define KMP_GET_THREAD_STATE() __kmp_stats_thread_ptr->getState()
955 
956 #define KMP_SET_THREAD_STATE_BLOCK(state_name)                                 \
957   blockThreadState __BTHREADSTATE__(__kmp_stats_thread_ptr->getStatePointer(), \
958                                     state_name)
959 
960 /*!
961  * \brief resets all stats (counters to 0, timers to 0 elapsed ticks)
962  *
963  * \details Reset all stats for all threads.
964  *
965  * @ingroup STATS_GATHERING
966 */
967 #define KMP_RESET_STATS() __kmp_reset_stats()
968 
969 #if (KMP_DEVELOPER_STATS)
970 #define KMP_COUNT_DEVELOPER_VALUE(n, v) KMP_COUNT_VALUE(n, v)
971 #define KMP_COUNT_DEVELOPER_BLOCK(n) KMP_COUNT_BLOCK(n)
972 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) KMP_TIME_PARTITIONED_BLOCK(n)
973 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) KMP_PUSH_PARTITIONED_TIMER(n)
974 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) KMP_POP_PARTITIONED_TIMER(n)
975 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n)                            \
976   KMP_EXCHANGE_PARTITIONED_TIMER(n)
977 #else
978 // Null definitions
979 #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
980 #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
981 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
982 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
983 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
984 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
985 #endif
986 
987 #else // KMP_STATS_ENABLED
988 
989 // Null definitions
990 #define KMP_COUNT_VALUE(n, v) ((void)0)
991 #define KMP_COUNT_BLOCK(n) ((void)0)
992 
993 #define KMP_OUTPUT_STATS(heading_string) ((void)0)
994 #define KMP_RESET_STATS() ((void)0)
995 
996 #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
997 #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
998 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
999 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1000 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1001 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1002 #define KMP_INIT_PARTITIONED_TIMERS(name) ((void)0)
1003 #define KMP_TIME_PARTITIONED_BLOCK(name) ((void)0)
1004 #define KMP_PUSH_PARTITIONED_TIMER(name) ((void)0)
1005 #define KMP_POP_PARTITIONED_TIMER() ((void)0)
1006 #define KMP_SET_THREAD_STATE(state_name) ((void)0)
1007 #define KMP_GET_THREAD_STATE() ((void)0)
1008 #define KMP_SET_THREAD_STATE_BLOCK(state_name) ((void)0)
1009 #endif // KMP_STATS_ENABLED
1010 
1011 #endif // KMP_STATS_H
1012