1 #ifndef KMP_STATS_H
2 #define KMP_STATS_H
3 
4 /** @file kmp_stats.h
5  * Functions for collecting statistics.
6  */
7 
8 //===----------------------------------------------------------------------===//
9 //
10 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
11 // See https://llvm.org/LICENSE.txt for license information.
12 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "kmp_config.h"
17 #include "kmp_debug.h"
18 
19 #if KMP_STATS_ENABLED
20 /* Statistics accumulator.
21    Accumulates number of samples and computes min, max, mean, standard deviation
22    on the fly.
23 
24    Online variance calculation algorithm from
25    http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
26  */
27 
28 #include "kmp_stats_timing.h"
29 #include <limits>
30 #include <math.h>
31 #include <new> // placement new
32 #include <stdint.h>
33 #include <string>
34 #include <vector>
35 
36 /* Enable developer statistics here if you want them. They are more detailed
37    than is useful for application characterisation and are intended for the
38    runtime library developer. */
39 #define KMP_DEVELOPER_STATS 0
40 
41 /* Enable/Disable histogram output */
42 #define KMP_STATS_HIST 0
43 
44 /*!
45  * @ingroup STATS_GATHERING
46  * \brief flags to describe the statistic (timer or counter)
47  *
48  */
49 enum stats_flags_e {
50   noTotal = 1 << 0, //!< do not show a TOTAL_aggregation for this statistic
51   onlyInMaster = 1 << 1, //!< statistic is valid only for primary thread
52   noUnits = 1 << 2, //!< statistic doesn't need units printed next to it
53   notInMaster = 1 << 3, //!< statistic is valid only for non-primary threads
54   logEvent = 1 << 4 //!< statistic can be logged on the event timeline when
55   //! KMP_STATS_EVENTS is on (valid only for timers)
56 };
57 
58 /*!
59  * @ingroup STATS_GATHERING
60  * \brief the states which a thread can be in
61  *
62  */
63 enum stats_state_e {
64   IDLE,
65   SERIAL_REGION,
66   FORK_JOIN_BARRIER,
67   PLAIN_BARRIER,
68   TASKWAIT,
69   TASKYIELD,
70   TASKGROUP,
71   IMPLICIT_TASK,
72   EXPLICIT_TASK,
73   TEAMS_REGION
74 };
75 
76 /*!
77  * \brief Add new counters under KMP_FOREACH_COUNTER() macro in kmp_stats.h
78  *
79  * @param macro a user defined macro that takes three arguments -
80  * macro(COUNTER_NAME, flags, arg)
81  * @param arg a user defined argument to send to the user defined macro
82  *
83  * \details A counter counts the occurrence of some event. Each thread
84  * accumulates its own count, at the end of execution the counts are aggregated
85  * treating each thread as a separate measurement. (Unless onlyInMaster is set,
86  * in which case there's only a single measurement). The min,mean,max are
87  * therefore the values for the threads. Adding the counter here and then
88  * putting a KMP_BLOCK_COUNTER(name) at the point you want to count is all you
89  * need to do. All of the tables and printing is generated from this macro.
90  * Format is "macro(name, flags, arg)"
91  *
92  * @ingroup STATS_GATHERING
93  */
94 // clang-format off
95 #define KMP_FOREACH_COUNTER(macro, arg)                                        \
96   macro(OMP_PARALLEL,stats_flags_e::onlyInMaster|stats_flags_e::noTotal,arg)   \
97   macro(OMP_NESTED_PARALLEL, 0, arg)                                           \
98   macro(OMP_LOOP_STATIC, 0, arg)                                               \
99   macro(OMP_LOOP_STATIC_STEAL, 0, arg)                                         \
100   macro(OMP_LOOP_DYNAMIC, 0, arg)                                              \
101   macro(OMP_DISTRIBUTE, 0, arg)                                                \
102   macro(OMP_BARRIER, 0, arg)                                                   \
103   macro(OMP_CRITICAL, 0, arg)                                                  \
104   macro(OMP_SINGLE, 0, arg)                                                    \
105   macro(OMP_SECTIONS, 0, arg)                                                  \
106   macro(OMP_MASTER, 0, arg)                                                    \
107   macro(OMP_MASKED, 0, arg)                                                    \
108   macro(OMP_TEAMS, 0, arg)                                                     \
109   macro(OMP_set_lock, 0, arg)                                                  \
110   macro(OMP_test_lock, 0, arg)                                                 \
111   macro(REDUCE_wait, 0, arg)                                                   \
112   macro(REDUCE_nowait, 0, arg)                                                 \
113   macro(OMP_TASKYIELD, 0, arg)                                                 \
114   macro(OMP_TASKLOOP, 0, arg)                                                  \
115   macro(TASK_executed, 0, arg)                                                 \
116   macro(TASK_cancelled, 0, arg)                                                \
117   macro(TASK_stolen, 0, arg)
118 // clang-format on
119 
120 /*!
121  * \brief Add new timers under KMP_FOREACH_TIMER() macro in kmp_stats.h
122  *
123  * @param macro a user defined macro that takes three arguments -
124  * macro(TIMER_NAME, flags, arg)
125  * @param arg a user defined argument to send to the user defined macro
126  *
127  * \details A timer collects multiple samples of some count in each thread and
128  * then finally aggregates all of the samples from all of the threads. For most
129  * timers the printing code also provides an aggregation over the thread totals.
130  * These are printed as TOTAL_foo. The count is normally a time (in ticks),
131  * hence the name "timer". (But can be any value, so we use this for "number of
132  * arguments passed to fork" as well). For timers the threads are not
133  * significant, it's the individual observations that count, so the statistics
134  * are at that level. Format is "macro(name, flags, arg)"
135  *
136  * @ingroup STATS_GATHERING2
137  */
138 // clang-format off
139 #define KMP_FOREACH_TIMER(macro, arg)                                          \
140   macro (OMP_worker_thread_life, stats_flags_e::logEvent, arg)                 \
141   macro (OMP_parallel, stats_flags_e::logEvent, arg)                           \
142   macro (OMP_parallel_overhead, stats_flags_e::logEvent, arg)                  \
143   macro (OMP_teams, stats_flags_e::logEvent, arg)                              \
144   macro (OMP_teams_overhead, stats_flags_e::logEvent, arg)                     \
145   macro (OMP_loop_static, 0, arg)                                              \
146   macro (OMP_loop_static_scheduling, 0, arg)                                   \
147   macro (OMP_loop_dynamic, 0, arg)                                             \
148   macro (OMP_loop_dynamic_scheduling, 0, arg)                                  \
149   macro (OMP_distribute, 0, arg)                                               \
150   macro (OMP_distribute_scheduling, 0, arg)                                    \
151   macro (OMP_critical, 0, arg)                                                 \
152   macro (OMP_critical_wait, 0, arg)                                            \
153   macro (OMP_single, 0, arg)                                                   \
154   macro (OMP_sections, 0, arg)                                                 \
155   macro (OMP_sections_overhead, 0, arg)                                        \
156   macro (OMP_master, 0, arg)                                                   \
157   macro (OMP_masked, 0, arg)                                                   \
158   macro (OMP_task_immediate, 0, arg)                                           \
159   macro (OMP_task_taskwait, 0, arg)                                            \
160   macro (OMP_task_taskyield, 0, arg)                                           \
161   macro (OMP_task_taskgroup, 0, arg)                                           \
162   macro (OMP_task_join_bar, 0, arg)                                            \
163   macro (OMP_task_plain_bar, 0, arg)                                           \
164   macro (OMP_taskloop_scheduling, 0, arg)                                      \
165   macro (OMP_plain_barrier, stats_flags_e::logEvent, arg)                      \
166   macro (OMP_idle, stats_flags_e::logEvent, arg)                               \
167   macro (OMP_fork_barrier, stats_flags_e::logEvent, arg)                       \
168   macro (OMP_join_barrier, stats_flags_e::logEvent, arg)                       \
169   macro (OMP_serial, stats_flags_e::logEvent, arg)                             \
170   macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal,  \
171          arg)                                                                  \
172   macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal,   \
173          arg)                                                                  \
174   macro (OMP_loop_static_iterations,                                           \
175          stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
176   macro (OMP_loop_static_total_iterations,                                     \
177          stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
178   macro (OMP_loop_dynamic_iterations,                                          \
179          stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
180   macro (OMP_loop_dynamic_total_iterations,                                    \
181          stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
182   macro (OMP_distribute_iterations,                                            \
183          stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
184   KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
185 // clang-format on
186 
187 // OMP_worker_thread_life -- Time from thread becoming an OpenMP thread (either
188 //                           initializing OpenMP or being created by a primary
189 //                           thread) until the thread is destroyed
190 // OMP_parallel           -- Time thread spends executing work directly
191 //                           within a #pragma omp parallel
192 // OMP_parallel_overhead  -- Time thread spends setting up a parallel region
193 // OMP_loop_static        -- Time thread spends executing loop iterations from
194 //                           a statically scheduled loop
195 // OMP_loop_static_scheduling -- Time thread spends scheduling loop iterations
196 //                               from a statically scheduled loop
197 // OMP_loop_dynamic       -- Time thread spends executing loop iterations from
198 //                           a dynamically scheduled loop
199 // OMP_loop_dynamic_scheduling -- Time thread spends scheduling loop iterations
200 //                                from a dynamically scheduled loop
201 // OMP_critical           -- Time thread spends executing critical section
202 // OMP_critical_wait      -- Time thread spends waiting to enter
203 //                           a critical section
204 // OMP_single             -- Time spent executing a "single" region
205 // OMP_master             -- Time spent executing a "master" region
206 // OMP_masked             -- Time spent executing a "masked" region
207 // OMP_task_immediate     -- Time spent executing non-deferred tasks
208 // OMP_task_taskwait      -- Time spent executing tasks inside a taskwait
209 //                           construct
210 // OMP_task_taskyield     -- Time spent executing tasks inside a taskyield
211 //                           construct
212 // OMP_task_taskgroup     -- Time spent executing tasks inside a taskygroup
213 //                           construct
214 // OMP_task_join_bar      -- Time spent executing tasks inside a join barrier
215 // OMP_task_plain_bar     -- Time spent executing tasks inside a barrier
216 //                           construct
217 // OMP_taskloop_scheduling -- Time spent scheduling tasks inside a taskloop
218 //                            construct
219 // OMP_plain_barrier      -- Time spent in a #pragma omp barrier construct or
220 //                           inside implicit barrier at end of worksharing
221 //                           construct
222 // OMP_idle               -- Time worker threads spend waiting for next
223 //                           parallel region
224 // OMP_fork_barrier       -- Time spent in a the fork barrier surrounding a
225 //                           parallel region
226 // OMP_join_barrier       -- Time spent in a the join barrier surrounding a
227 //                           parallel region
228 // OMP_serial             -- Time thread zero spends executing serial code
229 // OMP_set_numthreads     -- Values passed to omp_set_num_threads
230 // OMP_PARALLEL_args      -- Number of arguments passed to a parallel region
231 // OMP_loop_static_iterations -- Number of iterations thread is assigned for
232 //                               statically scheduled loops
233 // OMP_loop_dynamic_iterations -- Number of iterations thread is assigned for
234 //                                dynamically scheduled loops
235 
236 #if (KMP_DEVELOPER_STATS)
237 // Timers which are of interest to runtime library developers, not end users.
238 // These have to be explicitly enabled in addition to the other stats.
239 
240 // KMP_fork_barrier       -- time in __kmp_fork_barrier
241 // KMP_join_barrier       -- time in __kmp_join_barrier
242 // KMP_barrier            -- time in __kmp_barrier
243 // KMP_end_split_barrier  -- time in __kmp_end_split_barrier
244 // KMP_setup_icv_copy     -- time in __kmp_setup_icv_copy
245 // KMP_icv_copy           -- start/stop timer for any ICV copying
246 // KMP_linear_gather      -- time in __kmp_linear_barrier_gather
247 // KMP_linear_release     -- time in __kmp_linear_barrier_release
248 // KMP_tree_gather        -- time in __kmp_tree_barrier_gather
249 // KMP_tree_release       -- time in __kmp_tree_barrier_release
250 // KMP_hyper_gather       -- time in __kmp_hyper_barrier_gather
251 // KMP_hyper_release      -- time in __kmp_hyper_barrier_release
252 // KMP_dist_gather       -- time in __kmp_dist_barrier_gather
253 // KMP_dist_release      -- time in __kmp_dist_barrier_release
254 // clang-format off
255 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)                                \
256   macro(KMP_fork_call, 0, arg)                                                 \
257   macro(KMP_join_call, 0, arg)                                                 \
258   macro(KMP_end_split_barrier, 0, arg)                                         \
259   macro(KMP_hier_gather, 0, arg)                                               \
260   macro(KMP_hier_release, 0, arg)                                              \
261   macro(KMP_hyper_gather, 0, arg)                                              \
262   macro(KMP_hyper_release, 0, arg)                                             \
263   macro(KMP_dist_gather, 0, arg)                                              \
264   macro(KMP_dist_release, 0, arg)                                             \
265   macro(KMP_linear_gather, 0, arg)                                             \
266   macro(KMP_linear_release, 0, arg)                                            \
267   macro(KMP_tree_gather, 0, arg)                                               \
268   macro(KMP_tree_release, 0, arg)                                              \
269   macro(USER_resume, 0, arg)                                                   \
270   macro(USER_suspend, 0, arg)                                                  \
271   macro(USER_mwait, 0, arg)                                                    \
272   macro(KMP_allocate_team, 0, arg)                                             \
273   macro(KMP_setup_icv_copy, 0, arg)                                            \
274   macro(USER_icv_copy, 0, arg)                                                 \
275   macro (FOR_static_steal_stolen,                                              \
276          stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
277   macro (FOR_static_steal_chunks,                                              \
278          stats_flags_e::noUnits | stats_flags_e::noTotal, arg)
279 #else
280 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
281 #endif
282 // clang-format on
283 
284 /*!
285  * \brief Add new explicit timers under KMP_FOREACH_EXPLICIT_TIMER() macro.
286  *
287  * @param macro a user defined macro that takes three arguments -
288  * macro(TIMER_NAME, flags, arg)
289  * @param arg a user defined argument to send to the user defined macro
290  *
291  * \warning YOU MUST HAVE THE SAME NAMED TIMER UNDER KMP_FOREACH_TIMER() OR ELSE
292  * BAD THINGS WILL HAPPEN!
293  *
294  * \details Explicit timers are ones where we need to allocate a timer itself
295  * (as well as the accumulated timing statistics). We allocate these on a
296  * per-thread basis, and explicitly start and stop them. Block timers just
297  * allocate the timer itself on the stack, and use the destructor to notice
298  * block exit; they don't need to be defined here. The name here should be the
299  * same as that of a timer above.
300  *
301  * @ingroup STATS_GATHERING
302  */
303 #define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) KMP_FOREACH_TIMER(macro, arg)
304 
305 #define ENUMERATE(name, ignore, prefix) prefix##name,
306 enum timer_e { KMP_FOREACH_TIMER(ENUMERATE, TIMER_) TIMER_LAST };
307 
308 enum explicit_timer_e {
309   KMP_FOREACH_EXPLICIT_TIMER(ENUMERATE, EXPLICIT_TIMER_) EXPLICIT_TIMER_LAST
310 };
311 
312 enum counter_e { KMP_FOREACH_COUNTER(ENUMERATE, COUNTER_) COUNTER_LAST };
313 #undef ENUMERATE
314 
315 /*
316  * A logarithmic histogram. It accumulates the number of values in each power of
317  * ten bin.  So 1<=x<10, 10<=x<100, ...
318  * Mostly useful where we have some big outliers and want to see information
319  * about them.
320  */
321 class logHistogram {
322   enum {
323     numBins = 31, /* Number of powers of 10. If this changes you need to change
324                    * the initializer for binMax */
325 
326     /*
327      * If you want to use this to analyse values that may be less than 1, (for
328      * instance times in s), then the logOffset gives you negative powers.
329      * In our case here, we're just looking at times in ticks, or counts, so we
330      * can never see values with magnitude < 1 (other than zero), so we can set
331      * it to 0.  As above change the initializer if you change this.
332      */
333     logOffset = 0
334   };
335   uint32_t KMP_ALIGN_CACHE zeroCount;
336   struct {
337     uint32_t count;
338     double total;
339   } bins[numBins];
340 
341   static double binMax[numBins];
342 
343 #ifdef KMP_DEBUG
344   uint64_t _total;
345 
346   void check() const {
347     uint64_t t = zeroCount;
348     for (int i = 0; i < numBins; i++)
349       t += bins[i].count;
350     KMP_DEBUG_ASSERT(t == _total);
351   }
352 #else
353   void check() const {}
354 #endif
355 
356 public:
357   logHistogram() { reset(); }
358 
359   logHistogram(logHistogram const &o) {
360     for (int i = 0; i < numBins; i++)
361       bins[i] = o.bins[i];
362 #ifdef KMP_DEBUG
363     _total = o._total;
364 #endif
365   }
366 
367   void reset() {
368     zeroCount = 0;
369     for (int i = 0; i < numBins; i++) {
370       bins[i].count = 0;
371       bins[i].total = 0;
372     }
373 
374 #ifdef KMP_DEBUG
375     _total = 0;
376 #endif
377   }
378   uint32_t count(int b) const { return bins[b + logOffset].count; }
379   double total(int b) const { return bins[b + logOffset].total; }
380   static uint32_t findBin(double sample);
381 
382   logHistogram &operator+=(logHistogram const &o) {
383     zeroCount += o.zeroCount;
384     for (int i = 0; i < numBins; i++) {
385       bins[i].count += o.bins[i].count;
386       bins[i].total += o.bins[i].total;
387     }
388 #ifdef KMP_DEBUG
389     _total += o._total;
390     check();
391 #endif
392 
393     return *this;
394   }
395 
396   void addSample(double sample);
397   int minBin() const;
398   int maxBin() const;
399 
400   std::string format(char) const;
401 };
402 
403 class statistic {
404   double KMP_ALIGN_CACHE minVal;
405   double maxVal;
406   double meanVal;
407   double m2;
408   uint64_t sampleCount;
409   double offset;
410   bool collectingHist;
411   logHistogram hist;
412 
413 public:
414   statistic(bool doHist = bool(KMP_STATS_HIST)) {
415     reset();
416     collectingHist = doHist;
417   }
418   statistic(statistic const &o)
419       : minVal(o.minVal), maxVal(o.maxVal), meanVal(o.meanVal), m2(o.m2),
420         sampleCount(o.sampleCount), offset(o.offset),
421         collectingHist(o.collectingHist), hist(o.hist) {}
422   statistic(double minv, double maxv, double meanv, uint64_t sc, double sd)
423       : minVal(minv), maxVal(maxv), meanVal(meanv), m2(sd * sd * sc),
424         sampleCount(sc), offset(0.0), collectingHist(false) {}
425   bool haveHist() const { return collectingHist; }
426   double getMin() const { return minVal; }
427   double getMean() const { return meanVal; }
428   double getMax() const { return maxVal; }
429   uint64_t getCount() const { return sampleCount; }
430   double getSD() const { return sqrt(m2 / sampleCount); }
431   double getTotal() const { return sampleCount * meanVal; }
432   logHistogram const *getHist() const { return &hist; }
433   void setOffset(double d) { offset = d; }
434 
435   void reset() {
436     minVal = (std::numeric_limits<double>::max)();
437     maxVal = -minVal;
438     meanVal = 0.0;
439     m2 = 0.0;
440     sampleCount = 0;
441     offset = 0.0;
442     hist.reset();
443   }
444   void addSample(double sample);
445   void scale(double factor);
446   void scaleDown(double f) { scale(1. / f); }
447   void forceCount(uint64_t count) { sampleCount = count; }
448   statistic &operator+=(statistic const &other);
449 
450   std::string format(char unit, bool total = false) const;
451   std::string formatHist(char unit) const { return hist.format(unit); }
452 };
453 
454 struct statInfo {
455   const char *name;
456   uint32_t flags;
457 };
458 
459 class timeStat : public statistic {
460   static statInfo timerInfo[];
461 
462 public:
463   timeStat() : statistic() {}
464   static const char *name(timer_e e) { return timerInfo[e].name; }
465   static bool noTotal(timer_e e) {
466     return timerInfo[e].flags & stats_flags_e::noTotal;
467   }
468   static bool masterOnly(timer_e e) {
469     return timerInfo[e].flags & stats_flags_e::onlyInMaster;
470   }
471   static bool workerOnly(timer_e e) {
472     return timerInfo[e].flags & stats_flags_e::notInMaster;
473   }
474   static bool noUnits(timer_e e) {
475     return timerInfo[e].flags & stats_flags_e::noUnits;
476   }
477   static bool logEvent(timer_e e) {
478     return timerInfo[e].flags & stats_flags_e::logEvent;
479   }
480   static void clearEventFlags() {
481     for (int i = 0; i < TIMER_LAST; i++) {
482       timerInfo[i].flags &= (~(stats_flags_e::logEvent));
483     }
484   }
485 };
486 
487 // Where we need explicitly to start and end the timer, this version can be used
488 // Since these timers normally aren't nicely scoped, so don't have a good place
489 // to live on the stack of the thread, they're more work to use.
490 class explicitTimer {
491   timeStat *stat;
492   timer_e timerEnumValue;
493   tsc_tick_count startTime;
494   tsc_tick_count pauseStartTime;
495   tsc_tick_count::tsc_interval_t totalPauseTime;
496 
497 public:
498   explicitTimer(timeStat *s, timer_e te)
499       : stat(s), timerEnumValue(te), startTime(), pauseStartTime(0),
500         totalPauseTime() {}
501 
502   // void setStat(timeStat *s) { stat = s; }
503   void start(tsc_tick_count tick);
504   void pause(tsc_tick_count tick) { pauseStartTime = tick; }
505   void resume(tsc_tick_count tick) {
506     totalPauseTime += (tick - pauseStartTime);
507   }
508   void stop(tsc_tick_count tick, kmp_stats_list *stats_ptr = nullptr);
509   void reset() {
510     startTime = 0;
511     pauseStartTime = 0;
512     totalPauseTime = 0;
513   }
514   timer_e get_type() const { return timerEnumValue; }
515 };
516 
517 // Where you need to partition a threads clock ticks into separate states
518 // e.g., a partitionedTimers class with two timers of EXECUTING_TASK, and
519 // DOING_NOTHING would render these conditions:
520 // time(EXECUTING_TASK) + time(DOING_NOTHING) = total time thread is alive
521 // No clock tick in the EXECUTING_TASK is a member of DOING_NOTHING and vice
522 // versa
523 class partitionedTimers {
524 private:
525   std::vector<explicitTimer> timer_stack;
526 
527 public:
528   partitionedTimers();
529   void init(explicitTimer timer);
530   void exchange(explicitTimer timer);
531   void push(explicitTimer timer);
532   void pop();
533   void windup();
534 };
535 
536 // Special wrapper around the partitioned timers to aid timing code blocks
537 // It avoids the need to have an explicit end, leaving the scope suffices.
538 class blockPartitionedTimer {
539   partitionedTimers *part_timers;
540 
541 public:
542   blockPartitionedTimer(partitionedTimers *pt, explicitTimer timer)
543       : part_timers(pt) {
544     part_timers->push(timer);
545   }
546   ~blockPartitionedTimer() { part_timers->pop(); }
547 };
548 
549 // Special wrapper around the thread state to aid in keeping state in code
550 // blocks It avoids the need to have an explicit end, leaving the scope
551 // suffices.
552 class blockThreadState {
553   stats_state_e *state_pointer;
554   stats_state_e old_state;
555 
556 public:
557   blockThreadState(stats_state_e *thread_state_pointer, stats_state_e new_state)
558       : state_pointer(thread_state_pointer), old_state(*thread_state_pointer) {
559     *state_pointer = new_state;
560   }
561   ~blockThreadState() { *state_pointer = old_state; }
562 };
563 
564 // If all you want is a count, then you can use this...
565 // The individual per-thread counts will be aggregated into a statistic at
566 // program exit.
567 class counter {
568   uint64_t value;
569   static const statInfo counterInfo[];
570 
571 public:
572   counter() : value(0) {}
573   void increment() { value++; }
574   uint64_t getValue() const { return value; }
575   void reset() { value = 0; }
576   static const char *name(counter_e e) { return counterInfo[e].name; }
577   static bool masterOnly(counter_e e) {
578     return counterInfo[e].flags & stats_flags_e::onlyInMaster;
579   }
580 };
581 
582 /* ****************************************************************
583     Class to implement an event
584 
585     There are four components to an event: start time, stop time
586     nest_level, and timer_name.
587     The start and stop time should be obvious (recorded in clock ticks).
588     The nest_level relates to the bar width in the timeline graph.
589     The timer_name is used to determine which timer event triggered this event.
590 
591     the interface to this class is through four read-only operations:
592     1) getStart()     -- returns the start time as 64 bit integer
593     2) getStop()      -- returns the stop time as 64 bit integer
594     3) getNestLevel() -- returns the nest level of the event
595     4) getTimerName() -- returns the timer name that triggered event
596 
597     *MORE ON NEST_LEVEL*
598     The nest level is used in the bar graph that represents the timeline.
599     Its main purpose is for showing how events are nested inside eachother.
600     For example, say events, A, B, and C are recorded.  If the timeline
601     looks like this:
602 
603 Begin -------------------------------------------------------------> Time
604          |    |          |        |          |              |
605          A    B          C        C          B              A
606        start start     start     end        end            end
607 
608        Then A, B, C will have a nest level of 1, 2, 3 respectively.
609        These values are then used to calculate the barwidth so you can
610        see that inside A, B has occurred, and inside B, C has occurred.
611        Currently, this is shown with A's bar width being larger than B's
612        bar width, and B's bar width being larger than C's bar width.
613 
614 **************************************************************** */
615 class kmp_stats_event {
616   uint64_t start;
617   uint64_t stop;
618   int nest_level;
619   timer_e timer_name;
620 
621 public:
622   kmp_stats_event()
623       : start(0), stop(0), nest_level(0), timer_name(TIMER_LAST) {}
624   kmp_stats_event(uint64_t strt, uint64_t stp, int nst, timer_e nme)
625       : start(strt), stop(stp), nest_level(nst), timer_name(nme) {}
626   inline uint64_t getStart() const { return start; }
627   inline uint64_t getStop() const { return stop; }
628   inline int getNestLevel() const { return nest_level; }
629   inline timer_e getTimerName() const { return timer_name; }
630 };
631 
632 /* ****************************************************************
633     Class to implement a dynamically expandable array of events
634 
635     ---------------------------------------------------------
636     | event 1 | event 2 | event 3 | event 4 | ... | event N |
637     ---------------------------------------------------------
638 
639     An event is pushed onto the back of this array at every
640     explicitTimer->stop() call.  The event records the thread #,
641     start time, stop time, and nest level related to the bar width.
642 
643     The event vector starts at size INIT_SIZE and grows (doubles in size)
644     if needed.  An implication of this behavior is that log(N)
645     reallocations are needed (where N is number of events).  If you want
646     to avoid reallocations, then set INIT_SIZE to a large value.
647 
648     the interface to this class is through six operations:
649     1) reset() -- sets the internal_size back to 0 but does not deallocate any
650        memory
651     2) size()  -- returns the number of valid elements in the vector
652     3) push_back(start, stop, nest, timer_name) -- pushes an event onto
653        the back of the array
654     4) deallocate() -- frees all memory associated with the vector
655     5) sort() -- sorts the vector by start time
656     6) operator[index] or at(index) -- returns event reference at that index
657 **************************************************************** */
658 class kmp_stats_event_vector {
659   kmp_stats_event *events;
660   int internal_size;
661   int allocated_size;
662   static const int INIT_SIZE = 1024;
663 
664 public:
665   kmp_stats_event_vector() {
666     events =
667         (kmp_stats_event *)__kmp_allocate(sizeof(kmp_stats_event) * INIT_SIZE);
668     internal_size = 0;
669     allocated_size = INIT_SIZE;
670   }
671   ~kmp_stats_event_vector() {}
672   inline void reset() { internal_size = 0; }
673   inline int size() const { return internal_size; }
674   void push_back(uint64_t start_time, uint64_t stop_time, int nest_level,
675                  timer_e name) {
676     int i;
677     if (internal_size == allocated_size) {
678       kmp_stats_event *tmp = (kmp_stats_event *)__kmp_allocate(
679           sizeof(kmp_stats_event) * allocated_size * 2);
680       for (i = 0; i < internal_size; i++)
681         tmp[i] = events[i];
682       __kmp_free(events);
683       events = tmp;
684       allocated_size *= 2;
685     }
686     events[internal_size] =
687         kmp_stats_event(start_time, stop_time, nest_level, name);
688     internal_size++;
689     return;
690   }
691   void deallocate();
692   void sort();
693   const kmp_stats_event &operator[](int index) const { return events[index]; }
694   kmp_stats_event &operator[](int index) { return events[index]; }
695   const kmp_stats_event &at(int index) const { return events[index]; }
696   kmp_stats_event &at(int index) { return events[index]; }
697 };
698 
699 /* ****************************************************************
700     Class to implement a doubly-linked, circular, statistics list
701 
702     |---| ---> |---| ---> |---| ---> |---| ---> ... next
703     |   |      |   |      |   |      |   |
704     |---| <--- |---| <--- |---| <--- |---| <--- ... prev
705     Sentinel   first      second     third
706     Node       node       node       node
707 
708     The Sentinel Node is the user handle on the list.
709     The first node corresponds to thread 0's statistics.
710     The second node corresponds to thread 1's statistics and so on...
711 
712     Each node has a _timers, _counters, and _explicitTimers array to hold that
713     thread's statistics. The _explicitTimers point to the correct _timer and
714     update its statistics at every stop() call. The explicitTimers' pointers are
715     set up in the constructor. Each node also has an event vector to hold that
716     thread's timing events. The event vector expands as necessary and records
717     the start-stop times for each timer.
718 
719     The nestLevel variable is for plotting events and is related
720     to the bar width in the timeline graph.
721 
722     Every thread will have a thread local pointer to its node in
723     the list.  The sentinel node is used by the primary thread to
724     store "dummy" statistics before __kmp_create_worker() is called.
725 **************************************************************** */
726 class kmp_stats_list {
727   int gtid;
728   timeStat _timers[TIMER_LAST + 1];
729   counter _counters[COUNTER_LAST + 1];
730   explicitTimer thread_life_timer;
731   partitionedTimers _partitionedTimers;
732   int _nestLevel; // one per thread
733   kmp_stats_event_vector _event_vector;
734   kmp_stats_list *next;
735   kmp_stats_list *prev;
736   stats_state_e state;
737   int thread_is_idle_flag;
738 
739 public:
740   kmp_stats_list()
741       : thread_life_timer(&_timers[TIMER_OMP_worker_thread_life],
742                           TIMER_OMP_worker_thread_life),
743         _nestLevel(0), _event_vector(), next(this), prev(this), state(IDLE),
744         thread_is_idle_flag(0) {}
745   ~kmp_stats_list() {}
746   inline timeStat *getTimer(timer_e idx) { return &_timers[idx]; }
747   inline counter *getCounter(counter_e idx) { return &_counters[idx]; }
748   inline partitionedTimers *getPartitionedTimers() {
749     return &_partitionedTimers;
750   }
751   inline timeStat *getTimers() { return _timers; }
752   inline counter *getCounters() { return _counters; }
753   inline kmp_stats_event_vector &getEventVector() { return _event_vector; }
754   inline void startLife() { thread_life_timer.start(tsc_tick_count::now()); }
755   inline void endLife() { thread_life_timer.stop(tsc_tick_count::now(), this); }
756   inline void resetEventVector() { _event_vector.reset(); }
757   inline void incrementNestValue() { _nestLevel++; }
758   inline int getNestValue() { return _nestLevel; }
759   inline void decrementNestValue() { _nestLevel--; }
760   inline int getGtid() const { return gtid; }
761   inline void setGtid(int newgtid) { gtid = newgtid; }
762   inline void setState(stats_state_e newstate) { state = newstate; }
763   inline stats_state_e getState() const { return state; }
764   inline stats_state_e *getStatePointer() { return &state; }
765   inline bool isIdle() { return thread_is_idle_flag == 1; }
766   inline void setIdleFlag() { thread_is_idle_flag = 1; }
767   inline void resetIdleFlag() { thread_is_idle_flag = 0; }
768   kmp_stats_list *push_back(int gtid); // returns newly created list node
769   inline void push_event(uint64_t start_time, uint64_t stop_time,
770                          int nest_level, timer_e name) {
771     _event_vector.push_back(start_time, stop_time, nest_level, name);
772   }
773   void deallocate();
774   class iterator;
775   kmp_stats_list::iterator begin();
776   kmp_stats_list::iterator end();
777   int size();
778   class iterator {
779     kmp_stats_list *ptr;
780     friend kmp_stats_list::iterator kmp_stats_list::begin();
781     friend kmp_stats_list::iterator kmp_stats_list::end();
782 
783   public:
784     iterator();
785     ~iterator();
786     iterator operator++();
787     iterator operator++(int dummy);
788     iterator operator--();
789     iterator operator--(int dummy);
790     bool operator!=(const iterator &rhs);
791     bool operator==(const iterator &rhs);
792     kmp_stats_list *operator*() const; // dereference operator
793   };
794 };
795 
796 /* ****************************************************************
797    Class to encapsulate all output functions and the environment variables
798 
799    This module holds filenames for various outputs (normal stats, events, plot
800    file), as well as coloring information for the plot file.
801 
802    The filenames and flags variables are read from environment variables.
803    These are read once by the constructor of the global variable
804    __kmp_stats_output which calls init().
805 
806    During this init() call, event flags for the timeStat::timerInfo[] global
807    array are cleared if KMP_STATS_EVENTS is not true (on, 1, yes).
808 
809    The only interface function that is public is outputStats(heading).  This
810    function should print out everything it needs to, either to files or stderr,
811    depending on the environment variables described below
812 
813    ENVIRONMENT VARIABLES:
814    KMP_STATS_FILE -- if set, all statistics (not events) will be printed to this
815                      file, otherwise, print to stderr
816    KMP_STATS_THREADS -- if set to "on", then will print per thread statistics to
817                         either KMP_STATS_FILE or stderr
818    KMP_STATS_PLOT_FILE -- if set, print the ploticus plot file to this filename,
819                           otherwise, the plot file is sent to "events.plt"
820    KMP_STATS_EVENTS -- if set to "on", then log events, otherwise, don't log
821                        events
822    KMP_STATS_EVENTS_FILE -- if set, all events are outputted to this file,
823                             otherwise, output is sent to "events.dat"
824 **************************************************************** */
825 class kmp_stats_output_module {
826 
827 public:
828   struct rgb_color {
829     float r;
830     float g;
831     float b;
832   };
833 
834 private:
835   std::string outputFileName;
836   static const char *eventsFileName;
837   static const char *plotFileName;
838   static int printPerThreadFlag;
839   static int printPerThreadEventsFlag;
840   static const rgb_color globalColorArray[];
841   static rgb_color timerColorInfo[];
842 
843   void init();
844   static void setupEventColors();
845   static void printPloticusFile();
846   static void printHeaderInfo(FILE *statsOut);
847   static void printTimerStats(FILE *statsOut, statistic const *theStats,
848                               statistic const *totalStats);
849   static void printCounterStats(FILE *statsOut, statistic const *theStats);
850   static void printCounters(FILE *statsOut, counter const *theCounters);
851   static void printEvents(FILE *eventsOut, kmp_stats_event_vector *theEvents,
852                           int gtid);
853   static rgb_color getEventColor(timer_e e) { return timerColorInfo[e]; }
854   static void windupExplicitTimers();
855   bool eventPrintingEnabled() const { return printPerThreadEventsFlag; }
856 
857 public:
858   kmp_stats_output_module() { init(); }
859   void outputStats(const char *heading);
860 };
861 
862 #ifdef __cplusplus
863 extern "C" {
864 #endif
865 void __kmp_stats_init();
866 void __kmp_stats_fini();
867 void __kmp_reset_stats();
868 void __kmp_output_stats(const char *);
869 void __kmp_accumulate_stats_at_exit(void);
870 // thread local pointer to stats node within list
871 extern KMP_THREAD_LOCAL kmp_stats_list *__kmp_stats_thread_ptr;
872 // head to stats list.
873 extern kmp_stats_list *__kmp_stats_list;
874 // lock for __kmp_stats_list
875 extern kmp_tas_lock_t __kmp_stats_lock;
876 // reference start time
877 extern tsc_tick_count __kmp_stats_start_time;
878 // interface to output
879 extern kmp_stats_output_module __kmp_stats_output;
880 
881 #ifdef __cplusplus
882 }
883 #endif
884 
885 // Simple, standard interfaces that drop out completely if stats aren't enabled
886 
887 /*!
888  * \brief Adds value to specified timer (name).
889  *
890  * @param name timer name as specified under the KMP_FOREACH_TIMER() macro
891  * @param value double precision sample value to add to statistics for the timer
892  *
893  * \details Use KMP_COUNT_VALUE(name, value) macro to add a particular value to
894  * a timer statistics.
895  *
896  * @ingroup STATS_GATHERING
897  */
898 #define KMP_COUNT_VALUE(name, value)                                           \
899   __kmp_stats_thread_ptr->getTimer(TIMER_##name)->addSample((double)value)
900 
901 /*!
902  * \brief Increments specified counter (name).
903  *
904  * @param name counter name as specified under the KMP_FOREACH_COUNTER() macro
905  *
906  * \details Use KMP_COUNT_BLOCK(name, value) macro to increment a statistics
907  * counter for the executing thread.
908  *
909  * @ingroup STATS_GATHERING
910  */
911 #define KMP_COUNT_BLOCK(name)                                                  \
912   __kmp_stats_thread_ptr->getCounter(COUNTER_##name)->increment()
913 
914 /*!
915  * \brief Outputs the current thread statistics and reset them.
916  *
917  * @param heading_string heading put above the final stats output
918  *
919  * \details Explicitly stops all timers and outputs all stats. Environment
920  * variable, `OMPTB_STATSFILE=filename`, can be used to output the stats to a
921  * filename instead of stderr. Environment variable,
922  * `OMPTB_STATSTHREADS=true|undefined`, can be used to output thread specific
923  * stats. For now the `OMPTB_STATSTHREADS` environment variable can either be
924  * defined with any value, which will print out thread specific stats, or it can
925  * be undefined (not specified in the environment) and thread specific stats
926  * won't be printed. It should be noted that all statistics are reset when this
927  * macro is called.
928  *
929  * @ingroup STATS_GATHERING
930  */
931 #define KMP_OUTPUT_STATS(heading_string) __kmp_output_stats(heading_string)
932 
933 /*!
934  * \brief Initializes the partitioned timers to begin with name.
935  *
936  * @param name timer which you want this thread to begin with
937  *
938  * @ingroup STATS_GATHERING
939  */
940 #define KMP_INIT_PARTITIONED_TIMERS(name)                                      \
941   __kmp_stats_thread_ptr->getPartitionedTimers()->init(explicitTimer(          \
942       __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
943 
944 #define KMP_TIME_PARTITIONED_BLOCK(name)                                       \
945   blockPartitionedTimer __PBLOCKTIME__(                                        \
946       __kmp_stats_thread_ptr->getPartitionedTimers(),                          \
947       explicitTimer(__kmp_stats_thread_ptr->getTimer(TIMER_##name),            \
948                     TIMER_##name))
949 
950 #define KMP_PUSH_PARTITIONED_TIMER(name)                                       \
951   __kmp_stats_thread_ptr->getPartitionedTimers()->push(explicitTimer(          \
952       __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
953 
954 #define KMP_POP_PARTITIONED_TIMER()                                            \
955   __kmp_stats_thread_ptr->getPartitionedTimers()->pop()
956 
957 #define KMP_EXCHANGE_PARTITIONED_TIMER(name)                                   \
958   __kmp_stats_thread_ptr->getPartitionedTimers()->exchange(explicitTimer(      \
959       __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
960 
961 #define KMP_SET_THREAD_STATE(state_name)                                       \
962   __kmp_stats_thread_ptr->setState(state_name)
963 
964 #define KMP_GET_THREAD_STATE() __kmp_stats_thread_ptr->getState()
965 
966 #define KMP_SET_THREAD_STATE_BLOCK(state_name)                                 \
967   blockThreadState __BTHREADSTATE__(__kmp_stats_thread_ptr->getStatePointer(), \
968                                     state_name)
969 
970 /*!
971  * \brief resets all stats (counters to 0, timers to 0 elapsed ticks)
972  *
973  * \details Reset all stats for all threads.
974  *
975  * @ingroup STATS_GATHERING
976  */
977 #define KMP_RESET_STATS() __kmp_reset_stats()
978 
979 #if (KMP_DEVELOPER_STATS)
980 #define KMP_COUNT_DEVELOPER_VALUE(n, v) KMP_COUNT_VALUE(n, v)
981 #define KMP_COUNT_DEVELOPER_BLOCK(n) KMP_COUNT_BLOCK(n)
982 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) KMP_TIME_PARTITIONED_BLOCK(n)
983 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) KMP_PUSH_PARTITIONED_TIMER(n)
984 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) KMP_POP_PARTITIONED_TIMER(n)
985 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n)                            \
986   KMP_EXCHANGE_PARTITIONED_TIMER(n)
987 #else
988 // Null definitions
989 #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
990 #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
991 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
992 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
993 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
994 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
995 #endif
996 
997 #else // KMP_STATS_ENABLED
998 
999 // Null definitions
1000 #define KMP_COUNT_VALUE(n, v) ((void)0)
1001 #define KMP_COUNT_BLOCK(n) ((void)0)
1002 
1003 #define KMP_OUTPUT_STATS(heading_string) ((void)0)
1004 #define KMP_RESET_STATS() ((void)0)
1005 
1006 #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
1007 #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
1008 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
1009 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1010 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1011 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1012 #define KMP_INIT_PARTITIONED_TIMERS(name) ((void)0)
1013 #define KMP_TIME_PARTITIONED_BLOCK(name) ((void)0)
1014 #define KMP_PUSH_PARTITIONED_TIMER(name) ((void)0)
1015 #define KMP_POP_PARTITIONED_TIMER() ((void)0)
1016 #define KMP_SET_THREAD_STATE(state_name) ((void)0)
1017 #define KMP_GET_THREAD_STATE() ((void)0)
1018 #define KMP_SET_THREAD_STATE_BLOCK(state_name) ((void)0)
1019 #endif // KMP_STATS_ENABLED
1020 
1021 #endif // KMP_STATS_H
1022