1 #ifndef KMP_STATS_H
2 #define KMP_STATS_H
3 
4 /** @file kmp_stats.h
5  * Functions for collecting statistics.
6  */
7 
8 //===----------------------------------------------------------------------===//
9 //
10 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
11 // See https://llvm.org/LICENSE.txt for license information.
12 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "kmp_config.h"
17 #include "kmp_debug.h"
18 
19 #if KMP_STATS_ENABLED
20 /* Statistics accumulator.
21    Accumulates number of samples and computes min, max, mean, standard deviation
22    on the fly.
23 
24    Online variance calculation algorithm from
25    http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
26  */
27 
28 #include "kmp_stats_timing.h"
29 #include <limits>
30 #include <math.h>
31 #include <new> // placement new
32 #include <stdint.h>
33 #include <string>
34 #include <vector>
35 
36 /* Enable developer statistics here if you want them. They are more detailed
37    than is useful for application characterisation and are intended for the
38    runtime library developer. */
39 #define KMP_DEVELOPER_STATS 0
40 
41 /* Enable/Disable histogram output */
42 #define KMP_STATS_HIST 0
43 
44 /*!
45  * @ingroup STATS_GATHERING
46  * \brief flags to describe the statistic (timer or counter)
47  *
48  */
49 enum stats_flags_e {
50   noTotal = 1 << 0, //!< do not show a TOTAL_aggregation for this statistic
51   onlyInMaster = 1 << 1, //!< statistic is valid only for primary thread
52   noUnits = 1 << 2, //!< statistic doesn't need units printed next to it
53   notInMaster = 1 << 3, //!< statistic is valid only for non-primary threads
54   logEvent = 1 << 4 //!< statistic can be logged on the event timeline when
55   //! KMP_STATS_EVENTS is on (valid only for timers)
56 };
57 
58 /*!
59  * @ingroup STATS_GATHERING
60  * \brief the states which a thread can be in
61  *
62  */
63 enum stats_state_e {
64   IDLE,
65   SERIAL_REGION,
66   FORK_JOIN_BARRIER,
67   PLAIN_BARRIER,
68   TASKWAIT,
69   TASKYIELD,
70   TASKGROUP,
71   IMPLICIT_TASK,
72   EXPLICIT_TASK,
73   TEAMS_REGION
74 };
75 
76 /*!
77  * \brief Add new counters under KMP_FOREACH_COUNTER() macro in kmp_stats.h
78  *
79  * @param macro a user defined macro that takes three arguments -
80  * macro(COUNTER_NAME, flags, arg)
81  * @param arg a user defined argument to send to the user defined macro
82  *
83  * \details A counter counts the occurrence of some event. Each thread
84  * accumulates its own count, at the end of execution the counts are aggregated
85  * treating each thread as a separate measurement. (Unless onlyInMaster is set,
86  * in which case there's only a single measurement). The min,mean,max are
87  * therefore the values for the threads. Adding the counter here and then
88  * putting a KMP_BLOCK_COUNTER(name) at the point you want to count is all you
89  * need to do. All of the tables and printing is generated from this macro.
90  * Format is "macro(name, flags, arg)"
91  *
92  * @ingroup STATS_GATHERING
93  */
94 // clang-format off
95 #define KMP_FOREACH_COUNTER(macro, arg)                                        \
96   macro(OMP_PARALLEL,stats_flags_e::onlyInMaster|stats_flags_e::noTotal,arg)   \
97   macro(OMP_NESTED_PARALLEL, 0, arg)                                           \
98   macro(OMP_LOOP_STATIC, 0, arg)                                               \
99   macro(OMP_LOOP_STATIC_STEAL, 0, arg)                                         \
100   macro(OMP_LOOP_DYNAMIC, 0, arg)                                              \
101   macro(OMP_DISTRIBUTE, 0, arg)                                                \
102   macro(OMP_BARRIER, 0, arg)                                                   \
103   macro(OMP_CRITICAL, 0, arg)                                                  \
104   macro(OMP_SINGLE, 0, arg)                                                    \
105   macro(OMP_MASTER, 0, arg)                                                    \
106   macro(OMP_MASKED, 0, arg)                                                    \
107   macro(OMP_TEAMS, 0, arg)                                                     \
108   macro(OMP_set_lock, 0, arg)                                                  \
109   macro(OMP_test_lock, 0, arg)                                                 \
110   macro(REDUCE_wait, 0, arg)                                                   \
111   macro(REDUCE_nowait, 0, arg)                                                 \
112   macro(OMP_TASKYIELD, 0, arg)                                                 \
113   macro(OMP_TASKLOOP, 0, arg)                                                  \
114   macro(TASK_executed, 0, arg)                                                 \
115   macro(TASK_cancelled, 0, arg)                                                \
116   macro(TASK_stolen, 0, arg)
117 // clang-format on
118 
119 /*!
120  * \brief Add new timers under KMP_FOREACH_TIMER() macro in kmp_stats.h
121  *
122  * @param macro a user defined macro that takes three arguments -
123  * macro(TIMER_NAME, flags, arg)
124  * @param arg a user defined argument to send to the user defined macro
125  *
126  * \details A timer collects multiple samples of some count in each thread and
127  * then finally aggregates all of the samples from all of the threads. For most
128  * timers the printing code also provides an aggregation over the thread totals.
129  * These are printed as TOTAL_foo. The count is normally a time (in ticks),
130  * hence the name "timer". (But can be any value, so we use this for "number of
131  * arguments passed to fork" as well). For timers the threads are not
132  * significant, it's the individual observations that count, so the statistics
133  * are at that level. Format is "macro(name, flags, arg)"
134  *
135  * @ingroup STATS_GATHERING2
136  */
137 // clang-format off
138 #define KMP_FOREACH_TIMER(macro, arg)                                          \
139   macro (OMP_worker_thread_life, stats_flags_e::logEvent, arg)                 \
140   macro (OMP_parallel, stats_flags_e::logEvent, arg)                           \
141   macro (OMP_parallel_overhead, stats_flags_e::logEvent, arg)                  \
142   macro (OMP_teams, stats_flags_e::logEvent, arg)                              \
143   macro (OMP_teams_overhead, stats_flags_e::logEvent, arg)                     \
144   macro (OMP_loop_static, 0, arg)                                              \
145   macro (OMP_loop_static_scheduling, 0, arg)                                   \
146   macro (OMP_loop_dynamic, 0, arg)                                             \
147   macro (OMP_loop_dynamic_scheduling, 0, arg)                                  \
148   macro (OMP_distribute, 0, arg)                                               \
149   macro (OMP_distribute_scheduling, 0, arg)                                    \
150   macro (OMP_critical, 0, arg)                                                 \
151   macro (OMP_critical_wait, 0, arg)                                            \
152   macro (OMP_single, 0, arg)                                                   \
153   macro (OMP_master, 0, arg)                                                   \
154   macro (OMP_masked, 0, arg)                                                   \
155   macro (OMP_task_immediate, 0, arg)                                           \
156   macro (OMP_task_taskwait, 0, arg)                                            \
157   macro (OMP_task_taskyield, 0, arg)                                           \
158   macro (OMP_task_taskgroup, 0, arg)                                           \
159   macro (OMP_task_join_bar, 0, arg)                                            \
160   macro (OMP_task_plain_bar, 0, arg)                                           \
161   macro (OMP_taskloop_scheduling, 0, arg)                                      \
162   macro (OMP_plain_barrier, stats_flags_e::logEvent, arg)                      \
163   macro (OMP_idle, stats_flags_e::logEvent, arg)                               \
164   macro (OMP_fork_barrier, stats_flags_e::logEvent, arg)                       \
165   macro (OMP_join_barrier, stats_flags_e::logEvent, arg)                       \
166   macro (OMP_serial, stats_flags_e::logEvent, arg)                             \
167   macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal,  \
168          arg)                                                                  \
169   macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal,   \
170          arg)                                                                  \
171   macro (OMP_loop_static_iterations,                                           \
172          stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
173   macro (OMP_loop_static_total_iterations,                                     \
174          stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
175   macro (OMP_loop_dynamic_iterations,                                          \
176          stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
177   macro (OMP_loop_dynamic_total_iterations,                                    \
178          stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
179   macro (OMP_distribute_iterations,                                            \
180          stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
181   KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
182 // clang-format on
183 
184 // OMP_worker_thread_life -- Time from thread becoming an OpenMP thread (either
185 //                           initializing OpenMP or being created by a primary
186 //                           thread) until the thread is destroyed
187 // OMP_parallel           -- Time thread spends executing work directly
188 //                           within a #pragma omp parallel
189 // OMP_parallel_overhead  -- Time thread spends setting up a parallel region
190 // OMP_loop_static        -- Time thread spends executing loop iterations from
191 //                           a statically scheduled loop
192 // OMP_loop_static_scheduling -- Time thread spends scheduling loop iterations
193 //                               from a statically scheduled loop
194 // OMP_loop_dynamic       -- Time thread spends executing loop iterations from
195 //                           a dynamically scheduled loop
196 // OMP_loop_dynamic_scheduling -- Time thread spends scheduling loop iterations
197 //                                from a dynamically scheduled loop
198 // OMP_critical           -- Time thread spends executing critical section
199 // OMP_critical_wait      -- Time thread spends waiting to enter
200 //                           a critical section
201 // OMP_single             -- Time spent executing a "single" region
202 // OMP_master             -- Time spent executing a "master" region
203 // OMP_masked             -- Time spent executing a "masked" region
204 // OMP_task_immediate     -- Time spent executing non-deferred tasks
205 // OMP_task_taskwait      -- Time spent executing tasks inside a taskwait
206 //                           construct
207 // OMP_task_taskyield     -- Time spent executing tasks inside a taskyield
208 //                           construct
209 // OMP_task_taskgroup     -- Time spent executing tasks inside a taskygroup
210 //                           construct
211 // OMP_task_join_bar      -- Time spent executing tasks inside a join barrier
212 // OMP_task_plain_bar     -- Time spent executing tasks inside a barrier
213 //                           construct
214 // OMP_taskloop_scheduling -- Time spent scheduling tasks inside a taskloop
215 //                            construct
216 // OMP_plain_barrier      -- Time spent in a #pragma omp barrier construct or
217 //                           inside implicit barrier at end of worksharing
218 //                           construct
219 // OMP_idle               -- Time worker threads spend waiting for next
220 //                           parallel region
221 // OMP_fork_barrier       -- Time spent in a the fork barrier surrounding a
222 //                           parallel region
223 // OMP_join_barrier       -- Time spent in a the join barrier surrounding a
224 //                           parallel region
225 // OMP_serial             -- Time thread zero spends executing serial code
226 // OMP_set_numthreads     -- Values passed to omp_set_num_threads
227 // OMP_PARALLEL_args      -- Number of arguments passed to a parallel region
228 // OMP_loop_static_iterations -- Number of iterations thread is assigned for
229 //                               statically scheduled loops
230 // OMP_loop_dynamic_iterations -- Number of iterations thread is assigned for
231 //                                dynamically scheduled loops
232 
233 #if (KMP_DEVELOPER_STATS)
234 // Timers which are of interest to runtime library developers, not end users.
235 // These have to be explicitly enabled in addition to the other stats.
236 
237 // KMP_fork_barrier       -- time in __kmp_fork_barrier
238 // KMP_join_barrier       -- time in __kmp_join_barrier
239 // KMP_barrier            -- time in __kmp_barrier
240 // KMP_end_split_barrier  -- time in __kmp_end_split_barrier
241 // KMP_setup_icv_copy     -- time in __kmp_setup_icv_copy
242 // KMP_icv_copy           -- start/stop timer for any ICV copying
243 // KMP_linear_gather      -- time in __kmp_linear_barrier_gather
244 // KMP_linear_release     -- time in __kmp_linear_barrier_release
245 // KMP_tree_gather        -- time in __kmp_tree_barrier_gather
246 // KMP_tree_release       -- time in __kmp_tree_barrier_release
247 // KMP_hyper_gather       -- time in __kmp_hyper_barrier_gather
248 // KMP_hyper_release      -- time in __kmp_hyper_barrier_release
249 // KMP_dist_gather       -- time in __kmp_dist_barrier_gather
250 // KMP_dist_release      -- time in __kmp_dist_barrier_release
251 // clang-format off
252 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)                                \
253   macro(KMP_fork_call, 0, arg)                                                 \
254   macro(KMP_join_call, 0, arg)                                                 \
255   macro(KMP_end_split_barrier, 0, arg)                                         \
256   macro(KMP_hier_gather, 0, arg)                                               \
257   macro(KMP_hier_release, 0, arg)                                              \
258   macro(KMP_hyper_gather, 0, arg)                                              \
259   macro(KMP_hyper_release, 0, arg)                                             \
260   macro(KMP_dist_gather, 0, arg)                                              \
261   macro(KMP_dist_release, 0, arg)                                             \
262   macro(KMP_linear_gather, 0, arg)                                             \
263   macro(KMP_linear_release, 0, arg)                                            \
264   macro(KMP_tree_gather, 0, arg)                                               \
265   macro(KMP_tree_release, 0, arg)                                              \
266   macro(USER_resume, 0, arg)                                                   \
267   macro(USER_suspend, 0, arg)                                                  \
268   macro(USER_mwait, 0, arg)                                                    \
269   macro(KMP_allocate_team, 0, arg)                                             \
270   macro(KMP_setup_icv_copy, 0, arg)                                            \
271   macro(USER_icv_copy, 0, arg)                                                 \
272   macro (FOR_static_steal_stolen,                                              \
273          stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
274   macro (FOR_static_steal_chunks,                                              \
275          stats_flags_e::noUnits | stats_flags_e::noTotal, arg)
276 #else
277 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
278 #endif
279 // clang-format on
280 
281 /*!
282  * \brief Add new explicit timers under KMP_FOREACH_EXPLICIT_TIMER() macro.
283  *
284  * @param macro a user defined macro that takes three arguments -
285  * macro(TIMER_NAME, flags, arg)
286  * @param arg a user defined argument to send to the user defined macro
287  *
288  * \warning YOU MUST HAVE THE SAME NAMED TIMER UNDER KMP_FOREACH_TIMER() OR ELSE
289  * BAD THINGS WILL HAPPEN!
290  *
291  * \details Explicit timers are ones where we need to allocate a timer itself
292  * (as well as the accumulated timing statistics). We allocate these on a
293  * per-thread basis, and explicitly start and stop them. Block timers just
294  * allocate the timer itself on the stack, and use the destructor to notice
295  * block exit; they don't need to be defined here. The name here should be the
296  * same as that of a timer above.
297  *
298  * @ingroup STATS_GATHERING
299  */
300 #define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) KMP_FOREACH_TIMER(macro, arg)
301 
302 #define ENUMERATE(name, ignore, prefix) prefix##name,
303 enum timer_e { KMP_FOREACH_TIMER(ENUMERATE, TIMER_) TIMER_LAST };
304 
305 enum explicit_timer_e {
306   KMP_FOREACH_EXPLICIT_TIMER(ENUMERATE, EXPLICIT_TIMER_) EXPLICIT_TIMER_LAST
307 };
308 
309 enum counter_e { KMP_FOREACH_COUNTER(ENUMERATE, COUNTER_) COUNTER_LAST };
310 #undef ENUMERATE
311 
312 /*
313  * A logarithmic histogram. It accumulates the number of values in each power of
314  * ten bin.  So 1<=x<10, 10<=x<100, ...
315  * Mostly useful where we have some big outliers and want to see information
316  * about them.
317  */
318 class logHistogram {
319   enum {
320     numBins = 31, /* Number of powers of 10. If this changes you need to change
321                    * the initializer for binMax */
322 
323     /*
324      * If you want to use this to analyse values that may be less than 1, (for
325      * instance times in s), then the logOffset gives you negative powers.
326      * In our case here, we're just looking at times in ticks, or counts, so we
327      * can never see values with magnitude < 1 (other than zero), so we can set
328      * it to 0.  As above change the initializer if you change this.
329      */
330     logOffset = 0
331   };
332   uint32_t KMP_ALIGN_CACHE zeroCount;
333   struct {
334     uint32_t count;
335     double total;
336   } bins[numBins];
337 
338   static double binMax[numBins];
339 
340 #ifdef KMP_DEBUG
341   uint64_t _total;
342 
343   void check() const {
344     uint64_t t = zeroCount;
345     for (int i = 0; i < numBins; i++)
346       t += bins[i].count;
347     KMP_DEBUG_ASSERT(t == _total);
348   }
349 #else
350   void check() const {}
351 #endif
352 
353 public:
354   logHistogram() { reset(); }
355 
356   logHistogram(logHistogram const &o) {
357     for (int i = 0; i < numBins; i++)
358       bins[i] = o.bins[i];
359 #ifdef KMP_DEBUG
360     _total = o._total;
361 #endif
362   }
363 
364   void reset() {
365     zeroCount = 0;
366     for (int i = 0; i < numBins; i++) {
367       bins[i].count = 0;
368       bins[i].total = 0;
369     }
370 
371 #ifdef KMP_DEBUG
372     _total = 0;
373 #endif
374   }
375   uint32_t count(int b) const { return bins[b + logOffset].count; }
376   double total(int b) const { return bins[b + logOffset].total; }
377   static uint32_t findBin(double sample);
378 
379   logHistogram &operator+=(logHistogram const &o) {
380     zeroCount += o.zeroCount;
381     for (int i = 0; i < numBins; i++) {
382       bins[i].count += o.bins[i].count;
383       bins[i].total += o.bins[i].total;
384     }
385 #ifdef KMP_DEBUG
386     _total += o._total;
387     check();
388 #endif
389 
390     return *this;
391   }
392 
393   void addSample(double sample);
394   int minBin() const;
395   int maxBin() const;
396 
397   std::string format(char) const;
398 };
399 
400 class statistic {
401   double KMP_ALIGN_CACHE minVal;
402   double maxVal;
403   double meanVal;
404   double m2;
405   uint64_t sampleCount;
406   double offset;
407   bool collectingHist;
408   logHistogram hist;
409 
410 public:
411   statistic(bool doHist = bool(KMP_STATS_HIST)) {
412     reset();
413     collectingHist = doHist;
414   }
415   statistic(statistic const &o)
416       : minVal(o.minVal), maxVal(o.maxVal), meanVal(o.meanVal), m2(o.m2),
417         sampleCount(o.sampleCount), offset(o.offset),
418         collectingHist(o.collectingHist), hist(o.hist) {}
419   statistic(double minv, double maxv, double meanv, uint64_t sc, double sd)
420       : minVal(minv), maxVal(maxv), meanVal(meanv), m2(sd * sd * sc),
421         sampleCount(sc), offset(0.0), collectingHist(false) {}
422   bool haveHist() const { return collectingHist; }
423   double getMin() const { return minVal; }
424   double getMean() const { return meanVal; }
425   double getMax() const { return maxVal; }
426   uint64_t getCount() const { return sampleCount; }
427   double getSD() const { return sqrt(m2 / sampleCount); }
428   double getTotal() const { return sampleCount * meanVal; }
429   logHistogram const *getHist() const { return &hist; }
430   void setOffset(double d) { offset = d; }
431 
432   void reset() {
433     minVal = (std::numeric_limits<double>::max)();
434     maxVal = -minVal;
435     meanVal = 0.0;
436     m2 = 0.0;
437     sampleCount = 0;
438     offset = 0.0;
439     hist.reset();
440   }
441   void addSample(double sample);
442   void scale(double factor);
443   void scaleDown(double f) { scale(1. / f); }
444   void forceCount(uint64_t count) { sampleCount = count; }
445   statistic &operator+=(statistic const &other);
446 
447   std::string format(char unit, bool total = false) const;
448   std::string formatHist(char unit) const { return hist.format(unit); }
449 };
450 
451 struct statInfo {
452   const char *name;
453   uint32_t flags;
454 };
455 
456 class timeStat : public statistic {
457   static statInfo timerInfo[];
458 
459 public:
460   timeStat() : statistic() {}
461   static const char *name(timer_e e) { return timerInfo[e].name; }
462   static bool noTotal(timer_e e) {
463     return timerInfo[e].flags & stats_flags_e::noTotal;
464   }
465   static bool masterOnly(timer_e e) {
466     return timerInfo[e].flags & stats_flags_e::onlyInMaster;
467   }
468   static bool workerOnly(timer_e e) {
469     return timerInfo[e].flags & stats_flags_e::notInMaster;
470   }
471   static bool noUnits(timer_e e) {
472     return timerInfo[e].flags & stats_flags_e::noUnits;
473   }
474   static bool logEvent(timer_e e) {
475     return timerInfo[e].flags & stats_flags_e::logEvent;
476   }
477   static void clearEventFlags() {
478     for (int i = 0; i < TIMER_LAST; i++) {
479       timerInfo[i].flags &= (~(stats_flags_e::logEvent));
480     }
481   }
482 };
483 
484 // Where we need explicitly to start and end the timer, this version can be used
485 // Since these timers normally aren't nicely scoped, so don't have a good place
486 // to live on the stack of the thread, they're more work to use.
487 class explicitTimer {
488   timeStat *stat;
489   timer_e timerEnumValue;
490   tsc_tick_count startTime;
491   tsc_tick_count pauseStartTime;
492   tsc_tick_count::tsc_interval_t totalPauseTime;
493 
494 public:
495   explicitTimer(timeStat *s, timer_e te)
496       : stat(s), timerEnumValue(te), startTime(), pauseStartTime(0),
497         totalPauseTime() {}
498 
499   // void setStat(timeStat *s) { stat = s; }
500   void start(tsc_tick_count tick);
501   void pause(tsc_tick_count tick) { pauseStartTime = tick; }
502   void resume(tsc_tick_count tick) {
503     totalPauseTime += (tick - pauseStartTime);
504   }
505   void stop(tsc_tick_count tick, kmp_stats_list *stats_ptr = nullptr);
506   void reset() {
507     startTime = 0;
508     pauseStartTime = 0;
509     totalPauseTime = 0;
510   }
511   timer_e get_type() const { return timerEnumValue; }
512 };
513 
514 // Where you need to partition a threads clock ticks into separate states
515 // e.g., a partitionedTimers class with two timers of EXECUTING_TASK, and
516 // DOING_NOTHING would render these conditions:
517 // time(EXECUTING_TASK) + time(DOING_NOTHING) = total time thread is alive
518 // No clock tick in the EXECUTING_TASK is a member of DOING_NOTHING and vice
519 // versa
520 class partitionedTimers {
521 private:
522   std::vector<explicitTimer> timer_stack;
523 
524 public:
525   partitionedTimers();
526   void init(explicitTimer timer);
527   void exchange(explicitTimer timer);
528   void push(explicitTimer timer);
529   void pop();
530   void windup();
531 };
532 
533 // Special wrapper around the partitioned timers to aid timing code blocks
534 // It avoids the need to have an explicit end, leaving the scope suffices.
535 class blockPartitionedTimer {
536   partitionedTimers *part_timers;
537 
538 public:
539   blockPartitionedTimer(partitionedTimers *pt, explicitTimer timer)
540       : part_timers(pt) {
541     part_timers->push(timer);
542   }
543   ~blockPartitionedTimer() { part_timers->pop(); }
544 };
545 
546 // Special wrapper around the thread state to aid in keeping state in code
547 // blocks It avoids the need to have an explicit end, leaving the scope
548 // suffices.
549 class blockThreadState {
550   stats_state_e *state_pointer;
551   stats_state_e old_state;
552 
553 public:
554   blockThreadState(stats_state_e *thread_state_pointer, stats_state_e new_state)
555       : state_pointer(thread_state_pointer), old_state(*thread_state_pointer) {
556     *state_pointer = new_state;
557   }
558   ~blockThreadState() { *state_pointer = old_state; }
559 };
560 
561 // If all you want is a count, then you can use this...
562 // The individual per-thread counts will be aggregated into a statistic at
563 // program exit.
564 class counter {
565   uint64_t value;
566   static const statInfo counterInfo[];
567 
568 public:
569   counter() : value(0) {}
570   void increment() { value++; }
571   uint64_t getValue() const { return value; }
572   void reset() { value = 0; }
573   static const char *name(counter_e e) { return counterInfo[e].name; }
574   static bool masterOnly(counter_e e) {
575     return counterInfo[e].flags & stats_flags_e::onlyInMaster;
576   }
577 };
578 
579 /* ****************************************************************
580     Class to implement an event
581 
582     There are four components to an event: start time, stop time
583     nest_level, and timer_name.
584     The start and stop time should be obvious (recorded in clock ticks).
585     The nest_level relates to the bar width in the timeline graph.
586     The timer_name is used to determine which timer event triggered this event.
587 
588     the interface to this class is through four read-only operations:
589     1) getStart()     -- returns the start time as 64 bit integer
590     2) getStop()      -- returns the stop time as 64 bit integer
591     3) getNestLevel() -- returns the nest level of the event
592     4) getTimerName() -- returns the timer name that triggered event
593 
594     *MORE ON NEST_LEVEL*
595     The nest level is used in the bar graph that represents the timeline.
596     Its main purpose is for showing how events are nested inside eachother.
597     For example, say events, A, B, and C are recorded.  If the timeline
598     looks like this:
599 
600 Begin -------------------------------------------------------------> Time
601          |    |          |        |          |              |
602          A    B          C        C          B              A
603        start start     start     end        end            end
604 
605        Then A, B, C will have a nest level of 1, 2, 3 respectively.
606        These values are then used to calculate the barwidth so you can
607        see that inside A, B has occurred, and inside B, C has occurred.
608        Currently, this is shown with A's bar width being larger than B's
609        bar width, and B's bar width being larger than C's bar width.
610 
611 **************************************************************** */
612 class kmp_stats_event {
613   uint64_t start;
614   uint64_t stop;
615   int nest_level;
616   timer_e timer_name;
617 
618 public:
619   kmp_stats_event()
620       : start(0), stop(0), nest_level(0), timer_name(TIMER_LAST) {}
621   kmp_stats_event(uint64_t strt, uint64_t stp, int nst, timer_e nme)
622       : start(strt), stop(stp), nest_level(nst), timer_name(nme) {}
623   inline uint64_t getStart() const { return start; }
624   inline uint64_t getStop() const { return stop; }
625   inline int getNestLevel() const { return nest_level; }
626   inline timer_e getTimerName() const { return timer_name; }
627 };
628 
629 /* ****************************************************************
630     Class to implement a dynamically expandable array of events
631 
632     ---------------------------------------------------------
633     | event 1 | event 2 | event 3 | event 4 | ... | event N |
634     ---------------------------------------------------------
635 
636     An event is pushed onto the back of this array at every
637     explicitTimer->stop() call.  The event records the thread #,
638     start time, stop time, and nest level related to the bar width.
639 
640     The event vector starts at size INIT_SIZE and grows (doubles in size)
641     if needed.  An implication of this behavior is that log(N)
642     reallocations are needed (where N is number of events).  If you want
643     to avoid reallocations, then set INIT_SIZE to a large value.
644 
645     the interface to this class is through six operations:
646     1) reset() -- sets the internal_size back to 0 but does not deallocate any
647        memory
648     2) size()  -- returns the number of valid elements in the vector
649     3) push_back(start, stop, nest, timer_name) -- pushes an event onto
650        the back of the array
651     4) deallocate() -- frees all memory associated with the vector
652     5) sort() -- sorts the vector by start time
653     6) operator[index] or at(index) -- returns event reference at that index
654 **************************************************************** */
655 class kmp_stats_event_vector {
656   kmp_stats_event *events;
657   int internal_size;
658   int allocated_size;
659   static const int INIT_SIZE = 1024;
660 
661 public:
662   kmp_stats_event_vector() {
663     events =
664         (kmp_stats_event *)__kmp_allocate(sizeof(kmp_stats_event) * INIT_SIZE);
665     internal_size = 0;
666     allocated_size = INIT_SIZE;
667   }
668   ~kmp_stats_event_vector() {}
669   inline void reset() { internal_size = 0; }
670   inline int size() const { return internal_size; }
671   void push_back(uint64_t start_time, uint64_t stop_time, int nest_level,
672                  timer_e name) {
673     int i;
674     if (internal_size == allocated_size) {
675       kmp_stats_event *tmp = (kmp_stats_event *)__kmp_allocate(
676           sizeof(kmp_stats_event) * allocated_size * 2);
677       for (i = 0; i < internal_size; i++)
678         tmp[i] = events[i];
679       __kmp_free(events);
680       events = tmp;
681       allocated_size *= 2;
682     }
683     events[internal_size] =
684         kmp_stats_event(start_time, stop_time, nest_level, name);
685     internal_size++;
686     return;
687   }
688   void deallocate();
689   void sort();
690   const kmp_stats_event &operator[](int index) const { return events[index]; }
691   kmp_stats_event &operator[](int index) { return events[index]; }
692   const kmp_stats_event &at(int index) const { return events[index]; }
693   kmp_stats_event &at(int index) { return events[index]; }
694 };
695 
696 /* ****************************************************************
697     Class to implement a doubly-linked, circular, statistics list
698 
699     |---| ---> |---| ---> |---| ---> |---| ---> ... next
700     |   |      |   |      |   |      |   |
701     |---| <--- |---| <--- |---| <--- |---| <--- ... prev
702     Sentinel   first      second     third
703     Node       node       node       node
704 
705     The Sentinel Node is the user handle on the list.
706     The first node corresponds to thread 0's statistics.
707     The second node corresponds to thread 1's statistics and so on...
708 
709     Each node has a _timers, _counters, and _explicitTimers array to hold that
710     thread's statistics. The _explicitTimers point to the correct _timer and
711     update its statistics at every stop() call. The explicitTimers' pointers are
712     set up in the constructor. Each node also has an event vector to hold that
713     thread's timing events. The event vector expands as necessary and records
714     the start-stop times for each timer.
715 
716     The nestLevel variable is for plotting events and is related
717     to the bar width in the timeline graph.
718 
719     Every thread will have a thread local pointer to its node in
720     the list.  The sentinel node is used by the primary thread to
721     store "dummy" statistics before __kmp_create_worker() is called.
722 **************************************************************** */
723 class kmp_stats_list {
724   int gtid;
725   timeStat _timers[TIMER_LAST + 1];
726   counter _counters[COUNTER_LAST + 1];
727   explicitTimer thread_life_timer;
728   partitionedTimers _partitionedTimers;
729   int _nestLevel; // one per thread
730   kmp_stats_event_vector _event_vector;
731   kmp_stats_list *next;
732   kmp_stats_list *prev;
733   stats_state_e state;
734   int thread_is_idle_flag;
735 
736 public:
737   kmp_stats_list()
738       : thread_life_timer(&_timers[TIMER_OMP_worker_thread_life],
739                           TIMER_OMP_worker_thread_life),
740         _nestLevel(0), _event_vector(), next(this), prev(this), state(IDLE),
741         thread_is_idle_flag(0) {}
742   ~kmp_stats_list() {}
743   inline timeStat *getTimer(timer_e idx) { return &_timers[idx]; }
744   inline counter *getCounter(counter_e idx) { return &_counters[idx]; }
745   inline partitionedTimers *getPartitionedTimers() {
746     return &_partitionedTimers;
747   }
748   inline timeStat *getTimers() { return _timers; }
749   inline counter *getCounters() { return _counters; }
750   inline kmp_stats_event_vector &getEventVector() { return _event_vector; }
751   inline void startLife() { thread_life_timer.start(tsc_tick_count::now()); }
752   inline void endLife() { thread_life_timer.stop(tsc_tick_count::now(), this); }
753   inline void resetEventVector() { _event_vector.reset(); }
754   inline void incrementNestValue() { _nestLevel++; }
755   inline int getNestValue() { return _nestLevel; }
756   inline void decrementNestValue() { _nestLevel--; }
757   inline int getGtid() const { return gtid; }
758   inline void setGtid(int newgtid) { gtid = newgtid; }
759   inline void setState(stats_state_e newstate) { state = newstate; }
760   inline stats_state_e getState() const { return state; }
761   inline stats_state_e *getStatePointer() { return &state; }
762   inline bool isIdle() { return thread_is_idle_flag == 1; }
763   inline void setIdleFlag() { thread_is_idle_flag = 1; }
764   inline void resetIdleFlag() { thread_is_idle_flag = 0; }
765   kmp_stats_list *push_back(int gtid); // returns newly created list node
766   inline void push_event(uint64_t start_time, uint64_t stop_time,
767                          int nest_level, timer_e name) {
768     _event_vector.push_back(start_time, stop_time, nest_level, name);
769   }
770   void deallocate();
771   class iterator;
772   kmp_stats_list::iterator begin();
773   kmp_stats_list::iterator end();
774   int size();
775   class iterator {
776     kmp_stats_list *ptr;
777     friend kmp_stats_list::iterator kmp_stats_list::begin();
778     friend kmp_stats_list::iterator kmp_stats_list::end();
779 
780   public:
781     iterator();
782     ~iterator();
783     iterator operator++();
784     iterator operator++(int dummy);
785     iterator operator--();
786     iterator operator--(int dummy);
787     bool operator!=(const iterator &rhs);
788     bool operator==(const iterator &rhs);
789     kmp_stats_list *operator*() const; // dereference operator
790   };
791 };
792 
793 /* ****************************************************************
794    Class to encapsulate all output functions and the environment variables
795 
796    This module holds filenames for various outputs (normal stats, events, plot
797    file), as well as coloring information for the plot file.
798 
799    The filenames and flags variables are read from environment variables.
800    These are read once by the constructor of the global variable
801    __kmp_stats_output which calls init().
802 
803    During this init() call, event flags for the timeStat::timerInfo[] global
804    array are cleared if KMP_STATS_EVENTS is not true (on, 1, yes).
805 
806    The only interface function that is public is outputStats(heading).  This
807    function should print out everything it needs to, either to files or stderr,
808    depending on the environment variables described below
809 
810    ENVIRONMENT VARIABLES:
811    KMP_STATS_FILE -- if set, all statistics (not events) will be printed to this
812                      file, otherwise, print to stderr
813    KMP_STATS_THREADS -- if set to "on", then will print per thread statistics to
814                         either KMP_STATS_FILE or stderr
815    KMP_STATS_PLOT_FILE -- if set, print the ploticus plot file to this filename,
816                           otherwise, the plot file is sent to "events.plt"
817    KMP_STATS_EVENTS -- if set to "on", then log events, otherwise, don't log
818                        events
819    KMP_STATS_EVENTS_FILE -- if set, all events are outputted to this file,
820                             otherwise, output is sent to "events.dat"
821 **************************************************************** */
822 class kmp_stats_output_module {
823 
824 public:
825   struct rgb_color {
826     float r;
827     float g;
828     float b;
829   };
830 
831 private:
832   std::string outputFileName;
833   static const char *eventsFileName;
834   static const char *plotFileName;
835   static int printPerThreadFlag;
836   static int printPerThreadEventsFlag;
837   static const rgb_color globalColorArray[];
838   static rgb_color timerColorInfo[];
839 
840   void init();
841   static void setupEventColors();
842   static void printPloticusFile();
843   static void printHeaderInfo(FILE *statsOut);
844   static void printTimerStats(FILE *statsOut, statistic const *theStats,
845                               statistic const *totalStats);
846   static void printCounterStats(FILE *statsOut, statistic const *theStats);
847   static void printCounters(FILE *statsOut, counter const *theCounters);
848   static void printEvents(FILE *eventsOut, kmp_stats_event_vector *theEvents,
849                           int gtid);
850   static rgb_color getEventColor(timer_e e) { return timerColorInfo[e]; }
851   static void windupExplicitTimers();
852   bool eventPrintingEnabled() const { return printPerThreadEventsFlag; }
853 
854 public:
855   kmp_stats_output_module() { init(); }
856   void outputStats(const char *heading);
857 };
858 
859 #ifdef __cplusplus
860 extern "C" {
861 #endif
862 void __kmp_stats_init();
863 void __kmp_stats_fini();
864 void __kmp_reset_stats();
865 void __kmp_output_stats(const char *);
866 void __kmp_accumulate_stats_at_exit(void);
867 // thread local pointer to stats node within list
868 extern KMP_THREAD_LOCAL kmp_stats_list *__kmp_stats_thread_ptr;
869 // head to stats list.
870 extern kmp_stats_list *__kmp_stats_list;
871 // lock for __kmp_stats_list
872 extern kmp_tas_lock_t __kmp_stats_lock;
873 // reference start time
874 extern tsc_tick_count __kmp_stats_start_time;
875 // interface to output
876 extern kmp_stats_output_module __kmp_stats_output;
877 
878 #ifdef __cplusplus
879 }
880 #endif
881 
882 // Simple, standard interfaces that drop out completely if stats aren't enabled
883 
884 /*!
885  * \brief Adds value to specified timer (name).
886  *
887  * @param name timer name as specified under the KMP_FOREACH_TIMER() macro
888  * @param value double precision sample value to add to statistics for the timer
889  *
890  * \details Use KMP_COUNT_VALUE(name, value) macro to add a particular value to
891  * a timer statistics.
892  *
893  * @ingroup STATS_GATHERING
894  */
895 #define KMP_COUNT_VALUE(name, value)                                           \
896   __kmp_stats_thread_ptr->getTimer(TIMER_##name)->addSample((double)value)
897 
898 /*!
899  * \brief Increments specified counter (name).
900  *
901  * @param name counter name as specified under the KMP_FOREACH_COUNTER() macro
902  *
903  * \details Use KMP_COUNT_BLOCK(name, value) macro to increment a statistics
904  * counter for the executing thread.
905  *
906  * @ingroup STATS_GATHERING
907  */
908 #define KMP_COUNT_BLOCK(name)                                                  \
909   __kmp_stats_thread_ptr->getCounter(COUNTER_##name)->increment()
910 
911 /*!
912  * \brief Outputs the current thread statistics and reset them.
913  *
914  * @param heading_string heading put above the final stats output
915  *
916  * \details Explicitly stops all timers and outputs all stats. Environment
917  * variable, `OMPTB_STATSFILE=filename`, can be used to output the stats to a
918  * filename instead of stderr. Environment variable,
919  * `OMPTB_STATSTHREADS=true|undefined`, can be used to output thread specific
920  * stats. For now the `OMPTB_STATSTHREADS` environment variable can either be
921  * defined with any value, which will print out thread specific stats, or it can
922  * be undefined (not specified in the environment) and thread specific stats
923  * won't be printed. It should be noted that all statistics are reset when this
924  * macro is called.
925  *
926  * @ingroup STATS_GATHERING
927  */
928 #define KMP_OUTPUT_STATS(heading_string) __kmp_output_stats(heading_string)
929 
930 /*!
931  * \brief Initializes the partitioned timers to begin with name.
932  *
933  * @param name timer which you want this thread to begin with
934  *
935  * @ingroup STATS_GATHERING
936  */
937 #define KMP_INIT_PARTITIONED_TIMERS(name)                                      \
938   __kmp_stats_thread_ptr->getPartitionedTimers()->init(explicitTimer(          \
939       __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
940 
941 #define KMP_TIME_PARTITIONED_BLOCK(name)                                       \
942   blockPartitionedTimer __PBLOCKTIME__(                                        \
943       __kmp_stats_thread_ptr->getPartitionedTimers(),                          \
944       explicitTimer(__kmp_stats_thread_ptr->getTimer(TIMER_##name),            \
945                     TIMER_##name))
946 
947 #define KMP_PUSH_PARTITIONED_TIMER(name)                                       \
948   __kmp_stats_thread_ptr->getPartitionedTimers()->push(explicitTimer(          \
949       __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
950 
951 #define KMP_POP_PARTITIONED_TIMER()                                            \
952   __kmp_stats_thread_ptr->getPartitionedTimers()->pop()
953 
954 #define KMP_EXCHANGE_PARTITIONED_TIMER(name)                                   \
955   __kmp_stats_thread_ptr->getPartitionedTimers()->exchange(explicitTimer(      \
956       __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
957 
958 #define KMP_SET_THREAD_STATE(state_name)                                       \
959   __kmp_stats_thread_ptr->setState(state_name)
960 
961 #define KMP_GET_THREAD_STATE() __kmp_stats_thread_ptr->getState()
962 
963 #define KMP_SET_THREAD_STATE_BLOCK(state_name)                                 \
964   blockThreadState __BTHREADSTATE__(__kmp_stats_thread_ptr->getStatePointer(), \
965                                     state_name)
966 
967 /*!
968  * \brief resets all stats (counters to 0, timers to 0 elapsed ticks)
969  *
970  * \details Reset all stats for all threads.
971  *
972  * @ingroup STATS_GATHERING
973  */
974 #define KMP_RESET_STATS() __kmp_reset_stats()
975 
976 #if (KMP_DEVELOPER_STATS)
977 #define KMP_COUNT_DEVELOPER_VALUE(n, v) KMP_COUNT_VALUE(n, v)
978 #define KMP_COUNT_DEVELOPER_BLOCK(n) KMP_COUNT_BLOCK(n)
979 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) KMP_TIME_PARTITIONED_BLOCK(n)
980 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) KMP_PUSH_PARTITIONED_TIMER(n)
981 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) KMP_POP_PARTITIONED_TIMER(n)
982 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n)                            \
983   KMP_EXCHANGE_PARTITIONED_TIMER(n)
984 #else
985 // Null definitions
986 #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
987 #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
988 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
989 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
990 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
991 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
992 #endif
993 
994 #else // KMP_STATS_ENABLED
995 
996 // Null definitions
997 #define KMP_COUNT_VALUE(n, v) ((void)0)
998 #define KMP_COUNT_BLOCK(n) ((void)0)
999 
1000 #define KMP_OUTPUT_STATS(heading_string) ((void)0)
1001 #define KMP_RESET_STATS() ((void)0)
1002 
1003 #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
1004 #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
1005 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
1006 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1007 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1008 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1009 #define KMP_INIT_PARTITIONED_TIMERS(name) ((void)0)
1010 #define KMP_TIME_PARTITIONED_BLOCK(name) ((void)0)
1011 #define KMP_PUSH_PARTITIONED_TIMER(name) ((void)0)
1012 #define KMP_POP_PARTITIONED_TIMER() ((void)0)
1013 #define KMP_SET_THREAD_STATE(state_name) ((void)0)
1014 #define KMP_GET_THREAD_STATE() ((void)0)
1015 #define KMP_SET_THREAD_STATE_BLOCK(state_name) ((void)0)
1016 #endif // KMP_STATS_ENABLED
1017 
1018 #endif // KMP_STATS_H
1019