1 /*
2  * Copyright (C) 1996-2021 The Squid Software Foundation and contributors
3  *
4  * Squid software is distributed under GPLv2+ license and includes
5  * contributions from numerous individuals and organizations.
6  * Please see the COPYING and CONTRIBUTORS files for details.
7  */
8 
9 /* DEBUG: section 81    CPU Profiling Routines */
10 
11 /**
12  * CPU Profiling implementation.
13  *
14  * \par
15  *  This library implements the Probes needed to gather stats.
16  *  See src/ProfStats.c which implements historical recording and
17  *  presentation in CacheMgr.cgi.
18  *
19  * \par
20  *  For timing we prefer on-CPU ops that retrieve cpu ticks counter.
21  *  For Intel, this is "rdtsc", which is 64-bit counter that virtually
22  *  never wraps. For alpha, this is "rpcc" which is 32-bit counter and
23  *  wraps every few seconds. Currently, no handling of wrapping counters
24  *  is implemented. Other CPU's are also not covered. Potentially all
25  *  modern CPU's has similar counters.
26  *
27  * Usage.
28  *  Insert macro PROF_state(probename) in strategic places in code.
29  *    PROF_start(probename);
30  *     ...  section of code measured ...
31  *    PROF_stop(probename);
32  *
33  *   probename must be added to the xprof_type.h enum list
34  *   with prepended "XPROF_" string.
35  *
36  * \section description Description.
37  * \par PROF
38  *  gathers stats per probename into structures. It indexes these
39  *  structures by enum type index in an array.
40  *
41  * \par PROF
42  *  records best, best, average and worst values for delta time,
43  *  also, if UNACCED is defined, it measures "empty" time during which
44  *  no probes are in measuring state. This allows to see time "unaccounted"
45  *  for. If OVERHEAD is defined, additional calculations are made at every
46  *  probe to measure approximate overhead of the probe code itself.
47  *
48  * \par
49  *  Probe data is stored in linked-list, so the more probes you define,
50  *  the more overhead is added to find the deepest nested probe. To reduce
51  *  average overhead, linked list is manipulated each time PR_start is
52  *  called, so that probe just started is moved 1 position up in linkedlist.
53  *  This way frequently used probes are moved closer to the head of list,
54  *  reducing average overhead.
55  *  Note that all overhead is on the scale of one hundred of CPU clock
56  *  ticks, which on the scale of submicroseconds. Yet, to optimise really
57  *  fast and frequent sections of code, we want to reduce this overhead
58  *  to absolute minimum possible.
59  *
60  * \par
61  *  For actual measurements, probe overhead cancels out mostly. Still,
62  *  do not take the measured times as facts, they should be viewed in
63  *  relative comparison to overall CPU time and on the same platform.
64  *
65  * \par
66  *  Every 1 second, Event within squid is called that parses gathered
67  *  statistics of every probe, and accumulates that into historical
68  *  structures for last 1,5,30 secs, 1,5,30 mins, and 1,5 and 24 hours.
69  *  Each second active probe stats are reset, and only historical data
70  *  is presented in cachemgr output.
71  *
72  * \section reading Reading stats.
73  * \par
74  *  "Worst case" may be misleading. Anything can happen at any section
75  *  of code that could delay reaching to probe stop. For eg. system may
76  *  need to service interrupt routine, task switch could occur, or page
77  *  fault needs to be handled. In this sense, this is quite meaningless
78  *  metric. "Best case" shows fastest completion of probe section, and
79  *  is also somewhat useless, unless you know that amount of work is
80  *  constant. Best metric to watch is "average time" and total cumulated
81  *  time in given timeframe, which really show percentage of time spent
82  *  in given section of code, and its average completion time. This data
83  *  could be used to detect bottlenecks withing squid and optimise them.
84  *
85  * \par
86  *  TOTALS are quite off reality. Its there just to summarise cumulative
87  *  times and percent column. Percent values over 100% shows that there
88  *  have been some probes nested into each other.
89  *
90  */
91 
92 #include "squid.h"
93 #include "profiler/Profiler.h"
94 
95 #if USE_XPROF_STATS
96 
97 #include <cassert>
98 #if HAVE_GNUMALLLOC_H
99 #include <gnumalloc.h>
100 #elif HAVE_MALLOC_H
101 #include <malloc.h>
102 #endif
103 #if HAVE_UNISTD_H
104 #include <unistd.h>
105 #endif
106 
107 /* Exported Data */
108 TimersArray *xprof_Timers = NULL;
109 
110 /* Private stuff */
111 
112 /* new stuff */
113 #define MAXSTACKDEPTH   512
114 
115 struct _callstack_entry {
116     int timer;      /* index into timers array */
117     const char *name;
118     hrtime_t start, stop, accum;
119 };
120 
121 struct _callstack_entry cstack[MAXSTACKDEPTH];
122 int cstack_head = 0;
123 
124 #if defined(_MSC_VER) /* Microsoft C Compiler ONLY */
125 static __inline void
126 #else
127 static inline void
128 #endif
xprof_update(xprof_stats_data * head)129 xprof_update(xprof_stats_data * head)
130 {
131     if (head->delta < head->best)
132         head->best = head->delta;
133     if (head->worst < head->delta)
134         head->worst = head->delta;
135     head->summ += head->delta;
136     ++head->count;
137 }
138 
139 static xprof_stats_data *xp_UNACCOUNTED;
140 static int xprof_inited = 0;
141 
142 static void
xprof_InitLib(void)143 xprof_InitLib(void)
144 {
145     if (xprof_inited)
146         return;
147 
148     xprof_Timers = static_cast<TimersArray *>(calloc(XPROF_LAST + 2, sizeof(xprof_stats_node)));
149 
150     xprof_Timers[XPROF_PROF_UNACCOUNTED]->name = "PROF_UNACCOUNTED";
151     xprof_Timers[XPROF_PROF_UNACCOUNTED]->accu.start = get_tick();
152     xp_UNACCOUNTED = &xprof_Timers[XPROF_PROF_UNACCOUNTED]->accu;
153     cstack_head = 0;
154     xprof_inited = 1;
155 }
156 
157 void
xprof_start(xprof_type type,const char * timer)158 xprof_start(xprof_type type, const char *timer)
159 {
160     hrtime_t tt = get_tick();
161     if (!xprof_inited)
162         xprof_InitLib();
163 
164     /* If nested, stop current stack frame */
165     if (cstack_head > 0) {
166         cstack[cstack_head - 1].accum += get_tick() - cstack[cstack_head - 1].start;
167         cstack[cstack_head - 1].start = -1;
168     }
169 
170     /* Are we at the first level? If so; stop the unaccounted timer */
171     if (cstack_head == 0) {
172         assert(xp_UNACCOUNTED->start != -1);
173         xp_UNACCOUNTED->delta = tt - xp_UNACCOUNTED->start;
174         xp_UNACCOUNTED->start = -1;
175         xprof_update(xp_UNACCOUNTED);
176     }
177 
178     /* Allocate new stack frame */
179     cstack[cstack_head].start = tt;
180     cstack[cstack_head].stop = -1;
181     cstack[cstack_head].accum = 0;
182     cstack[cstack_head].timer = type;
183     cstack[cstack_head].name = timer;
184     ++cstack_head;
185     assert(cstack_head < MAXSTACKDEPTH);
186 
187 }
188 
189 void
xprof_stop(xprof_type type,const char * timer)190 xprof_stop(xprof_type type, const char *timer)
191 {
192     hrtime_t tt = get_tick();
193     assert(cstack_head > 0);
194     --cstack_head;
195     assert(cstack[cstack_head].timer == type);
196 
197     /* Record timer details */
198     cstack[cstack_head].accum += tt - cstack[cstack_head].start;
199     xprof_Timers[type]->accu.delta = cstack[cstack_head].accum;
200     xprof_Timers[type]->name = timer;
201 
202     /* Update */
203     xprof_update(&xprof_Timers[type]->accu);
204 
205     /* Restart previous timer if we're not at the top level */
206     if (cstack_head > 0) {
207         cstack[cstack_head - 1].start = tt;
208         cstack[cstack_head - 1].stop = 0;
209         return;
210     }
211     /* Get here? We're at the top level; unaccounted */
212     xp_UNACCOUNTED->start = tt;
213 }
214 
215 #endif /* USE_XPROF_STATS */
216 
217