1 /* dzl-counter.h
2  *
3  * Copyright (C) 2013-2015 Christian Hergert <christian@hergert.me>
4  *
5  * This file is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU Lesser General Public
7  * License as published by the Free Software Foundation; either
8  * version 2.1 of the License, or (at your option) any later version.
9  *
10  * This file is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * Lesser General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
17  *
18  * Additionally, this file does not claim copyright over the expansion
19  * of macros in your source program.
20  */
21 
22 #ifndef DZL_COUNTER_H
23 #define DZL_COUNTER_H
24 
25 #include <glib-object.h>
26 
27 #include "dzl-version-macros.h"
28 
29 /*
30  * History
31  * =======
32  *
33  * DzlCounter is a performance counter based on ideas from previous work
34  * on high performance counters. They are not guaranteed to be 100%
35  * correct, but they approach that with no synchronization given new
36  * enough hardware. In particular, we use %ecx from rdtscp (the core id)
37  * to determine which cachline to increment the counter within.
38  *
39  * Given a counter, the value will be split up int NCPU cachelines where
40  * NCPU is the number of cores returned from get_nprocs() (on Linux).
41  *
42  * Updating the counter is very cheap, reading back the counter requires
43  * a volatile read of each cacheline. Again, no correctness is guaranteed.
44  *
45  * In practice, very few values are lost even during tight competing loops.
46  * A loss can happen when the thread is pre-empted between the %rdtscp
47  * instruction and the addq increment (on x86_64).
48  *
49  *
50  * Using DzlCounter
51  * ================
52  *
53  * To define a counter, you must have support for constructor attributes.
54  *
55  *   DZL_DEFINE_COUNTER (Symbol, "Category", "Name", "Description")
56  *
57  * To increment the counter in a function of your choice (but within the
58  * same module), use DZL_COUNTER_ADD, DZL_COUNTER_INC, DZL_COUNTER_DEC.
59  *
60  *   DZL_COUNTER_INC (Symbol);
61  *
62  *
63  * Architecture Support
64  * ====================
65  *
66  * If you are not on x86_64, or are missing the rdtscp instruction, a 64-bit
67  * atomic will be performed using __sync_fetch_and_add8(). Clearly, we can
68  * do some more work here to abstract which implementation is used, but we
69  * only support GCC and Clang today, which both have that intrinsic. Some
70  * architectures may not have it (such as 32-bit PPC), but I'm not too
71  * concerned about that at the moment.
72  *
73  * The counters are mapped into a shared memory zone using shm_open() and
74  * mmap(). An external program can then discover the available counters
75  * and print them without blocking the target program. It simply must
76  * perform the reads in a volatile manner just like the target process
77  * would need to do for readback.
78  *
79  * DzlCounterArena provides a helper to walk through the counters in the
80  * shared memory zone. dzl_counter_arena_foreach().
81  *
82  * You cannot remove a counter once it has been registered.
83  *
84  *
85  * Accessing Counters Remotely
86  * ===========================
87  *
88  * You can access the counters from out of process. By opening the SHM zone
89  * and reading the contents from each cachline, you can get the approximate
90  * state of the target application without blocking it.
91  *
92  * DzlCounterArena provides a helper for you to do this.
93  *
94  *   DzlCounterArena *arena;
95  *
96  *   arena = dzl_counter_arena_new_for_pid (other_process_pid);
97  *   dzl_counter_arena_foreach (arena, my_counter_callback, user_data);
98  *
99  *
100  * Data Layout
101  * ===========
102  *
103  * The layout of the shared memory zone is broken into "cells". Each cell
104  * is an approximate cacheline (64-bytes) on modern Intel hardware. Indexes
105  * to data locations are represented in cells to simplify the math and
106  * allow the compiler to know we are working with properly aligned structures.
107  *
108  * The base pointer in DzlCounter.values is not 64-byte aligned! It is 8-byte
109  * aligned and points to the offset within the cacheline for that counter.
110  * We pack 8 64-bit counters into a single cacheline. This allows us to avoid
111  * an extra MOV instruction when incrementing since we only need to perform
112  * the offset from the base pointer.
113  *
114  * The first two cells are the header which contain information about the
115  * underlying shm file and how large the mmap() range should be.
116  *
117  * After that, begin the counters.
118  *
119  * The counters are layed out in groups of 8 counters.
120  *
121  *  [8 CounterInfo Structs (128-bytes each)][N_CPU Data Zones (64-byte each)]
122  *
123  * See dzl-counter.c for more information on the contents of these structures.
124  *
125  *
126  * Build System Requirements
127  * =========================
128  *
129  * We need to know if rdtscp is available at compile time. In an effort
130  * to keep the headers as portable as possible (if that matters here?) we
131  * require that you define DZL_HAVE_RDTSCP if the instruction is supported.
132  *
133  * An example for autoconf might be similar to the following:
134  *
135  *   AC_MSG_CHECKING([for fast counters with rdtscp])
136  *   AC_RUN_IFELSE(
137  *     [AC_LANG_SOURCE([[
138  *      #include <x86intrin.h>
139  *      int main (int argc, char *argv[]) { int cpu; __builtin_ia32_rdtscp (&cpu); return 0; }]])],
140  *     [have_rdtscp=yes],
141  *     [have_rdtscp=no])
142  *   AC_MSG_RESULT([$have_rdtscp])
143  *   AS_IF([test "$have_rdtscp" = "yes"],
144  *         [CFLAGS="$CFLAGS -DDZL_HAVE_RDTSCP"])
145  */
146 
147 G_BEGIN_DECLS
148 
149 #ifdef DZL_HAVE_RDTSCP
150 # include <x86intrin.h>
151   static inline guint
dzl_get_current_cpu_rdtscp(void)152   dzl_get_current_cpu_rdtscp (void)
153   {
154     /*
155      * This extracts the IA32_TSC_AUX into the ecx register. On Linux,
156      * that value contains a value with the bottom 12 bits being the
157      * cpu identifier, and the next 10 bits being the node group.
158      */
159     guint aux;
160     __builtin_ia32_rdtscp (&aux);
161     return aux & 0xFFF;
162   }
163 # define dzl_get_current_cpu() dzl_get_current_cpu_rdtscp()
164 #elif defined(__linux__)
165 # define dzl_get_current_cpu() dzl_get_current_cpu_call()
166 #elif defined(__powerpc__) && !defined(__powerpc64__)
167 # define dzl_get_current_cpu() 0
168 # undef DZL_COUNTER_REQUIRES_ATOMIC
169 #else
170 # define dzl_get_current_cpu() 0
171 # define DZL_COUNTER_REQUIRES_ATOMIC 1
172 #endif
173 
174 /**
175  * DZL_DEFINE_COUNTER:
176  * @Identifier: The symbol name of the counter
177  * @Category: A string category for the counter.
178  * @Name: A string name for the counter.
179  * @Description: A string description for the counter.
180  *
181  * |[<!-- language="C" -->
182  * DZL_DEFINE_COUNTER (my_counter, "My", "Counter", "My Counter Description");
183  * ]|
184  */
185 #define DZL_DEFINE_COUNTER(Identifier, Category, Name, Description)                 \
186  static DzlCounter Identifier##_ctr = { NULL, Category, Name, Description };        \
187  static void Identifier##_ctr_init (void) __attribute__((constructor));             \
188  static void                                                                        \
189  Identifier##_ctr_init (void)                                                       \
190  {                                                                                  \
191    dzl_counter_arena_register (dzl_counter_arena_get_default(), &Identifier##_ctr); \
192  }
193 
194 /**
195  * DZL_COUNTER_INC:
196  * @Identifier: The identifier of the counter.
197  *
198  * Increments the counter @Identifier by 1.
199  */
200 #define DZL_COUNTER_INC(Identifier) DZL_COUNTER_ADD(Identifier, G_GINT64_CONSTANT(1))
201 
202 /**
203  * DZL_COUNTER_DEC:
204  * @Identifier: The identifier of the counter.
205  *
206  * Decrements the counter @Identifier by 1.
207  */
208 #define DZL_COUNTER_DEC(Identifier) DZL_COUNTER_SUB(Identifier, G_GINT64_CONSTANT(1))
209 
210 /**
211  * DZL_COUNTER_SUB:
212  * @Identifier: The identifier of the counter.
213  * @Count: the amount to subtract.
214  *
215  * Subtracts from the counter identified by @Identifier by @Count.
216  */
217 #define DZL_COUNTER_SUB(Identifier, Count) DZL_COUNTER_ADD(Identifier, (-(Count)))
218 
219 /**
220  * DZL_COUNTER_ADD:
221  * @Identifier: The identifier of the counter.
222  * @Count: the amount to add to the counter.
223  *
224  * Adds @Count to @Identifier.
225  *
226  * This operation is not guaranteed to have full correctness. It tries to find
227  * a happy medium between fast, and accurate. When possible, the %rdtscp
228  * instruction is used to get a cacheline owned by the executing CPU, to avoid
229  * collisions. However, this is not guaranteed as the thread could be swapped
230  * between the calls to %rdtscp and %addq (on 64-bit Intel).
231  *
232  * Other platforms have fallbacks which may give different guarantees, such as
233  * using atomic operations (and therefore, memory barriers).
234  *
235  * See #DzlCounter for more information.
236  */
237 #ifdef DZL_COUNTER_REQUIRES_ATOMIC
238 # define DZL_COUNTER_ADD(Identifier, Count)                                          \
239   G_STMT_START {                                                                     \
240     __sync_add_and_fetch ((gint64 *)&Identifier##_ctr.values[0], ((gint64)(Count))); \
241   } G_STMT_END
242 #else
243 # define DZL_COUNTER_ADD(Identifier, Count)                                    \
244   G_STMT_START {                                                               \
245     Identifier##_ctr.values[dzl_get_current_cpu()].value += ((gint64)(Count)); \
246   } G_STMT_END
247 #endif
248 
249 typedef struct _DzlCounter      DzlCounter;
250 typedef struct _DzlCounterArena DzlCounterArena;
251 typedef struct _DzlCounterValue DzlCounterValue;
252 
253 /**
254  * DzlCounterForeachFunc:
255  * @counter: the counter.
256  * @user_data: data supplied to dzl_counter_arena_foreach().
257  *
258  * Function prototype for callbacks provided to dzl_counter_arena_foreach().
259  */
260 typedef void (*DzlCounterForeachFunc) (DzlCounter *counter,
261                                        gpointer    user_data);
262 
263 struct _DzlCounter
264 {
265   /*< Private >*/
266   DzlCounterValue *values;
267   const gchar     *category;
268   const gchar     *name;
269   const gchar     *description;
270 } __attribute__ ((aligned(8)));
271 
272 struct _DzlCounterValue
273 {
274   volatile gint64 value;
275   gint64          padding [7];
276 } __attribute__ ((aligned(8)));
277 
278 DZL_AVAILABLE_IN_ALL
279 GType            dzl_counter_arena_get_type     (void);
280 DZL_AVAILABLE_IN_ALL
281 guint            dzl_get_current_cpu_call       (void);
282 DZL_AVAILABLE_IN_ALL
283 DzlCounterArena *dzl_counter_arena_get_default  (void);
284 DZL_AVAILABLE_IN_ALL
285 DzlCounterArena *dzl_counter_arena_new_for_pid  (GPid                   pid);
286 DZL_AVAILABLE_IN_ALL
287 DzlCounterArena *dzl_counter_arena_ref          (DzlCounterArena       *arena);
288 DZL_AVAILABLE_IN_ALL
289 void             dzl_counter_arena_unref        (DzlCounterArena       *arena);
290 DZL_AVAILABLE_IN_ALL
291 void             dzl_counter_arena_register     (DzlCounterArena       *arena,
292                                                  DzlCounter            *counter);
293 DZL_AVAILABLE_IN_ALL
294 void             dzl_counter_arena_foreach      (DzlCounterArena       *arena,
295                                                  DzlCounterForeachFunc  func,
296                                                  gpointer               user_data);
297 DZL_AVAILABLE_IN_ALL
298 void             dzl_counter_reset              (DzlCounter            *counter);
299 DZL_AVAILABLE_IN_ALL
300 gint64           dzl_counter_get                (DzlCounter            *counter);
301 
302 G_END_DECLS
303 
304 #endif /* DZL_COUNTER_H */
305