1 /*
2 * rdtsc.h
3 *
4 * Created on: 09.08.2016
5 * Author: kruppa
6 */
7
8 #ifndef RDTSC_H_
9 #define RDTSC_H_
10
11 #include <stdint.h>
12 #include "macros.h"
13
14 /* Enable at most one of these three defines, preferably in the including
15 compilation unit */
16 // #define USE_INTEL_PCM 1
17 // #define USE_PERF 1
18 // #define USE_JEVENTS 1
19
20 #include <stdlib.h>
21
u32_to_64(uint32_t low,uint32_t high)22 uint64_t u32_to_64(uint32_t low, uint32_t high)
23 { return ((uint64_t) high << 32) + (uint64_t) low; }
24
25 #ifdef USE_INTEL_PCM
26
27 #include <sched.h>
28 #include <stdio.h>
29 #include "cpucounters.h"
30 static PCM * m;
31 static CoreCounterState before_sstate, after_sstate;
32
33 #elif defined USE_PERF
34
35 #include "libperf.h" /* standard libperf include */
36 static struct libperf_data* pd;
37 static uint64_t start_time, end_time;
38
39 #elif USE_JEVENTS
40
41 #ifdef __cplusplus
42 extern "C" {
43 #endif
44 #include "rdpmc.h"
45 #ifdef __cplusplus
46 }
47 #endif
48 struct rdpmc_ctx ctx;
49 #define IMMEDIATE_RDPMC 1
50 #ifdef IMMEDIATE_RDPMC
51 struct {unsigned lo, hi;} start_time;
52 static unsigned long end_time;
53 #else
54 static uint64_t start_time, end_time;
55 #endif
56
57 #else
58
59 #define START_COUNTER rdtscpl
60 #define END_COUNTER rdtscpl
61 static uint64_t start_time, end_time;
62
63 #endif
64
65 #if defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
66
67 __attribute__((__unused__, __always_inline__)) ATTRIBUTE_ARTIFICIAL
serialize()68 static inline void serialize()
69 {
70 unsigned long id = 0;
71 __asm__ volatile("CPUID\n\t"
72 : "+a" (id)
73 :: "%rbx", "%rcx", "%rdx"
74 );
75 }
76
77 /* Use RDTSC to read CPU core clock cycles except that on contemporary CPUs,
78 it actually reads some natural-time counter */
79 __attribute__((__unused__, __always_inline__)) ATTRIBUTE_ARTIFICIAL
rdtsc(uint32_t * low,uint32_t * high)80 static inline void rdtsc(uint32_t *low, uint32_t *high)
81 {
82 __asm__ volatile("RDTSC\n\t"
83 : "=d" (*high), "=a" (*low)
84 );
85 }
86
87 /* Use RDTSCP to serialize, then read TSC. Instructions following RDTSCP may
88 start to execute before RDTSCP finishes. */
89 __attribute__((__unused__, __always_inline__)) ATTRIBUTE_ARTIFICIAL
rdtscp(uint32_t * low,uint32_t * high)90 static inline void rdtscp(uint32_t *low, uint32_t *high)
91 {
92 __asm__ volatile("RDTSCP\n\t"
93 : "=d" (*high), "=a" (*low)
94 :: "%rcx"
95 );
96 }
97
98 /* Read a performance measurement counter */
99 __attribute__((__unused__, __always_inline__)) ATTRIBUTE_ARTIFICIAL
100 static inline void
rdpmc(uint32_t * low,uint32_t * high,const unsigned int selector)101 rdpmc(uint32_t *low, uint32_t *high, const unsigned int selector)
102 {
103 __asm__ volatile("rdpmc" : "=a" (*low), "=d" (*high) : "c" (selector));
104 }
105
106 #else /* defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM) */
107
108 __attribute__((__unused__, __always_inline__)) ATTRIBUTE_ARTIFICIAL
serialize()109 static inline void serialize() {}
110
111 __attribute__((__unused__, __always_inline__)) ATTRIBUTE_ARTIFICIAL
rdtsc(uint32_t * low,uint32_t * high)112 static inline void rdtsc(uint32_t *low, uint32_t *high)
113 { *low = *high = 0; }
114
115 __attribute__((__unused__, __always_inline__)) ATTRIBUTE_ARTIFICIAL
rdtscp(uint32_t * low,uint32_t * high)116 static inline void rdtscp(uint32_t *low, uint32_t *high)
117 { *low = *high = 0; }
118
119 __attribute__((__unused__, __always_inline__)) ATTRIBUTE_ARTIFICIAL
120 static inline void
rdpmc(uint32_t * low,uint32_t * high,const unsigned int selector MAYBE_UNUSED)121 rdpmc(uint32_t *low, uint32_t *high, const unsigned int selector MAYBE_UNUSED)
122 { *low = *high = 0; }
123
124 #endif
125
126 __attribute__((__unused__, __always_inline__)) ATTRIBUTE_ARTIFICIAL
rdtscl()127 static inline uint64_t rdtscl()
128 {
129 uint32_t high, low;
130 rdtsc(&low, &high);
131 return u32_to_64(low, high);
132 }
133
134 __attribute__((__unused__, __always_inline__)) ATTRIBUTE_ARTIFICIAL
rdtscpl()135 static inline uint64_t rdtscpl()
136 {
137 uint32_t high, low;
138 rdtscp(&low, &high);
139 return u32_to_64(low, high);
140 }
141
142 __attribute__((__unused__, __always_inline__)) ATTRIBUTE_ARTIFICIAL
rdpmcl(const unsigned int selector)143 static inline uint64_t rdpmcl(const unsigned int selector)
144 {
145 uint32_t high, low;
146 rdpmc(&low, &high, selector);
147 return u32_to_64(low, high);
148 }
149
150 // rdpmcl_cycles uses a "fixed-function" performance counter to return
151 // the count of actual CPU core cycles executed by the current core.
152 // Core cycles are not accumulated while the processor is in the "HALT"
153 // state, which is used when the operating system has no task(s) to run
154 // on a processor core.
155 // Note that this counter continues to increment during system calls
156 // and task switches. As such, it may be unreliable for timing long
157 // functions where the CPU may serve an interrupt request or where
158 // the kernel may preempt execution and switch to another process.
159 // It is best used for timing short intervals which usually run
160 // uninterrupted, and where occurrences of interruption are easily
161 // detected by an abnormally large cycle count.
162
163 // The RDPMC instruction must be enabled for execution in user-space.
164 // This requires a total of three bits to be set in CR4 and MSRs of
165 // the CPU:
166 // Bit 1<<8 in CR4 must be set to 1. On Linux, this can be effected by
167 // executing as root:
168 // echo 1 >> /sys/devices/cpu/rdpmc
169 // Bit 1<<33 must be set to 1 in the MSR_CORE_PERF_GLOBAL_CTRL
170 // (MSR address 0x38f). This enables the cycle counter so that it
171 // actually increment with each clock cycle; while this bit is 0,
172 // the counter value stays fixed.
173 // Bit 1<<5 must be set to 1 in the MSR_CORE_PERF_FIXED_CTR_CTRL
174 // (MSR address 0x38d) which causes the counter to be incremented
175 // on non-halted clock cycles that occur while the CPL is >0
176 // (user-mode). If bit 1<<4 is set to 1, then the counter will
177 // increment while CPL is 0 (kernel mode), e.g., during interrupts,
178 // etc.
179
180 // The only reliable way I found to enable all these bits is through
181 // the JEVENTS library which uses the PERF kernel interface which lets
182 // it know that the bits are supposed to stay on. Doing it manually
183 // with, e.g., a kernel module causes the kernel to clear the bits
184 // again next time it updates any of these CR/MSR.
185
186
187 __attribute__((__unused__, __always_inline__)) ATTRIBUTE_ARTIFICIAL
188 static inline void
rdpmc_cycles(uint32_t * low,uint32_t * high)189 rdpmc_cycles(uint32_t *low, uint32_t *high)
190 {
191 const unsigned c = (1U<<30) + 1; /* Second Fixed-function counter:
192 clock cycles in non-HALT */
193 rdpmc(low, high, c);
194 }
195
196 __attribute__((__unused__, __always_inline__)) ATTRIBUTE_ARTIFICIAL
rdpmcl_cycles()197 static inline uint64_t rdpmcl_cycles()
198 {
199 uint32_t low, high;
200 rdpmc_cycles(&low, &high);
201 return u32_to_64(low, high);
202 }
203
init_timing()204 static inline void init_timing()
205 {
206 #ifdef USE_INTEL_PCM
207 printf("# Using Intel PCM library\n");
208 m = PCM::getInstance();
209 if (m->program() != PCM::Success) {
210 printf("Could not initialise PCM\n");
211 exit(EXIT_FAILURE);
212 }
213
214 const bool have_smt = m->getSMT();
215 if (have_smt) {
216 printf("# CPU uses SMT\n");
217 } else {
218 printf("# CPU does not use SMT\n");
219 }
220 #elif defined(USE_PERF)
221 printf("# Using PERF library\n");
222 pd = libperf_initialize(-1,-1); /* init lib */
223 libperf_enablecounter(pd, LIBPERF_COUNT_HW_CPU_CYCLES);
224 /* enable HW counter */
225 #elif defined(USE_JEVENTS)
226 printf("# Using jevents library\n");
227 #ifdef TIMING_SERIALIZE
228 printf("# Serializing with CPUID before timing start and with RDTSCP at "
229 "timing end\n");
230 #endif
231 if (rdpmc_open(PERF_COUNT_HW_CPU_CYCLES, &ctx) < 0)
232 exit(EXIT_FAILURE);
233 #else
234 printf("# Using " CADO_STRINGIZE(START_COUNTER) " to start and "
235 CADO_STRINGIZE(END_COUNTER) " to end measurement\n");
236 #endif
237 }
238
clear_timing()239 static inline void clear_timing()
240 {
241 #ifdef USE_INTEL_PCM
242 m->cleanup();
243 #elif defined(USE_PERF)
244 libperf_close(pd);
245 pd = NULL;
246 #elif defined(USE_JEVENTS)
247 rdpmc_close (&ctx);
248 #else
249 /* Timing with RDTSC[P] does not allocate anything */
250 #endif
251 }
252
start_timing()253 static inline void start_timing()
254 {
255 #ifdef USE_INTEL_PCM
256 const int cpu = sched_getcpu();
257 before_sstate = getCoreCounterState(cpu);
258 #elif defined(USE_PERF)
259 start_time = libperf_readcounter(pd, LIBPERF_COUNT_HW_CPU_CYCLES);
260 #elif defined(USE_JEVENTS)
261 #ifdef TIMING_SERIALIZE
262 serialize();
263 #endif
264 #ifdef IMMEDIATE_RDPMC
265 rdpmc_cycles(&start_time.lo, &start_time.hi);
266 #else
267 start_time = rdpmc_read(&ctx);
268 #endif
269 #else
270 start_time = START_COUNTER();
271 #endif
272 }
273
end_timing()274 static inline void end_timing()
275 {
276 #ifdef USE_INTEL_PCM
277 const int cpu = sched_getcpu();
278 after_sstate = getCoreCounterState(cpu);
279 #elif defined(USE_PERF)
280 end_time = libperf_readcounter(pd, LIBPERF_COUNT_HW_CPU_CYCLES);
281 #elif defined(USE_JEVENTS)
282 #ifdef TIMING_SERIALIZE
283 rdtscpl();
284 #endif
285 #ifdef IMMEDIATE_RDPMC
286 end_time = rdpmcl_cycles();
287 #else
288 end_time = rdpmc_read(&ctx);
289 #endif
290 #else
291 end_time = END_COUNTER();
292 #endif
293 }
294
get_diff_timing()295 static inline uint64_t get_diff_timing()
296 {
297 #ifdef USE_INTEL_PCM
298 return getCycles(before_sstate,after_sstate);
299 #elif defined(USE_PERF)
300 return end_time - start_time;
301 #elif defined(USE_JEVENTS)
302 #ifdef IMMEDIATE_RDPMC
303 return end_time - u32_to_64(start_time.lo, start_time.hi);
304 #else
305 return end_time - start_time;
306 #endif
307 #else
308 return end_time - start_time;
309 #endif
310
311 }
312
313 #ifdef USE_JEVENTS
314 static uint64_t pmc0, pmc1, pmc2, pmc3;
315 #endif
316
317 void
readAllPmc()318 readAllPmc()
319 {
320 #ifdef USE_JEVENTS
321 pmc0 = rdpmcl(0);
322 pmc1 = rdpmcl(1);
323 pmc2 = rdpmcl(2);
324 pmc3 = rdpmcl(3);
325 #endif
326 }
327
328 void
diffAllPmc()329 diffAllPmc()
330 {
331 #ifdef USE_JEVENTS
332 pmc0 = rdpmcl(0) - pmc0;
333 pmc1 = rdpmcl(1) - pmc1;
334 pmc2 = rdpmcl(0) - pmc2;
335 pmc3 = rdpmcl(1) - pmc3;
336 #endif
337 }
338
339 void
printAllPmc()340 printAllPmc()
341 {
342 #ifdef USE_JEVENTS
343 printf(" (pmc0: %lu, pmc1: %lu, pmc2: %lu, pmc3: %lu)",
344 pmc0, pmc1, pmc2, pmc3);
345 #endif
346 }
347
348 #endif /* RDTSC_H_ */
349