1 /*
2  * rdtsc.h
3  *
4  *  Created on: 09.08.2016
5  *      Author: kruppa
6  */
7 
8 #ifndef RDTSC_H_
9 #define RDTSC_H_
10 
11 #include <stdint.h>
12 #include "macros.h"
13 
14 /* Enable at most one of these three defines, preferably in the including
15    compilation unit */
16 // #define USE_INTEL_PCM 1
17 // #define USE_PERF 1
18 // #define USE_JEVENTS 1
19 
20 #include <stdlib.h>
21 
u32_to_64(uint32_t low,uint32_t high)22 uint64_t u32_to_64(uint32_t low, uint32_t high)
23 { return ((uint64_t) high << 32) + (uint64_t) low; }
24 
25 #ifdef USE_INTEL_PCM
26 
27 #include <sched.h>
28 #include <stdio.h>
29 #include "cpucounters.h"
30 static PCM * m;
31 static CoreCounterState before_sstate, after_sstate;
32 
33 #elif defined USE_PERF
34 
35 #include "libperf.h"  /* standard libperf include */
36 static struct libperf_data* pd;
37 static uint64_t start_time, end_time;
38 
39 #elif USE_JEVENTS
40 
41 #ifdef __cplusplus
42 extern "C" {
43 #endif
44 #include "rdpmc.h"
45 #ifdef __cplusplus
46 }
47 #endif
48 struct rdpmc_ctx ctx;
49 #define IMMEDIATE_RDPMC 1
50 #ifdef IMMEDIATE_RDPMC
51 struct {unsigned lo, hi;} start_time;
52 static unsigned long end_time;
53 #else
54 static uint64_t start_time, end_time;
55 #endif
56 
57 #else
58 
59 #define START_COUNTER rdtscpl
60 #define END_COUNTER rdtscpl
61 static uint64_t start_time, end_time;
62 
63 #endif
64 
65 #if defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
66 
67 __attribute__((__unused__, __always_inline__)) ATTRIBUTE_ARTIFICIAL
serialize()68 static inline void serialize()
69 {
70     unsigned long id = 0;
71     __asm__ volatile("CPUID\n\t"
72 		     : "+a" (id)
73 		     :: "%rbx", "%rcx", "%rdx"
74 	);
75 }
76 
77 /* Use RDTSC to read CPU core clock cycles except that on contemporary CPUs,
78    it actually reads some natural-time counter */
79 __attribute__((__unused__, __always_inline__)) ATTRIBUTE_ARTIFICIAL
rdtsc(uint32_t * low,uint32_t * high)80 static inline void rdtsc(uint32_t *low, uint32_t *high)
81 {
82     __asm__ volatile("RDTSC\n\t"
83 		     : "=d" (*high), "=a" (*low)
84 	);
85 }
86 
87 /* Use RDTSCP to serialize, then read TSC. Instructions following RDTSCP may
88    start to execute before RDTSCP finishes. */
89 __attribute__((__unused__, __always_inline__)) ATTRIBUTE_ARTIFICIAL
rdtscp(uint32_t * low,uint32_t * high)90 static inline void rdtscp(uint32_t *low, uint32_t *high)
91 {
92     __asm__ volatile("RDTSCP\n\t"
93 		     : "=d" (*high), "=a" (*low)
94 		     :: "%rcx"
95 	);
96 }
97 
98 /* Read a performance measurement counter */
99 __attribute__((__unused__, __always_inline__)) ATTRIBUTE_ARTIFICIAL
100 static inline void
rdpmc(uint32_t * low,uint32_t * high,const unsigned int selector)101 rdpmc(uint32_t *low, uint32_t *high, const unsigned int selector)
102 {
103     __asm__ volatile("rdpmc" : "=a" (*low), "=d" (*high) : "c" (selector));
104 }
105 
106 #else /* defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM) */
107 
108 __attribute__((__unused__, __always_inline__)) ATTRIBUTE_ARTIFICIAL
serialize()109 static inline void serialize() {}
110 
111 __attribute__((__unused__, __always_inline__)) ATTRIBUTE_ARTIFICIAL
rdtsc(uint32_t * low,uint32_t * high)112 static inline void rdtsc(uint32_t *low, uint32_t *high)
113 { *low = *high = 0; }
114 
115 __attribute__((__unused__, __always_inline__)) ATTRIBUTE_ARTIFICIAL
rdtscp(uint32_t * low,uint32_t * high)116 static inline void rdtscp(uint32_t *low, uint32_t *high)
117 { *low = *high = 0; }
118 
119 __attribute__((__unused__, __always_inline__)) ATTRIBUTE_ARTIFICIAL
120 static inline void
rdpmc(uint32_t * low,uint32_t * high,const unsigned int selector MAYBE_UNUSED)121 rdpmc(uint32_t *low, uint32_t *high, const unsigned int selector MAYBE_UNUSED)
122 { *low = *high = 0; }
123 
124 #endif
125 
126 __attribute__((__unused__, __always_inline__)) ATTRIBUTE_ARTIFICIAL
rdtscl()127 static inline uint64_t rdtscl()
128 {
129     uint32_t high, low;
130     rdtsc(&low, &high);
131     return u32_to_64(low, high);
132 }
133 
134 __attribute__((__unused__, __always_inline__)) ATTRIBUTE_ARTIFICIAL
rdtscpl()135 static inline uint64_t rdtscpl()
136 {
137     uint32_t high, low;
138     rdtscp(&low, &high);
139     return u32_to_64(low, high);
140 }
141 
142 __attribute__((__unused__, __always_inline__)) ATTRIBUTE_ARTIFICIAL
rdpmcl(const unsigned int selector)143 static inline uint64_t rdpmcl(const unsigned int selector)
144 {
145     uint32_t high, low;
146     rdpmc(&low, &high, selector);
147     return u32_to_64(low, high);
148 }
149 
150 // rdpmcl_cycles uses a "fixed-function" performance counter to return
151 // the count of actual CPU core cycles executed by the current core.
152 // Core cycles are not accumulated while the processor is in the "HALT"
153 // state, which is used when the operating system has no task(s) to run
154 // on a processor core.
155 // Note that this counter continues to increment during system calls
156 // and task switches. As such, it may be unreliable for timing long
157 // functions where the CPU may serve an interrupt request or where
158 // the kernel may preempt execution and switch to another process.
159 // It is best used for timing short intervals which usually run
160 // uninterrupted, and where occurrences of interruption are easily
161 // detected by an abnormally large cycle count.
162 
163 // The RDPMC instruction must be enabled for execution in user-space.
164 // This requires a total of three bits to be set in CR4 and MSRs of
165 // the CPU:
166 // Bit 1<<8 in CR4 must be set to 1. On Linux, this can be effected by
167 // executing as root:
168 //   echo 1 >> /sys/devices/cpu/rdpmc
169 // Bit 1<<33 must be set to 1 in the MSR_CORE_PERF_GLOBAL_CTRL
170 // (MSR address 0x38f). This enables the cycle counter so that it
171 // actually increment with each clock cycle; while this bit is 0,
172 // the counter value stays fixed.
173 // Bit 1<<5 must be set to 1 in the MSR_CORE_PERF_FIXED_CTR_CTRL
174 // (MSR address 0x38d) which causes the counter to be incremented
175 // on non-halted clock cycles that occur while the CPL is >0
176 // (user-mode). If bit 1<<4 is set to 1, then the counter will
177 // increment while CPL is 0 (kernel mode), e.g., during interrupts,
178 // etc.
179 
180 // The only reliable way I found to enable all these bits is through
181 // the JEVENTS library which uses the PERF kernel interface which lets
182 // it know that the bits are supposed to stay on. Doing it manually
183 // with, e.g., a kernel module causes the kernel to clear the bits
184 // again next time it updates any of these CR/MSR.
185 
186 
187 __attribute__((__unused__, __always_inline__)) ATTRIBUTE_ARTIFICIAL
188 static inline void
rdpmc_cycles(uint32_t * low,uint32_t * high)189 rdpmc_cycles(uint32_t *low, uint32_t *high)
190 {
191     const unsigned c = (1U<<30) + 1; /* Second Fixed-function counter:
192                                         clock cycles in non-HALT */
193     rdpmc(low, high, c);
194 }
195 
196 __attribute__((__unused__, __always_inline__)) ATTRIBUTE_ARTIFICIAL
rdpmcl_cycles()197 static inline uint64_t rdpmcl_cycles()
198 {
199     uint32_t low, high;
200     rdpmc_cycles(&low, &high);
201     return u32_to_64(low, high);
202 }
203 
init_timing()204 static inline void init_timing()
205 {
206 #ifdef USE_INTEL_PCM
207     printf("# Using Intel PCM library\n");
208     m = PCM::getInstance();
209     if (m->program() != PCM::Success) {
210 	printf("Could not initialise PCM\n");
211 	exit(EXIT_FAILURE);
212     }
213 
214     const bool have_smt = m->getSMT();
215     if (have_smt) {
216 	printf("# CPU uses SMT\n");
217     } else {
218 	printf("# CPU does not use SMT\n");
219   }
220 #elif defined(USE_PERF)
221     printf("# Using PERF library\n");
222     pd = libperf_initialize(-1,-1); /* init lib */
223     libperf_enablecounter(pd, LIBPERF_COUNT_HW_CPU_CYCLES);
224     /* enable HW counter */
225 #elif defined(USE_JEVENTS)
226     printf("# Using jevents library\n");
227 #ifdef TIMING_SERIALIZE
228     printf("# Serializing with CPUID before timing start and with RDTSCP at "
229 	   "timing end\n");
230 #endif
231     if (rdpmc_open(PERF_COUNT_HW_CPU_CYCLES, &ctx) < 0)
232 	exit(EXIT_FAILURE);
233 #else
234     printf("# Using " CADO_STRINGIZE(START_COUNTER) " to start and "
235 	   CADO_STRINGIZE(END_COUNTER) " to end measurement\n");
236 #endif
237 }
238 
clear_timing()239 static inline void clear_timing()
240 {
241 #ifdef USE_INTEL_PCM
242     m->cleanup();
243 #elif defined(USE_PERF)
244     libperf_close(pd);
245     pd = NULL;
246 #elif defined(USE_JEVENTS)
247     rdpmc_close (&ctx);
248 #else
249     /* Timing with RDTSC[P] does not allocate anything */
250 #endif
251 }
252 
start_timing()253 static inline void start_timing()
254 {
255 #ifdef USE_INTEL_PCM
256     const int cpu = sched_getcpu();
257     before_sstate = getCoreCounterState(cpu);
258 #elif defined(USE_PERF)
259     start_time = libperf_readcounter(pd, LIBPERF_COUNT_HW_CPU_CYCLES);
260 #elif defined(USE_JEVENTS)
261 #ifdef TIMING_SERIALIZE
262     serialize();
263 #endif
264 #ifdef IMMEDIATE_RDPMC
265     rdpmc_cycles(&start_time.lo, &start_time.hi);
266 #else
267     start_time = rdpmc_read(&ctx);
268 #endif
269 #else
270     start_time = START_COUNTER();
271 #endif
272 }
273 
end_timing()274 static inline void end_timing()
275 {
276 #ifdef USE_INTEL_PCM
277     const int cpu = sched_getcpu();
278     after_sstate = getCoreCounterState(cpu);
279 #elif defined(USE_PERF)
280     end_time = libperf_readcounter(pd, LIBPERF_COUNT_HW_CPU_CYCLES);
281 #elif defined(USE_JEVENTS)
282 #ifdef TIMING_SERIALIZE
283     rdtscpl();
284 #endif
285 #ifdef IMMEDIATE_RDPMC
286     end_time = rdpmcl_cycles();
287 #else
288     end_time = rdpmc_read(&ctx);
289 #endif
290 #else
291     end_time = END_COUNTER();
292 #endif
293 }
294 
get_diff_timing()295 static inline uint64_t get_diff_timing()
296 {
297 #ifdef USE_INTEL_PCM
298     return getCycles(before_sstate,after_sstate);
299 #elif defined(USE_PERF)
300     return end_time - start_time;
301 #elif defined(USE_JEVENTS)
302 #ifdef IMMEDIATE_RDPMC
303     return end_time - u32_to_64(start_time.lo, start_time.hi);
304 #else
305     return end_time - start_time;
306 #endif
307 #else
308     return end_time - start_time;
309 #endif
310 
311 }
312 
313 #ifdef USE_JEVENTS
314 static uint64_t pmc0, pmc1, pmc2, pmc3;
315 #endif
316 
317 void
readAllPmc()318 readAllPmc()
319 {
320 #ifdef USE_JEVENTS
321     pmc0 = rdpmcl(0);
322     pmc1 = rdpmcl(1);
323     pmc2 = rdpmcl(2);
324     pmc3 = rdpmcl(3);
325 #endif
326 }
327 
328 void
diffAllPmc()329 diffAllPmc()
330 {
331 #ifdef USE_JEVENTS
332     pmc0 = rdpmcl(0) - pmc0;
333     pmc1 = rdpmcl(1) - pmc1;
334     pmc2 = rdpmcl(0) - pmc2;
335     pmc3 = rdpmcl(1) - pmc3;
336 #endif
337 }
338 
339 void
printAllPmc()340 printAllPmc()
341 {
342 #ifdef USE_JEVENTS
343     printf(" (pmc0: %lu, pmc1: %lu, pmc2: %lu, pmc3: %lu)",
344 	   pmc0, pmc1, pmc2, pmc3);
345 #endif
346 }
347 
348 #endif /* RDTSC_H_ */
349