1 /* $NetBSD: tsc.c,v 1.57 2021/10/15 18:12:48 jmcneill Exp $ */
2
3 /*-
4 * Copyright (c) 2008, 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 #include <sys/cdefs.h>
30 __KERNEL_RCSID(0, "$NetBSD: tsc.c,v 1.57 2021/10/15 18:12:48 jmcneill Exp $");
31
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/time.h>
35 #include <sys/timetc.h>
36 #include <sys/lwp.h>
37 #include <sys/atomic.h>
38 #include <sys/kernel.h>
39 #include <sys/cpu.h>
40 #include <sys/xcall.h>
41 #include <sys/lock.h>
42
43 #include <machine/cpu_counter.h>
44 #include <machine/cpuvar.h>
45 #include <machine/cpufunc.h>
46 #include <machine/specialreg.h>
47 #include <machine/cputypes.h>
48
49 #include "tsc.h"
50
51 #define TSC_SYNC_ROUNDS 1000
52 #define ABS(a) ((a) >= 0 ? (a) : -(a))
53
54 static u_int tsc_get_timecount(struct timecounter *);
55
56 static void tsc_delay(unsigned int);
57
58 static uint64_t tsc_dummy_cacheline __cacheline_aligned;
59 uint64_t tsc_freq __read_mostly; /* exported for sysctl */
60 static int64_t tsc_drift_max = 1000; /* max cycles */
61 static int64_t tsc_drift_observed;
62 uint64_t (*rdtsc)(void) = rdtsc_cpuid;
63 uint64_t (*cpu_counter)(void) = cpu_counter_cpuid;
64 uint32_t (*cpu_counter32)(void) = cpu_counter32_cpuid;
65
66 int tsc_user_enabled = 1;
67
68 static volatile int64_t tsc_sync_val;
69 static volatile struct cpu_info *tsc_sync_cpu;
70
71 static struct timecounter tsc_timecounter = {
72 .tc_get_timecount = tsc_get_timecount,
73 .tc_counter_mask = ~0U,
74 .tc_name = "TSC",
75 .tc_quality = 3000,
76 };
77
78 bool
tsc_is_invariant(void)79 tsc_is_invariant(void)
80 {
81 struct cpu_info *ci;
82 uint32_t descs[4];
83 uint32_t family;
84 bool invariant;
85
86 if (!cpu_hascounter())
87 return false;
88
89 ci = curcpu();
90 invariant = false;
91
92 if (cpu_vendor == CPUVENDOR_INTEL) {
93 /*
94 * From Intel(tm) 64 and IA-32 Architectures Software
95 * Developer's Manual Volume 3A: System Programming Guide,
96 * Part 1, 17.13 TIME_STAMP COUNTER, these are the processors
97 * where the TSC is known invariant:
98 *
99 * Pentium 4, Intel Xeon (family 0f, models 03 and higher)
100 * Core Solo and Core Duo processors (family 06, model 0e)
101 * Xeon 5100 series and Core 2 Duo (family 06, model 0f)
102 * Core 2 and Xeon (family 06, model 17)
103 * Atom (family 06, model 1c)
104 *
105 * We'll also assume that it's safe on the Pentium, and
106 * that it's safe on P-II and P-III Xeons due to the
107 * typical configuration of those systems.
108 *
109 */
110 switch (CPUID_TO_BASEFAMILY(ci->ci_signature)) {
111 case 0x05:
112 invariant = true;
113 break;
114 case 0x06:
115 invariant = CPUID_TO_MODEL(ci->ci_signature) == 0x0e ||
116 CPUID_TO_MODEL(ci->ci_signature) == 0x0f ||
117 CPUID_TO_MODEL(ci->ci_signature) == 0x17 ||
118 CPUID_TO_MODEL(ci->ci_signature) == 0x1c;
119 break;
120 case 0x0f:
121 invariant = CPUID_TO_MODEL(ci->ci_signature) >= 0x03;
122 break;
123 }
124 } else if (cpu_vendor == CPUVENDOR_AMD) {
125 /*
126 * TSC and Power Management Events on AMD Processors
127 * Nov 2, 2005 Rich Brunner, AMD Fellow
128 * http://lkml.org/lkml/2005/11/4/173
129 *
130 * See Appendix E.4.7 CPUID Fn8000_0007_EDX Advanced Power
131 * Management Features, AMD64 Architecture Programmer's
132 * Manual Volume 3: General-Purpose and System Instructions.
133 * The check is done below.
134 */
135 }
136
137 /*
138 * The best way to check whether the TSC counter is invariant or not
139 * is to check CPUID 80000007.
140 */
141 family = CPUID_TO_BASEFAMILY(ci->ci_signature);
142 if (((cpu_vendor == CPUVENDOR_INTEL) || (cpu_vendor == CPUVENDOR_AMD))
143 && ((family == 0x06) || (family == 0x0f))) {
144 x86_cpuid(0x80000000, descs);
145 if (descs[0] >= 0x80000007) {
146 x86_cpuid(0x80000007, descs);
147 invariant = (descs[3] & CPUID_APM_ITSC) != 0;
148 }
149 }
150
151 return invariant;
152 }
153
154 /* Setup function pointers for rdtsc() and timecounter(9). */
155 void
tsc_setfunc(struct cpu_info * ci)156 tsc_setfunc(struct cpu_info *ci)
157 {
158 bool use_lfence, use_mfence;
159
160 use_lfence = use_mfence = false;
161
162 /*
163 * XXX On AMD, we might be able to use lfence for some cases:
164 * a) if MSR_DE_CFG exist and the bit 1 is set.
165 * b) family == 0x0f or 0x11. Those have no MSR_DE_CFG and
166 * lfence is always serializing.
167 *
168 * We don't use it because the test result showed mfence was better
169 * than lfence with MSR_DE_CFG.
170 */
171 if (cpu_vendor == CPUVENDOR_AMD)
172 use_mfence = true;
173 else if (cpu_vendor == CPUVENDOR_INTEL)
174 use_lfence = true;
175
176 /* LFENCE and MFENCE are applicable if SSE2 is set. */
177 if ((ci->ci_feat_val[0] & CPUID_SSE2) == 0)
178 use_lfence = use_mfence = false;
179
180 #define TSC_SETFUNC(fence) \
181 do { \
182 rdtsc = rdtsc_##fence; \
183 cpu_counter = cpu_counter_##fence; \
184 cpu_counter32 = cpu_counter32_##fence; \
185 } while (/* CONSTCOND */ 0)
186
187 if (use_lfence)
188 TSC_SETFUNC(lfence);
189 else if (use_mfence)
190 TSC_SETFUNC(mfence);
191 else
192 TSC_SETFUNC(cpuid);
193
194 aprint_verbose_dev(ci->ci_dev, "Use %s to serialize rdtsc\n",
195 use_lfence ? "lfence" : (use_mfence ? "mfence" : "cpuid"));
196 }
197
198 /*
199 * Initialize timecounter(9) and DELAY() function of TSC.
200 *
201 * This function is called after all secondary processors were brought up
202 * and drift has been measured, and after any other potential delay funcs
203 * have been installed (e.g. lapic_delay()).
204 */
205 void
tsc_tc_init(void)206 tsc_tc_init(void)
207 {
208 struct cpu_info *ci;
209 bool invariant;
210
211 if (!cpu_hascounter())
212 return;
213
214 ci = curcpu();
215 tsc_freq = ci->ci_data.cpu_cc_freq;
216 invariant = tsc_is_invariant();
217 if (!invariant) {
218 aprint_debug("TSC not known invariant on this CPU\n");
219 tsc_timecounter.tc_quality = -100;
220 } else if (tsc_drift_observed > tsc_drift_max) {
221 aprint_error("ERROR: %lld cycle TSC drift observed\n",
222 (long long)tsc_drift_observed);
223 tsc_timecounter.tc_quality = -100;
224 invariant = false;
225 } else if (vm_guest == VM_GUEST_NO) {
226 delay_func = tsc_delay;
227 } else if (vm_guest == VM_GUEST_VIRTUALBOX) {
228 tsc_timecounter.tc_quality = -100;
229 }
230
231 if (tsc_freq != 0) {
232 tsc_timecounter.tc_frequency = tsc_freq;
233 tc_init(&tsc_timecounter);
234 }
235 }
236
237 /*
238 * Record drift (in clock cycles). Called during AP startup.
239 */
240 void
tsc_sync_drift(int64_t drift)241 tsc_sync_drift(int64_t drift)
242 {
243
244 if (drift < 0)
245 drift = -drift;
246 if (drift > tsc_drift_observed)
247 tsc_drift_observed = drift;
248 }
249
250 /*
251 * Called during startup of APs, by the boot processor. Interrupts
252 * are disabled on entry.
253 */
254 static void __noinline
tsc_read_bp(struct cpu_info * ci,uint64_t * bptscp,uint64_t * aptscp)255 tsc_read_bp(struct cpu_info *ci, uint64_t *bptscp, uint64_t *aptscp)
256 {
257 uint64_t bptsc;
258
259 if (atomic_swap_ptr(&tsc_sync_cpu, ci) != NULL) {
260 panic("tsc_sync_bp: 1");
261 }
262
263 /* Prepare a cache miss for the other side. */
264 (void)atomic_swap_uint((void *)&tsc_dummy_cacheline, 0);
265
266 /* Flag our readiness. */
267 atomic_or_uint(&ci->ci_flags, CPUF_SYNCTSC);
268
269 /* Wait for other side then read our TSC. */
270 while ((ci->ci_flags & CPUF_SYNCTSC) != 0) {
271 __insn_barrier();
272 }
273 bptsc = rdtsc();
274
275 /* Wait for the results to come in. */
276 while (tsc_sync_cpu == ci) {
277 x86_pause();
278 }
279 if (tsc_sync_cpu != NULL) {
280 panic("tsc_sync_bp: 2");
281 }
282
283 *bptscp = bptsc;
284 *aptscp = tsc_sync_val;
285 }
286
287 void
tsc_sync_bp(struct cpu_info * ci)288 tsc_sync_bp(struct cpu_info *ci)
289 {
290 int64_t bptsc, aptsc, val, diff;
291
292 if (!cpu_hascounter())
293 return;
294
295 val = INT64_MAX;
296 for (int i = 0; i < TSC_SYNC_ROUNDS; i++) {
297 tsc_read_bp(ci, &bptsc, &aptsc);
298 diff = bptsc - aptsc;
299 if (ABS(diff) < ABS(val)) {
300 val = diff;
301 }
302 }
303
304 ci->ci_data.cpu_cc_skew = val;
305 }
306
307 /*
308 * Called during startup of AP, by the AP itself. Interrupts are
309 * disabled on entry.
310 */
311 static void __noinline
tsc_post_ap(struct cpu_info * ci)312 tsc_post_ap(struct cpu_info *ci)
313 {
314 uint64_t tsc;
315
316 /* Wait for go-ahead from primary. */
317 while ((ci->ci_flags & CPUF_SYNCTSC) == 0) {
318 __insn_barrier();
319 }
320
321 /* Instruct primary to read its counter. */
322 atomic_and_uint(&ci->ci_flags, ~CPUF_SYNCTSC);
323
324 /* Suffer a cache miss, then read TSC. */
325 __insn_barrier();
326 tsc = tsc_dummy_cacheline;
327 __insn_barrier();
328 tsc += rdtsc();
329
330 /* Post result. Ensure the whole value goes out atomically. */
331 (void)atomic_swap_64(&tsc_sync_val, tsc);
332
333 if (atomic_swap_ptr(&tsc_sync_cpu, NULL) != ci) {
334 panic("tsc_sync_ap");
335 }
336 }
337
338 void
tsc_sync_ap(struct cpu_info * ci)339 tsc_sync_ap(struct cpu_info *ci)
340 {
341
342 if (!cpu_hascounter())
343 return;
344
345 for (int i = 0; i < TSC_SYNC_ROUNDS; i++) {
346 tsc_post_ap(ci);
347 }
348 }
349
350 static void
tsc_apply_cpu(void * arg1,void * arg2)351 tsc_apply_cpu(void *arg1, void *arg2)
352 {
353 bool enable = arg1 != NULL;
354 if (enable) {
355 lcr4(rcr4() & ~CR4_TSD);
356 } else {
357 lcr4(rcr4() | CR4_TSD);
358 }
359 }
360
361 void
tsc_user_enable(void)362 tsc_user_enable(void)
363 {
364 uint64_t xc;
365
366 xc = xc_broadcast(0, tsc_apply_cpu, (void *)true, NULL);
367 xc_wait(xc);
368 }
369
370 void
tsc_user_disable(void)371 tsc_user_disable(void)
372 {
373 uint64_t xc;
374
375 xc = xc_broadcast(0, tsc_apply_cpu, (void *)false, NULL);
376 xc_wait(xc);
377 }
378
379 uint64_t
cpu_frequency(struct cpu_info * ci)380 cpu_frequency(struct cpu_info *ci)
381 {
382
383 return ci->ci_data.cpu_cc_freq;
384 }
385
386 int
cpu_hascounter(void)387 cpu_hascounter(void)
388 {
389
390 return cpu_feature[0] & CPUID_TSC;
391 }
392
393 static void
tsc_delay(unsigned int us)394 tsc_delay(unsigned int us)
395 {
396 uint64_t start, delta;
397
398 start = cpu_counter();
399 delta = (uint64_t)us * tsc_freq / 1000000;
400
401 while ((cpu_counter() - start) < delta) {
402 x86_pause();
403 }
404 }
405
406 static u_int
tsc_get_timecount(struct timecounter * tc)407 tsc_get_timecount(struct timecounter *tc)
408 {
409 #ifdef _LP64 /* requires atomic 64-bit store */
410 static __cpu_simple_lock_t lock = __SIMPLELOCK_UNLOCKED;
411 static int lastwarn;
412 uint64_t cur, prev;
413 lwp_t *l = curlwp;
414 int ticks;
415
416 /*
417 * Previous value must be read before the counter and stored to
418 * after, because this routine can be called from interrupt context
419 * and may run over the top of an existing invocation. Ordering is
420 * guaranteed by "volatile" on md_tsc.
421 */
422 prev = l->l_md.md_tsc;
423 cur = cpu_counter();
424 if (__predict_false(cur < prev)) {
425 if ((cur >> 63) == (prev >> 63) &&
426 __cpu_simple_lock_try(&lock)) {
427 ticks = getticks();
428 if (ticks - lastwarn >= hz) {
429 printf(
430 "WARNING: TSC time went backwards by %u - "
431 "change sysctl(7) kern.timecounter?\n",
432 (unsigned)(prev - cur));
433 lastwarn = ticks;
434 }
435 __cpu_simple_unlock(&lock);
436 }
437 }
438 l->l_md.md_tsc = cur;
439 return (uint32_t)cur;
440 #else
441 return cpu_counter32();
442 #endif
443 }
444
445 /*
446 * tsc has been reset; zero the cached tsc of every lwp in the system
447 * so we don't spuriously report that the tsc has gone backward.
448 * Caller must ensure all LWPs are quiescent (except the current one,
449 * obviously) and interrupts are blocked while we update this.
450 */
451 void
tsc_tc_reset(void)452 tsc_tc_reset(void)
453 {
454 struct lwp *l;
455
456 LIST_FOREACH(l, &alllwp, l_list)
457 l->l_md.md_tsc = 0;
458 }
459