xref: /netbsd/sys/arch/x86/x86/tsc.c (revision 5a3a627e)
1 /*	$NetBSD: tsc.c,v 1.57 2021/10/15 18:12:48 jmcneill Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008, 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __KERNEL_RCSID(0, "$NetBSD: tsc.c,v 1.57 2021/10/15 18:12:48 jmcneill Exp $");
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/time.h>
35 #include <sys/timetc.h>
36 #include <sys/lwp.h>
37 #include <sys/atomic.h>
38 #include <sys/kernel.h>
39 #include <sys/cpu.h>
40 #include <sys/xcall.h>
41 #include <sys/lock.h>
42 
43 #include <machine/cpu_counter.h>
44 #include <machine/cpuvar.h>
45 #include <machine/cpufunc.h>
46 #include <machine/specialreg.h>
47 #include <machine/cputypes.h>
48 
49 #include "tsc.h"
50 
51 #define	TSC_SYNC_ROUNDS		1000
52 #define	ABS(a)			((a) >= 0 ? (a) : -(a))
53 
54 static u_int	tsc_get_timecount(struct timecounter *);
55 
56 static void	tsc_delay(unsigned int);
57 
58 static uint64_t	tsc_dummy_cacheline __cacheline_aligned;
59 uint64_t	tsc_freq __read_mostly;	/* exported for sysctl */
60 static int64_t	tsc_drift_max = 1000;	/* max cycles */
61 static int64_t	tsc_drift_observed;
62 uint64_t	(*rdtsc)(void) = rdtsc_cpuid;
63 uint64_t	(*cpu_counter)(void) = cpu_counter_cpuid;
64 uint32_t	(*cpu_counter32)(void) = cpu_counter32_cpuid;
65 
66 int tsc_user_enabled = 1;
67 
68 static volatile int64_t	tsc_sync_val;
69 static volatile struct cpu_info	*tsc_sync_cpu;
70 
71 static struct timecounter tsc_timecounter = {
72 	.tc_get_timecount = tsc_get_timecount,
73 	.tc_counter_mask = ~0U,
74 	.tc_name = "TSC",
75 	.tc_quality = 3000,
76 };
77 
78 bool
tsc_is_invariant(void)79 tsc_is_invariant(void)
80 {
81 	struct cpu_info *ci;
82 	uint32_t descs[4];
83 	uint32_t family;
84 	bool invariant;
85 
86 	if (!cpu_hascounter())
87 		return false;
88 
89 	ci = curcpu();
90 	invariant = false;
91 
92 	if (cpu_vendor == CPUVENDOR_INTEL) {
93 		/*
94 		 * From Intel(tm) 64 and IA-32 Architectures Software
95 		 * Developer's Manual Volume 3A: System Programming Guide,
96 		 * Part 1, 17.13 TIME_STAMP COUNTER, these are the processors
97 		 * where the TSC is known invariant:
98 		 *
99 		 * Pentium 4, Intel Xeon (family 0f, models 03 and higher)
100 		 * Core Solo and Core Duo processors (family 06, model 0e)
101 		 * Xeon 5100 series and Core 2 Duo (family 06, model 0f)
102 		 * Core 2 and Xeon (family 06, model 17)
103 		 * Atom (family 06, model 1c)
104 		 *
105 		 * We'll also assume that it's safe on the Pentium, and
106 		 * that it's safe on P-II and P-III Xeons due to the
107 		 * typical configuration of those systems.
108 		 *
109 		 */
110 		switch (CPUID_TO_BASEFAMILY(ci->ci_signature)) {
111 		case 0x05:
112 			invariant = true;
113 			break;
114 		case 0x06:
115 			invariant = CPUID_TO_MODEL(ci->ci_signature) == 0x0e ||
116 			    CPUID_TO_MODEL(ci->ci_signature) == 0x0f ||
117 			    CPUID_TO_MODEL(ci->ci_signature) == 0x17 ||
118 			    CPUID_TO_MODEL(ci->ci_signature) == 0x1c;
119 			break;
120 		case 0x0f:
121 			invariant = CPUID_TO_MODEL(ci->ci_signature) >= 0x03;
122 			break;
123 		}
124 	} else if (cpu_vendor == CPUVENDOR_AMD) {
125 		/*
126 		 * TSC and Power Management Events on AMD Processors
127 		 * Nov 2, 2005 Rich Brunner, AMD Fellow
128 		 * http://lkml.org/lkml/2005/11/4/173
129 		 *
130 		 * See Appendix E.4.7 CPUID Fn8000_0007_EDX Advanced Power
131 		 * Management Features, AMD64 Architecture Programmer's
132 		 * Manual Volume 3: General-Purpose and System Instructions.
133 		 * The check is done below.
134 		 */
135 	}
136 
137 	/*
138 	 * The best way to check whether the TSC counter is invariant or not
139 	 * is to check CPUID 80000007.
140 	 */
141 	family = CPUID_TO_BASEFAMILY(ci->ci_signature);
142 	if (((cpu_vendor == CPUVENDOR_INTEL) || (cpu_vendor == CPUVENDOR_AMD))
143 	    && ((family == 0x06) || (family == 0x0f))) {
144 		x86_cpuid(0x80000000, descs);
145 		if (descs[0] >= 0x80000007) {
146 			x86_cpuid(0x80000007, descs);
147 			invariant = (descs[3] & CPUID_APM_ITSC) != 0;
148 		}
149 	}
150 
151 	return invariant;
152 }
153 
154 /* Setup function pointers for rdtsc() and timecounter(9). */
155 void
tsc_setfunc(struct cpu_info * ci)156 tsc_setfunc(struct cpu_info *ci)
157 {
158 	bool use_lfence, use_mfence;
159 
160 	use_lfence = use_mfence = false;
161 
162 	/*
163 	 * XXX On AMD, we might be able to use lfence for some cases:
164 	 *   a) if MSR_DE_CFG exist and the bit 1 is set.
165 	 *   b) family == 0x0f or 0x11. Those have no MSR_DE_CFG and
166 	 *      lfence is always serializing.
167 	 *
168 	 * We don't use it because the test result showed mfence was better
169 	 * than lfence with MSR_DE_CFG.
170 	 */
171 	if (cpu_vendor == CPUVENDOR_AMD)
172 		use_mfence = true;
173 	else if (cpu_vendor == CPUVENDOR_INTEL)
174 		use_lfence = true;
175 
176 	/* LFENCE and MFENCE are applicable if SSE2 is set. */
177 	if ((ci->ci_feat_val[0] & CPUID_SSE2) == 0)
178 		use_lfence = use_mfence = false;
179 
180 #define TSC_SETFUNC(fence)						      \
181 	do {								      \
182 		rdtsc = rdtsc_##fence;					      \
183 		cpu_counter = cpu_counter_##fence;			      \
184 		cpu_counter32 = cpu_counter32_##fence;			      \
185 	} while (/* CONSTCOND */ 0)
186 
187 	if (use_lfence)
188 		TSC_SETFUNC(lfence);
189 	else if (use_mfence)
190 		TSC_SETFUNC(mfence);
191 	else
192 		TSC_SETFUNC(cpuid);
193 
194 	aprint_verbose_dev(ci->ci_dev, "Use %s to serialize rdtsc\n",
195 	    use_lfence ? "lfence" : (use_mfence ? "mfence" : "cpuid"));
196 }
197 
198 /*
199  * Initialize timecounter(9) and DELAY() function of TSC.
200  *
201  * This function is called after all secondary processors were brought up
202  * and drift has been measured, and after any other potential delay funcs
203  * have been installed (e.g. lapic_delay()).
204  */
205 void
tsc_tc_init(void)206 tsc_tc_init(void)
207 {
208 	struct cpu_info *ci;
209 	bool invariant;
210 
211 	if (!cpu_hascounter())
212 		return;
213 
214 	ci = curcpu();
215 	tsc_freq = ci->ci_data.cpu_cc_freq;
216 	invariant = tsc_is_invariant();
217 	if (!invariant) {
218 		aprint_debug("TSC not known invariant on this CPU\n");
219 		tsc_timecounter.tc_quality = -100;
220 	} else if (tsc_drift_observed > tsc_drift_max) {
221 		aprint_error("ERROR: %lld cycle TSC drift observed\n",
222 		    (long long)tsc_drift_observed);
223 		tsc_timecounter.tc_quality = -100;
224 		invariant = false;
225 	} else if (vm_guest == VM_GUEST_NO) {
226 		delay_func = tsc_delay;
227 	} else if (vm_guest == VM_GUEST_VIRTUALBOX) {
228 		tsc_timecounter.tc_quality = -100;
229 	}
230 
231 	if (tsc_freq != 0) {
232 		tsc_timecounter.tc_frequency = tsc_freq;
233 		tc_init(&tsc_timecounter);
234 	}
235 }
236 
237 /*
238  * Record drift (in clock cycles).  Called during AP startup.
239  */
240 void
tsc_sync_drift(int64_t drift)241 tsc_sync_drift(int64_t drift)
242 {
243 
244 	if (drift < 0)
245 		drift = -drift;
246 	if (drift > tsc_drift_observed)
247 		tsc_drift_observed = drift;
248 }
249 
250 /*
251  * Called during startup of APs, by the boot processor.  Interrupts
252  * are disabled on entry.
253  */
254 static void __noinline
tsc_read_bp(struct cpu_info * ci,uint64_t * bptscp,uint64_t * aptscp)255 tsc_read_bp(struct cpu_info *ci, uint64_t *bptscp, uint64_t *aptscp)
256 {
257 	uint64_t bptsc;
258 
259 	if (atomic_swap_ptr(&tsc_sync_cpu, ci) != NULL) {
260 		panic("tsc_sync_bp: 1");
261 	}
262 
263 	/* Prepare a cache miss for the other side. */
264 	(void)atomic_swap_uint((void *)&tsc_dummy_cacheline, 0);
265 
266 	/* Flag our readiness. */
267 	atomic_or_uint(&ci->ci_flags, CPUF_SYNCTSC);
268 
269 	/* Wait for other side then read our TSC. */
270 	while ((ci->ci_flags & CPUF_SYNCTSC) != 0) {
271 		__insn_barrier();
272 	}
273 	bptsc = rdtsc();
274 
275 	/* Wait for the results to come in. */
276 	while (tsc_sync_cpu == ci) {
277 		x86_pause();
278 	}
279 	if (tsc_sync_cpu != NULL) {
280 		panic("tsc_sync_bp: 2");
281 	}
282 
283 	*bptscp = bptsc;
284 	*aptscp = tsc_sync_val;
285 }
286 
287 void
tsc_sync_bp(struct cpu_info * ci)288 tsc_sync_bp(struct cpu_info *ci)
289 {
290 	int64_t bptsc, aptsc, val, diff;
291 
292 	if (!cpu_hascounter())
293 		return;
294 
295 	val = INT64_MAX;
296 	for (int i = 0; i < TSC_SYNC_ROUNDS; i++) {
297 		tsc_read_bp(ci, &bptsc, &aptsc);
298 		diff = bptsc - aptsc;
299 		if (ABS(diff) < ABS(val)) {
300 			val = diff;
301 		}
302 	}
303 
304 	ci->ci_data.cpu_cc_skew = val;
305 }
306 
307 /*
308  * Called during startup of AP, by the AP itself.  Interrupts are
309  * disabled on entry.
310  */
311 static void __noinline
tsc_post_ap(struct cpu_info * ci)312 tsc_post_ap(struct cpu_info *ci)
313 {
314 	uint64_t tsc;
315 
316 	/* Wait for go-ahead from primary. */
317 	while ((ci->ci_flags & CPUF_SYNCTSC) == 0) {
318 		__insn_barrier();
319 	}
320 
321 	/* Instruct primary to read its counter. */
322 	atomic_and_uint(&ci->ci_flags, ~CPUF_SYNCTSC);
323 
324 	/* Suffer a cache miss, then read TSC. */
325 	__insn_barrier();
326 	tsc = tsc_dummy_cacheline;
327 	__insn_barrier();
328 	tsc += rdtsc();
329 
330 	/* Post result.  Ensure the whole value goes out atomically. */
331 	(void)atomic_swap_64(&tsc_sync_val, tsc);
332 
333 	if (atomic_swap_ptr(&tsc_sync_cpu, NULL) != ci) {
334 		panic("tsc_sync_ap");
335 	}
336 }
337 
338 void
tsc_sync_ap(struct cpu_info * ci)339 tsc_sync_ap(struct cpu_info *ci)
340 {
341 
342 	if (!cpu_hascounter())
343 		return;
344 
345 	for (int i = 0; i < TSC_SYNC_ROUNDS; i++) {
346 		tsc_post_ap(ci);
347 	}
348 }
349 
350 static void
tsc_apply_cpu(void * arg1,void * arg2)351 tsc_apply_cpu(void *arg1, void *arg2)
352 {
353 	bool enable = arg1 != NULL;
354 	if (enable) {
355 		lcr4(rcr4() & ~CR4_TSD);
356 	} else {
357 		lcr4(rcr4() | CR4_TSD);
358 	}
359 }
360 
361 void
tsc_user_enable(void)362 tsc_user_enable(void)
363 {
364 	uint64_t xc;
365 
366 	xc = xc_broadcast(0, tsc_apply_cpu, (void *)true, NULL);
367 	xc_wait(xc);
368 }
369 
370 void
tsc_user_disable(void)371 tsc_user_disable(void)
372 {
373 	uint64_t xc;
374 
375 	xc = xc_broadcast(0, tsc_apply_cpu, (void *)false, NULL);
376 	xc_wait(xc);
377 }
378 
379 uint64_t
cpu_frequency(struct cpu_info * ci)380 cpu_frequency(struct cpu_info *ci)
381 {
382 
383 	return ci->ci_data.cpu_cc_freq;
384 }
385 
386 int
cpu_hascounter(void)387 cpu_hascounter(void)
388 {
389 
390 	return cpu_feature[0] & CPUID_TSC;
391 }
392 
393 static void
tsc_delay(unsigned int us)394 tsc_delay(unsigned int us)
395 {
396 	uint64_t start, delta;
397 
398 	start = cpu_counter();
399 	delta = (uint64_t)us * tsc_freq / 1000000;
400 
401 	while ((cpu_counter() - start) < delta) {
402 		x86_pause();
403 	}
404 }
405 
406 static u_int
tsc_get_timecount(struct timecounter * tc)407 tsc_get_timecount(struct timecounter *tc)
408 {
409 #ifdef _LP64 /* requires atomic 64-bit store */
410 	static __cpu_simple_lock_t lock = __SIMPLELOCK_UNLOCKED;
411 	static int lastwarn;
412 	uint64_t cur, prev;
413 	lwp_t *l = curlwp;
414 	int ticks;
415 
416 	/*
417 	 * Previous value must be read before the counter and stored to
418 	 * after, because this routine can be called from interrupt context
419 	 * and may run over the top of an existing invocation.  Ordering is
420 	 * guaranteed by "volatile" on md_tsc.
421 	 */
422 	prev = l->l_md.md_tsc;
423 	cur = cpu_counter();
424 	if (__predict_false(cur < prev)) {
425 		if ((cur >> 63) == (prev >> 63) &&
426 		    __cpu_simple_lock_try(&lock)) {
427 			ticks = getticks();
428 			if (ticks - lastwarn >= hz) {
429 				printf(
430 				    "WARNING: TSC time went backwards by %u - "
431 				    "change sysctl(7) kern.timecounter?\n",
432 				    (unsigned)(prev - cur));
433 				lastwarn = ticks;
434 			}
435 			__cpu_simple_unlock(&lock);
436 		}
437 	}
438 	l->l_md.md_tsc = cur;
439 	return (uint32_t)cur;
440 #else
441 	return cpu_counter32();
442 #endif
443 }
444 
445 /*
446  * tsc has been reset; zero the cached tsc of every lwp in the system
447  * so we don't spuriously report that the tsc has gone backward.
448  * Caller must ensure all LWPs are quiescent (except the current one,
449  * obviously) and interrupts are blocked while we update this.
450  */
451 void
tsc_tc_reset(void)452 tsc_tc_reset(void)
453 {
454 	struct lwp *l;
455 
456 	LIST_FOREACH(l, &alllwp, l_list)
457 		l->l_md.md_tsc = 0;
458 }
459