xref: /freebsd/sys/x86/x86/tsc.c (revision c03c5b1c)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 1998-2003 Poul-Henning Kamp
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include "opt_clock.h"
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/bus.h>
37 #include <sys/cpu.h>
38 #include <sys/eventhandler.h>
39 #include <sys/limits.h>
40 #include <sys/malloc.h>
41 #include <sys/proc.h>
42 #include <sys/sched.h>
43 #include <sys/sysctl.h>
44 #include <sys/time.h>
45 #include <sys/timetc.h>
46 #include <sys/kernel.h>
47 #include <sys/smp.h>
48 #include <sys/vdso.h>
49 #include <machine/clock.h>
50 #include <machine/cputypes.h>
51 #include <machine/fpu.h>
52 #include <machine/md_var.h>
53 #include <machine/specialreg.h>
54 #include <x86/vmware.h>
55 #include <dev/acpica/acpi_hpet.h>
56 #include <contrib/dev/acpica/include/acpi.h>
57 
58 #include "cpufreq_if.h"
59 
60 uint64_t	tsc_freq;
61 int		tsc_is_invariant;
62 int		tsc_perf_stat;
63 static int	tsc_early_calib_exact;
64 
65 static eventhandler_tag tsc_levels_tag, tsc_pre_tag, tsc_post_tag;
66 
67 SYSCTL_INT(_kern_timecounter, OID_AUTO, invariant_tsc, CTLFLAG_RDTUN,
68     &tsc_is_invariant, 0, "Indicates whether the TSC is P-state invariant");
69 
70 #ifdef SMP
71 int	smp_tsc;
72 SYSCTL_INT(_kern_timecounter, OID_AUTO, smp_tsc, CTLFLAG_RDTUN, &smp_tsc, 0,
73     "Indicates whether the TSC is safe to use in SMP mode");
74 
75 int	smp_tsc_adjust = 0;
76 SYSCTL_INT(_kern_timecounter, OID_AUTO, smp_tsc_adjust, CTLFLAG_RDTUN,
77     &smp_tsc_adjust, 0, "Try to adjust TSC on APs to match BSP");
78 #endif
79 
80 static int	tsc_shift = 1;
81 SYSCTL_INT(_kern_timecounter, OID_AUTO, tsc_shift, CTLFLAG_RDTUN,
82     &tsc_shift, 0, "Shift to pre-apply for the maximum TSC frequency");
83 
84 static int	tsc_disabled;
85 SYSCTL_INT(_machdep, OID_AUTO, disable_tsc, CTLFLAG_RDTUN, &tsc_disabled, 0,
86     "Disable x86 Time Stamp Counter");
87 
88 static int	tsc_skip_calibration;
89 SYSCTL_INT(_machdep, OID_AUTO, disable_tsc_calibration, CTLFLAG_RDTUN,
90     &tsc_skip_calibration, 0,
91     "Disable early TSC frequency calibration");
92 
93 static void tsc_freq_changed(void *arg, const struct cf_level *level,
94     int status);
95 static void tsc_freq_changing(void *arg, const struct cf_level *level,
96     int *status);
97 static u_int tsc_get_timecount(struct timecounter *tc);
98 static inline u_int tsc_get_timecount_low(struct timecounter *tc);
99 static u_int tsc_get_timecount_lfence(struct timecounter *tc);
100 static u_int tsc_get_timecount_low_lfence(struct timecounter *tc);
101 static u_int tsc_get_timecount_mfence(struct timecounter *tc);
102 static u_int tsc_get_timecount_low_mfence(struct timecounter *tc);
103 static u_int tscp_get_timecount(struct timecounter *tc);
104 static u_int tscp_get_timecount_low(struct timecounter *tc);
105 static void tsc_levels_changed(void *arg, int unit);
106 static uint32_t x86_tsc_vdso_timehands(struct vdso_timehands *vdso_th,
107     struct timecounter *tc);
108 #ifdef COMPAT_FREEBSD32
109 static uint32_t x86_tsc_vdso_timehands32(struct vdso_timehands32 *vdso_th32,
110     struct timecounter *tc);
111 #endif
112 
113 static struct timecounter tsc_timecounter = {
114 	.tc_get_timecount =		tsc_get_timecount,
115 	.tc_counter_mask =		~0u,
116 	.tc_name =			"TSC",
117 	.tc_quality =			800,	/* adjusted in code */
118 	.tc_fill_vdso_timehands = 	x86_tsc_vdso_timehands,
119 #ifdef COMPAT_FREEBSD32
120 	.tc_fill_vdso_timehands32 = 	x86_tsc_vdso_timehands32,
121 #endif
122 };
123 
124 static int
125 tsc_freq_cpuid_vm(void)
126 {
127 	u_int regs[4];
128 
129 	if (vm_guest == VM_GUEST_NO)
130 		return (false);
131 	if (hv_high < 0x40000010)
132 		return (false);
133 	do_cpuid(0x40000010, regs);
134 	tsc_freq = (uint64_t)(regs[0]) * 1000;
135 	tsc_early_calib_exact = 1;
136 	return (true);
137 }
138 
139 static void
140 tsc_freq_vmware(void)
141 {
142 	u_int regs[4];
143 
144 	vmware_hvcall(VMW_HVCMD_GETHZ, regs);
145 	if (regs[1] != UINT_MAX)
146 		tsc_freq = regs[0] | ((uint64_t)regs[1] << 32);
147 	tsc_early_calib_exact = 1;
148 }
149 
150 /*
151  * Calculate TSC frequency using information from the CPUID leaf 0x15 'Time
152  * Stamp Counter and Nominal Core Crystal Clock'.  If leaf 0x15 is not
153  * functional, as it is on Skylake/Kabylake, try 0x16 'Processor Frequency
154  * Information'.  Leaf 0x16 is described in the SDM as informational only, but
155  * we can use this value until late calibration is complete.
156  */
157 static bool
158 tsc_freq_cpuid(uint64_t *res)
159 {
160 	u_int regs[4];
161 
162 	if (cpu_high < 0x15)
163 		return (false);
164 	do_cpuid(0x15, regs);
165 	if (regs[0] != 0 && regs[1] != 0 && regs[2] != 0) {
166 		*res = (uint64_t)regs[2] * regs[1] / regs[0];
167 		return (true);
168 	}
169 
170 	if (cpu_high < 0x16)
171 		return (false);
172 	do_cpuid(0x16, regs);
173 	if (regs[0] != 0) {
174 		*res = (uint64_t)regs[0] * 1000000;
175 		return (true);
176 	}
177 
178 	return (false);
179 }
180 
181 static bool
182 tsc_freq_intel_brand(uint64_t *res)
183 {
184 	char brand[48];
185 	u_int regs[4];
186 	uint64_t freq;
187 	char *p;
188 	u_int i;
189 
190 	/*
191 	 * Intel Processor Identification and the CPUID Instruction
192 	 * Application Note 485.
193 	 * http://www.intel.com/assets/pdf/appnote/241618.pdf
194 	 */
195 	if (cpu_exthigh >= 0x80000004) {
196 		p = brand;
197 		for (i = 0x80000002; i < 0x80000005; i++) {
198 			do_cpuid(i, regs);
199 			memcpy(p, regs, sizeof(regs));
200 			p += sizeof(regs);
201 		}
202 		p = NULL;
203 		for (i = 0; i < sizeof(brand) - 1; i++)
204 			if (brand[i] == 'H' && brand[i + 1] == 'z')
205 				p = brand + i;
206 		if (p != NULL) {
207 			p -= 5;
208 			switch (p[4]) {
209 			case 'M':
210 				i = 1;
211 				break;
212 			case 'G':
213 				i = 1000;
214 				break;
215 			case 'T':
216 				i = 1000000;
217 				break;
218 			default:
219 				return (false);
220 			}
221 #define	C2D(c)	((c) - '0')
222 			if (p[1] == '.') {
223 				freq = C2D(p[0]) * 1000;
224 				freq += C2D(p[2]) * 100;
225 				freq += C2D(p[3]) * 10;
226 				freq *= i * 1000;
227 			} else {
228 				freq = C2D(p[0]) * 1000;
229 				freq += C2D(p[1]) * 100;
230 				freq += C2D(p[2]) * 10;
231 				freq += C2D(p[3]);
232 				freq *= i * 1000000;
233 			}
234 #undef C2D
235 			*res = freq;
236 			return (true);
237 		}
238 	}
239 	return (false);
240 }
241 
242 static void
243 tsc_freq_tc(uint64_t *res)
244 {
245 	uint64_t tsc1, tsc2;
246 	int64_t overhead;
247 	int count, i;
248 
249 	overhead = 0;
250 	for (i = 0, count = 8; i < count; i++) {
251 		tsc1 = rdtsc_ordered();
252 		DELAY(0);
253 		tsc2 = rdtsc_ordered();
254 		if (i > 0)
255 			overhead += tsc2 - tsc1;
256 	}
257 	overhead /= count;
258 
259 	tsc1 = rdtsc_ordered();
260 	DELAY(100000);
261 	tsc2 = rdtsc_ordered();
262 	tsc_freq = (tsc2 - tsc1 - overhead) * 10;
263 }
264 
265 /*
266  * Try to determine the TSC frequency using CPUID or hypercalls.  If successful,
267  * this lets use the TSC for early DELAY() calls instead of the 8254 timer,
268  * which may be unreliable or entirely absent on contemporary systems.  However,
269  * avoid calibrating using the 8254 here so as to give hypervisors a chance to
270  * register a timecounter that can be used instead.
271  */
272 static void
273 probe_tsc_freq_early(void)
274 {
275 #ifdef __i386__
276 	/* The TSC is known to be broken on certain CPUs. */
277 	switch (cpu_vendor_id) {
278 	case CPU_VENDOR_AMD:
279 		switch (cpu_id & 0xFF0) {
280 		case 0x500:
281 			/* K5 Model 0 */
282 			tsc_disabled = 1;
283 			return;
284 		}
285 		break;
286 	case CPU_VENDOR_CENTAUR:
287 		switch (cpu_id & 0xff0) {
288 		case 0x540:
289 			/*
290 			 * http://www.centtech.com/c6_data_sheet.pdf
291 			 *
292 			 * I-12 RDTSC may return incoherent values in EDX:EAX
293 			 * I-13 RDTSC hangs when certain event counters are used
294 			 */
295 			tsc_disabled = 1;
296 			return;
297 		}
298 		break;
299 	case CPU_VENDOR_NSC:
300 		switch (cpu_id & 0xff0) {
301 		case 0x540:
302 			if ((cpu_id & CPUID_STEPPING) == 0) {
303 				tsc_disabled = 1;
304 				return;
305 			}
306 			break;
307 		}
308 		break;
309 	}
310 #endif
311 
312 	switch (cpu_vendor_id) {
313 	case CPU_VENDOR_AMD:
314 	case CPU_VENDOR_HYGON:
315 		if ((amd_pminfo & AMDPM_TSC_INVARIANT) != 0 ||
316 		    (vm_guest == VM_GUEST_NO &&
317 		    CPUID_TO_FAMILY(cpu_id) >= 0x10))
318 			tsc_is_invariant = 1;
319 		if (cpu_feature & CPUID_SSE2) {
320 			tsc_timecounter.tc_get_timecount =
321 			    tsc_get_timecount_mfence;
322 		}
323 		break;
324 	case CPU_VENDOR_INTEL:
325 		if ((amd_pminfo & AMDPM_TSC_INVARIANT) != 0 ||
326 		    (vm_guest == VM_GUEST_NO &&
327 		    ((CPUID_TO_FAMILY(cpu_id) == 0x6 &&
328 		    CPUID_TO_MODEL(cpu_id) >= 0xe) ||
329 		    (CPUID_TO_FAMILY(cpu_id) == 0xf &&
330 		    CPUID_TO_MODEL(cpu_id) >= 0x3))))
331 			tsc_is_invariant = 1;
332 		if (cpu_feature & CPUID_SSE2) {
333 			tsc_timecounter.tc_get_timecount =
334 			    tsc_get_timecount_lfence;
335 		}
336 		break;
337 	case CPU_VENDOR_CENTAUR:
338 		if (vm_guest == VM_GUEST_NO &&
339 		    CPUID_TO_FAMILY(cpu_id) == 0x6 &&
340 		    CPUID_TO_MODEL(cpu_id) >= 0xf &&
341 		    (rdmsr(0x1203) & 0x100000000ULL) == 0)
342 			tsc_is_invariant = 1;
343 		if (cpu_feature & CPUID_SSE2) {
344 			tsc_timecounter.tc_get_timecount =
345 			    tsc_get_timecount_lfence;
346 		}
347 		break;
348 	}
349 
350 	if (tsc_freq_cpuid_vm()) {
351 		if (bootverbose)
352 			printf(
353 		    "Early TSC frequency %juHz derived from hypervisor CPUID\n",
354 			    (uintmax_t)tsc_freq);
355 	} else if (vm_guest == VM_GUEST_VMWARE) {
356 		tsc_freq_vmware();
357 		if (bootverbose)
358 			printf(
359 		    "Early TSC frequency %juHz derived from VMWare hypercall\n",
360 			    (uintmax_t)tsc_freq);
361 	} else if (tsc_freq_cpuid(&tsc_freq)) {
362 		/*
363 		 * If possible, use the value obtained from CPUID as the initial
364 		 * frequency.  This will be refined later during boot but is
365 		 * good enough for now.  The 8254 PIT is not functional on some
366 		 * newer platforms anyway, so don't delay our boot for what
367 		 * might be a garbage result.  Late calibration is required if
368 		 * the initial frequency was obtained from CPUID.16H, as the
369 		 * derived value may be off by as much as 1%.
370 		 */
371 		if (bootverbose)
372 			printf("Early TSC frequency %juHz derived from CPUID\n",
373 			    (uintmax_t)tsc_freq);
374 	}
375 }
376 
377 /*
378  * If we were unable to determine the TSC frequency via CPU registers, try
379  * to calibrate against a known clock.
380  */
381 static void
382 probe_tsc_freq_late(void)
383 {
384 	if (tsc_freq != 0)
385 		return;
386 
387 	if (tsc_skip_calibration) {
388 		/*
389 		 * Try to parse the brand string to obtain the nominal TSC
390 		 * frequency.
391 		 */
392 		if (cpu_vendor_id == CPU_VENDOR_INTEL &&
393 		    tsc_freq_intel_brand(&tsc_freq)) {
394 			if (bootverbose)
395 				printf(
396 		    "Early TSC frequency %juHz derived from brand string\n",
397 				    (uintmax_t)tsc_freq);
398 		} else {
399 			tsc_disabled = 1;
400 		}
401 	} else {
402 		/*
403 		 * Calibrate against a timecounter or the 8254 PIT.  This
404 		 * estimate will be refined later in tsc_calib().
405 		 */
406 		tsc_freq_tc(&tsc_freq);
407 		if (bootverbose)
408 			printf(
409 		    "Early TSC frequency %juHz calibrated from 8254 PIT\n",
410 			    (uintmax_t)tsc_freq);
411 	}
412 }
413 
414 void
415 start_TSC(void)
416 {
417 	if ((cpu_feature & CPUID_TSC) == 0 || tsc_disabled)
418 		return;
419 
420 	probe_tsc_freq_late();
421 
422 	if (cpu_power_ecx & CPUID_PERF_STAT) {
423 		/*
424 		 * XXX Some emulators expose host CPUID without actual support
425 		 * for these MSRs.  We must test whether they really work.
426 		 */
427 		wrmsr(MSR_MPERF, 0);
428 		wrmsr(MSR_APERF, 0);
429 		DELAY(10);
430 		if (rdmsr(MSR_MPERF) > 0 && rdmsr(MSR_APERF) > 0)
431 			tsc_perf_stat = 1;
432 	}
433 
434 	/*
435 	 * Inform CPU accounting about our boot-time clock rate.  This will
436 	 * be updated if someone loads a cpufreq driver after boot that
437 	 * discovers a new max frequency.
438 	 *
439 	 * The frequency may also be updated after late calibration is complete;
440 	 * however, we register the TSC as the ticker now to avoid switching
441 	 * counters after much of the kernel has already booted and potentially
442 	 * sampled the CPU clock.
443 	 */
444 	if (tsc_freq != 0)
445 		set_cputicker(rdtsc, tsc_freq, !tsc_is_invariant);
446 
447 	if (tsc_is_invariant)
448 		return;
449 
450 	/* Register to find out about changes in CPU frequency. */
451 	tsc_pre_tag = EVENTHANDLER_REGISTER(cpufreq_pre_change,
452 	    tsc_freq_changing, NULL, EVENTHANDLER_PRI_FIRST);
453 	tsc_post_tag = EVENTHANDLER_REGISTER(cpufreq_post_change,
454 	    tsc_freq_changed, NULL, EVENTHANDLER_PRI_FIRST);
455 	tsc_levels_tag = EVENTHANDLER_REGISTER(cpufreq_levels_changed,
456 	    tsc_levels_changed, NULL, EVENTHANDLER_PRI_ANY);
457 }
458 
459 #ifdef SMP
460 
461 /*
462  * RDTSC is not a serializing instruction, and does not drain
463  * instruction stream, so we need to drain the stream before executing
464  * it.  It could be fixed by use of RDTSCP, except the instruction is
465  * not available everywhere.
466  *
467  * Use CPUID for draining in the boot-time SMP constistency test.  The
468  * timecounters use MFENCE for AMD CPUs, and LFENCE for others (Intel
469  * and VIA) when SSE2 is present, and nothing on older machines which
470  * also do not issue RDTSC prematurely.  There, testing for SSE2 and
471  * vendor is too cumbersome, and we learn about TSC presence from CPUID.
472  *
473  * Do not use do_cpuid(), since we do not need CPUID results, which
474  * have to be written into memory with do_cpuid().
475  */
476 #define	TSC_READ(x)							\
477 static void								\
478 tsc_read_##x(void *arg)							\
479 {									\
480 	uint64_t *tsc = arg;						\
481 	u_int cpu = PCPU_GET(cpuid);					\
482 									\
483 	__asm __volatile("cpuid" : : : "eax", "ebx", "ecx", "edx");	\
484 	tsc[cpu * 3 + x] = rdtsc();					\
485 }
486 TSC_READ(0)
487 TSC_READ(1)
488 TSC_READ(2)
489 #undef TSC_READ
490 
491 #define	N	1000
492 
493 static void
494 comp_smp_tsc(void *arg)
495 {
496 	uint64_t *tsc;
497 	int64_t d1, d2;
498 	u_int cpu = PCPU_GET(cpuid);
499 	u_int i, j, size;
500 
501 	size = (mp_maxid + 1) * 3;
502 	for (i = 0, tsc = arg; i < N; i++, tsc += size)
503 		CPU_FOREACH(j) {
504 			if (j == cpu)
505 				continue;
506 			d1 = tsc[cpu * 3 + 1] - tsc[j * 3];
507 			d2 = tsc[cpu * 3 + 2] - tsc[j * 3 + 1];
508 			if (d1 <= 0 || d2 <= 0) {
509 				smp_tsc = 0;
510 				return;
511 			}
512 		}
513 }
514 
515 static void
516 adj_smp_tsc(void *arg)
517 {
518 	uint64_t *tsc;
519 	int64_t d, min, max;
520 	u_int cpu = PCPU_GET(cpuid);
521 	u_int first, i, size;
522 
523 	first = CPU_FIRST();
524 	if (cpu == first)
525 		return;
526 	min = INT64_MIN;
527 	max = INT64_MAX;
528 	size = (mp_maxid + 1) * 3;
529 	for (i = 0, tsc = arg; i < N; i++, tsc += size) {
530 		d = tsc[first * 3] - tsc[cpu * 3 + 1];
531 		if (d > min)
532 			min = d;
533 		d = tsc[first * 3 + 1] - tsc[cpu * 3 + 2];
534 		if (d > min)
535 			min = d;
536 		d = tsc[first * 3 + 1] - tsc[cpu * 3];
537 		if (d < max)
538 			max = d;
539 		d = tsc[first * 3 + 2] - tsc[cpu * 3 + 1];
540 		if (d < max)
541 			max = d;
542 	}
543 	if (min > max)
544 		return;
545 	d = min / 2 + max / 2;
546 	__asm __volatile (
547 		"movl $0x10, %%ecx\n\t"
548 		"rdmsr\n\t"
549 		"addl %%edi, %%eax\n\t"
550 		"adcl %%esi, %%edx\n\t"
551 		"wrmsr\n"
552 		: /* No output */
553 		: "D" ((uint32_t)d), "S" ((uint32_t)(d >> 32))
554 		: "ax", "cx", "dx", "cc"
555 	);
556 }
557 
558 static int
559 test_tsc(int adj_max_count)
560 {
561 	uint64_t *data, *tsc;
562 	u_int i, size, adj;
563 
564 	if ((!smp_tsc && !tsc_is_invariant))
565 		return (-100);
566 	/*
567 	 * Misbehavior of TSC under VirtualBox has been observed.  In
568 	 * particular, threads doing small (~1 second) sleeps may miss their
569 	 * wakeup and hang around in sleep state, causing hangs on shutdown.
570 	 */
571 	if (vm_guest == VM_GUEST_VBOX)
572 		return (0);
573 
574 	TSENTER();
575 	size = (mp_maxid + 1) * 3;
576 	data = malloc(sizeof(*data) * size * N, M_TEMP, M_WAITOK);
577 	adj = 0;
578 retry:
579 	for (i = 0, tsc = data; i < N; i++, tsc += size)
580 		smp_rendezvous(tsc_read_0, tsc_read_1, tsc_read_2, tsc);
581 	smp_tsc = 1;	/* XXX */
582 	smp_rendezvous(smp_no_rendezvous_barrier, comp_smp_tsc,
583 	    smp_no_rendezvous_barrier, data);
584 	if (!smp_tsc && adj < adj_max_count) {
585 		adj++;
586 		smp_rendezvous(smp_no_rendezvous_barrier, adj_smp_tsc,
587 		    smp_no_rendezvous_barrier, data);
588 		goto retry;
589 	}
590 	free(data, M_TEMP);
591 	if (bootverbose)
592 		printf("SMP: %sed TSC synchronization test%s\n",
593 		    smp_tsc ? "pass" : "fail",
594 		    adj > 0 ? " after adjustment" : "");
595 	TSEXIT();
596 	if (smp_tsc && tsc_is_invariant) {
597 		switch (cpu_vendor_id) {
598 		case CPU_VENDOR_AMD:
599 		case CPU_VENDOR_HYGON:
600 			/*
601 			 * Processor Programming Reference (PPR) for AMD
602 			 * Family 17h states that the TSC uses a common
603 			 * reference for all sockets, cores and threads.
604 			 */
605 			if (CPUID_TO_FAMILY(cpu_id) >= 0x17)
606 				return (1000);
607 			/*
608 			 * Starting with Family 15h processors, TSC clock
609 			 * source is in the north bridge.  Check whether
610 			 * we have a single-socket/multi-core platform.
611 			 * XXX Need more work for complex cases.
612 			 */
613 			if (CPUID_TO_FAMILY(cpu_id) < 0x15 ||
614 			    (amd_feature2 & AMDID2_CMP) == 0 ||
615 			    smp_cpus > (cpu_procinfo2 & AMDID_CMP_CORES) + 1)
616 				break;
617 			return (1000);
618 		case CPU_VENDOR_INTEL:
619 			/*
620 			 * XXX Assume Intel platforms have synchronized TSCs.
621 			 */
622 			return (1000);
623 		}
624 		return (800);
625 	}
626 	return (-100);
627 }
628 
629 #undef N
630 
631 #endif /* SMP */
632 
633 static void
634 init_TSC_tc(void)
635 {
636 	uint64_t max_freq;
637 	int shift;
638 
639 	if ((cpu_feature & CPUID_TSC) == 0 || tsc_disabled)
640 		return;
641 
642 	/*
643 	 * Limit timecounter frequency to fit in an int and prevent it from
644 	 * overflowing too fast.
645 	 */
646 	max_freq = UINT_MAX;
647 
648 	/*
649 	 * Intel CPUs without a C-state invariant TSC can stop the TSC
650 	 * in either C2 or C3.  Disable use of C2 and C3 while using
651 	 * the TSC as the timecounter.  The timecounter can be changed
652 	 * to enable C2 and C3.
653 	 *
654 	 * Note that the TSC is used as the cputicker for computing
655 	 * thread runtime regardless of the timecounter setting, so
656 	 * using an alternate timecounter and enabling C2 or C3 can
657 	 * result incorrect runtimes for kernel idle threads (but not
658 	 * for any non-idle threads).
659 	 */
660 	if (cpu_vendor_id == CPU_VENDOR_INTEL &&
661 	    (amd_pminfo & AMDPM_TSC_INVARIANT) == 0) {
662 		tsc_timecounter.tc_flags |= TC_FLAGS_C2STOP;
663 		if (bootverbose)
664 			printf("TSC timecounter disables C2 and C3.\n");
665 	}
666 
667 	/*
668 	 * We can not use the TSC in SMP mode unless the TSCs on all CPUs
669 	 * are synchronized.  If the user is sure that the system has
670 	 * synchronized TSCs, set kern.timecounter.smp_tsc tunable to a
671 	 * non-zero value.  The TSC seems unreliable in virtualized SMP
672 	 * environments, so it is set to a negative quality in those cases.
673 	 */
674 #ifdef SMP
675 	if (mp_ncpus > 1)
676 		tsc_timecounter.tc_quality = test_tsc(smp_tsc_adjust);
677 	else
678 #endif /* SMP */
679 	if (tsc_is_invariant)
680 		tsc_timecounter.tc_quality = 1000;
681 	max_freq >>= tsc_shift;
682 
683 	for (shift = 0; shift <= 31 && (tsc_freq >> shift) > max_freq; shift++)
684 		;
685 
686 	/*
687 	 * Timecounter implementation selection, top to bottom:
688 	 * - If RDTSCP is available, use RDTSCP.
689 	 * - If fence instructions are provided (SSE2), use LFENCE;RDTSC
690 	 *   on Intel, and MFENCE;RDTSC on AMD.
691 	 * - For really old CPUs, just use RDTSC.
692 	 */
693 	if ((amd_feature & AMDID_RDTSCP) != 0) {
694 		tsc_timecounter.tc_get_timecount = shift > 0 ?
695 		    tscp_get_timecount_low : tscp_get_timecount;
696 	} else if ((cpu_feature & CPUID_SSE2) != 0 && mp_ncpus > 1) {
697 		if (cpu_vendor_id == CPU_VENDOR_AMD ||
698 		    cpu_vendor_id == CPU_VENDOR_HYGON) {
699 			tsc_timecounter.tc_get_timecount = shift > 0 ?
700 			    tsc_get_timecount_low_mfence :
701 			    tsc_get_timecount_mfence;
702 		} else {
703 			tsc_timecounter.tc_get_timecount = shift > 0 ?
704 			    tsc_get_timecount_low_lfence :
705 			    tsc_get_timecount_lfence;
706 		}
707 	} else {
708 		tsc_timecounter.tc_get_timecount = shift > 0 ?
709 		    tsc_get_timecount_low : tsc_get_timecount;
710 	}
711 	if (shift > 0) {
712 		tsc_timecounter.tc_name = "TSC-low";
713 		if (bootverbose)
714 			printf("TSC timecounter discards lower %d bit(s)\n",
715 			    shift);
716 	}
717 	if (tsc_freq != 0) {
718 		tsc_timecounter.tc_frequency = tsc_freq >> shift;
719 		tsc_timecounter.tc_priv = (void *)(intptr_t)shift;
720 
721 		/*
722 		 * Timecounter registration is deferred until after late
723 		 * calibration is finished.
724 		 */
725 	}
726 }
727 SYSINIT(tsc_tc, SI_SUB_SMP, SI_ORDER_ANY, init_TSC_tc, NULL);
728 
729 static void
730 tsc_update_freq(uint64_t new_freq)
731 {
732 	atomic_store_rel_64(&tsc_freq, new_freq);
733 	atomic_store_rel_64(&tsc_timecounter.tc_frequency,
734 	    new_freq >> (int)(intptr_t)tsc_timecounter.tc_priv);
735 }
736 
737 void
738 tsc_init(void)
739 {
740 	if ((cpu_feature & CPUID_TSC) == 0 || tsc_disabled)
741 		return;
742 
743 	probe_tsc_freq_early();
744 }
745 
746 /*
747  * Perform late calibration of the TSC frequency once ACPI-based timecounters
748  * are available.  At this point timehands are not set up, so we read the
749  * highest-quality timecounter directly rather than using (s)binuptime().
750  */
751 void
752 tsc_calibrate(void)
753 {
754 	uint64_t freq;
755 
756 	if (tsc_disabled)
757 		return;
758 	if (tsc_early_calib_exact)
759 		goto calibrated;
760 
761 	fpu_kern_enter(curthread, NULL, FPU_KERN_NOCTX);
762 	freq = clockcalib(rdtsc_ordered, "TSC");
763 	fpu_kern_leave(curthread, NULL);
764 	tsc_update_freq(freq);
765 
766 calibrated:
767 	tc_init(&tsc_timecounter);
768 	set_cputicker(rdtsc, tsc_freq, !tsc_is_invariant);
769 }
770 
771 void
772 resume_TSC(void)
773 {
774 #ifdef SMP
775 	int quality;
776 
777 	/* If TSC was not good on boot, it is unlikely to become good now. */
778 	if (tsc_timecounter.tc_quality < 0)
779 		return;
780 	/* Nothing to do with UP. */
781 	if (mp_ncpus < 2)
782 		return;
783 
784 	/*
785 	 * If TSC was good, a single synchronization should be enough,
786 	 * but honour smp_tsc_adjust if it's set.
787 	 */
788 	quality = test_tsc(MAX(smp_tsc_adjust, 1));
789 	if (quality != tsc_timecounter.tc_quality) {
790 		printf("TSC timecounter quality changed: %d -> %d\n",
791 		    tsc_timecounter.tc_quality, quality);
792 		tsc_timecounter.tc_quality = quality;
793 	}
794 #endif /* SMP */
795 }
796 
797 /*
798  * When cpufreq levels change, find out about the (new) max frequency.  We
799  * use this to update CPU accounting in case it got a lower estimate at boot.
800  */
801 static void
802 tsc_levels_changed(void *arg, int unit)
803 {
804 	device_t cf_dev;
805 	struct cf_level *levels;
806 	int count, error;
807 	uint64_t max_freq;
808 
809 	/* Only use values from the first CPU, assuming all are equal. */
810 	if (unit != 0)
811 		return;
812 
813 	/* Find the appropriate cpufreq device instance. */
814 	cf_dev = devclass_get_device(devclass_find("cpufreq"), unit);
815 	if (cf_dev == NULL) {
816 		printf("tsc_levels_changed() called but no cpufreq device?\n");
817 		return;
818 	}
819 
820 	/* Get settings from the device and find the max frequency. */
821 	count = 64;
822 	levels = malloc(count * sizeof(*levels), M_TEMP, M_NOWAIT);
823 	if (levels == NULL)
824 		return;
825 	error = CPUFREQ_LEVELS(cf_dev, levels, &count);
826 	if (error == 0 && count != 0) {
827 		max_freq = (uint64_t)levels[0].total_set.freq * 1000000;
828 		set_cputicker(rdtsc, max_freq, 1);
829 	} else
830 		printf("tsc_levels_changed: no max freq found\n");
831 	free(levels, M_TEMP);
832 }
833 
834 /*
835  * If the TSC timecounter is in use, veto the pending change.  It may be
836  * possible in the future to handle a dynamically-changing timecounter rate.
837  */
838 static void
839 tsc_freq_changing(void *arg, const struct cf_level *level, int *status)
840 {
841 
842 	if (*status != 0 || timecounter != &tsc_timecounter)
843 		return;
844 
845 	printf("timecounter TSC must not be in use when "
846 	    "changing frequencies; change denied\n");
847 	*status = EBUSY;
848 }
849 
850 /* Update TSC freq with the value indicated by the caller. */
851 static void
852 tsc_freq_changed(void *arg, const struct cf_level *level, int status)
853 {
854 	uint64_t freq;
855 
856 	/* If there was an error during the transition, don't do anything. */
857 	if (tsc_disabled || status != 0)
858 		return;
859 
860 	/* Total setting for this level gives the new frequency in MHz. */
861 	freq = (uint64_t)level->total_set.freq * 1000000;
862 	tsc_update_freq(freq);
863 }
864 
865 static int
866 sysctl_machdep_tsc_freq(SYSCTL_HANDLER_ARGS)
867 {
868 	int error;
869 	uint64_t freq;
870 
871 	freq = atomic_load_acq_64(&tsc_freq);
872 	if (freq == 0)
873 		return (EOPNOTSUPP);
874 	error = sysctl_handle_64(oidp, &freq, 0, req);
875 	if (error == 0 && req->newptr != NULL)
876 		tsc_update_freq(freq);
877 	return (error);
878 }
879 SYSCTL_PROC(_machdep, OID_AUTO, tsc_freq,
880     CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE,
881     0, 0, sysctl_machdep_tsc_freq, "QU",
882     "Time Stamp Counter frequency");
883 
884 static u_int
885 tsc_get_timecount(struct timecounter *tc __unused)
886 {
887 
888 	return (rdtsc32());
889 }
890 
891 static u_int
892 tscp_get_timecount(struct timecounter *tc __unused)
893 {
894 
895 	return (rdtscp32());
896 }
897 
898 static inline u_int
899 tsc_get_timecount_low(struct timecounter *tc)
900 {
901 	uint32_t rv;
902 
903 	__asm __volatile("rdtsc; shrd %%cl, %%edx, %0"
904 	    : "=a" (rv) : "c" ((int)(intptr_t)tc->tc_priv) : "edx");
905 	return (rv);
906 }
907 
908 static u_int
909 tscp_get_timecount_low(struct timecounter *tc)
910 {
911 	uint32_t rv;
912 
913 	__asm __volatile("rdtscp; movl %1, %%ecx; shrd %%cl, %%edx, %0"
914 	    : "=&a" (rv) : "m" (tc->tc_priv) : "ecx", "edx");
915 	return (rv);
916 }
917 
918 static u_int
919 tsc_get_timecount_lfence(struct timecounter *tc __unused)
920 {
921 
922 	lfence();
923 	return (rdtsc32());
924 }
925 
926 static u_int
927 tsc_get_timecount_low_lfence(struct timecounter *tc)
928 {
929 
930 	lfence();
931 	return (tsc_get_timecount_low(tc));
932 }
933 
934 static u_int
935 tsc_get_timecount_mfence(struct timecounter *tc __unused)
936 {
937 
938 	mfence();
939 	return (rdtsc32());
940 }
941 
942 static u_int
943 tsc_get_timecount_low_mfence(struct timecounter *tc)
944 {
945 
946 	mfence();
947 	return (tsc_get_timecount_low(tc));
948 }
949 
950 static uint32_t
951 x86_tsc_vdso_timehands(struct vdso_timehands *vdso_th, struct timecounter *tc)
952 {
953 
954 	vdso_th->th_algo = VDSO_TH_ALGO_X86_TSC;
955 	vdso_th->th_x86_shift = (int)(intptr_t)tc->tc_priv;
956 	vdso_th->th_x86_hpet_idx = 0xffffffff;
957 	vdso_th->th_x86_pvc_last_systime = 0;
958 	vdso_th->th_x86_pvc_stable_mask = 0;
959 	bzero(vdso_th->th_res, sizeof(vdso_th->th_res));
960 	return (1);
961 }
962 
963 #ifdef COMPAT_FREEBSD32
964 static uint32_t
965 x86_tsc_vdso_timehands32(struct vdso_timehands32 *vdso_th32,
966     struct timecounter *tc)
967 {
968 
969 	vdso_th32->th_algo = VDSO_TH_ALGO_X86_TSC;
970 	vdso_th32->th_x86_shift = (int)(intptr_t)tc->tc_priv;
971 	vdso_th32->th_x86_hpet_idx = 0xffffffff;
972 	vdso_th32->th_x86_pvc_last_systime = 0;
973 	vdso_th32->th_x86_pvc_stable_mask = 0;
974 	bzero(vdso_th32->th_res, sizeof(vdso_th32->th_res));
975 	return (1);
976 }
977 #endif
978