xref: /dragonfly/sys/platform/pc64/x86_64/machdep.c (revision 2b3f93ea)
1 /*-
2  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
3  * Copyright (c) 1992 Terrence R. Lambert.
4  * Copyright (c) 2003 Peter Wemm.
5  * Copyright (c) 2008-2017 The DragonFly Project.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to Berkeley by
9  * William Jolitz.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. All advertising materials mentioning features or use of this software
20  *    must display the following acknowledgement:
21  *	This product includes software developed by the University of
22  *	California, Berkeley and its contributors.
23  * 4. Neither the name of the University nor the names of its contributors
24  *    may be used to endorse or promote products derived from this software
25  *    without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37  * SUCH DAMAGE.
38  *
39  * from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
40  * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $
41  */
42 
43 #include "use_isa.h"
44 #include "opt_cpu.h"
45 #include "opt_ddb.h"
46 #include "opt_inet.h"
47 #include "opt_maxmem.h"
48 #include "opt_msgbuf.h"
49 #include "opt_swap.h"
50 
51 #include <sys/param.h>
52 #include <sys/systm.h>
53 #include <sys/sysmsg.h>
54 #include <sys/signalvar.h>
55 #include <sys/kernel.h>
56 #include <sys/linker.h>
57 #include <sys/malloc.h>
58 #include <sys/proc.h>
59 #include <sys/caps.h>
60 #include <sys/buf.h>
61 #include <sys/reboot.h>
62 #include <sys/mbuf.h>
63 #include <sys/msgbuf.h>
64 #include <sys/sysent.h>
65 #include <sys/sysctl.h>
66 #include <sys/vmmeter.h>
67 #include <sys/bus.h>
68 #include <sys/usched.h>
69 #include <sys/reg.h>
70 #include <sys/sbuf.h>
71 #include <sys/ctype.h>
72 #include <sys/serialize.h>
73 #include <sys/systimer.h>
74 
75 #include <vm/vm.h>
76 #include <vm/vm_param.h>
77 #include <sys/lock.h>
78 #include <vm/vm_kern.h>
79 #include <vm/vm_object.h>
80 #include <vm/vm_page.h>
81 #include <vm/vm_map.h>
82 #include <vm/vm_pager.h>
83 #include <vm/vm_extern.h>
84 
85 #include <sys/thread2.h>
86 #include <sys/mplock2.h>
87 
88 #include <sys/exec.h>
89 #include <sys/cons.h>
90 
91 #include <sys/efi.h>
92 
93 #include <ddb/ddb.h>
94 
95 #include <machine/cpu.h>
96 #include <machine/clock.h>
97 #include <machine/specialreg.h>
98 #if 0 /* JG */
99 #include <machine/bootinfo.h>
100 #endif
101 #include <machine/md_var.h>
102 #include <machine/metadata.h>
103 #include <machine/pc/bios.h>
104 #include <machine/pcb_ext.h>
105 #include <machine/globaldata.h>		/* CPU_prvspace */
106 #include <machine/smp.h>
107 #include <machine/cputypes.h>
108 #include <machine/intr_machdep.h>
109 #include <machine/framebuffer.h>
110 
111 #ifdef OLD_BUS_ARCH
112 #include <bus/isa/isa_device.h>
113 #endif
114 #include <machine_base/isa/isa_intr.h>
115 #include <bus/isa/rtc.h>
116 #include <sys/random.h>
117 #include <sys/ptrace.h>
118 #include <machine/sigframe.h>
119 
120 #include <sys/machintr.h>
121 #include <machine_base/icu/icu_abi.h>
122 #include <machine_base/icu/elcr_var.h>
123 #include <machine_base/apic/lapic.h>
124 #include <machine_base/apic/ioapic.h>
125 #include <machine_base/apic/ioapic_abi.h>
126 #include <machine/mptable.h>
127 
128 #define PHYSMAP_ENTRIES		10
129 #define MAXBUFSTRUCTSIZE	((size_t)512 * 1024 * 1024)
130 
131 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
132 
133 extern void printcpuinfo(void);	/* XXX header file */
134 extern void identify_cpu(void);
135 extern void panicifcpuunsupported(void);
136 
137 static void cpu_startup(void *);
138 static void pic_finish(void *);
139 static void cpu_finish(void *);
140 
141 static void set_fpregs_xmm(struct save87 *, struct savexmm *);
142 static void fill_fpregs_xmm(struct savexmm *, struct save87 *);
143 static void init_locks(void);
144 
145 extern void pcpu_timer_always(struct intrframe *);
146 
147 SYSINIT(cpu, SI_BOOT2_START_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
148 SYSINIT(pic_finish, SI_BOOT2_FINISH_PIC, SI_ORDER_FIRST, pic_finish, NULL);
149 SYSINIT(cpu_finish, SI_BOOT2_FINISH_CPU, SI_ORDER_FIRST, cpu_finish, NULL);
150 
151 #ifdef DDB
152 extern vm_offset_t ksym_start, ksym_end;
153 #endif
154 
155 struct privatespace CPU_prvspace_bsp __aligned(4096);
156 struct privatespace *CPU_prvspace[MAXCPU] = { &CPU_prvspace_bsp };
157 
158 vm_paddr_t efi_systbl_phys;
159 int	_udatasel, _ucodesel, _ucode32sel;
160 u_long	atdevbase;
161 int64_t tsc_offsets[MAXCPU];
162 cpumask_t smp_idleinvl_mask;
163 cpumask_t smp_idleinvl_reqs;
164 
165  /* MWAIT hint (EAX) or CPU_MWAIT_HINT_ */
166 __read_mostly static int cpu_mwait_halt_global;
167 __read_mostly static int clock_debug1;
168 __read_mostly static int flame_poll_debug;
169 
170 SYSCTL_INT(_debug, OID_AUTO, flame_poll_debug,
171 	CTLFLAG_RW, &flame_poll_debug, 0, "");
172 TUNABLE_INT("debug.flame_poll_debug", &flame_poll_debug);
173 
174 #if defined(SWTCH_OPTIM_STATS)
175 extern int swtch_optim_stats;
176 SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats,
177 	CTLFLAG_RD, &swtch_optim_stats, 0, "");
178 SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count,
179 	CTLFLAG_RD, &tlb_flush_count, 0, "");
180 #endif
181 SYSCTL_INT(_debug, OID_AUTO, clock_debug1,
182 	CTLFLAG_RW, &clock_debug1, 0, "");
183 SYSCTL_INT(_hw, OID_AUTO, cpu_mwait_halt,
184 	CTLFLAG_RD, &cpu_mwait_halt_global, 0, "");
185 SYSCTL_INT(_hw, OID_AUTO, cpu_mwait_spin,
186 	CTLFLAG_RD, &cpu_mwait_spin, 0, "monitor/mwait target state");
187 
188 #define CPU_MWAIT_HAS_CX	\
189 	((cpu_feature2 & CPUID2_MON) && \
190 	 (cpu_mwait_feature & CPUID_MWAIT_EXT))
191 
192 #define CPU_MWAIT_CX_NAMELEN	16
193 
194 #define CPU_MWAIT_C1		1
195 #define CPU_MWAIT_C2		2
196 #define CPU_MWAIT_C3		3
197 #define CPU_MWAIT_CX_MAX	8
198 
199 #define CPU_MWAIT_HINT_AUTO	-1	/* C1 and C2 */
200 #define CPU_MWAIT_HINT_AUTODEEP	-2	/* C3+ */
201 
202 SYSCTL_NODE(_machdep, OID_AUTO, mwait, CTLFLAG_RW, 0, "MWAIT features");
203 SYSCTL_NODE(_machdep_mwait, OID_AUTO, CX, CTLFLAG_RW, 0, "MWAIT Cx settings");
204 
205 struct cpu_mwait_cx {
206 	int			subcnt;
207 	char			name[4];
208 	struct sysctl_ctx_list	sysctl_ctx;
209 	struct sysctl_oid	*sysctl_tree;
210 };
211 static struct cpu_mwait_cx	cpu_mwait_cx_info[CPU_MWAIT_CX_MAX];
212 static char			cpu_mwait_cx_supported[256];
213 
214 static int			cpu_mwait_c1_hints_cnt;
215 static int			cpu_mwait_hints_cnt;
216 static int			*cpu_mwait_hints;
217 
218 static int			cpu_mwait_deep_hints_cnt;
219 static int			*cpu_mwait_deep_hints;
220 
221 #define CPU_IDLE_REPEAT_DEFAULT	750
222 
223 static u_int			cpu_idle_repeat = CPU_IDLE_REPEAT_DEFAULT;
224 static u_long			cpu_idle_repeat_max = CPU_IDLE_REPEAT_DEFAULT;
225 static u_int			cpu_mwait_repeat_shift = 1;
226 
227 #define CPU_MWAIT_C3_PREAMBLE_BM_ARB	0x1
228 #define CPU_MWAIT_C3_PREAMBLE_BM_STS	0x2
229 
230 static int			cpu_mwait_c3_preamble =
231 				    CPU_MWAIT_C3_PREAMBLE_BM_ARB |
232 				    CPU_MWAIT_C3_PREAMBLE_BM_STS;
233 
234 SYSCTL_STRING(_machdep_mwait_CX, OID_AUTO, supported, CTLFLAG_RD,
235     cpu_mwait_cx_supported, 0, "MWAIT supported C states");
236 SYSCTL_INT(_machdep_mwait_CX, OID_AUTO, c3_preamble, CTLFLAG_RD,
237     &cpu_mwait_c3_preamble, 0, "C3+ preamble mask");
238 
239 static int	cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS,
240 		    int *, boolean_t);
241 static int	cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS);
242 static int	cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS);
243 static int	cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS);
244 
245 SYSCTL_PROC(_machdep_mwait_CX, OID_AUTO, idle, CTLTYPE_STRING|CTLFLAG_RW,
246     NULL, 0, cpu_mwait_cx_idle_sysctl, "A", "");
247 SYSCTL_PROC(_machdep_mwait_CX, OID_AUTO, spin, CTLTYPE_STRING|CTLFLAG_RW,
248     NULL, 0, cpu_mwait_cx_spin_sysctl, "A", "");
249 SYSCTL_UINT(_machdep_mwait_CX, OID_AUTO, repeat_shift, CTLFLAG_RW,
250     &cpu_mwait_repeat_shift, 0, "");
251 
252 long physmem = 0;
253 
254 u_long ebda_addr = 0;
255 
256 int imcr_present = 0;
257 
258 int naps = 0; /* # of Applications processors */
259 
260 u_int base_memory;
261 
262 static int
sysctl_hw_physmem(SYSCTL_HANDLER_ARGS)263 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS)
264 {
265 	u_long pmem = ctob(physmem);
266 	int error;
267 
268 	error = sysctl_handle_long(oidp, &pmem, 0, req);
269 
270 	return (error);
271 }
272 
273 SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_ULONG|CTLFLAG_RD,
274 	0, 0, sysctl_hw_physmem, "LU",
275 	"Total system memory in bytes (number of pages * page size)");
276 
277 static int
sysctl_hw_usermem(SYSCTL_HANDLER_ARGS)278 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS)
279 {
280 	u_long usermem = ctob(physmem - vmstats.v_wire_count);
281 	int error;
282 
283 	error = sysctl_handle_long(oidp, &usermem, 0, req);
284 
285 	return (error);
286 }
287 
288 SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_ULONG|CTLFLAG_RD,
289 	0, 0, sysctl_hw_usermem, "LU", "");
290 
291 static int
sysctl_hw_availpages(SYSCTL_HANDLER_ARGS)292 sysctl_hw_availpages(SYSCTL_HANDLER_ARGS)
293 {
294 	int error;
295 	u_long availpages;
296 
297 	availpages = x86_64_btop(avail_end - avail_start);
298 	error = sysctl_handle_long(oidp, &availpages, 0, req);
299 
300 	return (error);
301 }
302 
303 SYSCTL_PROC(_hw, OID_AUTO, availpages, CTLTYPE_ULONG|CTLFLAG_RD,
304 	0, 0, sysctl_hw_availpages, "LU", "");
305 
306 vm_paddr_t Maxmem;
307 vm_paddr_t Realmem;
308 
309 /*
310  * The number of PHYSMAP entries must be one less than the number of
311  * PHYSSEG entries because the PHYSMAP entry that spans the largest
312  * physical address that is accessible by ISA DMA is split into two
313  * PHYSSEG entries.
314  */
315 vm_phystable_t phys_avail[VM_PHYSSEG_MAX + 1];
316 vm_phystable_t dump_avail[VM_PHYSSEG_MAX + 1];
317 
318 /* must be 1 less so 0 0 can signal end of chunks */
319 #define PHYS_AVAIL_ARRAY_END (NELEM(phys_avail) - 1)
320 #define DUMP_AVAIL_ARRAY_END (NELEM(dump_avail) - 1)
321 
322 static vm_offset_t buffer_sva, buffer_eva;
323 vm_offset_t clean_sva, clean_eva;
324 static vm_offset_t pager_sva, pager_eva;
325 static struct trapframe proc0_tf;
326 
327 static void cpu_implement_smap(void);
328 
329 static void
cpu_startup(void * dummy)330 cpu_startup(void *dummy)
331 {
332 	caddr_t v;
333 	vm_size_t size = 0;
334 	vm_offset_t firstaddr;
335 
336 	/*
337 	 * Good {morning,afternoon,evening,night}.
338 	 */
339 	kprintf("%s", version);
340 	startrtclock();
341 	printcpuinfo();
342 	panicifcpuunsupported();
343 	if (cpu_stdext_feature & CPUID_STDEXT_SMAP)
344 		cpu_implement_smap();
345 
346 	kprintf("real memory  = %ju (%ju MB)\n",
347 		(intmax_t)Realmem,
348 		(intmax_t)Realmem / 1024 / 1024);
349 	/*
350 	 * Display any holes after the first chunk of extended memory.
351 	 */
352 	if (bootverbose) {
353 		int indx;
354 
355 		kprintf("Physical memory chunk(s):\n");
356 		for (indx = 0; phys_avail[indx].phys_end != 0; ++indx) {
357 			vm_paddr_t size1;
358 
359 			size1 = phys_avail[indx].phys_end -
360 				phys_avail[indx].phys_beg;
361 
362 			kprintf("0x%08jx - 0x%08jx, %ju bytes (%ju pages)\n",
363 				(intmax_t)phys_avail[indx].phys_beg,
364 				(intmax_t)phys_avail[indx].phys_end - 1,
365 				(intmax_t)size1,
366 				(intmax_t)(size1 / PAGE_SIZE));
367 		}
368 	}
369 
370 	/*
371 	 * Allocate space for system data structures.
372 	 * The first available kernel virtual address is in "v".
373 	 * As pages of kernel virtual memory are allocated, "v" is incremented.
374 	 * As pages of memory are allocated and cleared,
375 	 * "firstaddr" is incremented.
376 	 * An index into the kernel page table corresponding to the
377 	 * virtual memory address maintained in "v" is kept in "mapaddr".
378 	 */
379 
380 	/*
381 	 * Make two passes.  The first pass calculates how much memory is
382 	 * needed and allocates it.  The second pass assigns virtual
383 	 * addresses to the various data structures.
384 	 */
385 	firstaddr = 0;
386 again:
387 	v = (caddr_t)firstaddr;
388 
389 #define	valloc(name, type, num) \
390 	    (name) = (type *)v; v = (caddr_t)((name)+(num))
391 #define	valloclim(name, type, num, lim) \
392 	    (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num)))
393 
394 	/*
395 	 * Calculate nbuf such that maxbufspace uses approximately 1/20
396 	 * of physical memory by default, with a minimum of 50 buffers.
397 	 *
398 	 * The calculation is made after discounting 128MB.
399 	 *
400 	 * NOTE: maxbufspace is (nbuf * NBUFCALCSIZE) (NBUFCALCSIZE ~= 16KB).
401 	 *	 nbuf = (kbytes / factor) would cover all of memory.
402 	 */
403 	if (nbuf == 0) {
404 		long factor = NBUFCALCSIZE / 1024;		/* KB/nbuf */
405 		long kbytes = physmem * (PAGE_SIZE / 1024);	/* physmem */
406 
407 		nbuf = 50;
408 		if (kbytes > 128 * 1024)
409 			nbuf += (kbytes - 128 * 1024) / (factor * 20);
410 		if (maxbcache && nbuf > maxbcache / NBUFCALCSIZE)
411 			nbuf = maxbcache / NBUFCALCSIZE;
412 		if ((size_t)nbuf * sizeof(struct buf) > MAXBUFSTRUCTSIZE) {
413 			kprintf("Warning: nbuf capped at %ld due to the "
414 				"reasonability limit\n", nbuf);
415 			nbuf = MAXBUFSTRUCTSIZE / sizeof(struct buf);
416 		}
417 	}
418 
419 	/*
420 	 * Do not allow the buffer_map to be more then 1/2 the size of the
421 	 * kernel_map.
422 	 */
423 	if (nbuf > (virtual_end - virtual_start +
424 		    virtual2_end - virtual2_start) / (MAXBSIZE * 2)) {
425 		nbuf = (virtual_end - virtual_start +
426 			virtual2_end - virtual2_start) / (MAXBSIZE * 2);
427 		kprintf("Warning: nbufs capped at %ld due to kvm\n", nbuf);
428 	}
429 
430 	/*
431 	 * Do not allow the buffer_map to use more than 50% of available
432 	 * physical-equivalent memory.  Since the VM pages which back
433 	 * individual buffers are typically wired, having too many bufs
434 	 * can prevent the system from paging properly.
435 	 */
436 	if (nbuf > physmem * PAGE_SIZE / (NBUFCALCSIZE * 2)) {
437 		nbuf = physmem * PAGE_SIZE / (NBUFCALCSIZE * 2);
438 		kprintf("Warning: nbufs capped at %ld due to physmem\n", nbuf);
439 	}
440 
441 	/*
442 	 * Do not allow the sizeof(struct buf) * nbuf to exceed 1/4 of
443 	 * the valloc space which is just the virtual_end - virtual_start
444 	 * section.  This is typically ~2GB regardless of the amount of
445 	 * memory, so we use 500MB as a metric.
446 	 *
447 	 * This is because we use valloc() to allocate the buf header array.
448 	 *
449 	 * NOTE: buffer space in bytes is limited by vfs.*bufspace sysctls.
450 	 */
451 	if (nbuf > (virtual_end - virtual_start) / (sizeof(struct buf) * 4)) {
452 		nbuf = (virtual_end - virtual_start) /
453 		       (sizeof(struct buf) * 4);
454 		kprintf("Warning: nbufs capped at %ld due to "
455 			"valloc considerations\n",
456 			nbuf);
457 	}
458 
459 	nswbuf_mem = lmax(lmin(nbuf / 32, 512), 8);
460 #ifdef NSWBUF_MIN
461 	if (nswbuf_mem < NSWBUF_MIN)
462 		nswbuf_mem = NSWBUF_MIN;
463 #endif
464 	nswbuf_kva = lmax(lmin(nbuf / 4, 512), 16);
465 #ifdef NSWBUF_MIN
466 	if (nswbuf_kva < NSWBUF_MIN)
467 		nswbuf_kva = NSWBUF_MIN;
468 #endif
469 
470 	valloc(swbuf_mem, struct buf, nswbuf_mem);
471 	valloc(swbuf_kva, struct buf, nswbuf_kva);
472 	valloc(buf, struct buf, nbuf);
473 
474 	/*
475 	 * End of first pass, size has been calculated so allocate memory
476 	 */
477 	if (firstaddr == 0) {
478 		size = (vm_size_t)(v - firstaddr);
479 		firstaddr = kmem_alloc(kernel_map, round_page(size),
480 				       VM_SUBSYS_BUF);
481 		if (firstaddr == 0)
482 			panic("startup: no room for tables");
483 		goto again;
484 	}
485 
486 	/*
487 	 * End of second pass, addresses have been assigned
488 	 *
489 	 * nbuf is an int, make sure we don't overflow the field.
490 	 *
491 	 * On 64-bit systems we always reserve maximal allocations for
492 	 * buffer cache buffers and there are no fragmentation issues,
493 	 * so the KVA segment does not have to be excessively oversized.
494 	 */
495 	if ((vm_size_t)(v - firstaddr) != size)
496 		panic("startup: table size inconsistency");
497 
498 	kmem_suballoc(kernel_map, clean_map, &clean_sva, &clean_eva,
499 		      ((vm_offset_t)(nbuf + 16) * MAXBSIZE) +
500 		      ((nswbuf_mem + nswbuf_kva) * MAXPHYS) + pager_map_size);
501 	kmem_suballoc(clean_map, buffer_map, &buffer_sva, &buffer_eva,
502 		      ((vm_offset_t)(nbuf + 16) * MAXBSIZE));
503 	buffer_map->system_map = 1;
504 	kmem_suballoc(clean_map, pager_map, &pager_sva, &pager_eva,
505 		      ((vm_offset_t)(nswbuf_mem + nswbuf_kva) * MAXPHYS) +
506 		      pager_map_size);
507 	pager_map->system_map = 1;
508 	kprintf("avail memory = %ju (%ju MB)\n",
509 		(uintmax_t)ptoa(vmstats.v_free_count + vmstats.v_dma_pages),
510 		(uintmax_t)ptoa(vmstats.v_free_count + vmstats.v_dma_pages) /
511 		1024 / 1024);
512 }
513 
514 struct cpu_idle_stat {
515 	int	hint;
516 	int	reserved;
517 	u_long	halt;
518 	u_long	spin;
519 	u_long	repeat;
520 	u_long	repeat_last;
521 	u_long	repeat_delta;
522 	u_long	mwait_cx[CPU_MWAIT_CX_MAX];
523 } __cachealign;
524 
525 #define CPU_IDLE_STAT_HALT	-1
526 #define CPU_IDLE_STAT_SPIN	-2
527 
528 static struct cpu_idle_stat	cpu_idle_stats[MAXCPU];
529 
530 static int
sysctl_cpu_idle_cnt(SYSCTL_HANDLER_ARGS)531 sysctl_cpu_idle_cnt(SYSCTL_HANDLER_ARGS)
532 {
533 	int idx = arg2, cpu, error;
534 	u_long val = 0;
535 
536 	if (idx == CPU_IDLE_STAT_HALT) {
537 		for (cpu = 0; cpu < ncpus; ++cpu)
538 			val += cpu_idle_stats[cpu].halt;
539 	} else if (idx == CPU_IDLE_STAT_SPIN) {
540 		for (cpu = 0; cpu < ncpus; ++cpu)
541 			val += cpu_idle_stats[cpu].spin;
542 	} else {
543 		KASSERT(idx >= 0 && idx < CPU_MWAIT_CX_MAX,
544 		    ("invalid index %d", idx));
545 		for (cpu = 0; cpu < ncpus; ++cpu)
546 			val += cpu_idle_stats[cpu].mwait_cx[idx];
547 	}
548 
549 	error = sysctl_handle_quad(oidp, &val, 0, req);
550         if (error || req->newptr == NULL)
551 	        return error;
552 
553 	if (idx == CPU_IDLE_STAT_HALT) {
554 		for (cpu = 0; cpu < ncpus; ++cpu)
555 			cpu_idle_stats[cpu].halt = 0;
556 		cpu_idle_stats[0].halt = val;
557 	} else if (idx == CPU_IDLE_STAT_SPIN) {
558 		for (cpu = 0; cpu < ncpus; ++cpu)
559 			cpu_idle_stats[cpu].spin = 0;
560 		cpu_idle_stats[0].spin = val;
561 	} else {
562 		KASSERT(idx >= 0 && idx < CPU_MWAIT_CX_MAX,
563 		    ("invalid index %d", idx));
564 		for (cpu = 0; cpu < ncpus; ++cpu)
565 			cpu_idle_stats[cpu].mwait_cx[idx] = 0;
566 		cpu_idle_stats[0].mwait_cx[idx] = val;
567 	}
568 	return 0;
569 }
570 
571 static void
cpu_mwait_attach(void)572 cpu_mwait_attach(void)
573 {
574 	struct sbuf sb;
575 	int hint_idx, i;
576 
577 	if (!CPU_MWAIT_HAS_CX)
578 		return;
579 
580 	if (cpu_vendor_id == CPU_VENDOR_INTEL &&
581 	    (CPUID_TO_FAMILY(cpu_id) > 0xf ||
582 	     (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
583 	      CPUID_TO_MODEL(cpu_id) >= 0xf))) {
584 		int bm_sts = 1;
585 
586 		/*
587 		 * Pentium dual-core, Core 2 and beyond do not need any
588 		 * additional activities to enter deep C-state, i.e. C3(+).
589 		 */
590 		cpu_mwait_cx_no_bmarb();
591 
592 		TUNABLE_INT_FETCH("machdep.cpu.mwait.bm_sts", &bm_sts);
593 		if (!bm_sts)
594 			cpu_mwait_cx_no_bmsts();
595 	}
596 
597 	sbuf_new(&sb, cpu_mwait_cx_supported,
598 	    sizeof(cpu_mwait_cx_supported), SBUF_FIXEDLEN);
599 
600 	for (i = 0; i < CPU_MWAIT_CX_MAX; ++i) {
601 		struct cpu_mwait_cx *cx = &cpu_mwait_cx_info[i];
602 		int sub;
603 
604 		ksnprintf(cx->name, sizeof(cx->name), "C%d", i);
605 
606 		sysctl_ctx_init(&cx->sysctl_ctx);
607 		cx->sysctl_tree = SYSCTL_ADD_NODE(&cx->sysctl_ctx,
608 		    SYSCTL_STATIC_CHILDREN(_machdep_mwait), OID_AUTO,
609 		    cx->name, CTLFLAG_RW, NULL, "Cx control/info");
610 		if (cx->sysctl_tree == NULL)
611 			continue;
612 
613 		cx->subcnt = CPUID_MWAIT_CX_SUBCNT(cpu_mwait_extemu, i);
614 		SYSCTL_ADD_INT(&cx->sysctl_ctx,
615 		    SYSCTL_CHILDREN(cx->sysctl_tree), OID_AUTO,
616 		    "subcnt", CTLFLAG_RD, &cx->subcnt, 0,
617 		    "sub-state count");
618 		SYSCTL_ADD_PROC(&cx->sysctl_ctx,
619 		    SYSCTL_CHILDREN(cx->sysctl_tree), OID_AUTO,
620 		    "entered", (CTLTYPE_QUAD | CTLFLAG_RW), 0,
621 		    i, sysctl_cpu_idle_cnt, "Q", "# of times entered");
622 
623 		for (sub = 0; sub < cx->subcnt; ++sub)
624 			sbuf_printf(&sb, "C%d/%d ", i, sub);
625 	}
626 	sbuf_trim(&sb);
627 	sbuf_finish(&sb);
628 
629 	/*
630 	 * Non-deep C-states
631 	 */
632 	cpu_mwait_c1_hints_cnt = cpu_mwait_cx_info[CPU_MWAIT_C1].subcnt;
633 	for (i = CPU_MWAIT_C1; i < CPU_MWAIT_C3; ++i)
634 		cpu_mwait_hints_cnt += cpu_mwait_cx_info[i].subcnt;
635 	cpu_mwait_hints = kmalloc(sizeof(int) * cpu_mwait_hints_cnt,
636 				  M_DEVBUF, M_WAITOK);
637 
638 	hint_idx = 0;
639 	for (i = CPU_MWAIT_C1; i < CPU_MWAIT_C3; ++i) {
640 		int j, subcnt;
641 
642 		subcnt = cpu_mwait_cx_info[i].subcnt;
643 		for (j = 0; j < subcnt; ++j) {
644 			KASSERT(hint_idx < cpu_mwait_hints_cnt,
645 			    ("invalid mwait hint index %d", hint_idx));
646 			cpu_mwait_hints[hint_idx] = MWAIT_EAX_HINT(i, j);
647 			++hint_idx;
648 		}
649 	}
650 	KASSERT(hint_idx == cpu_mwait_hints_cnt,
651 	    ("mwait hint count %d != index %d",
652 	     cpu_mwait_hints_cnt, hint_idx));
653 
654 	if (bootverbose) {
655 		kprintf("MWAIT hints (%d C1 hints):\n", cpu_mwait_c1_hints_cnt);
656 		for (i = 0; i < cpu_mwait_hints_cnt; ++i) {
657 			int hint = cpu_mwait_hints[i];
658 
659 			kprintf("  C%d/%d hint 0x%04x\n",
660 			    MWAIT_EAX_TO_CX(hint), MWAIT_EAX_TO_CX_SUB(hint),
661 			    hint);
662 		}
663 	}
664 
665 	/*
666 	 * Deep C-states
667 	 */
668 	for (i = CPU_MWAIT_C1; i < CPU_MWAIT_CX_MAX; ++i)
669 		cpu_mwait_deep_hints_cnt += cpu_mwait_cx_info[i].subcnt;
670 	cpu_mwait_deep_hints = kmalloc(sizeof(int) * cpu_mwait_deep_hints_cnt,
671 	    M_DEVBUF, M_WAITOK);
672 
673 	hint_idx = 0;
674 	for (i = CPU_MWAIT_C1; i < CPU_MWAIT_CX_MAX; ++i) {
675 		int j, subcnt;
676 
677 		subcnt = cpu_mwait_cx_info[i].subcnt;
678 		for (j = 0; j < subcnt; ++j) {
679 			KASSERT(hint_idx < cpu_mwait_deep_hints_cnt,
680 			    ("invalid mwait deep hint index %d", hint_idx));
681 			cpu_mwait_deep_hints[hint_idx] = MWAIT_EAX_HINT(i, j);
682 			++hint_idx;
683 		}
684 	}
685 	KASSERT(hint_idx == cpu_mwait_deep_hints_cnt,
686 	    ("mwait deep hint count %d != index %d",
687 	     cpu_mwait_deep_hints_cnt, hint_idx));
688 
689 	if (bootverbose) {
690 		kprintf("MWAIT deep hints:\n");
691 		for (i = 0; i < cpu_mwait_deep_hints_cnt; ++i) {
692 			int hint = cpu_mwait_deep_hints[i];
693 
694 			kprintf("  C%d/%d hint 0x%04x\n",
695 			    MWAIT_EAX_TO_CX(hint), MWAIT_EAX_TO_CX_SUB(hint),
696 			    hint);
697 		}
698 	}
699 	cpu_idle_repeat_max = 256 * cpu_mwait_deep_hints_cnt;
700 
701 	for (i = 0; i < ncpus; ++i) {
702 		char name[16];
703 
704 		ksnprintf(name, sizeof(name), "idle%d", i);
705 		SYSCTL_ADD_PROC(NULL,
706 		    SYSCTL_STATIC_CHILDREN(_machdep_mwait_CX), OID_AUTO,
707 		    name, (CTLTYPE_STRING | CTLFLAG_RW), &cpu_idle_stats[i],
708 		    0, cpu_mwait_cx_pcpu_idle_sysctl, "A", "");
709 	}
710 }
711 
712 static void
cpu_finish(void * dummy __unused)713 cpu_finish(void *dummy __unused)
714 {
715 	cpu_setregs();
716 	cpu_mwait_attach();
717 }
718 
719 static void
pic_finish(void * dummy __unused)720 pic_finish(void *dummy __unused)
721 {
722 	/* Log ELCR information */
723 	elcr_dump();
724 
725 	/* Log MPTABLE information */
726 	mptable_pci_int_dump();
727 
728 	/* Finalize PCI */
729 	MachIntrABI.finalize();
730 }
731 
732 /*
733  * Send an interrupt to process.
734  *
735  * Stack is set up to allow sigcode stored
736  * at top to call routine, followed by kcall
737  * to sigreturn routine below.  After sigreturn
738  * resets the signal mask, the stack, and the
739  * frame pointer, it returns to the user
740  * specified pc, psl.
741  */
742 void
sendsig(sig_t catcher,int sig,sigset_t * mask,u_long code)743 sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
744 {
745 	struct lwp *lp = curthread->td_lwp;
746 	struct proc *p = lp->lwp_proc;
747 	struct trapframe *regs;
748 	struct sigacts *psp = p->p_sigacts;
749 	struct sigframe sf, *sfp;
750 	int oonstack;
751 	char *sp;
752 
753 	regs = lp->lwp_md.md_regs;
754 	oonstack = (lp->lwp_sigstk.ss_flags & SS_ONSTACK) ? 1 : 0;
755 
756 	/* Save user context */
757 	bzero(&sf, sizeof(struct sigframe));
758 	sf.sf_uc.uc_sigmask = *mask;
759 	sf.sf_uc.uc_stack = lp->lwp_sigstk;
760 	sf.sf_uc.uc_mcontext.mc_onstack = oonstack;
761 	KKASSERT(__offsetof(struct trapframe, tf_rdi) == 0);
762 	/* gcc errors out on optimized bcopy */
763 	_bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(struct trapframe));
764 
765 	/* Make the size of the saved context visible to userland */
766 	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext);
767 
768 	/* Allocate and validate space for the signal handler context. */
769         if ((lp->lwp_flags & LWP_ALTSTACK) != 0 && !oonstack &&
770 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
771 		sp = (char *)lp->lwp_sigstk.ss_sp + lp->lwp_sigstk.ss_size -
772 		    sizeof(struct sigframe);
773 		lp->lwp_sigstk.ss_flags |= SS_ONSTACK;
774 	} else {
775 		/* We take red zone into account */
776 		sp = (char *)regs->tf_rsp - sizeof(struct sigframe) - 128;
777 	}
778 
779 	/*
780 	 * XXX AVX needs 64-byte alignment but sigframe has other fields and
781 	 * the embedded ucontext is not at the front, so aligning this won't
782 	 * help us.  Fortunately we bcopy in/out of the sigframe, so the
783 	 * kernel is ok.
784 	 *
785 	 * The problem though is if userland winds up trying to use the
786 	 * context directly.
787 	 */
788 	sfp = (struct sigframe *)((intptr_t)sp & ~(intptr_t)0xF);
789 
790 	/* Translate the signal is appropriate */
791 	if (p->p_sysent->sv_sigtbl) {
792 		if (sig <= p->p_sysent->sv_sigsize)
793 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
794 	}
795 
796 	/*
797 	 * Build the argument list for the signal handler.
798 	 *
799 	 * Arguments are in registers (%rdi, %rsi, %rdx, %rcx)
800 	 */
801 	regs->tf_rdi = sig;				/* argument 1 */
802 	regs->tf_rdx = (register_t)&sfp->sf_uc;		/* argument 3 */
803 
804 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
805 		/*
806 		 * Signal handler installed with SA_SIGINFO.
807 		 *
808 		 * action(signo, siginfo, ucontext)
809 		 */
810 		regs->tf_rsi = (register_t)&sfp->sf_si;	/* argument 2 */
811 		regs->tf_rcx = (register_t)regs->tf_addr; /* argument 4 */
812 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
813 
814 		/* fill siginfo structure */
815 		sf.sf_si.si_signo = sig;
816 		sf.sf_si.si_pid = psp->ps_frominfo[sig].pid;
817 		sf.sf_si.si_uid = psp->ps_frominfo[sig].uid;
818 		sf.sf_si.si_code = code;
819 		sf.sf_si.si_addr = (void *)regs->tf_addr;
820 	} else {
821 		/*
822 		 * Old FreeBSD-style arguments.
823 		 *
824 		 * handler (signo, code, [uc], addr)
825 		 */
826 		regs->tf_rsi = (register_t)code;	/* argument 2 */
827 		regs->tf_rcx = (register_t)regs->tf_addr; /* argument 4 */
828 		sf.sf_ahu.sf_handler = catcher;
829 	}
830 
831 	/*
832 	 * If we're a vm86 process, we want to save the segment registers.
833 	 * We also change eflags to be our emulated eflags, not the actual
834 	 * eflags.
835 	 */
836 #if 0 /* JG */
837 	if (regs->tf_eflags & PSL_VM) {
838 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
839 		struct vm86_kernel *vm86 = &lp->lwp_thread->td_pcb->pcb_ext->ext_vm86;
840 
841 		sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
842 		sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
843 		sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
844 		sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
845 
846 		if (vm86->vm86_has_vme == 0)
847 			sf.sf_uc.uc_mcontext.mc_eflags =
848 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
849 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
850 
851 		/*
852 		 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
853 		 * syscalls made by the signal handler.  This just avoids
854 		 * wasting time for our lazy fixup of such faults.  PSL_NT
855 		 * does nothing in vm86 mode, but vm86 programs can set it
856 		 * almost legitimately in probes for old cpu types.
857 		 */
858 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
859 	}
860 #endif
861 
862 	/*
863 	 * Save the FPU state and reinit the FP unit
864 	 */
865 	npxpush(&sf.sf_uc.uc_mcontext);
866 
867 	/*
868 	 * Copy the sigframe out to the user's stack.
869 	 */
870 	if (copyout(&sf, sfp, sizeof(struct sigframe)) != 0) {
871 		/*
872 		 * Something is wrong with the stack pointer.
873 		 * ...Kill the process.
874 		 */
875 		sigexit(lp, SIGILL);
876 	}
877 
878 	regs->tf_rsp = (register_t)sfp;
879 	regs->tf_rip = trunc_page64(PS_STRINGS - *(p->p_sysent->sv_szsigcode));
880 	regs->tf_rip -= SZSIGCODE_EXTRA_BYTES;
881 
882 	/*
883 	 * x86 abi specifies that the direction flag must be cleared
884 	 * on function entry
885 	 */
886 	regs->tf_rflags &= ~(PSL_T | PSL_D);
887 
888 	/*
889 	 * 64 bit mode has a code and stack selector but
890 	 * no data or extra selector.  %fs and %gs are not
891 	 * stored in-context.
892 	 */
893 	regs->tf_cs = _ucodesel;
894 	regs->tf_ss = _udatasel;
895 	clear_quickret();
896 }
897 
898 /*
899  * Sanitize the trapframe for a virtual kernel passing control to a custom
900  * VM context.  Remove any items that would otherwise create a privilage
901  * issue.
902  *
903  * XXX at the moment we allow userland to set the resume flag.  Is this a
904  * bad idea?
905  */
906 int
cpu_sanitize_frame(struct trapframe * frame)907 cpu_sanitize_frame(struct trapframe *frame)
908 {
909 	frame->tf_cs = _ucodesel;
910 	frame->tf_ss = _udatasel;
911 	/* XXX VM (8086) mode not supported? */
912 	frame->tf_rflags &= (PSL_RF | PSL_USERCHANGE | PSL_VM_UNSUPP);
913 	frame->tf_rflags |= PSL_RESERVED_DEFAULT | PSL_I;
914 
915 	return(0);
916 }
917 
918 /*
919  * Sanitize the tls so loading the descriptor does not blow up
920  * on us.  For x86_64 we don't have to do anything.
921  */
922 int
cpu_sanitize_tls(struct savetls * tls)923 cpu_sanitize_tls(struct savetls *tls)
924 {
925 	return(0);
926 }
927 
928 /*
929  * sigreturn(ucontext_t *sigcntxp)
930  *
931  * System call to cleanup state after a signal
932  * has been taken.  Reset signal mask and
933  * stack state from context left by sendsig (above).
934  * Return to previous pc and psl as specified by
935  * context left by sendsig. Check carefully to
936  * make sure that the user has not modified the
937  * state to gain improper privileges.
938  *
939  * MPSAFE
940  */
941 #define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
942 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
943 
944 int
sys_sigreturn(struct sysmsg * sysmsg,const struct sigreturn_args * uap)945 sys_sigreturn(struct sysmsg *sysmsg, const struct sigreturn_args *uap)
946 {
947 	struct lwp *lp = curthread->td_lwp;
948 	struct trapframe *regs;
949 	ucontext_t uc;
950 	ucontext_t *ucp;
951 	register_t rflags;
952 	int cs;
953 	int error;
954 
955 	/*
956 	 * We have to copy the information into kernel space so userland
957 	 * can't modify it while we are sniffing it.
958 	 */
959 	regs = lp->lwp_md.md_regs;
960 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
961 	if (error)
962 		return (error);
963 	ucp = &uc;
964 	rflags = ucp->uc_mcontext.mc_rflags;
965 
966 	/* VM (8086) mode not supported */
967 	rflags &= ~PSL_VM_UNSUPP;
968 
969 #if 0 /* JG */
970 	if (eflags & PSL_VM) {
971 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
972 		struct vm86_kernel *vm86;
973 
974 		/*
975 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
976 		 * set up the vm86 area, and we can't enter vm86 mode.
977 		 */
978 		if (lp->lwp_thread->td_pcb->pcb_ext == 0)
979 			return (EINVAL);
980 		vm86 = &lp->lwp_thread->td_pcb->pcb_ext->ext_vm86;
981 		if (vm86->vm86_inited == 0)
982 			return (EINVAL);
983 
984 		/* go back to user mode if both flags are set */
985 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF))
986 			trapsignal(lp, SIGBUS, 0);
987 
988 		if (vm86->vm86_has_vme) {
989 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
990 			    (eflags & VME_USERCHANGE) | PSL_VM;
991 		} else {
992 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
993 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
994 			    (eflags & VM_USERCHANGE) | PSL_VM;
995 		}
996 		bcopy(&ucp->uc_mcontext.mc_gs, tf, sizeof(struct trapframe));
997 		tf->tf_eflags = eflags;
998 		tf->tf_vm86_ds = tf->tf_ds;
999 		tf->tf_vm86_es = tf->tf_es;
1000 		tf->tf_vm86_fs = tf->tf_fs;
1001 		tf->tf_vm86_gs = tf->tf_gs;
1002 		tf->tf_ds = _udatasel;
1003 		tf->tf_es = _udatasel;
1004 		tf->tf_fs = _udatasel;
1005 		tf->tf_gs = _udatasel;
1006 	} else
1007 #endif
1008 	{
1009 		/*
1010 		 * Don't allow users to change privileged or reserved flags.
1011 		 */
1012 		/*
1013 		 * XXX do allow users to change the privileged flag PSL_RF.
1014 		 * The cpu sets PSL_RF in tf_eflags for faults.  Debuggers
1015 		 * should sometimes set it there too.  tf_eflags is kept in
1016 		 * the signal context during signal handling and there is no
1017 		 * other place to remember it, so the PSL_RF bit may be
1018 		 * corrupted by the signal handler without us knowing.
1019 		 * Corruption of the PSL_RF bit at worst causes one more or
1020 		 * one less debugger trap, so allowing it is fairly harmless.
1021 		 */
1022 		if (!EFL_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) {
1023 			kprintf("sigreturn: rflags = 0x%lx\n", (long)rflags);
1024 			return(EINVAL);
1025 		}
1026 
1027 		/*
1028 		 * Don't allow users to load a valid privileged %cs.  Let the
1029 		 * hardware check for invalid selectors, excess privilege in
1030 		 * other selectors, invalid %eip's and invalid %esp's.
1031 		 */
1032 		cs = ucp->uc_mcontext.mc_cs;
1033 		if (!CS_SECURE(cs)) {
1034 			kprintf("sigreturn: cs = 0x%x\n", cs);
1035 			trapsignal(lp, SIGBUS, T_PROTFLT);
1036 			return(EINVAL);
1037 		}
1038 		/* gcc errors out on optimized bcopy */
1039 		_bcopy(&ucp->uc_mcontext.mc_rdi, regs,
1040 		       sizeof(struct trapframe));
1041 	}
1042 
1043 	/*
1044 	 * Restore the FPU state from the frame
1045 	 */
1046 	crit_enter();
1047 	npxpop(&ucp->uc_mcontext);
1048 
1049 	if (ucp->uc_mcontext.mc_onstack & 1)
1050 		lp->lwp_sigstk.ss_flags |= SS_ONSTACK;
1051 	else
1052 		lp->lwp_sigstk.ss_flags &= ~SS_ONSTACK;
1053 
1054 	lp->lwp_sigmask = ucp->uc_sigmask;
1055 	SIG_CANTMASK(lp->lwp_sigmask);
1056 	clear_quickret();
1057 	crit_exit();
1058 	return(EJUSTRETURN);
1059 }
1060 
1061 /*
1062  * Machine dependent boot() routine
1063  *
1064  * I haven't seen anything to put here yet
1065  * Possibly some stuff might be grafted back here from boot()
1066  */
1067 void
cpu_boot(int howto)1068 cpu_boot(int howto)
1069 {
1070 }
1071 
1072 /*
1073  * Shutdown the CPU as much as possible
1074  */
1075 void
cpu_halt(void)1076 cpu_halt(void)
1077 {
1078 	for (;;)
1079 		__asm__ __volatile("hlt");
1080 }
1081 
1082 /*
1083  * cpu_idle() represents the idle LWKT.  You cannot return from this function
1084  * (unless you want to blow things up!).  Instead we look for runnable threads
1085  * and loop or halt as appropriate.  Giant is not held on entry to the thread.
1086  *
1087  * The main loop is entered with a critical section held, we must release
1088  * the critical section before doing anything else.  lwkt_switch() will
1089  * check for pending interrupts due to entering and exiting its own
1090  * critical section.
1091  *
1092  * NOTE: On an SMP system we rely on a scheduler IPI to wake a HLTed cpu up.
1093  *	 However, there are cases where the idlethread will be entered with
1094  *	 the possibility that no IPI will occur and in such cases
1095  *	 lwkt_switch() sets TDF_IDLE_NOHLT.
1096  *
1097  * NOTE: cpu_idle_repeat determines how many entries into the idle thread
1098  *	 must occur before it starts using ACPI halt.
1099  *
1100  * NOTE: Value overridden in hammer_time().
1101  */
1102 static int	cpu_idle_hlt = 2;
1103 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW,
1104     &cpu_idle_hlt, 0, "Idle loop HLT enable");
1105 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_repeat, CTLFLAG_RW,
1106     &cpu_idle_repeat, 0, "Idle entries before acpi hlt");
1107 
1108 SYSCTL_PROC(_machdep, OID_AUTO, cpu_idle_hltcnt, (CTLTYPE_QUAD | CTLFLAG_RW),
1109     0, CPU_IDLE_STAT_HALT, sysctl_cpu_idle_cnt, "Q", "Idle loop entry halts");
1110 SYSCTL_PROC(_machdep, OID_AUTO, cpu_idle_spincnt, (CTLTYPE_QUAD | CTLFLAG_RW),
1111     0, CPU_IDLE_STAT_SPIN, sysctl_cpu_idle_cnt, "Q", "Idle loop entry spins");
1112 
1113 static void
cpu_idle_default_hook(void)1114 cpu_idle_default_hook(void)
1115 {
1116 	/*
1117 	 * We must guarentee that hlt is exactly the instruction
1118 	 * following the sti.
1119 	 */
1120 	__asm __volatile("sti; hlt");
1121 }
1122 
1123 /* Other subsystems (e.g., ACPI) can hook this later. */
1124 void (*cpu_idle_hook)(void) = cpu_idle_default_hook;
1125 
1126 static __inline int
cpu_mwait_cx_hint(struct cpu_idle_stat * stat)1127 cpu_mwait_cx_hint(struct cpu_idle_stat *stat)
1128 {
1129 	int hint, cx_idx;
1130 	u_int idx;
1131 
1132 	hint = stat->hint;
1133 	if (hint >= 0)
1134 		goto done;
1135 
1136 	idx = (stat->repeat + stat->repeat_last + stat->repeat_delta) >>
1137 	    cpu_mwait_repeat_shift;
1138 	if (idx >= cpu_mwait_c1_hints_cnt) {
1139 		/* Step up faster, once we walked through all C1 states */
1140 		stat->repeat_delta += 1 << (cpu_mwait_repeat_shift + 1);
1141 	}
1142 	if (hint == CPU_MWAIT_HINT_AUTODEEP) {
1143 		if (idx >= cpu_mwait_deep_hints_cnt)
1144 			idx = cpu_mwait_deep_hints_cnt - 1;
1145 		hint = cpu_mwait_deep_hints[idx];
1146 	} else {
1147 		if (idx >= cpu_mwait_hints_cnt)
1148 			idx = cpu_mwait_hints_cnt - 1;
1149 		hint = cpu_mwait_hints[idx];
1150 	}
1151 done:
1152 	cx_idx = MWAIT_EAX_TO_CX(hint);
1153 	if (cx_idx >= 0 && cx_idx < CPU_MWAIT_CX_MAX)
1154 		stat->mwait_cx[cx_idx]++;
1155 	return hint;
1156 }
1157 
1158 void
cpu_idle(void)1159 cpu_idle(void)
1160 {
1161 	globaldata_t gd = mycpu;
1162 	struct cpu_idle_stat *stat = &cpu_idle_stats[gd->gd_cpuid];
1163 	struct thread *td __debugvar = gd->gd_curthread;
1164 	int reqflags;
1165 
1166 	stat->repeat = stat->repeat_last = cpu_idle_repeat_max;
1167 
1168 	crit_exit();
1169 	KKASSERT(td->td_critcount == 0);
1170 
1171 	for (;;) {
1172 		/*
1173 		 * See if there are any LWKTs ready to go.
1174 		 */
1175 		lwkt_switch();
1176 
1177 		/*
1178 		 * When halting inside a cli we must check for reqflags
1179 		 * races, particularly [re]schedule requests.  Running
1180 		 * splz() does the job.
1181 		 *
1182 		 * cpu_idle_hlt:
1183 		 *	0	Never halt, just spin
1184 		 *
1185 		 *	1	Always use MONITOR/MWAIT if avail, HLT
1186 		 *		otherwise.
1187 		 *
1188 		 *		Better default for modern (Haswell+) Intel
1189 		 *		cpus.
1190 		 *
1191 		 *	2	Use HLT/MONITOR/MWAIT up to a point and then
1192 		 *		use the ACPI halt (default).  This is a hybrid
1193 		 *		approach.  See machdep.cpu_idle_repeat.
1194 		 *
1195 		 *		Better default for modern AMD cpus and older
1196 		 *		Intel cpus.
1197 		 *
1198 		 *	3	Always use the ACPI halt.  This typically
1199 		 *		eats the least amount of power but the cpu
1200 		 *		will be slow waking up.  Slows down e.g.
1201 		 *		compiles and other pipe/event oriented stuff.
1202 		 *
1203 		 *		Usually the best default for AMD cpus.
1204 		 *
1205 		 *	4	Always use HLT.
1206 		 *
1207 		 *	5	Always spin.
1208 		 *
1209 		 * NOTE: Interrupts are enabled and we are not in a critical
1210 		 *	 section.
1211 		 *
1212 		 * NOTE: Preemptions do not reset gd_idle_repeat.   Also we
1213 		 *	 don't bother capping gd_idle_repeat, it is ok if
1214 		 *	 it overflows (we do make it unsigned, however).
1215 		 *
1216 		 * Implement optimized invltlb operations when halted
1217 		 * in idle.  By setting the bit in smp_idleinvl_mask
1218 		 * we inform other cpus that they can set _reqs to
1219 		 * request an invltlb.  Current the code to do that
1220 		 * sets the bits in _reqs anyway, but then check _mask
1221 		 * to determine if they can assume the invltlb will execute.
1222 		 *
1223 		 * A critical section is required to ensure that interrupts
1224 		 * do not fully run until after we've had a chance to execute
1225 		 * the request.
1226 		 */
1227 		if (gd->gd_idle_repeat == 0) {
1228 			stat->repeat = (stat->repeat + stat->repeat_last) >> 1;
1229 			if (stat->repeat > cpu_idle_repeat_max)
1230 				stat->repeat = cpu_idle_repeat_max;
1231 			stat->repeat_last = 0;
1232 			stat->repeat_delta = 0;
1233 		}
1234 		++stat->repeat_last;
1235 
1236 		/*
1237 		 * General idle thread halt code
1238 		 *
1239 		 * IBRS NOTES - IBRS is a SPECTRE mitigation.  When going
1240 		 *		idle, disable IBRS to reduce hyperthread
1241 		 *		overhead.
1242 		 */
1243 		++gd->gd_idle_repeat;
1244 
1245 		switch(cpu_idle_hlt) {
1246 		default:
1247 		case 0:
1248 			/*
1249 			 * Always spin
1250 			 */
1251 			;
1252 do_spin:
1253 			splz();
1254 			__asm __volatile("sti");
1255 			stat->spin++;
1256 			crit_enter_gd(gd);
1257 			crit_exit_gd(gd);
1258 			break;
1259 		case 2:
1260 			/*
1261 			 * Use MONITOR/MWAIT (or HLT) for a few cycles,
1262 			 * then start using the ACPI halt code if we
1263 			 * continue to be idle.
1264 			 */
1265 			if (gd->gd_idle_repeat >= cpu_idle_repeat)
1266 				goto do_acpi;
1267 			/* FALL THROUGH */
1268 		case 1:
1269 			/*
1270 			 * Always use MONITOR/MWAIT (will use HLT if
1271 			 * MONITOR/MWAIT not available).
1272 			 */
1273 			if (cpu_mi_feature & CPU_MI_MONITOR) {
1274 				splz(); /* XXX */
1275 				reqflags = gd->gd_reqflags;
1276 				if (reqflags & RQF_IDLECHECK_WK_MASK)
1277 					goto do_spin;
1278 				crit_enter_gd(gd);
1279 				ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask, gd->gd_cpuid);
1280 				/*
1281 				 * IBRS/STIBP
1282 				 */
1283 				if (pscpu->trampoline.tr_pcb_spec_ctrl[1] &
1284 				    SPEC_CTRL_DUMMY_ENABLE) {
1285 					wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[1] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
1286 				}
1287 				cpu_mmw_pause_int(&gd->gd_reqflags, reqflags,
1288 						  cpu_mwait_cx_hint(stat), 0);
1289 				if (pscpu->trampoline.tr_pcb_spec_ctrl[0] &
1290 				    SPEC_CTRL_DUMMY_ENABLE) {
1291 					wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[0] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
1292 				}
1293 				stat->halt++;
1294 				ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask, gd->gd_cpuid);
1295 				if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs,
1296 							      gd->gd_cpuid)) {
1297 					cpu_invltlb();
1298 					cpu_mfence();
1299 				}
1300 				crit_exit_gd(gd);
1301 				break;
1302 			}
1303 			/* FALLTHROUGH */
1304 		case 4:
1305 			/*
1306 			 * Use HLT
1307 			 */
1308 			__asm __volatile("cli");
1309 			splz();
1310 			crit_enter_gd(gd);
1311 			if ((gd->gd_reqflags & RQF_IDLECHECK_WK_MASK) == 0) {
1312 				ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask,
1313 						     gd->gd_cpuid);
1314 				if (pscpu->trampoline.tr_pcb_spec_ctrl[1] &
1315 				    SPEC_CTRL_DUMMY_ENABLE) {
1316 					wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[1] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
1317 				}
1318 				cpu_idle_default_hook();
1319 				if (pscpu->trampoline.tr_pcb_spec_ctrl[0] &
1320 				    SPEC_CTRL_DUMMY_ENABLE) {
1321 					wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[0] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
1322 				}
1323 				ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask,
1324 						       gd->gd_cpuid);
1325 				if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs,
1326 							      gd->gd_cpuid)) {
1327 					cpu_invltlb();
1328 					cpu_mfence();
1329 				}
1330 			}
1331 			__asm __volatile("sti");
1332 			stat->halt++;
1333 			crit_exit_gd(gd);
1334 			break;
1335 		case 3:
1336 			/*
1337 			 * Use ACPI halt
1338 			 */
1339 			;
1340 do_acpi:
1341 			__asm __volatile("cli");
1342 			splz();
1343 			crit_enter_gd(gd);
1344 			if ((gd->gd_reqflags & RQF_IDLECHECK_WK_MASK) == 0) {
1345 				ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask,
1346 						     gd->gd_cpuid);
1347 				if (pscpu->trampoline.tr_pcb_spec_ctrl[1] &
1348 				    SPEC_CTRL_DUMMY_ENABLE) {
1349 					wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[1] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
1350 				}
1351 				cpu_idle_hook();
1352 				if (pscpu->trampoline.tr_pcb_spec_ctrl[0] &
1353 				    SPEC_CTRL_DUMMY_ENABLE) {
1354 					wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[0] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
1355 				}
1356 				ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask,
1357 						       gd->gd_cpuid);
1358 				if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs,
1359 							      gd->gd_cpuid)) {
1360 					cpu_invltlb();
1361 					cpu_mfence();
1362 				}
1363 			}
1364 			__asm __volatile("sti");
1365 			stat->halt++;
1366 			crit_exit_gd(gd);
1367 			break;
1368 		}
1369 	}
1370 }
1371 
1372 /*
1373  * Called from deep ACPI via cpu_idle_hook() (see above) to actually halt
1374  * the cpu in C1.  ACPI might use other halt methods for deeper states
1375  * and not reach here.
1376  *
1377  * For now we always use HLT as we are not sure what ACPI may have actually
1378  * done.  MONITOR/MWAIT might not be appropriate.
1379  *
1380  * NOTE: MONITOR/MWAIT does not appear to throttle AMD cpus, while HLT
1381  *	 does.  On Intel, MONITOR/MWAIT does appear to throttle the cpu.
1382  */
1383 void
cpu_idle_halt(void)1384 cpu_idle_halt(void)
1385 {
1386 	globaldata_t gd;
1387 
1388 	gd = mycpu;
1389 #if 0
1390 	/* DISABLED FOR NOW */
1391 	struct cpu_idle_stat *stat;
1392 	int reqflags;
1393 
1394 
1395 	if ((cpu_idle_hlt == 1 || cpu_idle_hlt == 2) &&
1396 	    (cpu_mi_feature & CPU_MI_MONITOR) &&
1397 	    cpu_vendor_id != CPU_VENDOR_AMD) {
1398 		/*
1399 		 * Use MONITOR/MWAIT
1400 		 *
1401 		 * (NOTE: On ryzen, MWAIT does not throttle clocks, so we
1402 		 *	  have to use HLT)
1403 		 */
1404 		stat = &cpu_idle_stats[gd->gd_cpuid];
1405 		reqflags = gd->gd_reqflags;
1406 		if ((reqflags & RQF_IDLECHECK_WK_MASK) == 0) {
1407 			__asm __volatile("sti");
1408 			cpu_mmw_pause_int(&gd->gd_reqflags, reqflags,
1409 					  cpu_mwait_cx_hint(stat), 0);
1410 		} else {
1411 			__asm __volatile("sti; pause");
1412 		}
1413 	} else
1414 #endif
1415 	{
1416 		/*
1417 		 * Use HLT
1418 		 */
1419 		if ((gd->gd_reqflags & RQF_IDLECHECK_WK_MASK) == 0)
1420 			__asm __volatile("sti; hlt");
1421 		else
1422 			__asm __volatile("sti; pause");
1423 	}
1424 }
1425 
1426 
1427 /*
1428  * Called in a loop indirectly via Xcpustop
1429  */
1430 void
cpu_smp_stopped(void)1431 cpu_smp_stopped(void)
1432 {
1433 	globaldata_t gd = mycpu;
1434 	volatile __uint64_t *ptr;
1435 	__uint64_t ovalue;
1436 
1437 	ptr = CPUMASK_ADDR(started_cpus, gd->gd_cpuid);
1438 	ovalue = *ptr;
1439 	if ((ovalue & CPUMASK_SIMPLE(gd->gd_cpuid & 63)) == 0) {
1440 		if (cpu_mi_feature & CPU_MI_MONITOR) {
1441 			if (cpu_mwait_hints) {
1442 				cpu_mmw_pause_long(__DEVOLATILE(void *, ptr),
1443 					   ovalue,
1444 					   cpu_mwait_hints[
1445 						cpu_mwait_hints_cnt - 1], 0);
1446 			} else {
1447 				cpu_mmw_pause_long(__DEVOLATILE(void *, ptr),
1448 					   ovalue, 0, 0);
1449 			}
1450 		} else {
1451 			cpu_halt();	/* depend on lapic timer */
1452 		}
1453 	}
1454 }
1455 
1456 /*
1457  * This routine is called if a spinlock has been held through the
1458  * exponential backoff period and is seriously contested.  On a real cpu
1459  * we let it spin.
1460  */
1461 void
cpu_spinlock_contested(void)1462 cpu_spinlock_contested(void)
1463 {
1464 	cpu_pause();
1465 }
1466 
1467 /*
1468  * Clear registers on exec
1469  */
1470 void
exec_setregs(u_long entry,u_long stack,u_long ps_strings)1471 exec_setregs(u_long entry, u_long stack, u_long ps_strings)
1472 {
1473 	struct thread *td = curthread;
1474 	struct lwp *lp = td->td_lwp;
1475 	struct pcb *pcb = td->td_pcb;
1476 	struct trapframe *regs = lp->lwp_md.md_regs;
1477 
1478 	user_ldt_free(pcb);
1479 
1480 	clear_quickret();
1481 	bzero((char *)regs, sizeof(struct trapframe));
1482 	regs->tf_rip = entry;
1483 	regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; /* align the stack */
1484 	regs->tf_rdi = stack;		/* argv */
1485 	regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
1486 	regs->tf_ss = _udatasel;
1487 	regs->tf_cs = _ucodesel;
1488 	regs->tf_rbx = ps_strings;
1489 
1490 	/*
1491 	 * Reset the hardware debug registers if they were in use.
1492 	 * They won't have any meaning for the newly exec'd process.
1493 	 */
1494 	if (pcb->pcb_flags & PCB_DBREGS) {
1495 		pcb->pcb_dr0 = 0;
1496 		pcb->pcb_dr1 = 0;
1497 		pcb->pcb_dr2 = 0;
1498 		pcb->pcb_dr3 = 0;
1499 		pcb->pcb_dr6 = 0;
1500 		pcb->pcb_dr7 = 0; /* JG set bit 10? */
1501 		if (pcb == td->td_pcb) {
1502 			/*
1503 			 * Clear the debug registers on the running
1504 			 * CPU, otherwise they will end up affecting
1505 			 * the next process we switch to.
1506 			 */
1507 			reset_dbregs();
1508 		}
1509 		pcb->pcb_flags &= ~PCB_DBREGS;
1510 	}
1511 
1512 	/*
1513 	 * Initialize the math emulator (if any) for the current process.
1514 	 * Actually, just clear the bit that says that the emulator has
1515 	 * been initialized.  Initialization is delayed until the process
1516 	 * traps to the emulator (if it is done at all) mainly because
1517 	 * emulators don't provide an entry point for initialization.
1518 	 */
1519 	pcb->pcb_flags &= ~FP_SOFTFP;
1520 
1521 	/*
1522 	 * NOTE: do not set CR0_TS here.  npxinit() must do it after clearing
1523 	 *	 gd_npxthread.  Otherwise a preemptive interrupt thread
1524 	 *	 may panic in npxdna().
1525 	 */
1526 	crit_enter();
1527 	load_cr0(rcr0() | CR0_MP);
1528 
1529 	/*
1530 	 * NOTE: The MSR values must be correct so we can return to
1531 	 *	 userland.  gd_user_fs/gs must be correct so the switch
1532 	 *	 code knows what the current MSR values are.
1533 	 */
1534 	pcb->pcb_fsbase = 0;	/* Values loaded from PCB on switch */
1535 	pcb->pcb_gsbase = 0;
1536 	mdcpu->gd_user_fs = 0;	/* Cache of current MSR values */
1537 	mdcpu->gd_user_gs = 0;
1538 	wrmsr(MSR_FSBASE, 0);	/* Set MSR values for return to userland */
1539 	wrmsr(MSR_KGSBASE, 0);
1540 
1541 	/* Initialize the npx (if any) for the current process. */
1542 	npxinit();
1543 	crit_exit();
1544 
1545 	pcb->pcb_ds = _udatasel;
1546 	pcb->pcb_es = _udatasel;
1547 	pcb->pcb_fs = _udatasel;
1548 	pcb->pcb_gs = _udatasel;
1549 }
1550 
1551 void
cpu_setregs(void)1552 cpu_setregs(void)
1553 {
1554 	register_t cr0;
1555 
1556 	cr0 = rcr0();
1557 	cr0 |= CR0_NE;			/* Done by npxinit() */
1558 	cr0 |= CR0_MP | CR0_TS;		/* Done at every execve() too. */
1559 	cr0 |= CR0_WP | CR0_AM;
1560 	load_cr0(cr0);
1561 	load_gs(_udatasel);
1562 }
1563 
1564 static int
sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS)1565 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS)
1566 {
1567 	int error;
1568 	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2,
1569 		req);
1570 	if (!error && req->newptr)
1571 		resettodr();
1572 	return (error);
1573 }
1574 
1575 SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW,
1576 	&adjkerntz, 0, sysctl_machdep_adjkerntz, "I", "");
1577 
1578 SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set,
1579 	CTLFLAG_RW, &disable_rtc_set, 0, "");
1580 
1581 #if 0 /* JG */
1582 SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo,
1583 	CTLFLAG_RD, &bootinfo, bootinfo, "");
1584 #endif
1585 
1586 SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock,
1587 	CTLFLAG_RW, &wall_cmos_clock, 0, "");
1588 
1589 static int
efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)1590 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1591 {
1592 	struct efi_map_header *efihdr;
1593 	caddr_t kmdp;
1594 	uint32_t efisize;
1595 
1596 	kmdp = preload_search_by_type("elf kernel");
1597 	if (kmdp == NULL)
1598 		kmdp = preload_search_by_type("elf64 kernel");
1599 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1600 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1601 	if (efihdr == NULL)
1602 		return (0);
1603 	efisize = *((uint32_t *)efihdr - 1);
1604 	return (SYSCTL_OUT(req, efihdr, efisize));
1605 }
1606 SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1607     efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map");
1608 
1609 /*
1610  * Initialize x86 and configure to run kernel
1611  */
1612 
1613 /*
1614  * Initialize segments & interrupt table
1615  */
1616 
1617 int _default_ldt;
1618 struct user_segment_descriptor gdt_cpu0[MAXGDT_COUNT];
1619 struct gate_descriptor idt_arr[MAXCPU][NIDT];
1620 #if 0 /* JG */
1621 union descriptor ldt[NLDT];		/* local descriptor table */
1622 #endif
1623 
1624 /* table descriptors - used to load tables by cpu */
1625 struct region_descriptor r_gdt;
1626 struct region_descriptor r_idt_arr[MAXCPU];
1627 
1628 /* JG proc0paddr is a virtual address */
1629 void *proc0paddr;
1630 /* JG alignment? */
1631 char proc0paddr_buff[LWKT_THREAD_STACK];
1632 
1633 
1634 /* software prototypes -- in more palatable form */
1635 struct soft_segment_descriptor gdt_segs[] = {
1636 /* GNULL_SEL	0 Null Descriptor */
1637 {	0x0,			/* segment base address  */
1638 	0x0,			/* length */
1639 	0,			/* segment type */
1640 	0,			/* segment descriptor priority level */
1641 	0,			/* segment descriptor present */
1642 	0,			/* long */
1643 	0,			/* default 32 vs 16 bit size */
1644 	0			/* limit granularity (byte/page units)*/ },
1645 /* GCODE_SEL	1 Code Descriptor for kernel */
1646 {	0x0,			/* segment base address  */
1647 	0xfffff,		/* length - all address space */
1648 	SDT_MEMERA,		/* segment type */
1649 	SEL_KPL,		/* segment descriptor priority level */
1650 	1,			/* segment descriptor present */
1651 	1,			/* long */
1652 	0,			/* default 32 vs 16 bit size */
1653 	1			/* limit granularity (byte/page units)*/ },
1654 /* GDATA_SEL	2 Data Descriptor for kernel */
1655 {	0x0,			/* segment base address  */
1656 	0xfffff,		/* length - all address space */
1657 	SDT_MEMRWA,		/* segment type */
1658 	SEL_KPL,		/* segment descriptor priority level */
1659 	1,			/* segment descriptor present */
1660 	1,			/* long */
1661 	0,			/* default 32 vs 16 bit size */
1662 	1			/* limit granularity (byte/page units)*/ },
1663 /* GUCODE32_SEL	3 32 bit Code Descriptor for user */
1664 {	0x0,			/* segment base address  */
1665 	0xfffff,		/* length - all address space */
1666 	SDT_MEMERA,		/* segment type */
1667 	SEL_UPL,		/* segment descriptor priority level */
1668 	1,			/* segment descriptor present */
1669 	0,			/* long */
1670 	1,			/* default 32 vs 16 bit size */
1671 	1			/* limit granularity (byte/page units)*/ },
1672 /* GUDATA_SEL	4 32/64 bit Data Descriptor for user */
1673 {	0x0,			/* segment base address  */
1674 	0xfffff,		/* length - all address space */
1675 	SDT_MEMRWA,		/* segment type */
1676 	SEL_UPL,		/* segment descriptor priority level */
1677 	1,			/* segment descriptor present */
1678 	0,			/* long */
1679 	1,			/* default 32 vs 16 bit size */
1680 	1			/* limit granularity (byte/page units)*/ },
1681 /* GUCODE_SEL	5 64 bit Code Descriptor for user */
1682 {	0x0,			/* segment base address  */
1683 	0xfffff,		/* length - all address space */
1684 	SDT_MEMERA,		/* segment type */
1685 	SEL_UPL,		/* segment descriptor priority level */
1686 	1,			/* segment descriptor present */
1687 	1,			/* long */
1688 	0,			/* default 32 vs 16 bit size */
1689 	1			/* limit granularity (byte/page units)*/ },
1690 /* GPROC0_SEL	6 Proc 0 Tss Descriptor */
1691 {
1692 	0x0,			/* segment base address */
1693 	sizeof(struct x86_64tss)-1,/* length - all address space */
1694 	SDT_SYSTSS,		/* segment type */
1695 	SEL_KPL,		/* segment descriptor priority level */
1696 	1,			/* segment descriptor present */
1697 	0,			/* long */
1698 	0,			/* unused - default 32 vs 16 bit size */
1699 	0			/* limit granularity (byte/page units)*/ },
1700 /* Actually, the TSS is a system descriptor which is double size */
1701 {	0x0,			/* segment base address  */
1702 	0x0,			/* length */
1703 	0,			/* segment type */
1704 	0,			/* segment descriptor priority level */
1705 	0,			/* segment descriptor present */
1706 	0,			/* long */
1707 	0,			/* default 32 vs 16 bit size */
1708 	0			/* limit granularity (byte/page units)*/ },
1709 /* GUGS32_SEL	8 32 bit GS Descriptor for user */
1710 {	0x0,			/* segment base address  */
1711 	0xfffff,		/* length - all address space */
1712 	SDT_MEMRWA,		/* segment type */
1713 	SEL_UPL,		/* segment descriptor priority level */
1714 	1,			/* segment descriptor present */
1715 	0,			/* long */
1716 	1,			/* default 32 vs 16 bit size */
1717 	1			/* limit granularity (byte/page units)*/ },
1718 };
1719 
1720 void
setidt_global(int idx,inthand_t * func,int typ,int dpl,int ist)1721 setidt_global(int idx, inthand_t *func, int typ, int dpl, int ist)
1722 {
1723 	int cpu;
1724 
1725 	for (cpu = 0; cpu < MAXCPU; ++cpu) {
1726 		struct gate_descriptor *ip = &idt_arr[cpu][idx];
1727 
1728 		ip->gd_looffset = (uintptr_t)func;
1729 		ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
1730 		ip->gd_ist = ist;
1731 		ip->gd_xx = 0;
1732 		ip->gd_type = typ;
1733 		ip->gd_dpl = dpl;
1734 		ip->gd_p = 1;
1735 		ip->gd_hioffset = ((uintptr_t)func)>>16 ;
1736 	}
1737 }
1738 
1739 void
setidt(int idx,inthand_t * func,int typ,int dpl,int ist,int cpu)1740 setidt(int idx, inthand_t *func, int typ, int dpl, int ist, int cpu)
1741 {
1742 	struct gate_descriptor *ip;
1743 
1744 	KASSERT(cpu >= 0 && cpu < ncpus, ("invalid cpu %d", cpu));
1745 
1746 	ip = &idt_arr[cpu][idx];
1747 	ip->gd_looffset = (uintptr_t)func;
1748 	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
1749 	ip->gd_ist = ist;
1750 	ip->gd_xx = 0;
1751 	ip->gd_type = typ;
1752 	ip->gd_dpl = dpl;
1753 	ip->gd_p = 1;
1754 	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
1755 }
1756 
1757 #define	IDTVEC(name)	__CONCAT(X,name)
1758 
1759 extern inthand_t
1760 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
1761 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
1762 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
1763 	IDTVEC(page), IDTVEC(mchk), IDTVEC(fpu), IDTVEC(align),
1764 	IDTVEC(xmm), IDTVEC(dblfault),
1765 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
1766 
1767 extern inthand_t
1768 	IDTVEC(rsvd00), IDTVEC(rsvd01), IDTVEC(rsvd02), IDTVEC(rsvd03),
1769 	IDTVEC(rsvd04), IDTVEC(rsvd05), IDTVEC(rsvd06), IDTVEC(rsvd07),
1770 	IDTVEC(rsvd08), IDTVEC(rsvd09), IDTVEC(rsvd0a), IDTVEC(rsvd0b),
1771 	IDTVEC(rsvd0c), IDTVEC(rsvd0d), IDTVEC(rsvd0e), IDTVEC(rsvd0f),
1772 	IDTVEC(rsvd10), IDTVEC(rsvd11), IDTVEC(rsvd12), IDTVEC(rsvd13),
1773 	IDTVEC(rsvd14), IDTVEC(rsvd15), IDTVEC(rsvd16), IDTVEC(rsvd17),
1774 	IDTVEC(rsvd18), IDTVEC(rsvd19), IDTVEC(rsvd1a), IDTVEC(rsvd1b),
1775 	IDTVEC(rsvd1c), IDTVEC(rsvd1d), IDTVEC(rsvd1e), IDTVEC(rsvd1f),
1776 	IDTVEC(rsvd20), IDTVEC(rsvd21), IDTVEC(rsvd22), IDTVEC(rsvd23),
1777 	IDTVEC(rsvd24), IDTVEC(rsvd25), IDTVEC(rsvd26), IDTVEC(rsvd27),
1778 	IDTVEC(rsvd28), IDTVEC(rsvd29), IDTVEC(rsvd2a), IDTVEC(rsvd2b),
1779 	IDTVEC(rsvd2c), IDTVEC(rsvd2d), IDTVEC(rsvd2e), IDTVEC(rsvd2f),
1780 	IDTVEC(rsvd30), IDTVEC(rsvd31), IDTVEC(rsvd32), IDTVEC(rsvd33),
1781 	IDTVEC(rsvd34), IDTVEC(rsvd35), IDTVEC(rsvd36), IDTVEC(rsvd37),
1782 	IDTVEC(rsvd38), IDTVEC(rsvd39), IDTVEC(rsvd3a), IDTVEC(rsvd3b),
1783 	IDTVEC(rsvd3c), IDTVEC(rsvd3d), IDTVEC(rsvd3e), IDTVEC(rsvd3f),
1784 	IDTVEC(rsvd40), IDTVEC(rsvd41), IDTVEC(rsvd42), IDTVEC(rsvd43),
1785 	IDTVEC(rsvd44), IDTVEC(rsvd45), IDTVEC(rsvd46), IDTVEC(rsvd47),
1786 	IDTVEC(rsvd48), IDTVEC(rsvd49), IDTVEC(rsvd4a), IDTVEC(rsvd4b),
1787 	IDTVEC(rsvd4c), IDTVEC(rsvd4d), IDTVEC(rsvd4e), IDTVEC(rsvd4f),
1788 	IDTVEC(rsvd50), IDTVEC(rsvd51), IDTVEC(rsvd52), IDTVEC(rsvd53),
1789 	IDTVEC(rsvd54), IDTVEC(rsvd55), IDTVEC(rsvd56), IDTVEC(rsvd57),
1790 	IDTVEC(rsvd58), IDTVEC(rsvd59), IDTVEC(rsvd5a), IDTVEC(rsvd5b),
1791 	IDTVEC(rsvd5c), IDTVEC(rsvd5d), IDTVEC(rsvd5e), IDTVEC(rsvd5f),
1792 	IDTVEC(rsvd60), IDTVEC(rsvd61), IDTVEC(rsvd62), IDTVEC(rsvd63),
1793 	IDTVEC(rsvd64), IDTVEC(rsvd65), IDTVEC(rsvd66), IDTVEC(rsvd67),
1794 	IDTVEC(rsvd68), IDTVEC(rsvd69), IDTVEC(rsvd6a), IDTVEC(rsvd6b),
1795 	IDTVEC(rsvd6c), IDTVEC(rsvd6d), IDTVEC(rsvd6e), IDTVEC(rsvd6f),
1796 	IDTVEC(rsvd70), IDTVEC(rsvd71), IDTVEC(rsvd72), IDTVEC(rsvd73),
1797 	IDTVEC(rsvd74), IDTVEC(rsvd75), IDTVEC(rsvd76), IDTVEC(rsvd77),
1798 	IDTVEC(rsvd78), IDTVEC(rsvd79), IDTVEC(rsvd7a), IDTVEC(rsvd7b),
1799 	IDTVEC(rsvd7c), IDTVEC(rsvd7d), IDTVEC(rsvd7e), IDTVEC(rsvd7f),
1800 	IDTVEC(rsvd80), IDTVEC(rsvd81), IDTVEC(rsvd82), IDTVEC(rsvd83),
1801 	IDTVEC(rsvd84), IDTVEC(rsvd85), IDTVEC(rsvd86), IDTVEC(rsvd87),
1802 	IDTVEC(rsvd88), IDTVEC(rsvd89), IDTVEC(rsvd8a), IDTVEC(rsvd8b),
1803 	IDTVEC(rsvd8c), IDTVEC(rsvd8d), IDTVEC(rsvd8e), IDTVEC(rsvd8f),
1804 	IDTVEC(rsvd90), IDTVEC(rsvd91), IDTVEC(rsvd92), IDTVEC(rsvd93),
1805 	IDTVEC(rsvd94), IDTVEC(rsvd95), IDTVEC(rsvd96), IDTVEC(rsvd97),
1806 	IDTVEC(rsvd98), IDTVEC(rsvd99), IDTVEC(rsvd9a), IDTVEC(rsvd9b),
1807 	IDTVEC(rsvd9c), IDTVEC(rsvd9d), IDTVEC(rsvd9e), IDTVEC(rsvd9f),
1808 	IDTVEC(rsvda0), IDTVEC(rsvda1), IDTVEC(rsvda2), IDTVEC(rsvda3),
1809 	IDTVEC(rsvda4), IDTVEC(rsvda5), IDTVEC(rsvda6), IDTVEC(rsvda7),
1810 	IDTVEC(rsvda8), IDTVEC(rsvda9), IDTVEC(rsvdaa), IDTVEC(rsvdab),
1811 	IDTVEC(rsvdac), IDTVEC(rsvdad), IDTVEC(rsvdae), IDTVEC(rsvdaf),
1812 	IDTVEC(rsvdb0), IDTVEC(rsvdb1), IDTVEC(rsvdb2), IDTVEC(rsvdb3),
1813 	IDTVEC(rsvdb4), IDTVEC(rsvdb5), IDTVEC(rsvdb6), IDTVEC(rsvdb7),
1814 	IDTVEC(rsvdb8), IDTVEC(rsvdb9), IDTVEC(rsvdba), IDTVEC(rsvdbb),
1815 	IDTVEC(rsvdbc), IDTVEC(rsvdbd), IDTVEC(rsvdbe), IDTVEC(rsvdbf),
1816 	IDTVEC(rsvdc0), IDTVEC(rsvdc1), IDTVEC(rsvdc2), IDTVEC(rsvdc3),
1817 	IDTVEC(rsvdc4), IDTVEC(rsvdc5), IDTVEC(rsvdc6), IDTVEC(rsvdc7),
1818 	IDTVEC(rsvdc8), IDTVEC(rsvdc9), IDTVEC(rsvdca), IDTVEC(rsvdcb),
1819 	IDTVEC(rsvdcc), IDTVEC(rsvdcd), IDTVEC(rsvdce), IDTVEC(rsvdcf),
1820 	IDTVEC(rsvdd0), IDTVEC(rsvdd1), IDTVEC(rsvdd2), IDTVEC(rsvdd3),
1821 	IDTVEC(rsvdd4), IDTVEC(rsvdd5), IDTVEC(rsvdd6), IDTVEC(rsvdd7),
1822 	IDTVEC(rsvdd8), IDTVEC(rsvdd9), IDTVEC(rsvdda), IDTVEC(rsvddb),
1823 	IDTVEC(rsvddc), IDTVEC(rsvddd), IDTVEC(rsvdde), IDTVEC(rsvddf),
1824 	IDTVEC(rsvde0), IDTVEC(rsvde1), IDTVEC(rsvde2), IDTVEC(rsvde3),
1825 	IDTVEC(rsvde4), IDTVEC(rsvde5), IDTVEC(rsvde6), IDTVEC(rsvde7),
1826 	IDTVEC(rsvde8), IDTVEC(rsvde9), IDTVEC(rsvdea), IDTVEC(rsvdeb),
1827 	IDTVEC(rsvdec), IDTVEC(rsvded), IDTVEC(rsvdee), IDTVEC(rsvdef),
1828 	IDTVEC(rsvdf0), IDTVEC(rsvdf1), IDTVEC(rsvdf2), IDTVEC(rsvdf3),
1829 	IDTVEC(rsvdf4), IDTVEC(rsvdf5), IDTVEC(rsvdf6), IDTVEC(rsvdf7),
1830 	IDTVEC(rsvdf8), IDTVEC(rsvdf9), IDTVEC(rsvdfa), IDTVEC(rsvdfb),
1831 	IDTVEC(rsvdfc), IDTVEC(rsvdfd), IDTVEC(rsvdfe), IDTVEC(rsvdff);
1832 
1833 inthand_t *rsvdary[NIDT] = {
1834 	&IDTVEC(rsvd00), &IDTVEC(rsvd01), &IDTVEC(rsvd02), &IDTVEC(rsvd03),
1835 	&IDTVEC(rsvd04), &IDTVEC(rsvd05), &IDTVEC(rsvd06), &IDTVEC(rsvd07),
1836 	&IDTVEC(rsvd08), &IDTVEC(rsvd09), &IDTVEC(rsvd0a), &IDTVEC(rsvd0b),
1837 	&IDTVEC(rsvd0c), &IDTVEC(rsvd0d), &IDTVEC(rsvd0e), &IDTVEC(rsvd0f),
1838 	&IDTVEC(rsvd10), &IDTVEC(rsvd11), &IDTVEC(rsvd12), &IDTVEC(rsvd13),
1839 	&IDTVEC(rsvd14), &IDTVEC(rsvd15), &IDTVEC(rsvd16), &IDTVEC(rsvd17),
1840 	&IDTVEC(rsvd18), &IDTVEC(rsvd19), &IDTVEC(rsvd1a), &IDTVEC(rsvd1b),
1841 	&IDTVEC(rsvd1c), &IDTVEC(rsvd1d), &IDTVEC(rsvd1e), &IDTVEC(rsvd1f),
1842 	&IDTVEC(rsvd20), &IDTVEC(rsvd21), &IDTVEC(rsvd22), &IDTVEC(rsvd23),
1843 	&IDTVEC(rsvd24), &IDTVEC(rsvd25), &IDTVEC(rsvd26), &IDTVEC(rsvd27),
1844 	&IDTVEC(rsvd28), &IDTVEC(rsvd29), &IDTVEC(rsvd2a), &IDTVEC(rsvd2b),
1845 	&IDTVEC(rsvd2c), &IDTVEC(rsvd2d), &IDTVEC(rsvd2e), &IDTVEC(rsvd2f),
1846 	&IDTVEC(rsvd30), &IDTVEC(rsvd31), &IDTVEC(rsvd32), &IDTVEC(rsvd33),
1847 	&IDTVEC(rsvd34), &IDTVEC(rsvd35), &IDTVEC(rsvd36), &IDTVEC(rsvd37),
1848 	&IDTVEC(rsvd38), &IDTVEC(rsvd39), &IDTVEC(rsvd3a), &IDTVEC(rsvd3b),
1849 	&IDTVEC(rsvd3c), &IDTVEC(rsvd3d), &IDTVEC(rsvd3e), &IDTVEC(rsvd3f),
1850 	&IDTVEC(rsvd40), &IDTVEC(rsvd41), &IDTVEC(rsvd42), &IDTVEC(rsvd43),
1851 	&IDTVEC(rsvd44), &IDTVEC(rsvd45), &IDTVEC(rsvd46), &IDTVEC(rsvd47),
1852 	&IDTVEC(rsvd48), &IDTVEC(rsvd49), &IDTVEC(rsvd4a), &IDTVEC(rsvd4b),
1853 	&IDTVEC(rsvd4c), &IDTVEC(rsvd4d), &IDTVEC(rsvd4e), &IDTVEC(rsvd4f),
1854 	&IDTVEC(rsvd50), &IDTVEC(rsvd51), &IDTVEC(rsvd52), &IDTVEC(rsvd53),
1855 	&IDTVEC(rsvd54), &IDTVEC(rsvd55), &IDTVEC(rsvd56), &IDTVEC(rsvd57),
1856 	&IDTVEC(rsvd58), &IDTVEC(rsvd59), &IDTVEC(rsvd5a), &IDTVEC(rsvd5b),
1857 	&IDTVEC(rsvd5c), &IDTVEC(rsvd5d), &IDTVEC(rsvd5e), &IDTVEC(rsvd5f),
1858 	&IDTVEC(rsvd60), &IDTVEC(rsvd61), &IDTVEC(rsvd62), &IDTVEC(rsvd63),
1859 	&IDTVEC(rsvd64), &IDTVEC(rsvd65), &IDTVEC(rsvd66), &IDTVEC(rsvd67),
1860 	&IDTVEC(rsvd68), &IDTVEC(rsvd69), &IDTVEC(rsvd6a), &IDTVEC(rsvd6b),
1861 	&IDTVEC(rsvd6c), &IDTVEC(rsvd6d), &IDTVEC(rsvd6e), &IDTVEC(rsvd6f),
1862 	&IDTVEC(rsvd70), &IDTVEC(rsvd71), &IDTVEC(rsvd72), &IDTVEC(rsvd73),
1863 	&IDTVEC(rsvd74), &IDTVEC(rsvd75), &IDTVEC(rsvd76), &IDTVEC(rsvd77),
1864 	&IDTVEC(rsvd78), &IDTVEC(rsvd79), &IDTVEC(rsvd7a), &IDTVEC(rsvd7b),
1865 	&IDTVEC(rsvd7c), &IDTVEC(rsvd7d), &IDTVEC(rsvd7e), &IDTVEC(rsvd7f),
1866 	&IDTVEC(rsvd80), &IDTVEC(rsvd81), &IDTVEC(rsvd82), &IDTVEC(rsvd83),
1867 	&IDTVEC(rsvd84), &IDTVEC(rsvd85), &IDTVEC(rsvd86), &IDTVEC(rsvd87),
1868 	&IDTVEC(rsvd88), &IDTVEC(rsvd89), &IDTVEC(rsvd8a), &IDTVEC(rsvd8b),
1869 	&IDTVEC(rsvd8c), &IDTVEC(rsvd8d), &IDTVEC(rsvd8e), &IDTVEC(rsvd8f),
1870 	&IDTVEC(rsvd90), &IDTVEC(rsvd91), &IDTVEC(rsvd92), &IDTVEC(rsvd93),
1871 	&IDTVEC(rsvd94), &IDTVEC(rsvd95), &IDTVEC(rsvd96), &IDTVEC(rsvd97),
1872 	&IDTVEC(rsvd98), &IDTVEC(rsvd99), &IDTVEC(rsvd9a), &IDTVEC(rsvd9b),
1873 	&IDTVEC(rsvd9c), &IDTVEC(rsvd9d), &IDTVEC(rsvd9e), &IDTVEC(rsvd9f),
1874 	&IDTVEC(rsvda0), &IDTVEC(rsvda1), &IDTVEC(rsvda2), &IDTVEC(rsvda3),
1875 	&IDTVEC(rsvda4), &IDTVEC(rsvda5), &IDTVEC(rsvda6), &IDTVEC(rsvda7),
1876 	&IDTVEC(rsvda8), &IDTVEC(rsvda9), &IDTVEC(rsvdaa), &IDTVEC(rsvdab),
1877 	&IDTVEC(rsvdac), &IDTVEC(rsvdad), &IDTVEC(rsvdae), &IDTVEC(rsvdaf),
1878 	&IDTVEC(rsvdb0), &IDTVEC(rsvdb1), &IDTVEC(rsvdb2), &IDTVEC(rsvdb3),
1879 	&IDTVEC(rsvdb4), &IDTVEC(rsvdb5), &IDTVEC(rsvdb6), &IDTVEC(rsvdb7),
1880 	&IDTVEC(rsvdb8), &IDTVEC(rsvdb9), &IDTVEC(rsvdba), &IDTVEC(rsvdbb),
1881 	&IDTVEC(rsvdbc), &IDTVEC(rsvdbd), &IDTVEC(rsvdbe), &IDTVEC(rsvdbf),
1882 	&IDTVEC(rsvdc0), &IDTVEC(rsvdc1), &IDTVEC(rsvdc2), &IDTVEC(rsvdc3),
1883 	&IDTVEC(rsvdc4), &IDTVEC(rsvdc5), &IDTVEC(rsvdc6), &IDTVEC(rsvdc7),
1884 	&IDTVEC(rsvdc8), &IDTVEC(rsvdc9), &IDTVEC(rsvdca), &IDTVEC(rsvdcb),
1885 	&IDTVEC(rsvdcc), &IDTVEC(rsvdcd), &IDTVEC(rsvdce), &IDTVEC(rsvdcf),
1886 	&IDTVEC(rsvdd0), &IDTVEC(rsvdd1), &IDTVEC(rsvdd2), &IDTVEC(rsvdd3),
1887 	&IDTVEC(rsvdd4), &IDTVEC(rsvdd5), &IDTVEC(rsvdd6), &IDTVEC(rsvdd7),
1888 	&IDTVEC(rsvdd8), &IDTVEC(rsvdd9), &IDTVEC(rsvdda), &IDTVEC(rsvddb),
1889 	&IDTVEC(rsvddc), &IDTVEC(rsvddd), &IDTVEC(rsvdde), &IDTVEC(rsvddf),
1890 	&IDTVEC(rsvde0), &IDTVEC(rsvde1), &IDTVEC(rsvde2), &IDTVEC(rsvde3),
1891 	&IDTVEC(rsvde4), &IDTVEC(rsvde5), &IDTVEC(rsvde6), &IDTVEC(rsvde7),
1892 	&IDTVEC(rsvde8), &IDTVEC(rsvde9), &IDTVEC(rsvdea), &IDTVEC(rsvdeb),
1893 	&IDTVEC(rsvdec), &IDTVEC(rsvded), &IDTVEC(rsvdee), &IDTVEC(rsvdef),
1894 	&IDTVEC(rsvdf0), &IDTVEC(rsvdf1), &IDTVEC(rsvdf2), &IDTVEC(rsvdf3),
1895 	&IDTVEC(rsvdf4), &IDTVEC(rsvdf5), &IDTVEC(rsvdf6), &IDTVEC(rsvdf7),
1896 	&IDTVEC(rsvdf8), &IDTVEC(rsvdf9), &IDTVEC(rsvdfa), &IDTVEC(rsvdfb),
1897 	&IDTVEC(rsvdfc), &IDTVEC(rsvdfd), &IDTVEC(rsvdfe), &IDTVEC(rsvdff)
1898 };
1899 
1900 void
sdtossd(struct user_segment_descriptor * sd,struct soft_segment_descriptor * ssd)1901 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd)
1902 {
1903 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
1904 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
1905 	ssd->ssd_type  = sd->sd_type;
1906 	ssd->ssd_dpl   = sd->sd_dpl;
1907 	ssd->ssd_p     = sd->sd_p;
1908 	ssd->ssd_def32 = sd->sd_def32;
1909 	ssd->ssd_gran  = sd->sd_gran;
1910 }
1911 
1912 void
ssdtosd(struct soft_segment_descriptor * ssd,struct user_segment_descriptor * sd)1913 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd)
1914 {
1915 
1916 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
1917 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
1918 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
1919 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
1920 	sd->sd_type  = ssd->ssd_type;
1921 	sd->sd_dpl   = ssd->ssd_dpl;
1922 	sd->sd_p     = ssd->ssd_p;
1923 	sd->sd_long  = ssd->ssd_long;
1924 	sd->sd_def32 = ssd->ssd_def32;
1925 	sd->sd_gran  = ssd->ssd_gran;
1926 }
1927 
1928 void
ssdtosyssd(struct soft_segment_descriptor * ssd,struct system_segment_descriptor * sd)1929 ssdtosyssd(struct soft_segment_descriptor *ssd,
1930     struct system_segment_descriptor *sd)
1931 {
1932 
1933 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
1934 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
1935 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
1936 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
1937 	sd->sd_type  = ssd->ssd_type;
1938 	sd->sd_dpl   = ssd->ssd_dpl;
1939 	sd->sd_p     = ssd->ssd_p;
1940 	sd->sd_gran  = ssd->ssd_gran;
1941 }
1942 
1943 /*
1944  * Populate the (physmap) array with base/bound pairs describing the
1945  * available physical memory in the system, then test this memory and
1946  * build the phys_avail array describing the actually-available memory.
1947  *
1948  * If we cannot accurately determine the physical memory map, then use
1949  * value from the 0xE801 call, and failing that, the RTC.
1950  *
1951  * Total memory size may be set by the kernel environment variable
1952  * hw.physmem or the compile-time define MAXMEM.
1953  *
1954  * Memory is aligned to PHYSMAP_ALIGN which must be a multiple
1955  * of PAGE_SIZE.  This also greatly reduces the memory test time
1956  * which would otherwise be excessive on machines with > 8G of ram.
1957  *
1958  * XXX first should be vm_paddr_t.
1959  */
1960 
1961 #define PHYSMAP_ALIGN		(vm_paddr_t)(128 * 1024)
1962 #define PHYSMAP_ALIGN_MASK	(vm_paddr_t)(PHYSMAP_ALIGN - 1)
1963 #define PHYSMAP_SIZE		VM_PHYSSEG_MAX
1964 
1965 vm_paddr_t physmap[PHYSMAP_SIZE];
1966 struct bios_smap *smapbase, *smap, *smapend;
1967 struct efi_map_header *efihdrbase;
1968 u_int32_t smapsize;
1969 
1970 #define PHYSMAP_HANDWAVE	(vm_paddr_t)(2 * 1024 * 1024)
1971 #define PHYSMAP_HANDWAVE_MASK	(PHYSMAP_HANDWAVE - 1)
1972 
1973 static void
add_smap_entries(int * physmap_idx)1974 add_smap_entries(int *physmap_idx)
1975 {
1976 	int i;
1977 
1978 	smapsize = *((u_int32_t *)smapbase - 1);
1979 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
1980 
1981 	for (smap = smapbase; smap < smapend; smap++) {
1982 		if (boothowto & RB_VERBOSE)
1983 			kprintf("SMAP type=%02x base=%016lx len=%016lx\n",
1984 			    smap->type, smap->base, smap->length);
1985 
1986 		if (smap->type != SMAP_TYPE_MEMORY)
1987 			continue;
1988 
1989 		if (smap->length == 0)
1990 			continue;
1991 
1992 		for (i = 0; i <= *physmap_idx; i += 2) {
1993 			if (smap->base < physmap[i + 1]) {
1994 				if (boothowto & RB_VERBOSE) {
1995 					kprintf("Overlapping or non-monotonic "
1996 						"memory region, ignoring "
1997 						"second region\n");
1998 				}
1999 				break;
2000 			}
2001 		}
2002 		if (i <= *physmap_idx)
2003 			continue;
2004 
2005 		Realmem += smap->length;
2006 
2007 		/*
2008 		 * NOTE: This little bit of code initially expands
2009 		 *	 physmap[1] as well as later entries.
2010 		 */
2011 		if (smap->base == physmap[*physmap_idx + 1]) {
2012 			physmap[*physmap_idx + 1] += smap->length;
2013 			continue;
2014 		}
2015 
2016 		*physmap_idx += 2;
2017 		if (*physmap_idx == PHYSMAP_SIZE) {
2018 			kprintf("Too many segments in the physical "
2019 				"address map, giving up\n");
2020 			break;
2021 		}
2022 		physmap[*physmap_idx] = smap->base;
2023 		physmap[*physmap_idx + 1] = smap->base + smap->length;
2024 	}
2025 }
2026 
2027 static void
add_efi_map_entries(int * physmap_idx)2028 add_efi_map_entries(int *physmap_idx)
2029 {
2030 	struct efi_md *map, *p;
2031 	const char *type;
2032 	size_t efisz;
2033 	int i, ndesc;
2034 
2035 	static const char *types[] = {
2036 		"Reserved",
2037 		"LoaderCode",
2038 		"LoaderData",
2039 		"BootServicesCode",
2040 		"BootServicesData",
2041 		"RuntimeServicesCode",
2042 		"RuntimeServicesData",
2043 		"ConventionalMemory",
2044 		"UnusableMemory",
2045 		"ACPIReclaimMemory",
2046 		"ACPIMemoryNVS",
2047 		"MemoryMappedIO",
2048 		"MemoryMappedIOPortSpace",
2049 		"PalCode"
2050 	 };
2051 
2052 	/*
2053 	 * Memory map data provided by UEFI via the GetMemoryMap
2054 	 * Boot Services API.
2055 	 */
2056 	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
2057 	map = (struct efi_md *)((uint8_t *)efihdrbase + efisz);
2058 
2059 	if (efihdrbase->descriptor_size == 0)
2060 		return;
2061 	ndesc = efihdrbase->memory_size / efihdrbase->descriptor_size;
2062 
2063 	if (boothowto & RB_VERBOSE)
2064 		kprintf("%23s %12s %12s %8s %4s\n",
2065 		    "Type", "Physical", "Virtual", "#Pages", "Attr");
2066 
2067 	for (i = 0, p = map; i < ndesc; i++,
2068 	    p = efi_next_descriptor(p, efihdrbase->descriptor_size)) {
2069 		if (boothowto & RB_VERBOSE) {
2070 			if (p->md_type <= EFI_MD_TYPE_PALCODE)
2071 				type = types[p->md_type];
2072 			else
2073 				type = "<INVALID>";
2074 			kprintf("%23s %012lx %12p %08lx ", type, p->md_phys,
2075 			    p->md_virt, p->md_pages);
2076 			if (p->md_attr & EFI_MD_ATTR_UC)
2077 				kprintf("UC ");
2078 			if (p->md_attr & EFI_MD_ATTR_WC)
2079 				kprintf("WC ");
2080 			if (p->md_attr & EFI_MD_ATTR_WT)
2081 				kprintf("WT ");
2082 			if (p->md_attr & EFI_MD_ATTR_WB)
2083 				kprintf("WB ");
2084 			if (p->md_attr & EFI_MD_ATTR_UCE)
2085 				kprintf("UCE ");
2086 			if (p->md_attr & EFI_MD_ATTR_WP)
2087 				kprintf("WP ");
2088 			if (p->md_attr & EFI_MD_ATTR_RP)
2089 				kprintf("RP ");
2090 			if (p->md_attr & EFI_MD_ATTR_XP)
2091 				kprintf("XP ");
2092 			if (p->md_attr & EFI_MD_ATTR_RT)
2093 				kprintf("RUNTIME");
2094 			kprintf("\n");
2095 		}
2096 
2097 		switch (p->md_type) {
2098 		case EFI_MD_TYPE_CODE:
2099 		case EFI_MD_TYPE_DATA:
2100 		case EFI_MD_TYPE_BS_CODE:
2101 		case EFI_MD_TYPE_BS_DATA:
2102 		case EFI_MD_TYPE_FREE:
2103 			/*
2104 			 * We're allowed to use any entry with these types.
2105 			 */
2106 			break;
2107 		default:
2108 			continue;
2109 		}
2110 
2111 		Realmem += p->md_pages * PAGE_SIZE;
2112 
2113 		/*
2114 		 * NOTE: This little bit of code initially expands
2115 		 *	 physmap[1] as well as later entries.
2116 		 */
2117 		if (p->md_phys == physmap[*physmap_idx + 1]) {
2118 			physmap[*physmap_idx + 1] += p->md_pages * PAGE_SIZE;
2119 			continue;
2120 		}
2121 
2122 		*physmap_idx += 2;
2123 		if (*physmap_idx == PHYSMAP_SIZE) {
2124 			kprintf("Too many segments in the physical "
2125 				"address map, giving up\n");
2126 			break;
2127 		}
2128 		physmap[*physmap_idx] = p->md_phys;
2129 		physmap[*physmap_idx + 1] = p->md_phys + p->md_pages * PAGE_SIZE;
2130 	 }
2131 }
2132 
2133 struct fb_info efi_fb_info;
2134 static int have_efi_framebuffer = 0;
2135 
2136 static void
efi_fb_init_vaddr(int direct_map)2137 efi_fb_init_vaddr(int direct_map)
2138 {
2139 	uint64_t sz;
2140 	vm_offset_t addr, v;
2141 
2142 	v = efi_fb_info.vaddr;
2143 	sz = efi_fb_info.stride * efi_fb_info.height;
2144 
2145 	if (direct_map) {
2146 		addr = PHYS_TO_DMAP(efi_fb_info.paddr);
2147 		if (addr >= DMAP_MIN_ADDRESS && addr + sz <= DMapMaxAddress)
2148 			efi_fb_info.vaddr = addr;
2149 	} else {
2150 		efi_fb_info.vaddr =
2151 			(vm_offset_t)pmap_mapdev_attr(efi_fb_info.paddr,
2152 						      sz,
2153 						      PAT_WRITE_COMBINING);
2154 	}
2155 }
2156 
2157 static u_int
efifb_color_depth(struct efi_fb * efifb)2158 efifb_color_depth(struct efi_fb *efifb)
2159 {
2160 	uint32_t mask;
2161 	u_int depth;
2162 
2163 	mask = efifb->fb_mask_red | efifb->fb_mask_green |
2164 	    efifb->fb_mask_blue | efifb->fb_mask_reserved;
2165 	if (mask == 0)
2166 		return (0);
2167 	for (depth = 1; mask != 1; depth++)
2168 		mask >>= 1;
2169 	return (depth);
2170 }
2171 
2172 int
probe_efi_fb(int early)2173 probe_efi_fb(int early)
2174 {
2175 	struct efi_fb	*efifb;
2176 	caddr_t		kmdp;
2177 	u_int		depth;
2178 
2179 	if (have_efi_framebuffer) {
2180 		if (!early &&
2181 		    (efi_fb_info.vaddr == 0 ||
2182 		     efi_fb_info.vaddr == PHYS_TO_DMAP(efi_fb_info.paddr)))
2183 			efi_fb_init_vaddr(0);
2184 		return 0;
2185 	}
2186 
2187 	kmdp = preload_search_by_type("elf kernel");
2188 	if (kmdp == NULL)
2189 		kmdp = preload_search_by_type("elf64 kernel");
2190 	efifb = (struct efi_fb *)preload_search_info(kmdp,
2191 	    MODINFO_METADATA | MODINFOMD_EFI_FB);
2192 	if (efifb == NULL)
2193 		return 1;
2194 
2195 	depth = efifb_color_depth(efifb);
2196 	/*
2197 	 * Our bootloader should already notice, when we won't be able to
2198 	 * use the UEFI framebuffer.
2199 	 */
2200 	if (depth != 24 && depth != 32)
2201 		return 1;
2202 
2203 	have_efi_framebuffer = 1;
2204 
2205 	efi_fb_info.is_vga_boot_display = 1;
2206 	efi_fb_info.width = efifb->fb_width;
2207 	efi_fb_info.height = efifb->fb_height;
2208 	efi_fb_info.depth = depth;
2209 	efi_fb_info.stride = efifb->fb_stride * (depth / 8);
2210 	efi_fb_info.paddr = efifb->fb_addr;
2211 	if (early) {
2212 		efi_fb_info.vaddr = 0;
2213 	} else {
2214 		efi_fb_init_vaddr(0);
2215 	}
2216 	efi_fb_info.fbops.fb_set_par = NULL;
2217 	efi_fb_info.fbops.fb_blank = NULL;
2218 	efi_fb_info.fbops.fb_debug_enter = NULL;
2219 	efi_fb_info.device = NULL;
2220 
2221 	return 0;
2222 }
2223 
2224 static void
efifb_startup(void * arg)2225 efifb_startup(void *arg)
2226 {
2227 	probe_efi_fb(0);
2228 }
2229 
2230 SYSINIT(efi_fb_info, SI_BOOT1_POST, SI_ORDER_FIRST, efifb_startup, NULL);
2231 
2232 static void
getmemsize(caddr_t kmdp,u_int64_t first)2233 getmemsize(caddr_t kmdp, u_int64_t first)
2234 {
2235 	int off, physmap_idx, pa_indx, da_indx;
2236 	int i, j;
2237 	vm_paddr_t pa;
2238 	vm_paddr_t msgbuf_size;
2239 	u_long physmem_tunable;
2240 	pt_entry_t *pte;
2241 	quad_t dcons_addr, dcons_size;
2242 
2243 	bzero(physmap, sizeof(physmap));
2244 	physmap_idx = 0;
2245 
2246 	/*
2247 	 * get memory map from INT 15:E820, kindly supplied by the loader.
2248 	 *
2249 	 * subr_module.c says:
2250 	 * "Consumer may safely assume that size value precedes data."
2251 	 * ie: an int32_t immediately precedes smap.
2252 	 */
2253 	efihdrbase = (struct efi_map_header *)preload_search_info(kmdp,
2254 		     MODINFO_METADATA | MODINFOMD_EFI_MAP);
2255 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
2256 		   MODINFO_METADATA | MODINFOMD_SMAP);
2257 	if (smapbase == NULL && efihdrbase == NULL)
2258 		panic("No BIOS smap or EFI map info from loader!");
2259 
2260 	if (efihdrbase == NULL)
2261 		add_smap_entries(&physmap_idx);
2262 	else
2263 		add_efi_map_entries(&physmap_idx);
2264 
2265 	base_memory = physmap[1] / 1024;
2266 	/* make hole for AP bootstrap code */
2267 	physmap[1] = mp_bootaddress(base_memory);
2268 
2269 	/* Save EBDA address, if any */
2270 	ebda_addr = (u_long)(*(u_short *)(KERNBASE + 0x40e));
2271 	ebda_addr <<= 4;
2272 
2273 	/*
2274 	 * Maxmem isn't the "maximum memory", it's one larger than the
2275 	 * highest page of the physical address space.  It should be
2276 	 * called something like "Maxphyspage".  We may adjust this
2277 	 * based on ``hw.physmem'' and the results of the memory test.
2278 	 */
2279 	Maxmem = atop(physmap[physmap_idx + 1]);
2280 
2281 #ifdef MAXMEM
2282 	Maxmem = MAXMEM / 4;
2283 #endif
2284 
2285 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
2286 		Maxmem = atop(physmem_tunable);
2287 
2288 	/*
2289 	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
2290 	 * in the system.
2291 	 */
2292 	if (Maxmem > atop(physmap[physmap_idx + 1]))
2293 		Maxmem = atop(physmap[physmap_idx + 1]);
2294 
2295 	/*
2296 	 * Blowing out the DMAP will blow up the system.
2297 	 */
2298 	if (Maxmem > atop(DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS)) {
2299 		kprintf("Limiting Maxmem due to DMAP size\n");
2300 		Maxmem = atop(DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS);
2301 	}
2302 
2303 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
2304 	    (boothowto & RB_VERBOSE)) {
2305 		kprintf("Physical memory use set to %ldK\n", Maxmem * 4);
2306 	}
2307 
2308 	/*
2309 	 * Call pmap initialization to make new kernel address space
2310 	 *
2311 	 * Mask off page 0.
2312 	 */
2313 	pmap_bootstrap(&first);
2314 	physmap[0] = PAGE_SIZE;
2315 
2316 	/*
2317 	 * Align the physmap to PHYSMAP_ALIGN and cut out anything
2318 	 * exceeding Maxmem.
2319 	 */
2320 	for (i = j = 0; i <= physmap_idx; i += 2) {
2321 		if (physmap[i+1] > ptoa(Maxmem))
2322 			physmap[i+1] = ptoa(Maxmem);
2323 		physmap[i] = (physmap[i] + PHYSMAP_ALIGN_MASK) &
2324 			     ~PHYSMAP_ALIGN_MASK;
2325 		physmap[i+1] = physmap[i+1] & ~PHYSMAP_ALIGN_MASK;
2326 
2327 		physmap[j] = physmap[i];
2328 		physmap[j+1] = physmap[i+1];
2329 
2330 		if (physmap[i] < physmap[i+1])
2331 			j += 2;
2332 	}
2333 	physmap_idx = j - 2;
2334 
2335 	/*
2336 	 * Align anything else used in the validation loop.
2337 	 *
2338 	 * Also make sure that our 2MB kernel text+data+bss mappings
2339 	 * do not overlap potentially allocatable space.
2340 	 */
2341 	first = (first + PHYSMAP_ALIGN_MASK) & ~PHYSMAP_ALIGN_MASK;
2342 
2343 	/*
2344 	 * Size up each available chunk of physical memory.
2345 	 */
2346 	pa_indx = 0;
2347 	da_indx = 0;
2348 	phys_avail[pa_indx].phys_beg = physmap[0];
2349 	phys_avail[pa_indx].phys_end = physmap[0];
2350 	dump_avail[da_indx].phys_beg = 0;
2351 	dump_avail[da_indx].phys_end = physmap[0];
2352 	pte = CMAP1;
2353 
2354 	/*
2355 	 * Get dcons buffer address
2356 	 */
2357 	if (kgetenv_quad("dcons.addr", &dcons_addr) == 0 ||
2358 	    kgetenv_quad("dcons.size", &dcons_size) == 0)
2359 		dcons_addr = 0;
2360 
2361 	/*
2362 	 * Validate the physical memory.  The physical memory segments
2363 	 * have already been aligned to PHYSMAP_ALIGN which is a multiple
2364 	 * of PAGE_SIZE.
2365 	 *
2366 	 * We no longer perform an exhaustive memory test.  Instead we
2367 	 * simply test the first and last word in each physmap[]
2368 	 * segment.
2369 	 */
2370 	for (i = 0; i <= physmap_idx; i += 2) {
2371 		vm_paddr_t end;
2372 		vm_paddr_t incr;
2373 
2374 		end = physmap[i + 1];
2375 
2376 		for (pa = physmap[i]; pa < end; pa += incr) {
2377 			int page_bad, full;
2378 			volatile uint64_t *ptr = (uint64_t *)CADDR1;
2379 			uint64_t tmp;
2380 
2381 			full = FALSE;
2382 
2383 			/*
2384 			 * Calculate incr.  Just test the first and
2385 			 * last page in each physmap[] segment.
2386 			 */
2387 			if (pa == end - PAGE_SIZE)
2388 				incr = PAGE_SIZE;
2389 			else
2390 				incr = end - pa - PAGE_SIZE;
2391 
2392 			/*
2393 			 * Make sure we don't skip blacked out areas.
2394 			 */
2395 			if (pa < 0x200000 && 0x200000 < end) {
2396 				incr = 0x200000 - pa;
2397 			}
2398 			if (dcons_addr > 0 &&
2399 			    pa < dcons_addr &&
2400 			    dcons_addr < end) {
2401 				incr = dcons_addr - pa;
2402 			}
2403 
2404 			/*
2405 			 * Block out kernel memory as not available.
2406 			 */
2407 			if (pa >= 0x200000 && pa < first) {
2408 				incr = first - pa;
2409 				if (pa + incr > end)
2410 					incr = end - pa;
2411 				goto do_dump_avail;
2412 			}
2413 
2414 			/*
2415 			 * Block out the dcons buffer if it exists.
2416 			 */
2417 			if (dcons_addr > 0 &&
2418 			    pa >= trunc_page(dcons_addr) &&
2419 			    pa < dcons_addr + dcons_size) {
2420 				incr = dcons_addr + dcons_size - pa;
2421 				incr = (incr + PAGE_MASK) &
2422 				       ~(vm_paddr_t)PAGE_MASK;
2423 				if (pa + incr > end)
2424 					incr = end - pa;
2425 				goto do_dump_avail;
2426 			}
2427 
2428 			page_bad = FALSE;
2429 
2430 			/*
2431 			 * Map the page non-cacheable for the memory
2432 			 * test.
2433 			 */
2434 			*pte = pa |
2435 			    kernel_pmap->pmap_bits[PG_V_IDX] |
2436 			    kernel_pmap->pmap_bits[PG_RW_IDX] |
2437 			    kernel_pmap->pmap_bits[PG_N_IDX];
2438 			cpu_invlpg(__DEVOLATILE(void *, ptr));
2439 			cpu_mfence();
2440 
2441 			/*
2442 			 * Save original value for restoration later.
2443 			 */
2444 			tmp = *ptr;
2445 
2446 			/*
2447 			 * Test for alternating 1's and 0's
2448 			 */
2449 			*ptr = 0xaaaaaaaaaaaaaaaaLLU;
2450 			cpu_mfence();
2451 			if (*ptr != 0xaaaaaaaaaaaaaaaaLLU)
2452 				page_bad = TRUE;
2453 			/*
2454 			 * Test for alternating 0's and 1's
2455 			 */
2456 			*ptr = 0x5555555555555555LLU;
2457 			cpu_mfence();
2458 			if (*ptr != 0x5555555555555555LLU)
2459 				page_bad = TRUE;
2460 			/*
2461 			 * Test for all 1's
2462 			 */
2463 			*ptr = 0xffffffffffffffffLLU;
2464 			cpu_mfence();
2465 			if (*ptr != 0xffffffffffffffffLLU)
2466 				page_bad = TRUE;
2467 			/*
2468 			 * Test for all 0's
2469 			 */
2470 			*ptr = 0x0;
2471 			cpu_mfence();
2472 			if (*ptr != 0x0)
2473 				page_bad = TRUE;
2474 
2475 			/*
2476 			 * Restore original value.
2477 			 */
2478 			*ptr = tmp;
2479 
2480 			/*
2481 			 * Adjust array of valid/good pages.
2482 			 */
2483 			if (page_bad == TRUE) {
2484 				incr = PAGE_SIZE;
2485 				continue;
2486 			}
2487 
2488 			/*
2489 			 * Collapse page address into phys_avail[].  Do a
2490 			 * continuation of the current phys_avail[] index
2491 			 * when possible.
2492 			 */
2493 			if (phys_avail[pa_indx].phys_end == pa) {
2494 				/*
2495 				 * Continuation
2496 				 */
2497 				phys_avail[pa_indx].phys_end += incr;
2498 			} else if (phys_avail[pa_indx].phys_beg ==
2499 				   phys_avail[pa_indx].phys_end) {
2500 				/*
2501 				 * Current phys_avail is completely empty,
2502 				 * reuse the index.
2503 				 */
2504 				phys_avail[pa_indx].phys_beg = pa;
2505 				phys_avail[pa_indx].phys_end = pa + incr;
2506 			} else {
2507 				/*
2508 				 * Allocate next phys_avail index.
2509 				 */
2510 				++pa_indx;
2511 				if (pa_indx == PHYS_AVAIL_ARRAY_END) {
2512 					kprintf(
2513 		"Too many holes in the physical address space, giving up\n");
2514 					--pa_indx;
2515 					full = TRUE;
2516 					goto do_dump_avail;
2517 				}
2518 				phys_avail[pa_indx].phys_beg = pa;
2519 				phys_avail[pa_indx].phys_end = pa + incr;
2520 			}
2521 			physmem += incr / PAGE_SIZE;
2522 
2523 			/*
2524 			 * pa available for dumping
2525 			 */
2526 do_dump_avail:
2527 			if (dump_avail[da_indx].phys_end == pa) {
2528 				dump_avail[da_indx].phys_end += incr;
2529 			} else {
2530 				++da_indx;
2531 				if (da_indx == DUMP_AVAIL_ARRAY_END) {
2532 					--da_indx;
2533 					goto do_next;
2534 				}
2535 				dump_avail[da_indx].phys_beg = pa;
2536 				dump_avail[da_indx].phys_end = pa + incr;
2537 			}
2538 do_next:
2539 			if (full)
2540 				break;
2541 		}
2542 	}
2543 	*pte = 0;
2544 	cpu_invltlb();
2545 	cpu_mfence();
2546 
2547 	/*
2548 	 * The last chunk must contain at least one page plus the message
2549 	 * buffer to avoid complicating other code (message buffer address
2550 	 * calculation, etc.).
2551 	 */
2552 	msgbuf_size = (MSGBUF_SIZE + PHYSMAP_ALIGN_MASK) & ~PHYSMAP_ALIGN_MASK;
2553 
2554 	while (phys_avail[pa_indx].phys_beg + PHYSMAP_ALIGN + msgbuf_size >=
2555 	       phys_avail[pa_indx].phys_end) {
2556 		physmem -= atop(phys_avail[pa_indx].phys_end -
2557 				phys_avail[pa_indx].phys_beg);
2558 		phys_avail[pa_indx].phys_beg = 0;
2559 		phys_avail[pa_indx].phys_end = 0;
2560 		--pa_indx;
2561 	}
2562 
2563 	Maxmem = atop(phys_avail[pa_indx].phys_end);
2564 
2565 	/* Trim off space for the message buffer. */
2566 	phys_avail[pa_indx].phys_end -= msgbuf_size;
2567 
2568 	avail_end = phys_avail[pa_indx].phys_end;
2569 
2570 	/* Map the message buffer. */
2571 	for (off = 0; off < msgbuf_size; off += PAGE_SIZE) {
2572 		pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off);
2573 	}
2574 
2575 	/*
2576 	 * Try to get EFI framebuffer working as early as possible.
2577 	 *
2578 	 * WARN: Some BIOSes do not list the EFI framebuffer memory, causing
2579 	 * the pmap probe code to create a DMAP that does not cover its
2580 	 * physical address space, efi_fb_init_vaddr(1) might not return
2581 	 * an initialized framebuffer base pointer.  In this situation the
2582 	 * later efi_fb_init_vaddr(0) call will deal with it.
2583 	 */
2584 	if (have_efi_framebuffer)
2585 		efi_fb_init_vaddr(1);
2586 }
2587 
2588 struct machintr_abi MachIntrABI;
2589 
2590 /*
2591  * IDT VECTORS:
2592  *	0	Divide by zero
2593  *	1	Debug
2594  *	2	NMI
2595  *	3	BreakPoint
2596  *	4	OverFlow
2597  *	5	Bound-Range
2598  *	6	Invalid OpCode
2599  *	7	Device Not Available (x87)
2600  *	8	Double-Fault
2601  *	9	Coprocessor Segment overrun (unsupported, reserved)
2602  *	10	Invalid-TSS
2603  *	11	Segment not present
2604  *	12	Stack
2605  *	13	General Protection
2606  *	14	Page Fault
2607  *	15	Reserved
2608  *	16	x87 FP Exception pending
2609  *	17	Alignment Check
2610  *	18	Machine Check
2611  *	19	SIMD floating point
2612  *	20-31	reserved
2613  *	32-255	INTn/external sources
2614  */
2615 u_int64_t
hammer_time(u_int64_t modulep,u_int64_t physfree)2616 hammer_time(u_int64_t modulep, u_int64_t physfree)
2617 {
2618 	caddr_t kmdp;
2619 	int gsel_tss, x, cpu;
2620 #if 0 /* JG */
2621 	int metadata_missing, off;
2622 #endif
2623 	struct mdglobaldata *gd;
2624 	struct privatespace *ps;
2625 	u_int64_t msr;
2626 
2627 	/*
2628 	 * Prevent lowering of the ipl if we call tsleep() early.
2629 	 */
2630 	gd = &CPU_prvspace[0]->mdglobaldata;
2631 	ps = (struct privatespace *)gd;
2632 	bzero(gd, sizeof(*gd));
2633 	bzero(&ps->common_tss, sizeof(ps->common_tss));
2634 
2635 	/*
2636 	 * Note: on both UP and SMP curthread must be set non-NULL
2637 	 * early in the boot sequence because the system assumes
2638 	 * that 'curthread' is never NULL.
2639 	 */
2640 
2641 	gd->mi.gd_curthread = &thread0;
2642 	thread0.td_gd = &gd->mi;
2643 
2644 	atdevbase = ISA_HOLE_START + PTOV_OFFSET;
2645 
2646 #if 0 /* JG */
2647 	metadata_missing = 0;
2648 	if (bootinfo.bi_modulep) {
2649 		preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE;
2650 		preload_bootstrap_relocate(KERNBASE);
2651 	} else {
2652 		metadata_missing = 1;
2653 	}
2654 	if (bootinfo.bi_envp)
2655 		kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE;
2656 #endif
2657 
2658 	preload_metadata = (caddr_t)(uintptr_t)(modulep + PTOV_OFFSET);
2659 	preload_bootstrap_relocate(PTOV_OFFSET);
2660 	kmdp = preload_search_by_type("elf kernel");
2661 	if (kmdp == NULL)
2662 		kmdp = preload_search_by_type("elf64 kernel");
2663 	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
2664 	kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *) + PTOV_OFFSET;
2665 #ifdef DDB
2666 	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
2667 	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
2668 #endif
2669 	efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
2670 
2671 	if (boothowto & RB_VERBOSE)
2672 		bootverbose++;
2673 
2674 	/*
2675 	 * Default MachIntrABI to ICU
2676 	 */
2677 	MachIntrABI = MachIntrABI_ICU;
2678 
2679 	/*
2680 	 * start with one cpu.  Note: with one cpu, ncpus_fit_mask remain 0.
2681 	 */
2682 	ncpus = 1;
2683 	ncpus_fit = 1;
2684 	/* Init basic tunables, hz etc */
2685 	init_param1();
2686 
2687 	/*
2688 	 * make gdt memory segments
2689 	 */
2690 	gdt_segs[GPROC0_SEL].ssd_base =
2691 		(uintptr_t) &CPU_prvspace[0]->common_tss;
2692 
2693 	gd->mi.gd_prvspace = CPU_prvspace[0];
2694 
2695 	for (x = 0; x < NGDT; x++) {
2696 		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1))
2697 			ssdtosd(&gdt_segs[x], &gdt_cpu0[x]);
2698 	}
2699 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
2700 	    (struct system_segment_descriptor *)&gdt_cpu0[GPROC0_SEL]);
2701 
2702 	/*
2703 	 * WARNING!  Due to an Intel quirk, VMX exits set the gdt[] table
2704 	 *	     limit to 0xFFFF.  To avoid having to do a heavy-weight
2705 	 *	     reload, we just make ours maximally sized.
2706 	 */
2707 	r_gdt.rd_limit = MAXGDT_LIMIT - 1;
2708 	r_gdt.rd_base = (long)gdt_cpu0;
2709 	lgdt(&r_gdt);
2710 
2711 	wrmsr(MSR_FSBASE, 0);		/* User value */
2712 	wrmsr(MSR_GSBASE, (u_int64_t)&gd->mi);
2713 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
2714 
2715 	mi_gdinit(&gd->mi, 0);
2716 	cpu_gdinit(gd, 0);
2717 	proc0paddr = proc0paddr_buff;
2718 	mi_proc0init(&gd->mi, proc0paddr);
2719 	safepri = TDPRI_MAX;
2720 
2721 	/* spinlocks and the BGL */
2722 	init_locks();
2723 
2724 	/* exceptions */
2725 	for (x = 0; x < NIDT; x++)
2726 		setidt_global(x, rsvdary[x], SDT_SYSIGT, SEL_KPL, 0);
2727 	setidt_global(IDT_DE, &IDTVEC(div),  SDT_SYSIGT, SEL_KPL, 0);
2728 	setidt_global(IDT_DB, &IDTVEC(dbg),  SDT_SYSIGT, SEL_KPL, 2);
2729 	setidt_global(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 1);
2730 	setidt_global(IDT_BP, &IDTVEC(bpt),  SDT_SYSIGT, SEL_UPL, 0);
2731 	setidt_global(IDT_OF, &IDTVEC(ofl),  SDT_SYSIGT, SEL_KPL, 0);
2732 	setidt_global(IDT_BR, &IDTVEC(bnd),  SDT_SYSIGT, SEL_KPL, 0);
2733 	setidt_global(IDT_UD, &IDTVEC(ill),  SDT_SYSIGT, SEL_KPL, 0);
2734 	setidt_global(IDT_NM, &IDTVEC(dna),  SDT_SYSIGT, SEL_KPL, 0);
2735 	setidt_global(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
2736 	setidt_global(IDT_FPUGP, &IDTVEC(fpusegm),  SDT_SYSIGT, SEL_KPL, 0);
2737 	setidt_global(IDT_TS, &IDTVEC(tss),  SDT_SYSIGT, SEL_KPL, 0);
2738 	setidt_global(IDT_NP, &IDTVEC(missing),  SDT_SYSIGT, SEL_KPL, 0);
2739 	setidt_global(IDT_SS, &IDTVEC(stk),  SDT_SYSIGT, SEL_KPL, 0);
2740 	setidt_global(IDT_GP, &IDTVEC(prot),  SDT_SYSIGT, SEL_KPL, 0);
2741 	setidt_global(IDT_PF, &IDTVEC(page),  SDT_SYSIGT, SEL_KPL, 0);
2742 	setidt_global(IDT_MF, &IDTVEC(fpu),  SDT_SYSIGT, SEL_KPL, 0);
2743 	setidt_global(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0);
2744 	setidt_global(IDT_MC, &IDTVEC(mchk),  SDT_SYSIGT, SEL_KPL, 0);
2745 	setidt_global(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0);
2746 
2747 	for (cpu = 0; cpu < MAXCPU; ++cpu) {
2748 		r_idt_arr[cpu].rd_limit = sizeof(idt_arr[cpu]) - 1;
2749 		r_idt_arr[cpu].rd_base = (long) &idt_arr[cpu][0];
2750 	}
2751 
2752 	lidt(&r_idt_arr[0]);
2753 
2754 	/*
2755 	 * Initialize the console before we print anything out.
2756 	 */
2757 	cninit();
2758 
2759 #if 0 /* JG */
2760 	if (metadata_missing)
2761 		kprintf("WARNING: loader(8) metadata is missing!\n");
2762 #endif
2763 
2764 #if	NISA >0
2765 	elcr_probe();
2766 	isa_defaultirq();
2767 #endif
2768 	rand_initialize();
2769 
2770 	/*
2771 	 * Initialize IRQ mapping
2772 	 *
2773 	 * NOTE:
2774 	 * SHOULD be after elcr_probe()
2775 	 */
2776 	MachIntrABI_ICU.initmap();
2777 	MachIntrABI_IOAPIC.initmap();
2778 
2779 #ifdef DDB
2780 	kdb_init();
2781 	if (boothowto & RB_KDB)
2782 		Debugger("Boot flags requested debugger");
2783 #endif
2784 
2785 	identify_cpu();		/* Final stage of CPU initialization */
2786 	initializecpu(0);	/* Initialize CPU registers */
2787 
2788 	/*
2789 	 * On modern Intel cpus, haswell or later, cpu_idle_hlt=1 is better
2790 	 * because the cpu does significant power management in MWAIT
2791 	 * (also suggested is to set sysctl machdep.mwait.CX.idle=AUTODEEP).
2792 	 *
2793 	 * On many AMD cpus cpu_idle_hlt=3 is better, because the cpu does
2794 	 * significant power management only when using ACPI halt mode.
2795 	 * (However, on Ryzen, mode 4 (HLT) also does power management).
2796 	 *
2797 	 * On older AMD or Intel cpus, cpu_idle_hlt=2 is better because ACPI
2798 	 * is needed to reduce power consumption, but wakeup times are often
2799 	 * too long.
2800 	 */
2801 	if (cpu_vendor_id == CPU_VENDOR_INTEL &&
2802 	    CPUID_TO_MODEL(cpu_id) >= 0x3C) {	/* Haswell or later */
2803 		cpu_idle_hlt = 1;
2804 	}
2805 	if (cpu_vendor_id == CPU_VENDOR_AMD) {
2806 		if (CPUID_TO_FAMILY(cpu_id) >= 0x17) {
2807 			/* Ryzen or later */
2808 			cpu_idle_hlt = 3;
2809 		} else if (CPUID_TO_FAMILY(cpu_id) >= 0x14) {
2810 			/* Bobcat or later */
2811 			cpu_idle_hlt = 3;
2812 		}
2813 	}
2814 
2815 	TUNABLE_INT_FETCH("hw.apic_io_enable", &ioapic_enable); /* for compat */
2816 	TUNABLE_INT_FETCH("hw.ioapic_enable", &ioapic_enable);
2817 	TUNABLE_INT_FETCH("hw.lapic_enable", &lapic_enable);
2818 	TUNABLE_INT_FETCH("machdep.cpu_idle_hlt", &cpu_idle_hlt);
2819 
2820 	/*
2821 	 * By default always enable the ioapic.  Certain virtual machines
2822 	 * may not work with the I/O apic enabled and can be specified in
2823 	 * the case statement below.  On the other hand, if the ioapic is
2824 	 * disabled for virtual machines which DO work with the I/O apic,
2825 	 * the virtual machine can implode if we disable the I/O apic.
2826 	 *
2827 	 * For now enable the ioapic for all guests.
2828 	 *
2829 	 * NOTE: This must be done after identify_cpu(), which sets
2830 	 *	 'cpu_feature2'.
2831 	 */
2832 	if (ioapic_enable < 0) {
2833 		ioapic_enable = 1;
2834 		switch(vmm_guest) {
2835 		case VMM_GUEST_NONE:	/* should be enabled on real HW */
2836 		case VMM_GUEST_KVM:	/* must be enabled or VM implodes */
2837 			ioapic_enable = 1;
2838 			break;
2839 		default:		/* enable by default for other VMs */
2840 			ioapic_enable = 1;
2841 			break;
2842 		}
2843 	}
2844 
2845 	/*
2846 	 * TSS entry point for interrupts, traps, and exceptions
2847 	 * (sans NMI).  This will always go to near the top of the pcpu
2848 	 * trampoline area.  Hardware-pushed data will be copied into
2849 	 * the trap-frame on entry, and (if necessary) returned to the
2850 	 * trampoline on exit.
2851 	 *
2852 	 * We store some pcb data for the trampoline code above the
2853 	 * stack the cpu hw pushes into, and arrange things so the
2854 	 * address of tr_pcb_rsp is the same as the desired top of
2855 	 * stack.
2856 	 */
2857 	ps->common_tss.tss_rsp0 = (register_t)&ps->trampoline.tr_pcb_rsp;
2858 	ps->trampoline.tr_pcb_rsp = ps->common_tss.tss_rsp0;
2859 	ps->trampoline.tr_pcb_gs_kernel = (register_t)gd;
2860 	ps->trampoline.tr_pcb_cr3 = KPML4phys;	/* adj to user cr3 live */
2861 	ps->dbltramp.tr_pcb_gs_kernel = (register_t)gd;
2862 	ps->dbltramp.tr_pcb_cr3 = KPML4phys;
2863 	ps->dbgtramp.tr_pcb_gs_kernel = (register_t)gd;
2864 	ps->dbgtramp.tr_pcb_cr3 = KPML4phys;
2865 
2866 	/* double fault stack */
2867 	ps->common_tss.tss_ist1 = (register_t)&ps->dbltramp.tr_pcb_rsp;
2868 	/* #DB debugger needs its own stack */
2869 	ps->common_tss.tss_ist2 = (register_t)&ps->dbgtramp.tr_pcb_rsp;
2870 
2871 	/* Set the IO permission bitmap (empty due to tss seg limit) */
2872 	ps->common_tss.tss_iobase = sizeof(struct x86_64tss);
2873 
2874 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
2875 	gd->gd_gdt = &gdt_cpu0[0];
2876 	gd->gd_tss_gdt = &gd->gd_gdt[GPROC0_SEL];
2877 	gd->gd_common_tssd = *gd->gd_tss_gdt;
2878 	ltr(gsel_tss);
2879 
2880 	/* Set up the fast syscall stuff */
2881 	msr = rdmsr(MSR_EFER) | EFER_SCE;
2882 	wrmsr(MSR_EFER, msr);
2883 	wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall));
2884 	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
2885 	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
2886 	      ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
2887 	wrmsr(MSR_STAR, msr);
2888 	wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D|PSL_IOPL|PSL_AC);
2889 
2890 	getmemsize(kmdp, physfree);
2891 	init_param2(physmem);
2892 
2893 	/* now running on new page tables, configured,and u/iom is accessible */
2894 
2895 	/* Map the message buffer. */
2896 #if 0 /* JG */
2897 	for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE)
2898 		pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off);
2899 #endif
2900 
2901 	msgbufinit(msgbufp, MSGBUF_SIZE);
2902 
2903 
2904 	/* transfer to user mode */
2905 
2906 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
2907 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
2908 	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
2909 
2910 	load_ds(_udatasel);
2911 	load_es(_udatasel);
2912 	load_fs(_udatasel);
2913 
2914 	/* setup proc 0's pcb */
2915 	thread0.td_pcb->pcb_flags = 0;
2916 	thread0.td_pcb->pcb_cr3 = KPML4phys;
2917 	thread0.td_pcb->pcb_cr3_iso = 0;
2918 	thread0.td_pcb->pcb_ext = NULL;
2919 	lwp0.lwp_md.md_regs = &proc0_tf;	/* XXX needed? */
2920 
2921 	/* Location of kernel stack for locore */
2922 	return ((u_int64_t)thread0.td_pcb);
2923 }
2924 
2925 /*
2926  * Initialize machine-dependant portions of the global data structure.
2927  * Note that the global data area and cpu0's idlestack in the private
2928  * data space were allocated in locore.
2929  *
2930  * Note: the idlethread's cpl is 0
2931  *
2932  * WARNING!  Called from early boot, 'mycpu' may not work yet.
2933  */
2934 void
cpu_gdinit(struct mdglobaldata * gd,int cpu)2935 cpu_gdinit(struct mdglobaldata *gd, int cpu)
2936 {
2937 	if (cpu)
2938 		gd->mi.gd_curthread = &gd->mi.gd_idlethread;
2939 
2940 	lwkt_init_thread(&gd->mi.gd_idlethread,
2941 			gd->mi.gd_prvspace->idlestack,
2942 			sizeof(gd->mi.gd_prvspace->idlestack),
2943 			0, &gd->mi);
2944 	lwkt_set_comm(&gd->mi.gd_idlethread, "idle_%d", cpu);
2945 	gd->mi.gd_idlethread.td_switch = cpu_lwkt_switch;
2946 	gd->mi.gd_idlethread.td_sp -= sizeof(void *);
2947 	*(void **)gd->mi.gd_idlethread.td_sp = cpu_idle_restore;
2948 }
2949 
2950 /*
2951  * We only have to check for DMAP bounds, the globaldata space is
2952  * actually part of the kernel_map so we don't have to waste time
2953  * checking CPU_prvspace[*].
2954  */
2955 int
is_globaldata_space(vm_offset_t saddr,vm_offset_t eaddr)2956 is_globaldata_space(vm_offset_t saddr, vm_offset_t eaddr)
2957 {
2958 #if 0
2959 	if (saddr >= (vm_offset_t)&CPU_prvspace[0] &&
2960 	    eaddr <= (vm_offset_t)&CPU_prvspace[MAXCPU]) {
2961 		return (TRUE);
2962 	}
2963 #endif
2964 	if (saddr >= DMAP_MIN_ADDRESS && eaddr <= DMAP_MAX_ADDRESS)
2965 		return (TRUE);
2966 	return (FALSE);
2967 }
2968 
2969 struct globaldata *
globaldata_find(int cpu)2970 globaldata_find(int cpu)
2971 {
2972 	KKASSERT(cpu >= 0 && cpu < ncpus);
2973 	return(&CPU_prvspace[cpu]->mdglobaldata.mi);
2974 }
2975 
2976 /*
2977  * This path should be safe from the SYSRET issue because only stopped threads
2978  * can have their %rip adjusted this way (and all heavy weight thread switches
2979  * clear QUICKREF and thus do not use SYSRET).  However, the code path is
2980  * convoluted so add a safety by forcing %rip to be cannonical.
2981  */
2982 int
ptrace_set_pc(struct lwp * lp,unsigned long addr)2983 ptrace_set_pc(struct lwp *lp, unsigned long addr)
2984 {
2985 	if (addr & 0x0000800000000000LLU)
2986 		lp->lwp_md.md_regs->tf_rip = addr | 0xFFFF000000000000LLU;
2987 	else
2988 		lp->lwp_md.md_regs->tf_rip = addr & 0x0000FFFFFFFFFFFFLLU;
2989 	return (0);
2990 }
2991 
2992 int
ptrace_single_step(struct lwp * lp)2993 ptrace_single_step(struct lwp *lp)
2994 {
2995 	lp->lwp_md.md_regs->tf_rflags |= PSL_T;
2996 	return (0);
2997 }
2998 
2999 int
fill_regs(struct lwp * lp,struct reg * regs)3000 fill_regs(struct lwp *lp, struct reg *regs)
3001 {
3002 	struct trapframe *tp;
3003 
3004 	if ((tp = lp->lwp_md.md_regs) == NULL)
3005 		return EINVAL;
3006 	bcopy(&tp->tf_rdi, &regs->r_rdi, sizeof(*regs));
3007 	return (0);
3008 }
3009 
3010 int
set_regs(struct lwp * lp,struct reg * regs)3011 set_regs(struct lwp *lp, struct reg *regs)
3012 {
3013 	struct trapframe *tp;
3014 
3015 	tp = lp->lwp_md.md_regs;
3016 	if (!EFL_SECURE(regs->r_rflags, tp->tf_rflags) ||
3017 	    !CS_SECURE(regs->r_cs))
3018 		return (EINVAL);
3019 	bcopy(&regs->r_rdi, &tp->tf_rdi, sizeof(*regs));
3020 	clear_quickret();
3021 	return (0);
3022 }
3023 
3024 static void
fill_fpregs_xmm(struct savexmm * sv_xmm,struct save87 * sv_87)3025 fill_fpregs_xmm(struct savexmm *sv_xmm, struct save87 *sv_87)
3026 {
3027 	struct env87 *penv_87 = &sv_87->sv_env;
3028 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
3029 	int i;
3030 
3031 	/* FPU control/status */
3032 	penv_87->en_cw = penv_xmm->en_cw;
3033 	penv_87->en_sw = penv_xmm->en_sw;
3034 	penv_87->en_tw = penv_xmm->en_tw;
3035 	penv_87->en_fip = penv_xmm->en_fip;
3036 	penv_87->en_fcs = penv_xmm->en_fcs;
3037 	penv_87->en_opcode = penv_xmm->en_opcode;
3038 	penv_87->en_foo = penv_xmm->en_foo;
3039 	penv_87->en_fos = penv_xmm->en_fos;
3040 
3041 	/* FPU registers */
3042 	for (i = 0; i < 8; ++i)
3043 		sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc;
3044 }
3045 
3046 static void
set_fpregs_xmm(struct save87 * sv_87,struct savexmm * sv_xmm)3047 set_fpregs_xmm(struct save87 *sv_87, struct savexmm *sv_xmm)
3048 {
3049 	struct env87 *penv_87 = &sv_87->sv_env;
3050 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
3051 	int i;
3052 
3053 	/* FPU control/status */
3054 	penv_xmm->en_cw = penv_87->en_cw;
3055 	penv_xmm->en_sw = penv_87->en_sw;
3056 	penv_xmm->en_tw = penv_87->en_tw;
3057 	penv_xmm->en_fip = penv_87->en_fip;
3058 	penv_xmm->en_fcs = penv_87->en_fcs;
3059 	penv_xmm->en_opcode = penv_87->en_opcode;
3060 	penv_xmm->en_foo = penv_87->en_foo;
3061 	penv_xmm->en_fos = penv_87->en_fos;
3062 
3063 	/* FPU registers */
3064 	for (i = 0; i < 8; ++i)
3065 		sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i];
3066 }
3067 
3068 int
fill_fpregs(struct lwp * lp,struct fpreg * fpregs)3069 fill_fpregs(struct lwp *lp, struct fpreg *fpregs)
3070 {
3071 	if (lp->lwp_thread == NULL || lp->lwp_thread->td_pcb == NULL)
3072 		return EINVAL;
3073 	if (cpu_fxsr) {
3074 		fill_fpregs_xmm(&lp->lwp_thread->td_pcb->pcb_save.sv_xmm,
3075 				(struct save87 *)fpregs);
3076 		return (0);
3077 	}
3078 	bcopy(&lp->lwp_thread->td_pcb->pcb_save.sv_87, fpregs, sizeof *fpregs);
3079 	return (0);
3080 }
3081 
3082 int
set_fpregs(struct lwp * lp,struct fpreg * fpregs)3083 set_fpregs(struct lwp *lp, struct fpreg *fpregs)
3084 {
3085 	if (cpu_fxsr) {
3086 		set_fpregs_xmm((struct save87 *)fpregs,
3087 			       &lp->lwp_thread->td_pcb->pcb_save.sv_xmm);
3088 		return (0);
3089 	}
3090 	bcopy(fpregs, &lp->lwp_thread->td_pcb->pcb_save.sv_87, sizeof *fpregs);
3091 	return (0);
3092 }
3093 
3094 int
fill_dbregs(struct lwp * lp,struct dbreg * dbregs)3095 fill_dbregs(struct lwp *lp, struct dbreg *dbregs)
3096 {
3097 	struct pcb *pcb;
3098 
3099 	if (lp == NULL) {
3100 		dbregs->dr[0] = rdr0();
3101 		dbregs->dr[1] = rdr1();
3102 		dbregs->dr[2] = rdr2();
3103 		dbregs->dr[3] = rdr3();
3104 		dbregs->dr[4] = rdr4();
3105 		dbregs->dr[5] = rdr5();
3106 		dbregs->dr[6] = rdr6();
3107 		dbregs->dr[7] = rdr7();
3108 		return (0);
3109 	}
3110 	if (lp->lwp_thread == NULL || (pcb = lp->lwp_thread->td_pcb) == NULL)
3111 		return EINVAL;
3112 	dbregs->dr[0] = pcb->pcb_dr0;
3113 	dbregs->dr[1] = pcb->pcb_dr1;
3114 	dbregs->dr[2] = pcb->pcb_dr2;
3115 	dbregs->dr[3] = pcb->pcb_dr3;
3116 	dbregs->dr[4] = 0;
3117 	dbregs->dr[5] = 0;
3118 	dbregs->dr[6] = pcb->pcb_dr6;
3119 	dbregs->dr[7] = pcb->pcb_dr7;
3120 	return (0);
3121 }
3122 
3123 int
set_dbregs(struct lwp * lp,struct dbreg * dbregs)3124 set_dbregs(struct lwp *lp, struct dbreg *dbregs)
3125 {
3126 	if (lp == NULL) {
3127 		load_dr0(dbregs->dr[0]);
3128 		load_dr1(dbregs->dr[1]);
3129 		load_dr2(dbregs->dr[2]);
3130 		load_dr3(dbregs->dr[3]);
3131 		load_dr4(dbregs->dr[4]);
3132 		load_dr5(dbregs->dr[5]);
3133 		load_dr6(dbregs->dr[6]);
3134 		load_dr7(dbregs->dr[7]);
3135 	} else {
3136 		struct pcb *pcb;
3137 		struct ucred *ucred;
3138 		int i;
3139 		uint64_t mask1, mask2;
3140 
3141 		/*
3142 		 * Don't let an illegal value for dr7 get set.	Specifically,
3143 		 * check for undefined settings.  Setting these bit patterns
3144 		 * result in undefined behaviour and can lead to an unexpected
3145 		 * TRCTRAP.
3146 		 */
3147 		/* JG this loop looks unreadable */
3148 		/* Check 4 2-bit fields for invalid patterns.
3149 		 * These fields are R/Wi, for i = 0..3
3150 		 */
3151 		/* Is 10 in LENi allowed when running in compatibility mode? */
3152 		/* Pattern 10 in R/Wi might be used to indicate
3153 		 * breakpoint on I/O. Further analysis should be
3154 		 * carried to decide if it is safe and useful to
3155 		 * provide access to that capability
3156 		 */
3157 		for (i = 0, mask1 = 0x3<<16, mask2 = 0x2<<16; i < 4;
3158 		     i++, mask1 <<= 4, mask2 <<= 4)
3159 			if ((dbregs->dr[7] & mask1) == mask2)
3160 				return (EINVAL);
3161 
3162 		pcb = lp->lwp_thread->td_pcb;
3163 		ucred = lp->lwp_proc->p_ucred;
3164 
3165 		/*
3166 		 * Don't let a process set a breakpoint that is not within the
3167 		 * process's address space.  If a process could do this, it
3168 		 * could halt the system by setting a breakpoint in the kernel
3169 		 * (if ddb was enabled).  Thus, we need to check to make sure
3170 		 * that no breakpoints are being enabled for addresses outside
3171 		 * process's address space, unless, perhaps, we were called by
3172 		 * uid 0.
3173 		 *
3174 		 * XXX - what about when the watched area of the user's
3175 		 * address space is written into from within the kernel
3176 		 * ... wouldn't that still cause a breakpoint to be generated
3177 		 * from within kernel mode?
3178 		 */
3179 
3180 		if (caps_priv_check(ucred, SYSCAP_RESTRICTEDROOT) != 0) {
3181 			if (dbregs->dr[7] & 0x3) {
3182 				/* dr0 is enabled */
3183 				if (dbregs->dr[0] >= VM_MAX_USER_ADDRESS)
3184 					return (EINVAL);
3185 			}
3186 
3187 			if (dbregs->dr[7] & (0x3<<2)) {
3188 				/* dr1 is enabled */
3189 				if (dbregs->dr[1] >= VM_MAX_USER_ADDRESS)
3190 					return (EINVAL);
3191 			}
3192 
3193 			if (dbregs->dr[7] & (0x3<<4)) {
3194 				/* dr2 is enabled */
3195 				if (dbregs->dr[2] >= VM_MAX_USER_ADDRESS)
3196 					return (EINVAL);
3197 			}
3198 
3199 			if (dbregs->dr[7] & (0x3<<6)) {
3200 				/* dr3 is enabled */
3201 				if (dbregs->dr[3] >= VM_MAX_USER_ADDRESS)
3202 					return (EINVAL);
3203 			}
3204 		}
3205 
3206 		pcb->pcb_dr0 = dbregs->dr[0];
3207 		pcb->pcb_dr1 = dbregs->dr[1];
3208 		pcb->pcb_dr2 = dbregs->dr[2];
3209 		pcb->pcb_dr3 = dbregs->dr[3];
3210 		pcb->pcb_dr6 = dbregs->dr[6];
3211 		pcb->pcb_dr7 = dbregs->dr[7];
3212 
3213 		pcb->pcb_flags |= PCB_DBREGS;
3214 	}
3215 
3216 	return (0);
3217 }
3218 
3219 /*
3220  * Return > 0 if a hardware breakpoint has been hit, and the
3221  * breakpoint was in user space.  Return 0, otherwise.
3222  */
3223 int
user_dbreg_trap(void)3224 user_dbreg_trap(void)
3225 {
3226 	u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */
3227 	u_int64_t bp;       /* breakpoint bits extracted from dr6 */
3228 	int nbp;            /* number of breakpoints that triggered */
3229 	caddr_t addr[4];    /* breakpoint addresses */
3230 	int i;
3231 
3232 	dr7 = rdr7();
3233 	if ((dr7 & 0xff) == 0) {
3234 		/*
3235 		 * all GE and LE bits in the dr7 register are zero,
3236 		 * thus the trap couldn't have been caused by the
3237 		 * hardware debug registers
3238 		 */
3239 		return 0;
3240 	}
3241 
3242 	nbp = 0;
3243 	dr6 = rdr6();
3244 	bp = dr6 & 0xf;
3245 
3246 	if (bp == 0) {
3247 		/*
3248 		 * None of the breakpoint bits are set meaning this
3249 		 * trap was not caused by any of the debug registers
3250 		 */
3251 		return 0;
3252 	}
3253 
3254 	/*
3255 	 * at least one of the breakpoints were hit, check to see
3256 	 * which ones and if any of them are user space addresses
3257 	 */
3258 
3259 	if (bp & 0x01) {
3260 		addr[nbp++] = (caddr_t)rdr0();
3261 	}
3262 	if (bp & 0x02) {
3263 		addr[nbp++] = (caddr_t)rdr1();
3264 	}
3265 	if (bp & 0x04) {
3266 		addr[nbp++] = (caddr_t)rdr2();
3267 	}
3268 	if (bp & 0x08) {
3269 		addr[nbp++] = (caddr_t)rdr3();
3270 	}
3271 
3272 	for (i = 0; i < nbp; i++) {
3273 		if (addr[i] < (caddr_t)VM_MAX_USER_ADDRESS) {
3274 			/*
3275 			 * addr[i] is in user space
3276 			 */
3277 			return nbp;
3278 		}
3279 	}
3280 
3281 	/*
3282 	 * None of the breakpoints are in user space.
3283 	 */
3284 	return 0;
3285 }
3286 
3287 
3288 #ifndef DDB
3289 void
Debugger(const char * msg)3290 Debugger(const char *msg)
3291 {
3292 	kprintf("Debugger(\"%s\") called.\n", msg);
3293 }
3294 #endif /* no DDB */
3295 
3296 #ifdef DDB
3297 
3298 /*
3299  * Provide inb() and outb() as functions.  They are normally only
3300  * available as macros calling inlined functions, thus cannot be
3301  * called inside DDB.
3302  *
3303  * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
3304  */
3305 
3306 #undef inb
3307 #undef outb
3308 
3309 /* silence compiler warnings */
3310 u_char inb(u_int);
3311 void outb(u_int, u_char);
3312 
3313 u_char
inb(u_int port)3314 inb(u_int port)
3315 {
3316 	u_char	data;
3317 	/*
3318 	 * We use %%dx and not %1 here because i/o is done at %dx and not at
3319 	 * %edx, while gcc generates inferior code (movw instead of movl)
3320 	 * if we tell it to load (u_short) port.
3321 	 */
3322 	__asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port));
3323 	return (data);
3324 }
3325 
3326 void
outb(u_int port,u_char data)3327 outb(u_int port, u_char data)
3328 {
3329 	u_char	al;
3330 	/*
3331 	 * Use an unnecessary assignment to help gcc's register allocator.
3332 	 * This make a large difference for gcc-1.40 and a tiny difference
3333 	 * for gcc-2.6.0.  For gcc-1.40, al had to be ``asm("ax")'' for
3334 	 * best results.  gcc-2.6.0 can't handle this.
3335 	 */
3336 	al = data;
3337 	__asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port));
3338 }
3339 
3340 #endif /* DDB */
3341 
3342 
3343 
3344 /*
3345  * initialize all the SMP locks
3346  */
3347 
3348 /* critical region when masking or unmasking interupts */
3349 struct spinlock_deprecated imen_spinlock;
3350 
3351 /* locks com (tty) data/hardware accesses: a FASTINTR() */
3352 struct spinlock_deprecated com_spinlock;
3353 
3354 /* lock regions around the clock hardware */
3355 struct spinlock_deprecated clock_spinlock;
3356 
3357 static void
init_locks(void)3358 init_locks(void)
3359 {
3360 	/*
3361 	 * Get the initial mplock with a count of 1 for the BSP.
3362 	 * This uses a LOGICAL cpu ID, ie BSP == 0.
3363 	 */
3364 	cpu_get_initial_mplock();
3365 	/* DEPRECATED */
3366 	spin_init_deprecated(&imen_spinlock);
3367 	spin_init_deprecated(&com_spinlock);
3368 	spin_init_deprecated(&clock_spinlock);
3369 
3370 	/* our token pool needs to work early */
3371 	lwkt_token_pool_init();
3372 }
3373 
3374 boolean_t
cpu_mwait_hint_valid(uint32_t hint)3375 cpu_mwait_hint_valid(uint32_t hint)
3376 {
3377 	int cx_idx, sub;
3378 
3379 	cx_idx = MWAIT_EAX_TO_CX(hint);
3380 	if (cx_idx >= CPU_MWAIT_CX_MAX)
3381 		return FALSE;
3382 
3383 	sub = MWAIT_EAX_TO_CX_SUB(hint);
3384 	if (sub >= cpu_mwait_cx_info[cx_idx].subcnt)
3385 		return FALSE;
3386 
3387 	return TRUE;
3388 }
3389 
3390 void
cpu_mwait_cx_no_bmsts(void)3391 cpu_mwait_cx_no_bmsts(void)
3392 {
3393 	atomic_clear_int(&cpu_mwait_c3_preamble, CPU_MWAIT_C3_PREAMBLE_BM_STS);
3394 }
3395 
3396 void
cpu_mwait_cx_no_bmarb(void)3397 cpu_mwait_cx_no_bmarb(void)
3398 {
3399 	atomic_clear_int(&cpu_mwait_c3_preamble, CPU_MWAIT_C3_PREAMBLE_BM_ARB);
3400 }
3401 
3402 static int
cpu_mwait_cx_hint2name(int hint,char * name,int namelen,boolean_t allow_auto)3403 cpu_mwait_cx_hint2name(int hint, char *name, int namelen, boolean_t allow_auto)
3404 {
3405 	int old_cx_idx, sub = 0;
3406 
3407 	if (hint >= 0) {
3408 		old_cx_idx = MWAIT_EAX_TO_CX(hint);
3409 		sub = MWAIT_EAX_TO_CX_SUB(hint);
3410 	} else if (hint == CPU_MWAIT_HINT_AUTO) {
3411 		old_cx_idx = allow_auto ? CPU_MWAIT_C2 : CPU_MWAIT_CX_MAX;
3412 	} else if (hint == CPU_MWAIT_HINT_AUTODEEP) {
3413 		old_cx_idx = allow_auto ? CPU_MWAIT_C3 : CPU_MWAIT_CX_MAX;
3414 	} else {
3415 		old_cx_idx = CPU_MWAIT_CX_MAX;
3416 	}
3417 
3418 	if (!CPU_MWAIT_HAS_CX)
3419 		strlcpy(name, "NONE", namelen);
3420 	else if (allow_auto && hint == CPU_MWAIT_HINT_AUTO)
3421 		strlcpy(name, "AUTO", namelen);
3422 	else if (allow_auto && hint == CPU_MWAIT_HINT_AUTODEEP)
3423 		strlcpy(name, "AUTODEEP", namelen);
3424 	else if (old_cx_idx >= CPU_MWAIT_CX_MAX ||
3425 	    sub >= cpu_mwait_cx_info[old_cx_idx].subcnt)
3426 		strlcpy(name, "INVALID", namelen);
3427 	else
3428 		ksnprintf(name, namelen, "C%d/%d", old_cx_idx, sub);
3429 
3430 	return old_cx_idx;
3431 }
3432 
3433 static int
cpu_mwait_cx_name2hint(char * name,int * hint0,boolean_t allow_auto)3434 cpu_mwait_cx_name2hint(char *name, int *hint0, boolean_t allow_auto)
3435 {
3436 	int cx_idx, sub, hint;
3437 	char *ptr, *start;
3438 
3439 	if (allow_auto && strcmp(name, "AUTO") == 0) {
3440 		hint = CPU_MWAIT_HINT_AUTO;
3441 		cx_idx = CPU_MWAIT_C2;
3442 		goto done;
3443 	}
3444 	if (allow_auto && strcmp(name, "AUTODEEP") == 0) {
3445 		hint = CPU_MWAIT_HINT_AUTODEEP;
3446 		cx_idx = CPU_MWAIT_C3;
3447 		goto done;
3448 	}
3449 
3450 	if (strlen(name) < 4 || toupper(name[0]) != 'C')
3451 		return -1;
3452 	start = &name[1];
3453 	ptr = NULL;
3454 
3455 	cx_idx = strtol(start, &ptr, 10);
3456 	if (ptr == start || *ptr != '/')
3457 		return -1;
3458 	if (cx_idx < 0 || cx_idx >= CPU_MWAIT_CX_MAX)
3459 		return -1;
3460 
3461 	start = ptr + 1;
3462 	ptr = NULL;
3463 
3464 	sub = strtol(start, &ptr, 10);
3465 	if (*ptr != '\0')
3466 		return -1;
3467 	if (sub < 0 || sub >= cpu_mwait_cx_info[cx_idx].subcnt)
3468 		return -1;
3469 
3470 	hint = MWAIT_EAX_HINT(cx_idx, sub);
3471 done:
3472 	*hint0 = hint;
3473 	return cx_idx;
3474 }
3475 
3476 static int
cpu_mwait_cx_transit(int old_cx_idx,int cx_idx)3477 cpu_mwait_cx_transit(int old_cx_idx, int cx_idx)
3478 {
3479 	if (cx_idx >= CPU_MWAIT_C3 && cpu_mwait_c3_preamble)
3480 		return EOPNOTSUPP;
3481 	if (old_cx_idx < CPU_MWAIT_C3 && cx_idx >= CPU_MWAIT_C3) {
3482 		int error;
3483 
3484 		error = cputimer_intr_powersave_addreq();
3485 		if (error)
3486 			return error;
3487 	} else if (old_cx_idx >= CPU_MWAIT_C3 && cx_idx < CPU_MWAIT_C3) {
3488 		cputimer_intr_powersave_remreq();
3489 	}
3490 	return 0;
3491 }
3492 
3493 static int
cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS,int * hint0,boolean_t allow_auto)3494 cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS, int *hint0,
3495     boolean_t allow_auto)
3496 {
3497 	int error, cx_idx, old_cx_idx, hint;
3498 	char name[CPU_MWAIT_CX_NAMELEN];
3499 
3500 	hint = *hint0;
3501 	old_cx_idx = cpu_mwait_cx_hint2name(hint, name, sizeof(name),
3502 	    allow_auto);
3503 
3504 	error = sysctl_handle_string(oidp, name, sizeof(name), req);
3505 	if (error != 0 || req->newptr == NULL)
3506 		return error;
3507 
3508 	if (!CPU_MWAIT_HAS_CX)
3509 		return EOPNOTSUPP;
3510 
3511 	cx_idx = cpu_mwait_cx_name2hint(name, &hint, allow_auto);
3512 	if (cx_idx < 0)
3513 		return EINVAL;
3514 
3515 	error = cpu_mwait_cx_transit(old_cx_idx, cx_idx);
3516 	if (error)
3517 		return error;
3518 
3519 	*hint0 = hint;
3520 	return 0;
3521 }
3522 
3523 static int
cpu_mwait_cx_setname(struct cpu_idle_stat * stat,const char * cx_name)3524 cpu_mwait_cx_setname(struct cpu_idle_stat *stat, const char *cx_name)
3525 {
3526 	int error, cx_idx, old_cx_idx, hint;
3527 	char name[CPU_MWAIT_CX_NAMELEN];
3528 
3529 	KASSERT(CPU_MWAIT_HAS_CX, ("cpu does not support mwait CX extension"));
3530 
3531 	hint = stat->hint;
3532 	old_cx_idx = cpu_mwait_cx_hint2name(hint, name, sizeof(name), TRUE);
3533 
3534 	strlcpy(name, cx_name, sizeof(name));
3535 	cx_idx = cpu_mwait_cx_name2hint(name, &hint, TRUE);
3536 	if (cx_idx < 0)
3537 		return EINVAL;
3538 
3539 	error = cpu_mwait_cx_transit(old_cx_idx, cx_idx);
3540 	if (error)
3541 		return error;
3542 
3543 	stat->hint = hint;
3544 	return 0;
3545 }
3546 
3547 static int
cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS)3548 cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS)
3549 {
3550 	int hint = cpu_mwait_halt_global;
3551 	int error, cx_idx, cpu;
3552 	char name[CPU_MWAIT_CX_NAMELEN], cx_name[CPU_MWAIT_CX_NAMELEN];
3553 
3554 	cpu_mwait_cx_hint2name(hint, name, sizeof(name), TRUE);
3555 
3556 	error = sysctl_handle_string(oidp, name, sizeof(name), req);
3557 	if (error != 0 || req->newptr == NULL)
3558 		return error;
3559 
3560 	if (!CPU_MWAIT_HAS_CX)
3561 		return EOPNOTSUPP;
3562 
3563 	/* Save name for later per-cpu CX configuration */
3564 	strlcpy(cx_name, name, sizeof(cx_name));
3565 
3566 	cx_idx = cpu_mwait_cx_name2hint(name, &hint, TRUE);
3567 	if (cx_idx < 0)
3568 		return EINVAL;
3569 
3570 	/* Change per-cpu CX configuration */
3571 	for (cpu = 0; cpu < ncpus; ++cpu) {
3572 		error = cpu_mwait_cx_setname(&cpu_idle_stats[cpu], cx_name);
3573 		if (error)
3574 			return error;
3575 	}
3576 
3577 	cpu_mwait_halt_global = hint;
3578 	return 0;
3579 }
3580 
3581 static int
cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS)3582 cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS)
3583 {
3584 	struct cpu_idle_stat *stat = arg1;
3585 	int error;
3586 
3587 	error = cpu_mwait_cx_select_sysctl(oidp, arg1, arg2, req,
3588 	    &stat->hint, TRUE);
3589 	return error;
3590 }
3591 
3592 static int
cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS)3593 cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS)
3594 {
3595 	int error;
3596 
3597 	error = cpu_mwait_cx_select_sysctl(oidp, arg1, arg2, req,
3598 	    &cpu_mwait_spin, FALSE);
3599 	return error;
3600 }
3601 
3602 /*
3603  * This manual debugging code is called unconditionally from Xtimer
3604  * (the per-cpu timer interrupt) whether the current thread is in a
3605  * critical section or not) and can be useful in tracking down lockups.
3606  *
3607  * NOTE: MANUAL DEBUG CODE
3608  */
3609 #if 0
3610 static int saveticks[SMP_MAXCPU];
3611 static int savecounts[SMP_MAXCPU];
3612 #endif
3613 static tsc_uclock_t last_tsc[SMP_MAXCPU];
3614 
3615 void
pcpu_timer_always(struct intrframe * frame)3616 pcpu_timer_always(struct intrframe *frame)
3617 {
3618 	globaldata_t gd;
3619 	thread_t td;
3620 	char *top;
3621 	char *bot;
3622 	char *rbp;
3623 	char *rip;
3624 	int n;
3625 	tsc_uclock_t tsc;
3626 
3627 	if (flame_poll_debug == 0)
3628 		return;
3629 	gd = mycpu;
3630 	tsc = rdtsc() - last_tsc[gd->gd_cpuid];
3631 	if (tsc_frequency == 0 || tsc < tsc_frequency)
3632 		return;
3633 	last_tsc[gd->gd_cpuid] = rdtsc();
3634 
3635 	td = gd->gd_curthread;
3636 	if (td == NULL)
3637 		return;
3638 	bot = (char *)td->td_kstack + PAGE_SIZE;        /* skip guard */
3639 	top = (char *)td->td_kstack + td->td_kstack_size;
3640 	if (bot >= top)
3641 		return;
3642 
3643 	rip = (char *)(intptr_t)frame->if_rip;
3644 	kprintf("POLL%02d %016lx", gd->gd_cpuid, (intptr_t)rip);
3645 	rbp = (char *)(intptr_t)frame->if_rbp;
3646 
3647 	for (n = 1; n < 8; ++n) {
3648 		if (rbp < bot || rbp > top - 8 || ((intptr_t)rbp & 7))
3649 			break;
3650 		kprintf("<-%016lx", (intptr_t)*(char **)(rbp + 8));
3651 		if (*(char **)rbp <= rbp)
3652 			break;
3653 		rbp = *(char **)rbp;
3654 	}
3655 	kprintf("\n");
3656 	cpu_sfence();
3657 }
3658 
3659 SET_DECLARE(smap_open, char);
3660 SET_DECLARE(smap_close, char);
3661 
3662 static void
cpu_implement_smap(void)3663 cpu_implement_smap(void)
3664 {
3665 	char **scan;
3666 
3667 	for (scan = SET_BEGIN(smap_open);		/* nop -> stac */
3668 	     scan < SET_LIMIT(smap_open); ++scan) {
3669 		(*scan)[0] = 0x0F;
3670 		(*scan)[1] = 0x01;
3671 		(*scan)[2] = 0xCB;
3672 	}
3673 	for (scan = SET_BEGIN(smap_close);		/* nop -> clac */
3674 	     scan < SET_LIMIT(smap_close); ++scan) {
3675 		(*scan)[0] = 0x0F;
3676 		(*scan)[1] = 0x01;
3677 		(*scan)[2] = 0xCA;
3678 	}
3679 }
3680 
3681 /*
3682  * From a hard interrupt
3683  */
3684 int
cpu_interrupt_running(struct thread * td)3685 cpu_interrupt_running(struct thread *td)
3686 {
3687 	struct mdglobaldata *gd = mdcpu;
3688 
3689 	if (clock_debug1 > 0) {
3690 		--clock_debug1;
3691 		kprintf("%d %016lx %016lx %016lx\n",
3692 			((td->td_flags & TDF_INTTHREAD) != 0),
3693 			gd->gd_ipending[0],
3694 			gd->gd_ipending[1],
3695 			gd->gd_ipending[2]);
3696 		if (td->td_flags & TDF_CLKTHREAD) {
3697 			kprintf("CLKTD %s PREEMPT %s\n",
3698 				td->td_comm,
3699 				(td->td_preempted ?
3700 				 td->td_preempted->td_comm : ""));
3701 		} else {
3702 			kprintf("NORTD %s\n", td->td_comm);
3703 		}
3704 	}
3705 	if ((td->td_flags & TDF_INTTHREAD) ||
3706 	    gd->gd_ipending[0] ||
3707 	    gd->gd_ipending[1] ||
3708 	    gd->gd_ipending[2]) {
3709 		return 1;
3710 	} else {
3711 		return 0;
3712 	}
3713 }
3714