xref: /dragonfly/sys/platform/pc64/x86_64/machdep.c (revision b9a6fe08)
1 /*-
2  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
3  * Copyright (c) 1992 Terrence R. Lambert.
4  * Copyright (c) 2003 Peter Wemm.
5  * Copyright (c) 2008-2017 The DragonFly Project.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to Berkeley by
9  * William Jolitz.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. All advertising materials mentioning features or use of this software
20  *    must display the following acknowledgement:
21  *	This product includes software developed by the University of
22  *	California, Berkeley and its contributors.
23  * 4. Neither the name of the University nor the names of its contributors
24  *    may be used to endorse or promote products derived from this software
25  *    without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37  * SUCH DAMAGE.
38  *
39  * from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
40  * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $
41  */
42 
43 //#include "use_npx.h"
44 #include "use_isa.h"
45 #include "opt_cpu.h"
46 #include "opt_ddb.h"
47 #include "opt_inet.h"
48 #include "opt_msgbuf.h"
49 #include "opt_swap.h"
50 
51 #include <sys/param.h>
52 #include <sys/systm.h>
53 #include <sys/sysmsg.h>
54 #include <sys/signalvar.h>
55 #include <sys/kernel.h>
56 #include <sys/linker.h>
57 #include <sys/malloc.h>
58 #include <sys/proc.h>
59 #include <sys/priv.h>
60 #include <sys/buf.h>
61 #include <sys/reboot.h>
62 #include <sys/mbuf.h>
63 #include <sys/msgbuf.h>
64 #include <sys/sysent.h>
65 #include <sys/sysctl.h>
66 #include <sys/vmmeter.h>
67 #include <sys/bus.h>
68 #include <sys/usched.h>
69 #include <sys/reg.h>
70 #include <sys/sbuf.h>
71 #include <sys/ctype.h>
72 #include <sys/serialize.h>
73 #include <sys/systimer.h>
74 
75 #include <vm/vm.h>
76 #include <vm/vm_param.h>
77 #include <sys/lock.h>
78 #include <vm/vm_kern.h>
79 #include <vm/vm_object.h>
80 #include <vm/vm_page.h>
81 #include <vm/vm_map.h>
82 #include <vm/vm_pager.h>
83 #include <vm/vm_extern.h>
84 
85 #include <sys/thread2.h>
86 #include <sys/mplock2.h>
87 
88 #include <sys/exec.h>
89 #include <sys/cons.h>
90 
91 #include <sys/efi.h>
92 
93 #include <ddb/ddb.h>
94 
95 #include <machine/cpu.h>
96 #include <machine/clock.h>
97 #include <machine/specialreg.h>
98 #if 0 /* JG */
99 #include <machine/bootinfo.h>
100 #endif
101 #include <machine/md_var.h>
102 #include <machine/metadata.h>
103 #include <machine/pc/bios.h>
104 #include <machine/pcb_ext.h>
105 #include <machine/globaldata.h>		/* CPU_prvspace */
106 #include <machine/smp.h>
107 #include <machine/cputypes.h>
108 #include <machine/intr_machdep.h>
109 #include <machine/framebuffer.h>
110 
111 #ifdef OLD_BUS_ARCH
112 #include <bus/isa/isa_device.h>
113 #endif
114 #include <machine_base/isa/isa_intr.h>
115 #include <bus/isa/rtc.h>
116 #include <sys/random.h>
117 #include <sys/ptrace.h>
118 #include <machine/sigframe.h>
119 
120 #include <sys/machintr.h>
121 #include <machine_base/icu/icu_abi.h>
122 #include <machine_base/icu/elcr_var.h>
123 #include <machine_base/apic/lapic.h>
124 #include <machine_base/apic/ioapic.h>
125 #include <machine_base/apic/ioapic_abi.h>
126 #include <machine/mptable.h>
127 
128 #define PHYSMAP_ENTRIES		10
129 #define MAXBUFSTRUCTSIZE	((size_t)512 * 1024 * 1024)
130 
131 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
132 
133 extern void printcpuinfo(void);	/* XXX header file */
134 extern void identify_cpu(void);
135 extern void panicifcpuunsupported(void);
136 
137 static void cpu_startup(void *);
138 static void pic_finish(void *);
139 static void cpu_finish(void *);
140 
141 static void set_fpregs_xmm(struct save87 *, struct savexmm *);
142 static void fill_fpregs_xmm(struct savexmm *, struct save87 *);
143 static void init_locks(void);
144 
145 extern void pcpu_timer_always(struct intrframe *);
146 
147 SYSINIT(cpu, SI_BOOT2_START_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
148 SYSINIT(pic_finish, SI_BOOT2_FINISH_PIC, SI_ORDER_FIRST, pic_finish, NULL);
149 SYSINIT(cpu_finish, SI_BOOT2_FINISH_CPU, SI_ORDER_FIRST, cpu_finish, NULL);
150 
151 #ifdef DDB
152 extern vm_offset_t ksym_start, ksym_end;
153 #endif
154 
155 struct privatespace CPU_prvspace_bsp __aligned(4096);
156 struct privatespace *CPU_prvspace[MAXCPU] = { &CPU_prvspace_bsp };
157 
158 vm_paddr_t efi_systbl_phys;
159 int	_udatasel, _ucodesel, _ucode32sel;
160 u_long	atdevbase;
161 int64_t tsc_offsets[MAXCPU];
162 cpumask_t smp_idleinvl_mask;
163 cpumask_t smp_idleinvl_reqs;
164 
165  /* MWAIT hint (EAX) or CPU_MWAIT_HINT_ */
166 __read_mostly static int cpu_mwait_halt_global;
167 __read_mostly static int clock_debug1;
168 
169 #if defined(SWTCH_OPTIM_STATS)
170 extern int swtch_optim_stats;
171 SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats,
172 	CTLFLAG_RD, &swtch_optim_stats, 0, "");
173 SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count,
174 	CTLFLAG_RD, &tlb_flush_count, 0, "");
175 #endif
176 SYSCTL_INT(_debug, OID_AUTO, clock_debug1,
177 	CTLFLAG_RW, &clock_debug1, 0, "");
178 SYSCTL_INT(_hw, OID_AUTO, cpu_mwait_halt,
179 	CTLFLAG_RD, &cpu_mwait_halt_global, 0, "");
180 SYSCTL_INT(_hw, OID_AUTO, cpu_mwait_spin,
181 	CTLFLAG_RD, &cpu_mwait_spin, 0, "monitor/mwait target state");
182 
183 #define CPU_MWAIT_HAS_CX	\
184 	((cpu_feature2 & CPUID2_MON) && \
185 	 (cpu_mwait_feature & CPUID_MWAIT_EXT))
186 
187 #define CPU_MWAIT_CX_NAMELEN	16
188 
189 #define CPU_MWAIT_C1		1
190 #define CPU_MWAIT_C2		2
191 #define CPU_MWAIT_C3		3
192 #define CPU_MWAIT_CX_MAX	8
193 
194 #define CPU_MWAIT_HINT_AUTO	-1	/* C1 and C2 */
195 #define CPU_MWAIT_HINT_AUTODEEP	-2	/* C3+ */
196 
197 SYSCTL_NODE(_machdep, OID_AUTO, mwait, CTLFLAG_RW, 0, "MWAIT features");
198 SYSCTL_NODE(_machdep_mwait, OID_AUTO, CX, CTLFLAG_RW, 0, "MWAIT Cx settings");
199 
200 struct cpu_mwait_cx {
201 	int			subcnt;
202 	char			name[4];
203 	struct sysctl_ctx_list	sysctl_ctx;
204 	struct sysctl_oid	*sysctl_tree;
205 };
206 static struct cpu_mwait_cx	cpu_mwait_cx_info[CPU_MWAIT_CX_MAX];
207 static char			cpu_mwait_cx_supported[256];
208 
209 static int			cpu_mwait_c1_hints_cnt;
210 static int			cpu_mwait_hints_cnt;
211 static int			*cpu_mwait_hints;
212 
213 static int			cpu_mwait_deep_hints_cnt;
214 static int			*cpu_mwait_deep_hints;
215 
216 #define CPU_IDLE_REPEAT_DEFAULT	750
217 
218 static u_int			cpu_idle_repeat = CPU_IDLE_REPEAT_DEFAULT;
219 static u_long			cpu_idle_repeat_max = CPU_IDLE_REPEAT_DEFAULT;
220 static u_int			cpu_mwait_repeat_shift = 1;
221 
222 #define CPU_MWAIT_C3_PREAMBLE_BM_ARB	0x1
223 #define CPU_MWAIT_C3_PREAMBLE_BM_STS	0x2
224 
225 static int			cpu_mwait_c3_preamble =
226 				    CPU_MWAIT_C3_PREAMBLE_BM_ARB |
227 				    CPU_MWAIT_C3_PREAMBLE_BM_STS;
228 
229 SYSCTL_STRING(_machdep_mwait_CX, OID_AUTO, supported, CTLFLAG_RD,
230     cpu_mwait_cx_supported, 0, "MWAIT supported C states");
231 SYSCTL_INT(_machdep_mwait_CX, OID_AUTO, c3_preamble, CTLFLAG_RD,
232     &cpu_mwait_c3_preamble, 0, "C3+ preamble mask");
233 
234 static int	cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS,
235 		    int *, boolean_t);
236 static int	cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS);
237 static int	cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS);
238 static int	cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS);
239 
240 SYSCTL_PROC(_machdep_mwait_CX, OID_AUTO, idle, CTLTYPE_STRING|CTLFLAG_RW,
241     NULL, 0, cpu_mwait_cx_idle_sysctl, "A", "");
242 SYSCTL_PROC(_machdep_mwait_CX, OID_AUTO, spin, CTLTYPE_STRING|CTLFLAG_RW,
243     NULL, 0, cpu_mwait_cx_spin_sysctl, "A", "");
244 SYSCTL_UINT(_machdep_mwait_CX, OID_AUTO, repeat_shift, CTLFLAG_RW,
245     &cpu_mwait_repeat_shift, 0, "");
246 
247 long physmem = 0;
248 
249 u_long ebda_addr = 0;
250 
251 int imcr_present = 0;
252 
253 int naps = 0; /* # of Applications processors */
254 
255 u_int base_memory;
256 
257 static int
258 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS)
259 {
260 	u_long pmem = ctob(physmem);
261 	int error;
262 
263 	error = sysctl_handle_long(oidp, &pmem, 0, req);
264 
265 	return (error);
266 }
267 
268 SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_ULONG|CTLFLAG_RD,
269 	0, 0, sysctl_hw_physmem, "LU",
270 	"Total system memory in bytes (number of pages * page size)");
271 
272 static int
273 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS)
274 {
275 	u_long usermem = ctob(physmem - vmstats.v_wire_count);
276 	int error;
277 
278 	error = sysctl_handle_long(oidp, &usermem, 0, req);
279 
280 	return (error);
281 }
282 
283 SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_ULONG|CTLFLAG_RD,
284 	0, 0, sysctl_hw_usermem, "LU", "");
285 
286 static int
287 sysctl_hw_availpages(SYSCTL_HANDLER_ARGS)
288 {
289 	int error;
290 	u_long availpages;
291 
292 	availpages = x86_64_btop(avail_end - avail_start);
293 	error = sysctl_handle_long(oidp, &availpages, 0, req);
294 
295 	return (error);
296 }
297 
298 SYSCTL_PROC(_hw, OID_AUTO, availpages, CTLTYPE_ULONG|CTLFLAG_RD,
299 	0, 0, sysctl_hw_availpages, "LU", "");
300 
301 vm_paddr_t Maxmem;
302 vm_paddr_t Realmem;
303 
304 /*
305  * The number of PHYSMAP entries must be one less than the number of
306  * PHYSSEG entries because the PHYSMAP entry that spans the largest
307  * physical address that is accessible by ISA DMA is split into two
308  * PHYSSEG entries.
309  */
310 vm_phystable_t phys_avail[VM_PHYSSEG_MAX + 1];
311 vm_phystable_t dump_avail[VM_PHYSSEG_MAX + 1];
312 
313 /* must be 1 less so 0 0 can signal end of chunks */
314 #define PHYS_AVAIL_ARRAY_END (NELEM(phys_avail) - 1)
315 #define DUMP_AVAIL_ARRAY_END (NELEM(dump_avail) - 1)
316 
317 static vm_offset_t buffer_sva, buffer_eva;
318 vm_offset_t clean_sva, clean_eva;
319 static vm_offset_t pager_sva, pager_eva;
320 static struct trapframe proc0_tf;
321 
322 static void cpu_implement_smap(void);
323 
324 static void
325 cpu_startup(void *dummy)
326 {
327 	caddr_t v;
328 	vm_size_t size = 0;
329 	vm_offset_t firstaddr;
330 
331 	/*
332 	 * Good {morning,afternoon,evening,night}.
333 	 */
334 	kprintf("%s", version);
335 	startrtclock();
336 	printcpuinfo();
337 	panicifcpuunsupported();
338 	if (cpu_stdext_feature & CPUID_STDEXT_SMAP)
339 		cpu_implement_smap();
340 
341 	kprintf("real memory  = %ju (%ju MB)\n",
342 		(intmax_t)Realmem,
343 		(intmax_t)Realmem / 1024 / 1024);
344 	/*
345 	 * Display any holes after the first chunk of extended memory.
346 	 */
347 	if (bootverbose) {
348 		int indx;
349 
350 		kprintf("Physical memory chunk(s):\n");
351 		for (indx = 0; phys_avail[indx].phys_end != 0; ++indx) {
352 			vm_paddr_t size1;
353 
354 			size1 = phys_avail[indx].phys_end -
355 				phys_avail[indx].phys_beg;
356 
357 			kprintf("0x%08jx - 0x%08jx, %ju bytes (%ju pages)\n",
358 				(intmax_t)phys_avail[indx].phys_beg,
359 				(intmax_t)phys_avail[indx].phys_end - 1,
360 				(intmax_t)size1,
361 				(intmax_t)(size1 / PAGE_SIZE));
362 		}
363 	}
364 
365 	/*
366 	 * Allocate space for system data structures.
367 	 * The first available kernel virtual address is in "v".
368 	 * As pages of kernel virtual memory are allocated, "v" is incremented.
369 	 * As pages of memory are allocated and cleared,
370 	 * "firstaddr" is incremented.
371 	 * An index into the kernel page table corresponding to the
372 	 * virtual memory address maintained in "v" is kept in "mapaddr".
373 	 */
374 
375 	/*
376 	 * Make two passes.  The first pass calculates how much memory is
377 	 * needed and allocates it.  The second pass assigns virtual
378 	 * addresses to the various data structures.
379 	 */
380 	firstaddr = 0;
381 again:
382 	v = (caddr_t)firstaddr;
383 
384 #define	valloc(name, type, num) \
385 	    (name) = (type *)v; v = (caddr_t)((name)+(num))
386 #define	valloclim(name, type, num, lim) \
387 	    (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num)))
388 
389 	/*
390 	 * Calculate nbuf such that maxbufspace uses approximately 1/20
391 	 * of physical memory by default, with a minimum of 50 buffers.
392 	 *
393 	 * The calculation is made after discounting 128MB.
394 	 *
395 	 * NOTE: maxbufspace is (nbuf * NBUFCALCSIZE) (NBUFCALCSIZE ~= 16KB).
396 	 *	 nbuf = (kbytes / factor) would cover all of memory.
397 	 */
398 	if (nbuf == 0) {
399 		long factor = NBUFCALCSIZE / 1024;		/* KB/nbuf */
400 		long kbytes = physmem * (PAGE_SIZE / 1024);	/* physmem */
401 
402 		nbuf = 50;
403 		if (kbytes > 128 * 1024)
404 			nbuf += (kbytes - 128 * 1024) / (factor * 20);
405 		if (maxbcache && nbuf > maxbcache / NBUFCALCSIZE)
406 			nbuf = maxbcache / NBUFCALCSIZE;
407 		if ((size_t)nbuf * sizeof(struct buf) > MAXBUFSTRUCTSIZE) {
408 			kprintf("Warning: nbuf capped at %ld due to the "
409 				"reasonability limit\n", nbuf);
410 			nbuf = MAXBUFSTRUCTSIZE / sizeof(struct buf);
411 		}
412 	}
413 
414 	/*
415 	 * Do not allow the buffer_map to be more then 1/2 the size of the
416 	 * kernel_map.
417 	 */
418 	if (nbuf > (virtual_end - virtual_start +
419 		    virtual2_end - virtual2_start) / (MAXBSIZE * 2)) {
420 		nbuf = (virtual_end - virtual_start +
421 			virtual2_end - virtual2_start) / (MAXBSIZE * 2);
422 		kprintf("Warning: nbufs capped at %ld due to kvm\n", nbuf);
423 	}
424 
425 	/*
426 	 * Do not allow the buffer_map to use more than 50% of available
427 	 * physical-equivalent memory.  Since the VM pages which back
428 	 * individual buffers are typically wired, having too many bufs
429 	 * can prevent the system from paging properly.
430 	 */
431 	if (nbuf > physmem * PAGE_SIZE / (NBUFCALCSIZE * 2)) {
432 		nbuf = physmem * PAGE_SIZE / (NBUFCALCSIZE * 2);
433 		kprintf("Warning: nbufs capped at %ld due to physmem\n", nbuf);
434 	}
435 
436 	/*
437 	 * Do not allow the sizeof(struct buf) * nbuf to exceed 1/4 of
438 	 * the valloc space which is just the virtual_end - virtual_start
439 	 * section.  This is typically ~2GB regardless of the amount of
440 	 * memory, so we use 500MB as a metric.
441 	 *
442 	 * This is because we use valloc() to allocate the buf header array.
443 	 *
444 	 * NOTE: buffer space in bytes is limited by vfs.*bufspace sysctls.
445 	 */
446 	if (nbuf > (virtual_end - virtual_start) / (sizeof(struct buf) * 4)) {
447 		nbuf = (virtual_end - virtual_start) /
448 		       (sizeof(struct buf) * 4);
449 		kprintf("Warning: nbufs capped at %ld due to "
450 			"valloc considerations\n",
451 			nbuf);
452 	}
453 
454 	nswbuf_mem = lmax(lmin(nbuf / 32, 512), 8);
455 #ifdef NSWBUF_MIN
456 	if (nswbuf_mem < NSWBUF_MIN)
457 		nswbuf_mem = NSWBUF_MIN;
458 #endif
459 	nswbuf_kva = lmax(lmin(nbuf / 4, 512), 16);
460 #ifdef NSWBUF_MIN
461 	if (nswbuf_kva < NSWBUF_MIN)
462 		nswbuf_kva = NSWBUF_MIN;
463 #endif
464 
465 	valloc(swbuf_mem, struct buf, nswbuf_mem);
466 	valloc(swbuf_kva, struct buf, nswbuf_kva);
467 	valloc(buf, struct buf, nbuf);
468 
469 	/*
470 	 * End of first pass, size has been calculated so allocate memory
471 	 */
472 	if (firstaddr == 0) {
473 		size = (vm_size_t)(v - firstaddr);
474 		firstaddr = kmem_alloc(&kernel_map, round_page(size),
475 				       VM_SUBSYS_BUF);
476 		if (firstaddr == 0)
477 			panic("startup: no room for tables");
478 		goto again;
479 	}
480 
481 	/*
482 	 * End of second pass, addresses have been assigned
483 	 *
484 	 * nbuf is an int, make sure we don't overflow the field.
485 	 *
486 	 * On 64-bit systems we always reserve maximal allocations for
487 	 * buffer cache buffers and there are no fragmentation issues,
488 	 * so the KVA segment does not have to be excessively oversized.
489 	 */
490 	if ((vm_size_t)(v - firstaddr) != size)
491 		panic("startup: table size inconsistency");
492 
493 	kmem_suballoc(&kernel_map, &clean_map, &clean_sva, &clean_eva,
494 		      ((vm_offset_t)(nbuf + 16) * MAXBSIZE) +
495 		      ((nswbuf_mem + nswbuf_kva) * MAXPHYS) + pager_map_size);
496 	kmem_suballoc(&clean_map, &buffer_map, &buffer_sva, &buffer_eva,
497 		      ((vm_offset_t)(nbuf + 16) * MAXBSIZE));
498 	buffer_map.system_map = 1;
499 	kmem_suballoc(&clean_map, &pager_map, &pager_sva, &pager_eva,
500 		      ((vm_offset_t)(nswbuf_mem + nswbuf_kva) * MAXPHYS) +
501 		      pager_map_size);
502 	pager_map.system_map = 1;
503 	kprintf("avail memory = %ju (%ju MB)\n",
504 		(uintmax_t)ptoa(vmstats.v_free_count + vmstats.v_dma_pages),
505 		(uintmax_t)ptoa(vmstats.v_free_count + vmstats.v_dma_pages) /
506 		1024 / 1024);
507 }
508 
509 struct cpu_idle_stat {
510 	int	hint;
511 	int	reserved;
512 	u_long	halt;
513 	u_long	spin;
514 	u_long	repeat;
515 	u_long	repeat_last;
516 	u_long	repeat_delta;
517 	u_long	mwait_cx[CPU_MWAIT_CX_MAX];
518 } __cachealign;
519 
520 #define CPU_IDLE_STAT_HALT	-1
521 #define CPU_IDLE_STAT_SPIN	-2
522 
523 static struct cpu_idle_stat	cpu_idle_stats[MAXCPU];
524 
525 static int
526 sysctl_cpu_idle_cnt(SYSCTL_HANDLER_ARGS)
527 {
528 	int idx = arg2, cpu, error;
529 	u_long val = 0;
530 
531 	if (idx == CPU_IDLE_STAT_HALT) {
532 		for (cpu = 0; cpu < ncpus; ++cpu)
533 			val += cpu_idle_stats[cpu].halt;
534 	} else if (idx == CPU_IDLE_STAT_SPIN) {
535 		for (cpu = 0; cpu < ncpus; ++cpu)
536 			val += cpu_idle_stats[cpu].spin;
537 	} else {
538 		KASSERT(idx >= 0 && idx < CPU_MWAIT_CX_MAX,
539 		    ("invalid index %d", idx));
540 		for (cpu = 0; cpu < ncpus; ++cpu)
541 			val += cpu_idle_stats[cpu].mwait_cx[idx];
542 	}
543 
544 	error = sysctl_handle_quad(oidp, &val, 0, req);
545         if (error || req->newptr == NULL)
546 	        return error;
547 
548 	if (idx == CPU_IDLE_STAT_HALT) {
549 		for (cpu = 0; cpu < ncpus; ++cpu)
550 			cpu_idle_stats[cpu].halt = 0;
551 		cpu_idle_stats[0].halt = val;
552 	} else if (idx == CPU_IDLE_STAT_SPIN) {
553 		for (cpu = 0; cpu < ncpus; ++cpu)
554 			cpu_idle_stats[cpu].spin = 0;
555 		cpu_idle_stats[0].spin = val;
556 	} else {
557 		KASSERT(idx >= 0 && idx < CPU_MWAIT_CX_MAX,
558 		    ("invalid index %d", idx));
559 		for (cpu = 0; cpu < ncpus; ++cpu)
560 			cpu_idle_stats[cpu].mwait_cx[idx] = 0;
561 		cpu_idle_stats[0].mwait_cx[idx] = val;
562 	}
563 	return 0;
564 }
565 
566 static void
567 cpu_mwait_attach(void)
568 {
569 	struct sbuf sb;
570 	int hint_idx, i;
571 
572 	if (!CPU_MWAIT_HAS_CX)
573 		return;
574 
575 	if (cpu_vendor_id == CPU_VENDOR_INTEL &&
576 	    (CPUID_TO_FAMILY(cpu_id) > 0xf ||
577 	     (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
578 	      CPUID_TO_MODEL(cpu_id) >= 0xf))) {
579 		int bm_sts = 1;
580 
581 		/*
582 		 * Pentium dual-core, Core 2 and beyond do not need any
583 		 * additional activities to enter deep C-state, i.e. C3(+).
584 		 */
585 		cpu_mwait_cx_no_bmarb();
586 
587 		TUNABLE_INT_FETCH("machdep.cpu.mwait.bm_sts", &bm_sts);
588 		if (!bm_sts)
589 			cpu_mwait_cx_no_bmsts();
590 	}
591 
592 	sbuf_new(&sb, cpu_mwait_cx_supported,
593 	    sizeof(cpu_mwait_cx_supported), SBUF_FIXEDLEN);
594 
595 	for (i = 0; i < CPU_MWAIT_CX_MAX; ++i) {
596 		struct cpu_mwait_cx *cx = &cpu_mwait_cx_info[i];
597 		int sub;
598 
599 		ksnprintf(cx->name, sizeof(cx->name), "C%d", i);
600 
601 		sysctl_ctx_init(&cx->sysctl_ctx);
602 		cx->sysctl_tree = SYSCTL_ADD_NODE(&cx->sysctl_ctx,
603 		    SYSCTL_STATIC_CHILDREN(_machdep_mwait), OID_AUTO,
604 		    cx->name, CTLFLAG_RW, NULL, "Cx control/info");
605 		if (cx->sysctl_tree == NULL)
606 			continue;
607 
608 		cx->subcnt = CPUID_MWAIT_CX_SUBCNT(cpu_mwait_extemu, i);
609 		SYSCTL_ADD_INT(&cx->sysctl_ctx,
610 		    SYSCTL_CHILDREN(cx->sysctl_tree), OID_AUTO,
611 		    "subcnt", CTLFLAG_RD, &cx->subcnt, 0,
612 		    "sub-state count");
613 		SYSCTL_ADD_PROC(&cx->sysctl_ctx,
614 		    SYSCTL_CHILDREN(cx->sysctl_tree), OID_AUTO,
615 		    "entered", (CTLTYPE_QUAD | CTLFLAG_RW), 0,
616 		    i, sysctl_cpu_idle_cnt, "Q", "# of times entered");
617 
618 		for (sub = 0; sub < cx->subcnt; ++sub)
619 			sbuf_printf(&sb, "C%d/%d ", i, sub);
620 	}
621 	sbuf_trim(&sb);
622 	sbuf_finish(&sb);
623 
624 	/*
625 	 * Non-deep C-states
626 	 */
627 	cpu_mwait_c1_hints_cnt = cpu_mwait_cx_info[CPU_MWAIT_C1].subcnt;
628 	for (i = CPU_MWAIT_C1; i < CPU_MWAIT_C3; ++i)
629 		cpu_mwait_hints_cnt += cpu_mwait_cx_info[i].subcnt;
630 	cpu_mwait_hints = kmalloc(sizeof(int) * cpu_mwait_hints_cnt,
631 				  M_DEVBUF, M_WAITOK);
632 
633 	hint_idx = 0;
634 	for (i = CPU_MWAIT_C1; i < CPU_MWAIT_C3; ++i) {
635 		int j, subcnt;
636 
637 		subcnt = cpu_mwait_cx_info[i].subcnt;
638 		for (j = 0; j < subcnt; ++j) {
639 			KASSERT(hint_idx < cpu_mwait_hints_cnt,
640 			    ("invalid mwait hint index %d", hint_idx));
641 			cpu_mwait_hints[hint_idx] = MWAIT_EAX_HINT(i, j);
642 			++hint_idx;
643 		}
644 	}
645 	KASSERT(hint_idx == cpu_mwait_hints_cnt,
646 	    ("mwait hint count %d != index %d",
647 	     cpu_mwait_hints_cnt, hint_idx));
648 
649 	if (bootverbose) {
650 		kprintf("MWAIT hints (%d C1 hints):\n", cpu_mwait_c1_hints_cnt);
651 		for (i = 0; i < cpu_mwait_hints_cnt; ++i) {
652 			int hint = cpu_mwait_hints[i];
653 
654 			kprintf("  C%d/%d hint 0x%04x\n",
655 			    MWAIT_EAX_TO_CX(hint), MWAIT_EAX_TO_CX_SUB(hint),
656 			    hint);
657 		}
658 	}
659 
660 	/*
661 	 * Deep C-states
662 	 */
663 	for (i = CPU_MWAIT_C1; i < CPU_MWAIT_CX_MAX; ++i)
664 		cpu_mwait_deep_hints_cnt += cpu_mwait_cx_info[i].subcnt;
665 	cpu_mwait_deep_hints = kmalloc(sizeof(int) * cpu_mwait_deep_hints_cnt,
666 	    M_DEVBUF, M_WAITOK);
667 
668 	hint_idx = 0;
669 	for (i = CPU_MWAIT_C1; i < CPU_MWAIT_CX_MAX; ++i) {
670 		int j, subcnt;
671 
672 		subcnt = cpu_mwait_cx_info[i].subcnt;
673 		for (j = 0; j < subcnt; ++j) {
674 			KASSERT(hint_idx < cpu_mwait_deep_hints_cnt,
675 			    ("invalid mwait deep hint index %d", hint_idx));
676 			cpu_mwait_deep_hints[hint_idx] = MWAIT_EAX_HINT(i, j);
677 			++hint_idx;
678 		}
679 	}
680 	KASSERT(hint_idx == cpu_mwait_deep_hints_cnt,
681 	    ("mwait deep hint count %d != index %d",
682 	     cpu_mwait_deep_hints_cnt, hint_idx));
683 
684 	if (bootverbose) {
685 		kprintf("MWAIT deep hints:\n");
686 		for (i = 0; i < cpu_mwait_deep_hints_cnt; ++i) {
687 			int hint = cpu_mwait_deep_hints[i];
688 
689 			kprintf("  C%d/%d hint 0x%04x\n",
690 			    MWAIT_EAX_TO_CX(hint), MWAIT_EAX_TO_CX_SUB(hint),
691 			    hint);
692 		}
693 	}
694 	cpu_idle_repeat_max = 256 * cpu_mwait_deep_hints_cnt;
695 
696 	for (i = 0; i < ncpus; ++i) {
697 		char name[16];
698 
699 		ksnprintf(name, sizeof(name), "idle%d", i);
700 		SYSCTL_ADD_PROC(NULL,
701 		    SYSCTL_STATIC_CHILDREN(_machdep_mwait_CX), OID_AUTO,
702 		    name, (CTLTYPE_STRING | CTLFLAG_RW), &cpu_idle_stats[i],
703 		    0, cpu_mwait_cx_pcpu_idle_sysctl, "A", "");
704 	}
705 }
706 
707 static void
708 cpu_finish(void *dummy __unused)
709 {
710 	cpu_setregs();
711 	cpu_mwait_attach();
712 }
713 
714 static void
715 pic_finish(void *dummy __unused)
716 {
717 	/* Log ELCR information */
718 	elcr_dump();
719 
720 	/* Log MPTABLE information */
721 	mptable_pci_int_dump();
722 
723 	/* Finalize PCI */
724 	MachIntrABI.finalize();
725 }
726 
727 /*
728  * Send an interrupt to process.
729  *
730  * Stack is set up to allow sigcode stored
731  * at top to call routine, followed by kcall
732  * to sigreturn routine below.  After sigreturn
733  * resets the signal mask, the stack, and the
734  * frame pointer, it returns to the user
735  * specified pc, psl.
736  */
737 void
738 sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
739 {
740 	struct lwp *lp = curthread->td_lwp;
741 	struct proc *p = lp->lwp_proc;
742 	struct trapframe *regs;
743 	struct sigacts *psp = p->p_sigacts;
744 	struct sigframe sf, *sfp;
745 	int oonstack;
746 	char *sp;
747 
748 	regs = lp->lwp_md.md_regs;
749 	oonstack = (lp->lwp_sigstk.ss_flags & SS_ONSTACK) ? 1 : 0;
750 
751 	/* Save user context */
752 	bzero(&sf, sizeof(struct sigframe));
753 	sf.sf_uc.uc_sigmask = *mask;
754 	sf.sf_uc.uc_stack = lp->lwp_sigstk;
755 	sf.sf_uc.uc_mcontext.mc_onstack = oonstack;
756 	KKASSERT(__offsetof(struct trapframe, tf_rdi) == 0);
757 	/* gcc errors out on optimized bcopy */
758 	_bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(struct trapframe));
759 
760 	/* Make the size of the saved context visible to userland */
761 	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext);
762 
763 	/* Allocate and validate space for the signal handler context. */
764         if ((lp->lwp_flags & LWP_ALTSTACK) != 0 && !oonstack &&
765 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
766 		sp = (char *)lp->lwp_sigstk.ss_sp + lp->lwp_sigstk.ss_size -
767 		    sizeof(struct sigframe);
768 		lp->lwp_sigstk.ss_flags |= SS_ONSTACK;
769 	} else {
770 		/* We take red zone into account */
771 		sp = (char *)regs->tf_rsp - sizeof(struct sigframe) - 128;
772 	}
773 
774 	/*
775 	 * XXX AVX needs 64-byte alignment but sigframe has other fields and
776 	 * the embedded ucontext is not at the front, so aligning this won't
777 	 * help us.  Fortunately we bcopy in/out of the sigframe, so the
778 	 * kernel is ok.
779 	 *
780 	 * The problem though is if userland winds up trying to use the
781 	 * context directly.
782 	 */
783 	sfp = (struct sigframe *)((intptr_t)sp & ~(intptr_t)0xF);
784 
785 	/* Translate the signal is appropriate */
786 	if (p->p_sysent->sv_sigtbl) {
787 		if (sig <= p->p_sysent->sv_sigsize)
788 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
789 	}
790 
791 	/*
792 	 * Build the argument list for the signal handler.
793 	 *
794 	 * Arguments are in registers (%rdi, %rsi, %rdx, %rcx)
795 	 */
796 	regs->tf_rdi = sig;				/* argument 1 */
797 	regs->tf_rdx = (register_t)&sfp->sf_uc;		/* argument 3 */
798 
799 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
800 		/*
801 		 * Signal handler installed with SA_SIGINFO.
802 		 *
803 		 * action(signo, siginfo, ucontext)
804 		 */
805 		regs->tf_rsi = (register_t)&sfp->sf_si;	/* argument 2 */
806 		regs->tf_rcx = (register_t)regs->tf_addr; /* argument 4 */
807 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
808 
809 		/* fill siginfo structure */
810 		sf.sf_si.si_signo = sig;
811 		sf.sf_si.si_pid = psp->ps_frominfo[sig].pid;
812 		sf.sf_si.si_uid = psp->ps_frominfo[sig].uid;
813 		sf.sf_si.si_code = code;
814 		sf.sf_si.si_addr = (void *)regs->tf_addr;
815 	} else {
816 		/*
817 		 * Old FreeBSD-style arguments.
818 		 *
819 		 * handler (signo, code, [uc], addr)
820 		 */
821 		regs->tf_rsi = (register_t)code;	/* argument 2 */
822 		regs->tf_rcx = (register_t)regs->tf_addr; /* argument 4 */
823 		sf.sf_ahu.sf_handler = catcher;
824 	}
825 
826 	/*
827 	 * If we're a vm86 process, we want to save the segment registers.
828 	 * We also change eflags to be our emulated eflags, not the actual
829 	 * eflags.
830 	 */
831 #if 0 /* JG */
832 	if (regs->tf_eflags & PSL_VM) {
833 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
834 		struct vm86_kernel *vm86 = &lp->lwp_thread->td_pcb->pcb_ext->ext_vm86;
835 
836 		sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
837 		sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
838 		sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
839 		sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
840 
841 		if (vm86->vm86_has_vme == 0)
842 			sf.sf_uc.uc_mcontext.mc_eflags =
843 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
844 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
845 
846 		/*
847 		 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
848 		 * syscalls made by the signal handler.  This just avoids
849 		 * wasting time for our lazy fixup of such faults.  PSL_NT
850 		 * does nothing in vm86 mode, but vm86 programs can set it
851 		 * almost legitimately in probes for old cpu types.
852 		 */
853 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
854 	}
855 #endif
856 
857 	/*
858 	 * Save the FPU state and reinit the FP unit
859 	 */
860 	npxpush(&sf.sf_uc.uc_mcontext);
861 
862 	/*
863 	 * Copy the sigframe out to the user's stack.
864 	 */
865 	if (copyout(&sf, sfp, sizeof(struct sigframe)) != 0) {
866 		/*
867 		 * Something is wrong with the stack pointer.
868 		 * ...Kill the process.
869 		 */
870 		sigexit(lp, SIGILL);
871 	}
872 
873 	regs->tf_rsp = (register_t)sfp;
874 	regs->tf_rip = trunc_page64(PS_STRINGS - *(p->p_sysent->sv_szsigcode));
875 	regs->tf_rip -= SZSIGCODE_EXTRA_BYTES;
876 
877 	/*
878 	 * x86 abi specifies that the direction flag must be cleared
879 	 * on function entry
880 	 */
881 	regs->tf_rflags &= ~(PSL_T | PSL_D);
882 
883 	/*
884 	 * 64 bit mode has a code and stack selector but
885 	 * no data or extra selector.  %fs and %gs are not
886 	 * stored in-context.
887 	 */
888 	regs->tf_cs = _ucodesel;
889 	regs->tf_ss = _udatasel;
890 	clear_quickret();
891 }
892 
893 /*
894  * Sanitize the trapframe for a virtual kernel passing control to a custom
895  * VM context.  Remove any items that would otherwise create a privilage
896  * issue.
897  *
898  * XXX at the moment we allow userland to set the resume flag.  Is this a
899  * bad idea?
900  */
901 int
902 cpu_sanitize_frame(struct trapframe *frame)
903 {
904 	frame->tf_cs = _ucodesel;
905 	frame->tf_ss = _udatasel;
906 	/* XXX VM (8086) mode not supported? */
907 	frame->tf_rflags &= (PSL_RF | PSL_USERCHANGE | PSL_VM_UNSUPP);
908 	frame->tf_rflags |= PSL_RESERVED_DEFAULT | PSL_I;
909 
910 	return(0);
911 }
912 
913 /*
914  * Sanitize the tls so loading the descriptor does not blow up
915  * on us.  For x86_64 we don't have to do anything.
916  */
917 int
918 cpu_sanitize_tls(struct savetls *tls)
919 {
920 	return(0);
921 }
922 
923 /*
924  * sigreturn(ucontext_t *sigcntxp)
925  *
926  * System call to cleanup state after a signal
927  * has been taken.  Reset signal mask and
928  * stack state from context left by sendsig (above).
929  * Return to previous pc and psl as specified by
930  * context left by sendsig. Check carefully to
931  * make sure that the user has not modified the
932  * state to gain improper privileges.
933  *
934  * MPSAFE
935  */
936 #define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
937 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
938 
939 int
940 sys_sigreturn(struct sysmsg *sysmsg, const struct sigreturn_args *uap)
941 {
942 	struct lwp *lp = curthread->td_lwp;
943 	struct trapframe *regs;
944 	ucontext_t uc;
945 	ucontext_t *ucp;
946 	register_t rflags;
947 	int cs;
948 	int error;
949 
950 	/*
951 	 * We have to copy the information into kernel space so userland
952 	 * can't modify it while we are sniffing it.
953 	 */
954 	regs = lp->lwp_md.md_regs;
955 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
956 	if (error)
957 		return (error);
958 	ucp = &uc;
959 	rflags = ucp->uc_mcontext.mc_rflags;
960 
961 	/* VM (8086) mode not supported */
962 	rflags &= ~PSL_VM_UNSUPP;
963 
964 #if 0 /* JG */
965 	if (eflags & PSL_VM) {
966 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
967 		struct vm86_kernel *vm86;
968 
969 		/*
970 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
971 		 * set up the vm86 area, and we can't enter vm86 mode.
972 		 */
973 		if (lp->lwp_thread->td_pcb->pcb_ext == 0)
974 			return (EINVAL);
975 		vm86 = &lp->lwp_thread->td_pcb->pcb_ext->ext_vm86;
976 		if (vm86->vm86_inited == 0)
977 			return (EINVAL);
978 
979 		/* go back to user mode if both flags are set */
980 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF))
981 			trapsignal(lp, SIGBUS, 0);
982 
983 		if (vm86->vm86_has_vme) {
984 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
985 			    (eflags & VME_USERCHANGE) | PSL_VM;
986 		} else {
987 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
988 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
989 			    (eflags & VM_USERCHANGE) | PSL_VM;
990 		}
991 		bcopy(&ucp->uc_mcontext.mc_gs, tf, sizeof(struct trapframe));
992 		tf->tf_eflags = eflags;
993 		tf->tf_vm86_ds = tf->tf_ds;
994 		tf->tf_vm86_es = tf->tf_es;
995 		tf->tf_vm86_fs = tf->tf_fs;
996 		tf->tf_vm86_gs = tf->tf_gs;
997 		tf->tf_ds = _udatasel;
998 		tf->tf_es = _udatasel;
999 		tf->tf_fs = _udatasel;
1000 		tf->tf_gs = _udatasel;
1001 	} else
1002 #endif
1003 	{
1004 		/*
1005 		 * Don't allow users to change privileged or reserved flags.
1006 		 */
1007 		/*
1008 		 * XXX do allow users to change the privileged flag PSL_RF.
1009 		 * The cpu sets PSL_RF in tf_eflags for faults.  Debuggers
1010 		 * should sometimes set it there too.  tf_eflags is kept in
1011 		 * the signal context during signal handling and there is no
1012 		 * other place to remember it, so the PSL_RF bit may be
1013 		 * corrupted by the signal handler without us knowing.
1014 		 * Corruption of the PSL_RF bit at worst causes one more or
1015 		 * one less debugger trap, so allowing it is fairly harmless.
1016 		 */
1017 		if (!EFL_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) {
1018 			kprintf("sigreturn: rflags = 0x%lx\n", (long)rflags);
1019 			return(EINVAL);
1020 		}
1021 
1022 		/*
1023 		 * Don't allow users to load a valid privileged %cs.  Let the
1024 		 * hardware check for invalid selectors, excess privilege in
1025 		 * other selectors, invalid %eip's and invalid %esp's.
1026 		 */
1027 		cs = ucp->uc_mcontext.mc_cs;
1028 		if (!CS_SECURE(cs)) {
1029 			kprintf("sigreturn: cs = 0x%x\n", cs);
1030 			trapsignal(lp, SIGBUS, T_PROTFLT);
1031 			return(EINVAL);
1032 		}
1033 		/* gcc errors out on optimized bcopy */
1034 		_bcopy(&ucp->uc_mcontext.mc_rdi, regs,
1035 		       sizeof(struct trapframe));
1036 	}
1037 
1038 	/*
1039 	 * Restore the FPU state from the frame
1040 	 */
1041 	crit_enter();
1042 	npxpop(&ucp->uc_mcontext);
1043 
1044 	if (ucp->uc_mcontext.mc_onstack & 1)
1045 		lp->lwp_sigstk.ss_flags |= SS_ONSTACK;
1046 	else
1047 		lp->lwp_sigstk.ss_flags &= ~SS_ONSTACK;
1048 
1049 	lp->lwp_sigmask = ucp->uc_sigmask;
1050 	SIG_CANTMASK(lp->lwp_sigmask);
1051 	clear_quickret();
1052 	crit_exit();
1053 	return(EJUSTRETURN);
1054 }
1055 
1056 /*
1057  * Machine dependent boot() routine
1058  *
1059  * I haven't seen anything to put here yet
1060  * Possibly some stuff might be grafted back here from boot()
1061  */
1062 void
1063 cpu_boot(int howto)
1064 {
1065 }
1066 
1067 /*
1068  * Shutdown the CPU as much as possible
1069  */
1070 void
1071 cpu_halt(void)
1072 {
1073 	for (;;)
1074 		__asm__ __volatile("hlt");
1075 }
1076 
1077 /*
1078  * cpu_idle() represents the idle LWKT.  You cannot return from this function
1079  * (unless you want to blow things up!).  Instead we look for runnable threads
1080  * and loop or halt as appropriate.  Giant is not held on entry to the thread.
1081  *
1082  * The main loop is entered with a critical section held, we must release
1083  * the critical section before doing anything else.  lwkt_switch() will
1084  * check for pending interrupts due to entering and exiting its own
1085  * critical section.
1086  *
1087  * NOTE: On an SMP system we rely on a scheduler IPI to wake a HLTed cpu up.
1088  *	 However, there are cases where the idlethread will be entered with
1089  *	 the possibility that no IPI will occur and in such cases
1090  *	 lwkt_switch() sets TDF_IDLE_NOHLT.
1091  *
1092  * NOTE: cpu_idle_repeat determines how many entries into the idle thread
1093  *	 must occur before it starts using ACPI halt.
1094  *
1095  * NOTE: Value overridden in hammer_time().
1096  */
1097 static int	cpu_idle_hlt = 2;
1098 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW,
1099     &cpu_idle_hlt, 0, "Idle loop HLT enable");
1100 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_repeat, CTLFLAG_RW,
1101     &cpu_idle_repeat, 0, "Idle entries before acpi hlt");
1102 
1103 SYSCTL_PROC(_machdep, OID_AUTO, cpu_idle_hltcnt, (CTLTYPE_QUAD | CTLFLAG_RW),
1104     0, CPU_IDLE_STAT_HALT, sysctl_cpu_idle_cnt, "Q", "Idle loop entry halts");
1105 SYSCTL_PROC(_machdep, OID_AUTO, cpu_idle_spincnt, (CTLTYPE_QUAD | CTLFLAG_RW),
1106     0, CPU_IDLE_STAT_SPIN, sysctl_cpu_idle_cnt, "Q", "Idle loop entry spins");
1107 
1108 static void
1109 cpu_idle_default_hook(void)
1110 {
1111 	/*
1112 	 * We must guarentee that hlt is exactly the instruction
1113 	 * following the sti.
1114 	 */
1115 	__asm __volatile("sti; hlt");
1116 }
1117 
1118 /* Other subsystems (e.g., ACPI) can hook this later. */
1119 void (*cpu_idle_hook)(void) = cpu_idle_default_hook;
1120 
1121 static __inline int
1122 cpu_mwait_cx_hint(struct cpu_idle_stat *stat)
1123 {
1124 	int hint, cx_idx;
1125 	u_int idx;
1126 
1127 	hint = stat->hint;
1128 	if (hint >= 0)
1129 		goto done;
1130 
1131 	idx = (stat->repeat + stat->repeat_last + stat->repeat_delta) >>
1132 	    cpu_mwait_repeat_shift;
1133 	if (idx >= cpu_mwait_c1_hints_cnt) {
1134 		/* Step up faster, once we walked through all C1 states */
1135 		stat->repeat_delta += 1 << (cpu_mwait_repeat_shift + 1);
1136 	}
1137 	if (hint == CPU_MWAIT_HINT_AUTODEEP) {
1138 		if (idx >= cpu_mwait_deep_hints_cnt)
1139 			idx = cpu_mwait_deep_hints_cnt - 1;
1140 		hint = cpu_mwait_deep_hints[idx];
1141 	} else {
1142 		if (idx >= cpu_mwait_hints_cnt)
1143 			idx = cpu_mwait_hints_cnt - 1;
1144 		hint = cpu_mwait_hints[idx];
1145 	}
1146 done:
1147 	cx_idx = MWAIT_EAX_TO_CX(hint);
1148 	if (cx_idx >= 0 && cx_idx < CPU_MWAIT_CX_MAX)
1149 		stat->mwait_cx[cx_idx]++;
1150 	return hint;
1151 }
1152 
1153 void
1154 cpu_idle(void)
1155 {
1156 	globaldata_t gd = mycpu;
1157 	struct cpu_idle_stat *stat = &cpu_idle_stats[gd->gd_cpuid];
1158 	struct thread *td __debugvar = gd->gd_curthread;
1159 	int reqflags;
1160 
1161 	stat->repeat = stat->repeat_last = cpu_idle_repeat_max;
1162 
1163 	crit_exit();
1164 	KKASSERT(td->td_critcount == 0);
1165 
1166 	for (;;) {
1167 		/*
1168 		 * See if there are any LWKTs ready to go.
1169 		 */
1170 		lwkt_switch();
1171 
1172 		/*
1173 		 * When halting inside a cli we must check for reqflags
1174 		 * races, particularly [re]schedule requests.  Running
1175 		 * splz() does the job.
1176 		 *
1177 		 * cpu_idle_hlt:
1178 		 *	0	Never halt, just spin
1179 		 *
1180 		 *	1	Always use MONITOR/MWAIT if avail, HLT
1181 		 *		otherwise.
1182 		 *
1183 		 *		Better default for modern (Haswell+) Intel
1184 		 *		cpus.
1185 		 *
1186 		 *	2	Use HLT/MONITOR/MWAIT up to a point and then
1187 		 *		use the ACPI halt (default).  This is a hybrid
1188 		 *		approach.  See machdep.cpu_idle_repeat.
1189 		 *
1190 		 *		Better default for modern AMD cpus and older
1191 		 *		Intel cpus.
1192 		 *
1193 		 *	3	Always use the ACPI halt.  This typically
1194 		 *		eats the least amount of power but the cpu
1195 		 *		will be slow waking up.  Slows down e.g.
1196 		 *		compiles and other pipe/event oriented stuff.
1197 		 *
1198 		 *		Usually the best default for AMD cpus.
1199 		 *
1200 		 *	4	Always use HLT.
1201 		 *
1202 		 *	5	Always spin.
1203 		 *
1204 		 * NOTE: Interrupts are enabled and we are not in a critical
1205 		 *	 section.
1206 		 *
1207 		 * NOTE: Preemptions do not reset gd_idle_repeat.   Also we
1208 		 *	 don't bother capping gd_idle_repeat, it is ok if
1209 		 *	 it overflows (we do make it unsigned, however).
1210 		 *
1211 		 * Implement optimized invltlb operations when halted
1212 		 * in idle.  By setting the bit in smp_idleinvl_mask
1213 		 * we inform other cpus that they can set _reqs to
1214 		 * request an invltlb.  Current the code to do that
1215 		 * sets the bits in _reqs anyway, but then check _mask
1216 		 * to determine if they can assume the invltlb will execute.
1217 		 *
1218 		 * A critical section is required to ensure that interrupts
1219 		 * do not fully run until after we've had a chance to execute
1220 		 * the request.
1221 		 */
1222 		if (gd->gd_idle_repeat == 0) {
1223 			stat->repeat = (stat->repeat + stat->repeat_last) >> 1;
1224 			if (stat->repeat > cpu_idle_repeat_max)
1225 				stat->repeat = cpu_idle_repeat_max;
1226 			stat->repeat_last = 0;
1227 			stat->repeat_delta = 0;
1228 		}
1229 		++stat->repeat_last;
1230 
1231 		/*
1232 		 * General idle thread halt code
1233 		 *
1234 		 * IBRS NOTES - IBRS is a SPECTRE mitigation.  When going
1235 		 *		idle, disable IBRS to reduce hyperthread
1236 		 *		overhead.
1237 		 */
1238 		++gd->gd_idle_repeat;
1239 
1240 		switch(cpu_idle_hlt) {
1241 		default:
1242 		case 0:
1243 			/*
1244 			 * Always spin
1245 			 */
1246 			;
1247 do_spin:
1248 			splz();
1249 			__asm __volatile("sti");
1250 			stat->spin++;
1251 			crit_enter_gd(gd);
1252 			crit_exit_gd(gd);
1253 			break;
1254 		case 2:
1255 			/*
1256 			 * Use MONITOR/MWAIT (or HLT) for a few cycles,
1257 			 * then start using the ACPI halt code if we
1258 			 * continue to be idle.
1259 			 */
1260 			if (gd->gd_idle_repeat >= cpu_idle_repeat)
1261 				goto do_acpi;
1262 			/* FALL THROUGH */
1263 		case 1:
1264 			/*
1265 			 * Always use MONITOR/MWAIT (will use HLT if
1266 			 * MONITOR/MWAIT not available).
1267 			 */
1268 			if (cpu_mi_feature & CPU_MI_MONITOR) {
1269 				splz(); /* XXX */
1270 				reqflags = gd->gd_reqflags;
1271 				if (reqflags & RQF_IDLECHECK_WK_MASK)
1272 					goto do_spin;
1273 				crit_enter_gd(gd);
1274 				ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask, gd->gd_cpuid);
1275 				/*
1276 				 * IBRS/STIBP
1277 				 */
1278 				if (pscpu->trampoline.tr_pcb_spec_ctrl[1] &
1279 				    SPEC_CTRL_DUMMY_ENABLE) {
1280 					wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[1] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
1281 				}
1282 				cpu_mmw_pause_int(&gd->gd_reqflags, reqflags,
1283 						  cpu_mwait_cx_hint(stat), 0);
1284 				if (pscpu->trampoline.tr_pcb_spec_ctrl[0] &
1285 				    SPEC_CTRL_DUMMY_ENABLE) {
1286 					wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[0] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
1287 				}
1288 				stat->halt++;
1289 				ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask, gd->gd_cpuid);
1290 				if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs,
1291 							      gd->gd_cpuid)) {
1292 					cpu_invltlb();
1293 					cpu_mfence();
1294 				}
1295 				crit_exit_gd(gd);
1296 				break;
1297 			}
1298 			/* FALLTHROUGH */
1299 		case 4:
1300 			/*
1301 			 * Use HLT
1302 			 */
1303 			__asm __volatile("cli");
1304 			splz();
1305 			crit_enter_gd(gd);
1306 			if ((gd->gd_reqflags & RQF_IDLECHECK_WK_MASK) == 0) {
1307 				ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask,
1308 						     gd->gd_cpuid);
1309 				if (pscpu->trampoline.tr_pcb_spec_ctrl[1] &
1310 				    SPEC_CTRL_DUMMY_ENABLE) {
1311 					wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[1] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
1312 				}
1313 				cpu_idle_default_hook();
1314 				if (pscpu->trampoline.tr_pcb_spec_ctrl[0] &
1315 				    SPEC_CTRL_DUMMY_ENABLE) {
1316 					wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[0] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
1317 				}
1318 				ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask,
1319 						       gd->gd_cpuid);
1320 				if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs,
1321 							      gd->gd_cpuid)) {
1322 					cpu_invltlb();
1323 					cpu_mfence();
1324 				}
1325 			}
1326 			__asm __volatile("sti");
1327 			stat->halt++;
1328 			crit_exit_gd(gd);
1329 			break;
1330 		case 3:
1331 			/*
1332 			 * Use ACPI halt
1333 			 */
1334 			;
1335 do_acpi:
1336 			__asm __volatile("cli");
1337 			splz();
1338 			crit_enter_gd(gd);
1339 			if ((gd->gd_reqflags & RQF_IDLECHECK_WK_MASK) == 0) {
1340 				ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask,
1341 						     gd->gd_cpuid);
1342 				if (pscpu->trampoline.tr_pcb_spec_ctrl[1] &
1343 				    SPEC_CTRL_DUMMY_ENABLE) {
1344 					wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[1] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
1345 				}
1346 				cpu_idle_hook();
1347 				if (pscpu->trampoline.tr_pcb_spec_ctrl[0] &
1348 				    SPEC_CTRL_DUMMY_ENABLE) {
1349 					wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[0] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
1350 				}
1351 				ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask,
1352 						       gd->gd_cpuid);
1353 				if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs,
1354 							      gd->gd_cpuid)) {
1355 					cpu_invltlb();
1356 					cpu_mfence();
1357 				}
1358 			}
1359 			__asm __volatile("sti");
1360 			stat->halt++;
1361 			crit_exit_gd(gd);
1362 			break;
1363 		}
1364 	}
1365 }
1366 
1367 /*
1368  * Called from deep ACPI via cpu_idle_hook() (see above) to actually halt
1369  * the cpu in C1.  ACPI might use other halt methods for deeper states
1370  * and not reach here.
1371  *
1372  * For now we always use HLT as we are not sure what ACPI may have actually
1373  * done.  MONITOR/MWAIT might not be appropriate.
1374  *
1375  * NOTE: MONITOR/MWAIT does not appear to throttle AMD cpus, while HLT
1376  *	 does.  On Intel, MONITOR/MWAIT does appear to throttle the cpu.
1377  */
1378 void
1379 cpu_idle_halt(void)
1380 {
1381 	globaldata_t gd;
1382 
1383 	gd = mycpu;
1384 #if 0
1385 	/* DISABLED FOR NOW */
1386 	struct cpu_idle_stat *stat;
1387 	int reqflags;
1388 
1389 
1390 	if ((cpu_idle_hlt == 1 || cpu_idle_hlt == 2) &&
1391 	    (cpu_mi_feature & CPU_MI_MONITOR) &&
1392 	    cpu_vendor_id != CPU_VENDOR_AMD) {
1393 		/*
1394 		 * Use MONITOR/MWAIT
1395 		 *
1396 		 * (NOTE: On ryzen, MWAIT does not throttle clocks, so we
1397 		 *	  have to use HLT)
1398 		 */
1399 		stat = &cpu_idle_stats[gd->gd_cpuid];
1400 		reqflags = gd->gd_reqflags;
1401 		if ((reqflags & RQF_IDLECHECK_WK_MASK) == 0) {
1402 			__asm __volatile("sti");
1403 			cpu_mmw_pause_int(&gd->gd_reqflags, reqflags,
1404 					  cpu_mwait_cx_hint(stat), 0);
1405 		} else {
1406 			__asm __volatile("sti; pause");
1407 		}
1408 	} else
1409 #endif
1410 	{
1411 		/*
1412 		 * Use HLT
1413 		 */
1414 		if ((gd->gd_reqflags & RQF_IDLECHECK_WK_MASK) == 0)
1415 			__asm __volatile("sti; hlt");
1416 		else
1417 			__asm __volatile("sti; pause");
1418 	}
1419 }
1420 
1421 
1422 /*
1423  * Called in a loop indirectly via Xcpustop
1424  */
1425 void
1426 cpu_smp_stopped(void)
1427 {
1428 	globaldata_t gd = mycpu;
1429 	volatile __uint64_t *ptr;
1430 	__uint64_t ovalue;
1431 
1432 	ptr = CPUMASK_ADDR(started_cpus, gd->gd_cpuid);
1433 	ovalue = *ptr;
1434 	if ((ovalue & CPUMASK_SIMPLE(gd->gd_cpuid & 63)) == 0) {
1435 		if (cpu_mi_feature & CPU_MI_MONITOR) {
1436 			if (cpu_mwait_hints) {
1437 				cpu_mmw_pause_long(__DEVOLATILE(void *, ptr),
1438 					   ovalue,
1439 					   cpu_mwait_hints[
1440 						cpu_mwait_hints_cnt - 1], 0);
1441 			} else {
1442 				cpu_mmw_pause_long(__DEVOLATILE(void *, ptr),
1443 					   ovalue, 0, 0);
1444 			}
1445 		} else {
1446 			cpu_halt();	/* depend on lapic timer */
1447 		}
1448 	}
1449 }
1450 
1451 /*
1452  * This routine is called if a spinlock has been held through the
1453  * exponential backoff period and is seriously contested.  On a real cpu
1454  * we let it spin.
1455  */
1456 void
1457 cpu_spinlock_contested(void)
1458 {
1459 	cpu_pause();
1460 }
1461 
1462 /*
1463  * Clear registers on exec
1464  */
1465 void
1466 exec_setregs(u_long entry, u_long stack, u_long ps_strings)
1467 {
1468 	struct thread *td = curthread;
1469 	struct lwp *lp = td->td_lwp;
1470 	struct pcb *pcb = td->td_pcb;
1471 	struct trapframe *regs = lp->lwp_md.md_regs;
1472 
1473 	user_ldt_free(pcb);
1474 
1475 	clear_quickret();
1476 	bzero((char *)regs, sizeof(struct trapframe));
1477 	regs->tf_rip = entry;
1478 	regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; /* align the stack */
1479 	regs->tf_rdi = stack;		/* argv */
1480 	regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
1481 	regs->tf_ss = _udatasel;
1482 	regs->tf_cs = _ucodesel;
1483 	regs->tf_rbx = ps_strings;
1484 
1485 	/*
1486 	 * Reset the hardware debug registers if they were in use.
1487 	 * They won't have any meaning for the newly exec'd process.
1488 	 */
1489 	if (pcb->pcb_flags & PCB_DBREGS) {
1490 		pcb->pcb_dr0 = 0;
1491 		pcb->pcb_dr1 = 0;
1492 		pcb->pcb_dr2 = 0;
1493 		pcb->pcb_dr3 = 0;
1494 		pcb->pcb_dr6 = 0;
1495 		pcb->pcb_dr7 = 0; /* JG set bit 10? */
1496 		if (pcb == td->td_pcb) {
1497 			/*
1498 			 * Clear the debug registers on the running
1499 			 * CPU, otherwise they will end up affecting
1500 			 * the next process we switch to.
1501 			 */
1502 			reset_dbregs();
1503 		}
1504 		pcb->pcb_flags &= ~PCB_DBREGS;
1505 	}
1506 
1507 	/*
1508 	 * Initialize the math emulator (if any) for the current process.
1509 	 * Actually, just clear the bit that says that the emulator has
1510 	 * been initialized.  Initialization is delayed until the process
1511 	 * traps to the emulator (if it is done at all) mainly because
1512 	 * emulators don't provide an entry point for initialization.
1513 	 */
1514 	pcb->pcb_flags &= ~FP_SOFTFP;
1515 
1516 	/*
1517 	 * NOTE: do not set CR0_TS here.  npxinit() must do it after clearing
1518 	 *	 gd_npxthread.  Otherwise a preemptive interrupt thread
1519 	 *	 may panic in npxdna().
1520 	 */
1521 	crit_enter();
1522 	load_cr0(rcr0() | CR0_MP);
1523 
1524 	/*
1525 	 * NOTE: The MSR values must be correct so we can return to
1526 	 *	 userland.  gd_user_fs/gs must be correct so the switch
1527 	 *	 code knows what the current MSR values are.
1528 	 */
1529 	pcb->pcb_fsbase = 0;	/* Values loaded from PCB on switch */
1530 	pcb->pcb_gsbase = 0;
1531 	mdcpu->gd_user_fs = 0;	/* Cache of current MSR values */
1532 	mdcpu->gd_user_gs = 0;
1533 	wrmsr(MSR_FSBASE, 0);	/* Set MSR values for return to userland */
1534 	wrmsr(MSR_KGSBASE, 0);
1535 
1536 	/* Initialize the npx (if any) for the current process. */
1537 	npxinit();
1538 	crit_exit();
1539 
1540 	pcb->pcb_ds = _udatasel;
1541 	pcb->pcb_es = _udatasel;
1542 	pcb->pcb_fs = _udatasel;
1543 	pcb->pcb_gs = _udatasel;
1544 }
1545 
1546 void
1547 cpu_setregs(void)
1548 {
1549 	register_t cr0;
1550 
1551 	cr0 = rcr0();
1552 	cr0 |= CR0_NE;			/* Done by npxinit() */
1553 	cr0 |= CR0_MP | CR0_TS;		/* Done at every execve() too. */
1554 	cr0 |= CR0_WP | CR0_AM;
1555 	load_cr0(cr0);
1556 	load_gs(_udatasel);
1557 }
1558 
1559 static int
1560 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS)
1561 {
1562 	int error;
1563 	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2,
1564 		req);
1565 	if (!error && req->newptr)
1566 		resettodr();
1567 	return (error);
1568 }
1569 
1570 SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW,
1571 	&adjkerntz, 0, sysctl_machdep_adjkerntz, "I", "");
1572 
1573 SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set,
1574 	CTLFLAG_RW, &disable_rtc_set, 0, "");
1575 
1576 #if 0 /* JG */
1577 SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo,
1578 	CTLFLAG_RD, &bootinfo, bootinfo, "");
1579 #endif
1580 
1581 SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock,
1582 	CTLFLAG_RW, &wall_cmos_clock, 0, "");
1583 
1584 static int
1585 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1586 {
1587 	struct efi_map_header *efihdr;
1588 	caddr_t kmdp;
1589 	uint32_t efisize;
1590 
1591 	kmdp = preload_search_by_type("elf kernel");
1592 	if (kmdp == NULL)
1593 		kmdp = preload_search_by_type("elf64 kernel");
1594 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1595 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1596 	if (efihdr == NULL)
1597 		return (0);
1598 	efisize = *((uint32_t *)efihdr - 1);
1599 	return (SYSCTL_OUT(req, efihdr, efisize));
1600 }
1601 SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1602     efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map");
1603 
1604 /*
1605  * Initialize x86 and configure to run kernel
1606  */
1607 
1608 /*
1609  * Initialize segments & interrupt table
1610  */
1611 
1612 int _default_ldt;
1613 struct user_segment_descriptor gdt[NGDT * MAXCPU];	/* global descriptor table */
1614 struct gate_descriptor idt_arr[MAXCPU][NIDT];
1615 #if 0 /* JG */
1616 union descriptor ldt[NLDT];		/* local descriptor table */
1617 #endif
1618 
1619 /* table descriptors - used to load tables by cpu */
1620 struct region_descriptor r_gdt;
1621 struct region_descriptor r_idt_arr[MAXCPU];
1622 
1623 /* JG proc0paddr is a virtual address */
1624 void *proc0paddr;
1625 /* JG alignment? */
1626 char proc0paddr_buff[LWKT_THREAD_STACK];
1627 
1628 
1629 /* software prototypes -- in more palatable form */
1630 struct soft_segment_descriptor gdt_segs[] = {
1631 /* GNULL_SEL	0 Null Descriptor */
1632 {	0x0,			/* segment base address  */
1633 	0x0,			/* length */
1634 	0,			/* segment type */
1635 	0,			/* segment descriptor priority level */
1636 	0,			/* segment descriptor present */
1637 	0,			/* long */
1638 	0,			/* default 32 vs 16 bit size */
1639 	0			/* limit granularity (byte/page units)*/ },
1640 /* GCODE_SEL	1 Code Descriptor for kernel */
1641 {	0x0,			/* segment base address  */
1642 	0xfffff,		/* length - all address space */
1643 	SDT_MEMERA,		/* segment type */
1644 	SEL_KPL,		/* segment descriptor priority level */
1645 	1,			/* segment descriptor present */
1646 	1,			/* long */
1647 	0,			/* default 32 vs 16 bit size */
1648 	1			/* limit granularity (byte/page units)*/ },
1649 /* GDATA_SEL	2 Data Descriptor for kernel */
1650 {	0x0,			/* segment base address  */
1651 	0xfffff,		/* length - all address space */
1652 	SDT_MEMRWA,		/* segment type */
1653 	SEL_KPL,		/* segment descriptor priority level */
1654 	1,			/* segment descriptor present */
1655 	1,			/* long */
1656 	0,			/* default 32 vs 16 bit size */
1657 	1			/* limit granularity (byte/page units)*/ },
1658 /* GUCODE32_SEL	3 32 bit Code Descriptor for user */
1659 {	0x0,			/* segment base address  */
1660 	0xfffff,		/* length - all address space */
1661 	SDT_MEMERA,		/* segment type */
1662 	SEL_UPL,		/* segment descriptor priority level */
1663 	1,			/* segment descriptor present */
1664 	0,			/* long */
1665 	1,			/* default 32 vs 16 bit size */
1666 	1			/* limit granularity (byte/page units)*/ },
1667 /* GUDATA_SEL	4 32/64 bit Data Descriptor for user */
1668 {	0x0,			/* segment base address  */
1669 	0xfffff,		/* length - all address space */
1670 	SDT_MEMRWA,		/* segment type */
1671 	SEL_UPL,		/* segment descriptor priority level */
1672 	1,			/* segment descriptor present */
1673 	0,			/* long */
1674 	1,			/* default 32 vs 16 bit size */
1675 	1			/* limit granularity (byte/page units)*/ },
1676 /* GUCODE_SEL	5 64 bit Code Descriptor for user */
1677 {	0x0,			/* segment base address  */
1678 	0xfffff,		/* length - all address space */
1679 	SDT_MEMERA,		/* segment type */
1680 	SEL_UPL,		/* segment descriptor priority level */
1681 	1,			/* segment descriptor present */
1682 	1,			/* long */
1683 	0,			/* default 32 vs 16 bit size */
1684 	1			/* limit granularity (byte/page units)*/ },
1685 /* GPROC0_SEL	6 Proc 0 Tss Descriptor */
1686 {
1687 	0x0,			/* segment base address */
1688 	sizeof(struct x86_64tss)-1,/* length - all address space */
1689 	SDT_SYSTSS,		/* segment type */
1690 	SEL_KPL,		/* segment descriptor priority level */
1691 	1,			/* segment descriptor present */
1692 	0,			/* long */
1693 	0,			/* unused - default 32 vs 16 bit size */
1694 	0			/* limit granularity (byte/page units)*/ },
1695 /* Actually, the TSS is a system descriptor which is double size */
1696 {	0x0,			/* segment base address  */
1697 	0x0,			/* length */
1698 	0,			/* segment type */
1699 	0,			/* segment descriptor priority level */
1700 	0,			/* segment descriptor present */
1701 	0,			/* long */
1702 	0,			/* default 32 vs 16 bit size */
1703 	0			/* limit granularity (byte/page units)*/ },
1704 /* GUGS32_SEL	8 32 bit GS Descriptor for user */
1705 {	0x0,			/* segment base address  */
1706 	0xfffff,		/* length - all address space */
1707 	SDT_MEMRWA,		/* segment type */
1708 	SEL_UPL,		/* segment descriptor priority level */
1709 	1,			/* segment descriptor present */
1710 	0,			/* long */
1711 	1,			/* default 32 vs 16 bit size */
1712 	1			/* limit granularity (byte/page units)*/ },
1713 };
1714 
1715 void
1716 setidt_global(int idx, inthand_t *func, int typ, int dpl, int ist)
1717 {
1718 	int cpu;
1719 
1720 	for (cpu = 0; cpu < MAXCPU; ++cpu) {
1721 		struct gate_descriptor *ip = &idt_arr[cpu][idx];
1722 
1723 		ip->gd_looffset = (uintptr_t)func;
1724 		ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
1725 		ip->gd_ist = ist;
1726 		ip->gd_xx = 0;
1727 		ip->gd_type = typ;
1728 		ip->gd_dpl = dpl;
1729 		ip->gd_p = 1;
1730 		ip->gd_hioffset = ((uintptr_t)func)>>16 ;
1731 	}
1732 }
1733 
1734 void
1735 setidt(int idx, inthand_t *func, int typ, int dpl, int ist, int cpu)
1736 {
1737 	struct gate_descriptor *ip;
1738 
1739 	KASSERT(cpu >= 0 && cpu < ncpus, ("invalid cpu %d", cpu));
1740 
1741 	ip = &idt_arr[cpu][idx];
1742 	ip->gd_looffset = (uintptr_t)func;
1743 	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
1744 	ip->gd_ist = ist;
1745 	ip->gd_xx = 0;
1746 	ip->gd_type = typ;
1747 	ip->gd_dpl = dpl;
1748 	ip->gd_p = 1;
1749 	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
1750 }
1751 
1752 #define	IDTVEC(name)	__CONCAT(X,name)
1753 
1754 extern inthand_t
1755 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
1756 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
1757 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
1758 	IDTVEC(page), IDTVEC(mchk), IDTVEC(fpu), IDTVEC(align),
1759 	IDTVEC(xmm), IDTVEC(dblfault),
1760 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
1761 
1762 extern inthand_t
1763 	IDTVEC(rsvd00), IDTVEC(rsvd01), IDTVEC(rsvd02), IDTVEC(rsvd03),
1764 	IDTVEC(rsvd04), IDTVEC(rsvd05), IDTVEC(rsvd06), IDTVEC(rsvd07),
1765 	IDTVEC(rsvd08), IDTVEC(rsvd09), IDTVEC(rsvd0a), IDTVEC(rsvd0b),
1766 	IDTVEC(rsvd0c), IDTVEC(rsvd0d), IDTVEC(rsvd0e), IDTVEC(rsvd0f),
1767 	IDTVEC(rsvd10), IDTVEC(rsvd11), IDTVEC(rsvd12), IDTVEC(rsvd13),
1768 	IDTVEC(rsvd14), IDTVEC(rsvd15), IDTVEC(rsvd16), IDTVEC(rsvd17),
1769 	IDTVEC(rsvd18), IDTVEC(rsvd19), IDTVEC(rsvd1a), IDTVEC(rsvd1b),
1770 	IDTVEC(rsvd1c), IDTVEC(rsvd1d), IDTVEC(rsvd1e), IDTVEC(rsvd1f),
1771 	IDTVEC(rsvd20), IDTVEC(rsvd21), IDTVEC(rsvd22), IDTVEC(rsvd23),
1772 	IDTVEC(rsvd24), IDTVEC(rsvd25), IDTVEC(rsvd26), IDTVEC(rsvd27),
1773 	IDTVEC(rsvd28), IDTVEC(rsvd29), IDTVEC(rsvd2a), IDTVEC(rsvd2b),
1774 	IDTVEC(rsvd2c), IDTVEC(rsvd2d), IDTVEC(rsvd2e), IDTVEC(rsvd2f),
1775 	IDTVEC(rsvd30), IDTVEC(rsvd31), IDTVEC(rsvd32), IDTVEC(rsvd33),
1776 	IDTVEC(rsvd34), IDTVEC(rsvd35), IDTVEC(rsvd36), IDTVEC(rsvd37),
1777 	IDTVEC(rsvd38), IDTVEC(rsvd39), IDTVEC(rsvd3a), IDTVEC(rsvd3b),
1778 	IDTVEC(rsvd3c), IDTVEC(rsvd3d), IDTVEC(rsvd3e), IDTVEC(rsvd3f),
1779 	IDTVEC(rsvd40), IDTVEC(rsvd41), IDTVEC(rsvd42), IDTVEC(rsvd43),
1780 	IDTVEC(rsvd44), IDTVEC(rsvd45), IDTVEC(rsvd46), IDTVEC(rsvd47),
1781 	IDTVEC(rsvd48), IDTVEC(rsvd49), IDTVEC(rsvd4a), IDTVEC(rsvd4b),
1782 	IDTVEC(rsvd4c), IDTVEC(rsvd4d), IDTVEC(rsvd4e), IDTVEC(rsvd4f),
1783 	IDTVEC(rsvd50), IDTVEC(rsvd51), IDTVEC(rsvd52), IDTVEC(rsvd53),
1784 	IDTVEC(rsvd54), IDTVEC(rsvd55), IDTVEC(rsvd56), IDTVEC(rsvd57),
1785 	IDTVEC(rsvd58), IDTVEC(rsvd59), IDTVEC(rsvd5a), IDTVEC(rsvd5b),
1786 	IDTVEC(rsvd5c), IDTVEC(rsvd5d), IDTVEC(rsvd5e), IDTVEC(rsvd5f),
1787 	IDTVEC(rsvd60), IDTVEC(rsvd61), IDTVEC(rsvd62), IDTVEC(rsvd63),
1788 	IDTVEC(rsvd64), IDTVEC(rsvd65), IDTVEC(rsvd66), IDTVEC(rsvd67),
1789 	IDTVEC(rsvd68), IDTVEC(rsvd69), IDTVEC(rsvd6a), IDTVEC(rsvd6b),
1790 	IDTVEC(rsvd6c), IDTVEC(rsvd6d), IDTVEC(rsvd6e), IDTVEC(rsvd6f),
1791 	IDTVEC(rsvd70), IDTVEC(rsvd71), IDTVEC(rsvd72), IDTVEC(rsvd73),
1792 	IDTVEC(rsvd74), IDTVEC(rsvd75), IDTVEC(rsvd76), IDTVEC(rsvd77),
1793 	IDTVEC(rsvd78), IDTVEC(rsvd79), IDTVEC(rsvd7a), IDTVEC(rsvd7b),
1794 	IDTVEC(rsvd7c), IDTVEC(rsvd7d), IDTVEC(rsvd7e), IDTVEC(rsvd7f),
1795 	IDTVEC(rsvd80), IDTVEC(rsvd81), IDTVEC(rsvd82), IDTVEC(rsvd83),
1796 	IDTVEC(rsvd84), IDTVEC(rsvd85), IDTVEC(rsvd86), IDTVEC(rsvd87),
1797 	IDTVEC(rsvd88), IDTVEC(rsvd89), IDTVEC(rsvd8a), IDTVEC(rsvd8b),
1798 	IDTVEC(rsvd8c), IDTVEC(rsvd8d), IDTVEC(rsvd8e), IDTVEC(rsvd8f),
1799 	IDTVEC(rsvd90), IDTVEC(rsvd91), IDTVEC(rsvd92), IDTVEC(rsvd93),
1800 	IDTVEC(rsvd94), IDTVEC(rsvd95), IDTVEC(rsvd96), IDTVEC(rsvd97),
1801 	IDTVEC(rsvd98), IDTVEC(rsvd99), IDTVEC(rsvd9a), IDTVEC(rsvd9b),
1802 	IDTVEC(rsvd9c), IDTVEC(rsvd9d), IDTVEC(rsvd9e), IDTVEC(rsvd9f),
1803 	IDTVEC(rsvda0), IDTVEC(rsvda1), IDTVEC(rsvda2), IDTVEC(rsvda3),
1804 	IDTVEC(rsvda4), IDTVEC(rsvda5), IDTVEC(rsvda6), IDTVEC(rsvda7),
1805 	IDTVEC(rsvda8), IDTVEC(rsvda9), IDTVEC(rsvdaa), IDTVEC(rsvdab),
1806 	IDTVEC(rsvdac), IDTVEC(rsvdad), IDTVEC(rsvdae), IDTVEC(rsvdaf),
1807 	IDTVEC(rsvdb0), IDTVEC(rsvdb1), IDTVEC(rsvdb2), IDTVEC(rsvdb3),
1808 	IDTVEC(rsvdb4), IDTVEC(rsvdb5), IDTVEC(rsvdb6), IDTVEC(rsvdb7),
1809 	IDTVEC(rsvdb8), IDTVEC(rsvdb9), IDTVEC(rsvdba), IDTVEC(rsvdbb),
1810 	IDTVEC(rsvdbc), IDTVEC(rsvdbd), IDTVEC(rsvdbe), IDTVEC(rsvdbf),
1811 	IDTVEC(rsvdc0), IDTVEC(rsvdc1), IDTVEC(rsvdc2), IDTVEC(rsvdc3),
1812 	IDTVEC(rsvdc4), IDTVEC(rsvdc5), IDTVEC(rsvdc6), IDTVEC(rsvdc7),
1813 	IDTVEC(rsvdc8), IDTVEC(rsvdc9), IDTVEC(rsvdca), IDTVEC(rsvdcb),
1814 	IDTVEC(rsvdcc), IDTVEC(rsvdcd), IDTVEC(rsvdce), IDTVEC(rsvdcf),
1815 	IDTVEC(rsvdd0), IDTVEC(rsvdd1), IDTVEC(rsvdd2), IDTVEC(rsvdd3),
1816 	IDTVEC(rsvdd4), IDTVEC(rsvdd5), IDTVEC(rsvdd6), IDTVEC(rsvdd7),
1817 	IDTVEC(rsvdd8), IDTVEC(rsvdd9), IDTVEC(rsvdda), IDTVEC(rsvddb),
1818 	IDTVEC(rsvddc), IDTVEC(rsvddd), IDTVEC(rsvdde), IDTVEC(rsvddf),
1819 	IDTVEC(rsvde0), IDTVEC(rsvde1), IDTVEC(rsvde2), IDTVEC(rsvde3),
1820 	IDTVEC(rsvde4), IDTVEC(rsvde5), IDTVEC(rsvde6), IDTVEC(rsvde7),
1821 	IDTVEC(rsvde8), IDTVEC(rsvde9), IDTVEC(rsvdea), IDTVEC(rsvdeb),
1822 	IDTVEC(rsvdec), IDTVEC(rsvded), IDTVEC(rsvdee), IDTVEC(rsvdef),
1823 	IDTVEC(rsvdf0), IDTVEC(rsvdf1), IDTVEC(rsvdf2), IDTVEC(rsvdf3),
1824 	IDTVEC(rsvdf4), IDTVEC(rsvdf5), IDTVEC(rsvdf6), IDTVEC(rsvdf7),
1825 	IDTVEC(rsvdf8), IDTVEC(rsvdf9), IDTVEC(rsvdfa), IDTVEC(rsvdfb),
1826 	IDTVEC(rsvdfc), IDTVEC(rsvdfd), IDTVEC(rsvdfe), IDTVEC(rsvdff);
1827 
1828 inthand_t *rsvdary[NIDT] = {
1829 	&IDTVEC(rsvd00), &IDTVEC(rsvd01), &IDTVEC(rsvd02), &IDTVEC(rsvd03),
1830 	&IDTVEC(rsvd04), &IDTVEC(rsvd05), &IDTVEC(rsvd06), &IDTVEC(rsvd07),
1831 	&IDTVEC(rsvd08), &IDTVEC(rsvd09), &IDTVEC(rsvd0a), &IDTVEC(rsvd0b),
1832 	&IDTVEC(rsvd0c), &IDTVEC(rsvd0d), &IDTVEC(rsvd0e), &IDTVEC(rsvd0f),
1833 	&IDTVEC(rsvd10), &IDTVEC(rsvd11), &IDTVEC(rsvd12), &IDTVEC(rsvd13),
1834 	&IDTVEC(rsvd14), &IDTVEC(rsvd15), &IDTVEC(rsvd16), &IDTVEC(rsvd17),
1835 	&IDTVEC(rsvd18), &IDTVEC(rsvd19), &IDTVEC(rsvd1a), &IDTVEC(rsvd1b),
1836 	&IDTVEC(rsvd1c), &IDTVEC(rsvd1d), &IDTVEC(rsvd1e), &IDTVEC(rsvd1f),
1837 	&IDTVEC(rsvd20), &IDTVEC(rsvd21), &IDTVEC(rsvd22), &IDTVEC(rsvd23),
1838 	&IDTVEC(rsvd24), &IDTVEC(rsvd25), &IDTVEC(rsvd26), &IDTVEC(rsvd27),
1839 	&IDTVEC(rsvd28), &IDTVEC(rsvd29), &IDTVEC(rsvd2a), &IDTVEC(rsvd2b),
1840 	&IDTVEC(rsvd2c), &IDTVEC(rsvd2d), &IDTVEC(rsvd2e), &IDTVEC(rsvd2f),
1841 	&IDTVEC(rsvd30), &IDTVEC(rsvd31), &IDTVEC(rsvd32), &IDTVEC(rsvd33),
1842 	&IDTVEC(rsvd34), &IDTVEC(rsvd35), &IDTVEC(rsvd36), &IDTVEC(rsvd37),
1843 	&IDTVEC(rsvd38), &IDTVEC(rsvd39), &IDTVEC(rsvd3a), &IDTVEC(rsvd3b),
1844 	&IDTVEC(rsvd3c), &IDTVEC(rsvd3d), &IDTVEC(rsvd3e), &IDTVEC(rsvd3f),
1845 	&IDTVEC(rsvd40), &IDTVEC(rsvd41), &IDTVEC(rsvd42), &IDTVEC(rsvd43),
1846 	&IDTVEC(rsvd44), &IDTVEC(rsvd45), &IDTVEC(rsvd46), &IDTVEC(rsvd47),
1847 	&IDTVEC(rsvd48), &IDTVEC(rsvd49), &IDTVEC(rsvd4a), &IDTVEC(rsvd4b),
1848 	&IDTVEC(rsvd4c), &IDTVEC(rsvd4d), &IDTVEC(rsvd4e), &IDTVEC(rsvd4f),
1849 	&IDTVEC(rsvd50), &IDTVEC(rsvd51), &IDTVEC(rsvd52), &IDTVEC(rsvd53),
1850 	&IDTVEC(rsvd54), &IDTVEC(rsvd55), &IDTVEC(rsvd56), &IDTVEC(rsvd57),
1851 	&IDTVEC(rsvd58), &IDTVEC(rsvd59), &IDTVEC(rsvd5a), &IDTVEC(rsvd5b),
1852 	&IDTVEC(rsvd5c), &IDTVEC(rsvd5d), &IDTVEC(rsvd5e), &IDTVEC(rsvd5f),
1853 	&IDTVEC(rsvd60), &IDTVEC(rsvd61), &IDTVEC(rsvd62), &IDTVEC(rsvd63),
1854 	&IDTVEC(rsvd64), &IDTVEC(rsvd65), &IDTVEC(rsvd66), &IDTVEC(rsvd67),
1855 	&IDTVEC(rsvd68), &IDTVEC(rsvd69), &IDTVEC(rsvd6a), &IDTVEC(rsvd6b),
1856 	&IDTVEC(rsvd6c), &IDTVEC(rsvd6d), &IDTVEC(rsvd6e), &IDTVEC(rsvd6f),
1857 	&IDTVEC(rsvd70), &IDTVEC(rsvd71), &IDTVEC(rsvd72), &IDTVEC(rsvd73),
1858 	&IDTVEC(rsvd74), &IDTVEC(rsvd75), &IDTVEC(rsvd76), &IDTVEC(rsvd77),
1859 	&IDTVEC(rsvd78), &IDTVEC(rsvd79), &IDTVEC(rsvd7a), &IDTVEC(rsvd7b),
1860 	&IDTVEC(rsvd7c), &IDTVEC(rsvd7d), &IDTVEC(rsvd7e), &IDTVEC(rsvd7f),
1861 	&IDTVEC(rsvd80), &IDTVEC(rsvd81), &IDTVEC(rsvd82), &IDTVEC(rsvd83),
1862 	&IDTVEC(rsvd84), &IDTVEC(rsvd85), &IDTVEC(rsvd86), &IDTVEC(rsvd87),
1863 	&IDTVEC(rsvd88), &IDTVEC(rsvd89), &IDTVEC(rsvd8a), &IDTVEC(rsvd8b),
1864 	&IDTVEC(rsvd8c), &IDTVEC(rsvd8d), &IDTVEC(rsvd8e), &IDTVEC(rsvd8f),
1865 	&IDTVEC(rsvd90), &IDTVEC(rsvd91), &IDTVEC(rsvd92), &IDTVEC(rsvd93),
1866 	&IDTVEC(rsvd94), &IDTVEC(rsvd95), &IDTVEC(rsvd96), &IDTVEC(rsvd97),
1867 	&IDTVEC(rsvd98), &IDTVEC(rsvd99), &IDTVEC(rsvd9a), &IDTVEC(rsvd9b),
1868 	&IDTVEC(rsvd9c), &IDTVEC(rsvd9d), &IDTVEC(rsvd9e), &IDTVEC(rsvd9f),
1869 	&IDTVEC(rsvda0), &IDTVEC(rsvda1), &IDTVEC(rsvda2), &IDTVEC(rsvda3),
1870 	&IDTVEC(rsvda4), &IDTVEC(rsvda5), &IDTVEC(rsvda6), &IDTVEC(rsvda7),
1871 	&IDTVEC(rsvda8), &IDTVEC(rsvda9), &IDTVEC(rsvdaa), &IDTVEC(rsvdab),
1872 	&IDTVEC(rsvdac), &IDTVEC(rsvdad), &IDTVEC(rsvdae), &IDTVEC(rsvdaf),
1873 	&IDTVEC(rsvdb0), &IDTVEC(rsvdb1), &IDTVEC(rsvdb2), &IDTVEC(rsvdb3),
1874 	&IDTVEC(rsvdb4), &IDTVEC(rsvdb5), &IDTVEC(rsvdb6), &IDTVEC(rsvdb7),
1875 	&IDTVEC(rsvdb8), &IDTVEC(rsvdb9), &IDTVEC(rsvdba), &IDTVEC(rsvdbb),
1876 	&IDTVEC(rsvdbc), &IDTVEC(rsvdbd), &IDTVEC(rsvdbe), &IDTVEC(rsvdbf),
1877 	&IDTVEC(rsvdc0), &IDTVEC(rsvdc1), &IDTVEC(rsvdc2), &IDTVEC(rsvdc3),
1878 	&IDTVEC(rsvdc4), &IDTVEC(rsvdc5), &IDTVEC(rsvdc6), &IDTVEC(rsvdc7),
1879 	&IDTVEC(rsvdc8), &IDTVEC(rsvdc9), &IDTVEC(rsvdca), &IDTVEC(rsvdcb),
1880 	&IDTVEC(rsvdcc), &IDTVEC(rsvdcd), &IDTVEC(rsvdce), &IDTVEC(rsvdcf),
1881 	&IDTVEC(rsvdd0), &IDTVEC(rsvdd1), &IDTVEC(rsvdd2), &IDTVEC(rsvdd3),
1882 	&IDTVEC(rsvdd4), &IDTVEC(rsvdd5), &IDTVEC(rsvdd6), &IDTVEC(rsvdd7),
1883 	&IDTVEC(rsvdd8), &IDTVEC(rsvdd9), &IDTVEC(rsvdda), &IDTVEC(rsvddb),
1884 	&IDTVEC(rsvddc), &IDTVEC(rsvddd), &IDTVEC(rsvdde), &IDTVEC(rsvddf),
1885 	&IDTVEC(rsvde0), &IDTVEC(rsvde1), &IDTVEC(rsvde2), &IDTVEC(rsvde3),
1886 	&IDTVEC(rsvde4), &IDTVEC(rsvde5), &IDTVEC(rsvde6), &IDTVEC(rsvde7),
1887 	&IDTVEC(rsvde8), &IDTVEC(rsvde9), &IDTVEC(rsvdea), &IDTVEC(rsvdeb),
1888 	&IDTVEC(rsvdec), &IDTVEC(rsvded), &IDTVEC(rsvdee), &IDTVEC(rsvdef),
1889 	&IDTVEC(rsvdf0), &IDTVEC(rsvdf1), &IDTVEC(rsvdf2), &IDTVEC(rsvdf3),
1890 	&IDTVEC(rsvdf4), &IDTVEC(rsvdf5), &IDTVEC(rsvdf6), &IDTVEC(rsvdf7),
1891 	&IDTVEC(rsvdf8), &IDTVEC(rsvdf9), &IDTVEC(rsvdfa), &IDTVEC(rsvdfb),
1892 	&IDTVEC(rsvdfc), &IDTVEC(rsvdfd), &IDTVEC(rsvdfe), &IDTVEC(rsvdff)
1893 };
1894 
1895 void
1896 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd)
1897 {
1898 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
1899 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
1900 	ssd->ssd_type  = sd->sd_type;
1901 	ssd->ssd_dpl   = sd->sd_dpl;
1902 	ssd->ssd_p     = sd->sd_p;
1903 	ssd->ssd_def32 = sd->sd_def32;
1904 	ssd->ssd_gran  = sd->sd_gran;
1905 }
1906 
1907 void
1908 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd)
1909 {
1910 
1911 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
1912 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
1913 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
1914 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
1915 	sd->sd_type  = ssd->ssd_type;
1916 	sd->sd_dpl   = ssd->ssd_dpl;
1917 	sd->sd_p     = ssd->ssd_p;
1918 	sd->sd_long  = ssd->ssd_long;
1919 	sd->sd_def32 = ssd->ssd_def32;
1920 	sd->sd_gran  = ssd->ssd_gran;
1921 }
1922 
1923 void
1924 ssdtosyssd(struct soft_segment_descriptor *ssd,
1925     struct system_segment_descriptor *sd)
1926 {
1927 
1928 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
1929 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
1930 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
1931 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
1932 	sd->sd_type  = ssd->ssd_type;
1933 	sd->sd_dpl   = ssd->ssd_dpl;
1934 	sd->sd_p     = ssd->ssd_p;
1935 	sd->sd_gran  = ssd->ssd_gran;
1936 }
1937 
1938 /*
1939  * Populate the (physmap) array with base/bound pairs describing the
1940  * available physical memory in the system, then test this memory and
1941  * build the phys_avail array describing the actually-available memory.
1942  *
1943  * If we cannot accurately determine the physical memory map, then use
1944  * value from the 0xE801 call, and failing that, the RTC.
1945  *
1946  * Total memory size may be set by the kernel environment variable
1947  * hw.physmem or the compile-time define MAXMEM.
1948  *
1949  * Memory is aligned to PHYSMAP_ALIGN which must be a multiple
1950  * of PAGE_SIZE.  This also greatly reduces the memory test time
1951  * which would otherwise be excessive on machines with > 8G of ram.
1952  *
1953  * XXX first should be vm_paddr_t.
1954  */
1955 
1956 #define PHYSMAP_ALIGN		(vm_paddr_t)(128 * 1024)
1957 #define PHYSMAP_ALIGN_MASK	(vm_paddr_t)(PHYSMAP_ALIGN - 1)
1958 #define PHYSMAP_SIZE		VM_PHYSSEG_MAX
1959 
1960 vm_paddr_t physmap[PHYSMAP_SIZE];
1961 struct bios_smap *smapbase, *smap, *smapend;
1962 struct efi_map_header *efihdrbase;
1963 u_int32_t smapsize;
1964 
1965 #define PHYSMAP_HANDWAVE	(vm_paddr_t)(2 * 1024 * 1024)
1966 #define PHYSMAP_HANDWAVE_MASK	(PHYSMAP_HANDWAVE - 1)
1967 
1968 static void
1969 add_smap_entries(int *physmap_idx)
1970 {
1971 	int i;
1972 
1973 	smapsize = *((u_int32_t *)smapbase - 1);
1974 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
1975 
1976 	for (smap = smapbase; smap < smapend; smap++) {
1977 		if (boothowto & RB_VERBOSE)
1978 			kprintf("SMAP type=%02x base=%016lx len=%016lx\n",
1979 			    smap->type, smap->base, smap->length);
1980 
1981 		if (smap->type != SMAP_TYPE_MEMORY)
1982 			continue;
1983 
1984 		if (smap->length == 0)
1985 			continue;
1986 
1987 		for (i = 0; i <= *physmap_idx; i += 2) {
1988 			if (smap->base < physmap[i + 1]) {
1989 				if (boothowto & RB_VERBOSE) {
1990 					kprintf("Overlapping or non-monotonic "
1991 						"memory region, ignoring "
1992 						"second region\n");
1993 				}
1994 				break;
1995 			}
1996 		}
1997 		if (i <= *physmap_idx)
1998 			continue;
1999 
2000 		Realmem += smap->length;
2001 
2002 		if (smap->base == physmap[*physmap_idx + 1]) {
2003 			physmap[*physmap_idx + 1] += smap->length;
2004 			continue;
2005 		}
2006 
2007 		*physmap_idx += 2;
2008 		if (*physmap_idx == PHYSMAP_SIZE) {
2009 			kprintf("Too many segments in the physical "
2010 				"address map, giving up\n");
2011 			break;
2012 		}
2013 		physmap[*physmap_idx] = smap->base;
2014 		physmap[*physmap_idx + 1] = smap->base + smap->length;
2015 	}
2016 }
2017 
2018 static void
2019 add_efi_map_entries(int *physmap_idx)
2020 {
2021 	struct efi_md *map, *p;
2022 	const char *type;
2023 	size_t efisz;
2024 	int i, ndesc;
2025 
2026 	static const char *types[] = {
2027 		"Reserved",
2028 		"LoaderCode",
2029 		"LoaderData",
2030 		"BootServicesCode",
2031 		"BootServicesData",
2032 		"RuntimeServicesCode",
2033 		"RuntimeServicesData",
2034 		"ConventionalMemory",
2035 		"UnusableMemory",
2036 		"ACPIReclaimMemory",
2037 		"ACPIMemoryNVS",
2038 		"MemoryMappedIO",
2039 		"MemoryMappedIOPortSpace",
2040 		"PalCode"
2041 	 };
2042 
2043 	/*
2044 	 * Memory map data provided by UEFI via the GetMemoryMap
2045 	 * Boot Services API.
2046 	 */
2047 	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
2048 	map = (struct efi_md *)((uint8_t *)efihdrbase + efisz);
2049 
2050 	if (efihdrbase->descriptor_size == 0)
2051 		return;
2052 	ndesc = efihdrbase->memory_size / efihdrbase->descriptor_size;
2053 
2054 	if (boothowto & RB_VERBOSE)
2055 		kprintf("%23s %12s %12s %8s %4s\n",
2056 		    "Type", "Physical", "Virtual", "#Pages", "Attr");
2057 
2058 	for (i = 0, p = map; i < ndesc; i++,
2059 	    p = efi_next_descriptor(p, efihdrbase->descriptor_size)) {
2060 		if (boothowto & RB_VERBOSE) {
2061 			if (p->md_type <= EFI_MD_TYPE_PALCODE)
2062 				type = types[p->md_type];
2063 			else
2064 				type = "<INVALID>";
2065 			kprintf("%23s %012lx %12p %08lx ", type, p->md_phys,
2066 			    p->md_virt, p->md_pages);
2067 			if (p->md_attr & EFI_MD_ATTR_UC)
2068 				kprintf("UC ");
2069 			if (p->md_attr & EFI_MD_ATTR_WC)
2070 				kprintf("WC ");
2071 			if (p->md_attr & EFI_MD_ATTR_WT)
2072 				kprintf("WT ");
2073 			if (p->md_attr & EFI_MD_ATTR_WB)
2074 				kprintf("WB ");
2075 			if (p->md_attr & EFI_MD_ATTR_UCE)
2076 				kprintf("UCE ");
2077 			if (p->md_attr & EFI_MD_ATTR_WP)
2078 				kprintf("WP ");
2079 			if (p->md_attr & EFI_MD_ATTR_RP)
2080 				kprintf("RP ");
2081 			if (p->md_attr & EFI_MD_ATTR_XP)
2082 				kprintf("XP ");
2083 			if (p->md_attr & EFI_MD_ATTR_RT)
2084 				kprintf("RUNTIME");
2085 			kprintf("\n");
2086 		}
2087 
2088 		switch (p->md_type) {
2089 		case EFI_MD_TYPE_CODE:
2090 		case EFI_MD_TYPE_DATA:
2091 		case EFI_MD_TYPE_BS_CODE:
2092 		case EFI_MD_TYPE_BS_DATA:
2093 		case EFI_MD_TYPE_FREE:
2094 			/*
2095 			 * We're allowed to use any entry with these types.
2096 			 */
2097 			break;
2098 		default:
2099 			continue;
2100 		}
2101 
2102 		Realmem += p->md_pages * PAGE_SIZE;
2103 
2104 		if (p->md_phys == physmap[*physmap_idx + 1]) {
2105 			physmap[*physmap_idx + 1] += p->md_pages * PAGE_SIZE;
2106 			continue;
2107 		}
2108 
2109 		*physmap_idx += 2;
2110 		if (*physmap_idx == PHYSMAP_SIZE) {
2111 			kprintf("Too many segments in the physical "
2112 				"address map, giving up\n");
2113 			break;
2114 		}
2115 		physmap[*physmap_idx] = p->md_phys;
2116 		physmap[*physmap_idx + 1] = p->md_phys + p->md_pages * PAGE_SIZE;
2117 	 }
2118 }
2119 
2120 struct fb_info efi_fb_info;
2121 static int have_efi_framebuffer = 0;
2122 
2123 static void
2124 efi_fb_init_vaddr(int direct_map)
2125 {
2126 	uint64_t sz;
2127 	vm_offset_t addr, v;
2128 
2129 	v = efi_fb_info.vaddr;
2130 	sz = efi_fb_info.stride * efi_fb_info.height;
2131 
2132 	if (direct_map) {
2133 		addr = PHYS_TO_DMAP(efi_fb_info.paddr);
2134 		if (addr >= DMAP_MIN_ADDRESS && addr + sz <= DMapMaxAddress)
2135 			efi_fb_info.vaddr = addr;
2136 	} else {
2137 		efi_fb_info.vaddr =
2138 			(vm_offset_t)pmap_mapdev_attr(efi_fb_info.paddr,
2139 						      sz,
2140 						      PAT_WRITE_COMBINING);
2141 	}
2142 }
2143 
2144 static u_int
2145 efifb_color_depth(struct efi_fb *efifb)
2146 {
2147 	uint32_t mask;
2148 	u_int depth;
2149 
2150 	mask = efifb->fb_mask_red | efifb->fb_mask_green |
2151 	    efifb->fb_mask_blue | efifb->fb_mask_reserved;
2152 	if (mask == 0)
2153 		return (0);
2154 	for (depth = 1; mask != 1; depth++)
2155 		mask >>= 1;
2156 	return (depth);
2157 }
2158 
2159 int
2160 probe_efi_fb(int early)
2161 {
2162 	struct efi_fb	*efifb;
2163 	caddr_t		kmdp;
2164 	u_int		depth;
2165 
2166 	if (have_efi_framebuffer) {
2167 		if (!early &&
2168 		    (efi_fb_info.vaddr == 0 ||
2169 		     efi_fb_info.vaddr == PHYS_TO_DMAP(efi_fb_info.paddr)))
2170 			efi_fb_init_vaddr(0);
2171 		return 0;
2172 	}
2173 
2174 	kmdp = preload_search_by_type("elf kernel");
2175 	if (kmdp == NULL)
2176 		kmdp = preload_search_by_type("elf64 kernel");
2177 	efifb = (struct efi_fb *)preload_search_info(kmdp,
2178 	    MODINFO_METADATA | MODINFOMD_EFI_FB);
2179 	if (efifb == NULL)
2180 		return 1;
2181 
2182 	depth = efifb_color_depth(efifb);
2183 	/*
2184 	 * Our bootloader should already notice, when we won't be able to
2185 	 * use the UEFI framebuffer.
2186 	 */
2187 	if (depth != 24 && depth != 32)
2188 		return 1;
2189 
2190 	have_efi_framebuffer = 1;
2191 
2192 	efi_fb_info.is_vga_boot_display = 1;
2193 	efi_fb_info.width = efifb->fb_width;
2194 	efi_fb_info.height = efifb->fb_height;
2195 	efi_fb_info.depth = depth;
2196 	efi_fb_info.stride = efifb->fb_stride * (depth / 8);
2197 	efi_fb_info.paddr = efifb->fb_addr;
2198 	if (early) {
2199 		efi_fb_info.vaddr = 0;
2200 	} else {
2201 		efi_fb_init_vaddr(0);
2202 	}
2203 	efi_fb_info.fbops.fb_set_par = NULL;
2204 	efi_fb_info.fbops.fb_blank = NULL;
2205 	efi_fb_info.fbops.fb_debug_enter = NULL;
2206 	efi_fb_info.device = NULL;
2207 
2208 	return 0;
2209 }
2210 
2211 static void
2212 efifb_startup(void *arg)
2213 {
2214 	probe_efi_fb(0);
2215 }
2216 
2217 SYSINIT(efi_fb_info, SI_BOOT1_POST, SI_ORDER_FIRST, efifb_startup, NULL);
2218 
2219 static void
2220 getmemsize(caddr_t kmdp, u_int64_t first)
2221 {
2222 	int off, physmap_idx, pa_indx, da_indx;
2223 	int i, j;
2224 	vm_paddr_t pa;
2225 	vm_paddr_t msgbuf_size;
2226 	u_long physmem_tunable;
2227 	pt_entry_t *pte;
2228 	quad_t dcons_addr, dcons_size;
2229 
2230 	bzero(physmap, sizeof(physmap));
2231 	physmap_idx = 0;
2232 
2233 	/*
2234 	 * get memory map from INT 15:E820, kindly supplied by the loader.
2235 	 *
2236 	 * subr_module.c says:
2237 	 * "Consumer may safely assume that size value precedes data."
2238 	 * ie: an int32_t immediately precedes smap.
2239 	 */
2240 	efihdrbase = (struct efi_map_header *)preload_search_info(kmdp,
2241 		     MODINFO_METADATA | MODINFOMD_EFI_MAP);
2242 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
2243 		   MODINFO_METADATA | MODINFOMD_SMAP);
2244 	if (smapbase == NULL && efihdrbase == NULL)
2245 		panic("No BIOS smap or EFI map info from loader!");
2246 
2247 	if (efihdrbase == NULL)
2248 		add_smap_entries(&physmap_idx);
2249 	else
2250 		add_efi_map_entries(&physmap_idx);
2251 
2252 	base_memory = physmap[1] / 1024;
2253 	/* make hole for AP bootstrap code */
2254 	physmap[1] = mp_bootaddress(base_memory);
2255 
2256 	/* Save EBDA address, if any */
2257 	ebda_addr = (u_long)(*(u_short *)(KERNBASE + 0x40e));
2258 	ebda_addr <<= 4;
2259 
2260 	/*
2261 	 * Maxmem isn't the "maximum memory", it's one larger than the
2262 	 * highest page of the physical address space.  It should be
2263 	 * called something like "Maxphyspage".  We may adjust this
2264 	 * based on ``hw.physmem'' and the results of the memory test.
2265 	 */
2266 	Maxmem = atop(physmap[physmap_idx + 1]);
2267 
2268 #ifdef MAXMEM
2269 	Maxmem = MAXMEM / 4;
2270 #endif
2271 
2272 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
2273 		Maxmem = atop(physmem_tunable);
2274 
2275 	/*
2276 	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
2277 	 * in the system.
2278 	 */
2279 	if (Maxmem > atop(physmap[physmap_idx + 1]))
2280 		Maxmem = atop(physmap[physmap_idx + 1]);
2281 
2282 	/*
2283 	 * Blowing out the DMAP will blow up the system.
2284 	 */
2285 	if (Maxmem > atop(DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS)) {
2286 		kprintf("Limiting Maxmem due to DMAP size\n");
2287 		Maxmem = atop(DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS);
2288 	}
2289 
2290 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
2291 	    (boothowto & RB_VERBOSE)) {
2292 		kprintf("Physical memory use set to %ldK\n", Maxmem * 4);
2293 	}
2294 
2295 	/*
2296 	 * Call pmap initialization to make new kernel address space
2297 	 *
2298 	 * Mask off page 0.
2299 	 */
2300 	pmap_bootstrap(&first);
2301 	physmap[0] = PAGE_SIZE;
2302 
2303 	/*
2304 	 * Align the physmap to PHYSMAP_ALIGN and cut out anything
2305 	 * exceeding Maxmem.
2306 	 */
2307 	for (i = j = 0; i <= physmap_idx; i += 2) {
2308 		if (physmap[i+1] > ptoa(Maxmem))
2309 			physmap[i+1] = ptoa(Maxmem);
2310 		physmap[i] = (physmap[i] + PHYSMAP_ALIGN_MASK) &
2311 			     ~PHYSMAP_ALIGN_MASK;
2312 		physmap[i+1] = physmap[i+1] & ~PHYSMAP_ALIGN_MASK;
2313 
2314 		physmap[j] = physmap[i];
2315 		physmap[j+1] = physmap[i+1];
2316 
2317 		if (physmap[i] < physmap[i+1])
2318 			j += 2;
2319 	}
2320 	physmap_idx = j - 2;
2321 
2322 	/*
2323 	 * Align anything else used in the validation loop.
2324 	 *
2325 	 * Also make sure that our 2MB kernel text+data+bss mappings
2326 	 * do not overlap potentially allocatable space.
2327 	 */
2328 	first = (first + PHYSMAP_ALIGN_MASK) & ~PHYSMAP_ALIGN_MASK;
2329 
2330 	/*
2331 	 * Size up each available chunk of physical memory.
2332 	 */
2333 	pa_indx = 0;
2334 	da_indx = 0;
2335 	phys_avail[pa_indx].phys_beg = physmap[0];
2336 	phys_avail[pa_indx].phys_end = physmap[0];
2337 	dump_avail[da_indx].phys_beg = 0;
2338 	dump_avail[da_indx].phys_end = physmap[0];
2339 	pte = CMAP1;
2340 
2341 	/*
2342 	 * Get dcons buffer address
2343 	 */
2344 	if (kgetenv_quad("dcons.addr", &dcons_addr) == 0 ||
2345 	    kgetenv_quad("dcons.size", &dcons_size) == 0)
2346 		dcons_addr = 0;
2347 
2348 	/*
2349 	 * Validate the physical memory.  The physical memory segments
2350 	 * have already been aligned to PHYSMAP_ALIGN which is a multiple
2351 	 * of PAGE_SIZE.
2352 	 *
2353 	 * We no longer perform an exhaustive memory test.  Instead we
2354 	 * simply test the first and last word in each physmap[]
2355 	 * segment.
2356 	 */
2357 	for (i = 0; i <= physmap_idx; i += 2) {
2358 		vm_paddr_t end;
2359 		vm_paddr_t incr;
2360 
2361 		end = physmap[i + 1];
2362 
2363 		for (pa = physmap[i]; pa < end; pa += incr) {
2364 			int page_bad, full;
2365 			volatile uint64_t *ptr = (uint64_t *)CADDR1;
2366 			uint64_t tmp;
2367 
2368 			full = FALSE;
2369 
2370 			/*
2371 			 * Calculate incr.  Just test the first and
2372 			 * last page in each physmap[] segment.
2373 			 */
2374 			if (pa == end - PAGE_SIZE)
2375 				incr = PAGE_SIZE;
2376 			else
2377 				incr = end - pa - PAGE_SIZE;
2378 
2379 			/*
2380 			 * Make sure we don't skip blacked out areas.
2381 			 */
2382 			if (pa < 0x200000 && 0x200000 < end) {
2383 				incr = 0x200000 - pa;
2384 			}
2385 			if (dcons_addr > 0 &&
2386 			    pa < dcons_addr &&
2387 			    dcons_addr < end) {
2388 				incr = dcons_addr - pa;
2389 			}
2390 
2391 			/*
2392 			 * Block out kernel memory as not available.
2393 			 */
2394 			if (pa >= 0x200000 && pa < first) {
2395 				incr = first - pa;
2396 				if (pa + incr > end)
2397 					incr = end - pa;
2398 				goto do_dump_avail;
2399 			}
2400 
2401 			/*
2402 			 * Block out the dcons buffer if it exists.
2403 			 */
2404 			if (dcons_addr > 0 &&
2405 			    pa >= trunc_page(dcons_addr) &&
2406 			    pa < dcons_addr + dcons_size) {
2407 				incr = dcons_addr + dcons_size - pa;
2408 				incr = (incr + PAGE_MASK) &
2409 				       ~(vm_paddr_t)PAGE_MASK;
2410 				if (pa + incr > end)
2411 					incr = end - pa;
2412 				goto do_dump_avail;
2413 			}
2414 
2415 			page_bad = FALSE;
2416 
2417 			/*
2418 			 * Map the page non-cacheable for the memory
2419 			 * test.
2420 			 */
2421 			*pte = pa |
2422 			    kernel_pmap.pmap_bits[PG_V_IDX] |
2423 			    kernel_pmap.pmap_bits[PG_RW_IDX] |
2424 			    kernel_pmap.pmap_bits[PG_N_IDX];
2425 			cpu_invlpg(__DEVOLATILE(void *, ptr));
2426 			cpu_mfence();
2427 
2428 			/*
2429 			 * Save original value for restoration later.
2430 			 */
2431 			tmp = *ptr;
2432 
2433 			/*
2434 			 * Test for alternating 1's and 0's
2435 			 */
2436 			*ptr = 0xaaaaaaaaaaaaaaaaLLU;
2437 			cpu_mfence();
2438 			if (*ptr != 0xaaaaaaaaaaaaaaaaLLU)
2439 				page_bad = TRUE;
2440 			/*
2441 			 * Test for alternating 0's and 1's
2442 			 */
2443 			*ptr = 0x5555555555555555LLU;
2444 			cpu_mfence();
2445 			if (*ptr != 0x5555555555555555LLU)
2446 				page_bad = TRUE;
2447 			/*
2448 			 * Test for all 1's
2449 			 */
2450 			*ptr = 0xffffffffffffffffLLU;
2451 			cpu_mfence();
2452 			if (*ptr != 0xffffffffffffffffLLU)
2453 				page_bad = TRUE;
2454 			/*
2455 			 * Test for all 0's
2456 			 */
2457 			*ptr = 0x0;
2458 			cpu_mfence();
2459 			if (*ptr != 0x0)
2460 				page_bad = TRUE;
2461 
2462 			/*
2463 			 * Restore original value.
2464 			 */
2465 			*ptr = tmp;
2466 
2467 			/*
2468 			 * Adjust array of valid/good pages.
2469 			 */
2470 			if (page_bad == TRUE) {
2471 				incr = PAGE_SIZE;
2472 				continue;
2473 			}
2474 
2475 			/*
2476 			 * Collapse page address into phys_avail[].  Do a
2477 			 * continuation of the current phys_avail[] index
2478 			 * when possible.
2479 			 */
2480 			if (phys_avail[pa_indx].phys_end == pa) {
2481 				/*
2482 				 * Continuation
2483 				 */
2484 				phys_avail[pa_indx].phys_end += incr;
2485 			} else if (phys_avail[pa_indx].phys_beg ==
2486 				   phys_avail[pa_indx].phys_end) {
2487 				/*
2488 				 * Current phys_avail is completely empty,
2489 				 * reuse the index.
2490 				 */
2491 				phys_avail[pa_indx].phys_beg = pa;
2492 				phys_avail[pa_indx].phys_end = pa + incr;
2493 			} else {
2494 				/*
2495 				 * Allocate next phys_avail index.
2496 				 */
2497 				++pa_indx;
2498 				if (pa_indx == PHYS_AVAIL_ARRAY_END) {
2499 					kprintf(
2500 		"Too many holes in the physical address space, giving up\n");
2501 					--pa_indx;
2502 					full = TRUE;
2503 					goto do_dump_avail;
2504 				}
2505 				phys_avail[pa_indx].phys_beg = pa;
2506 				phys_avail[pa_indx].phys_end = pa + incr;
2507 			}
2508 			physmem += incr / PAGE_SIZE;
2509 
2510 			/*
2511 			 * pa available for dumping
2512 			 */
2513 do_dump_avail:
2514 			if (dump_avail[da_indx].phys_end == pa) {
2515 				dump_avail[da_indx].phys_end += incr;
2516 			} else {
2517 				++da_indx;
2518 				if (da_indx == DUMP_AVAIL_ARRAY_END) {
2519 					--da_indx;
2520 					goto do_next;
2521 				}
2522 				dump_avail[da_indx].phys_beg = pa;
2523 				dump_avail[da_indx].phys_end = pa + incr;
2524 			}
2525 do_next:
2526 			if (full)
2527 				break;
2528 		}
2529 	}
2530 	*pte = 0;
2531 	cpu_invltlb();
2532 	cpu_mfence();
2533 
2534 	/*
2535 	 * The last chunk must contain at least one page plus the message
2536 	 * buffer to avoid complicating other code (message buffer address
2537 	 * calculation, etc.).
2538 	 */
2539 	msgbuf_size = (MSGBUF_SIZE + PHYSMAP_ALIGN_MASK) & ~PHYSMAP_ALIGN_MASK;
2540 
2541 	while (phys_avail[pa_indx].phys_beg + PHYSMAP_ALIGN + msgbuf_size >=
2542 	       phys_avail[pa_indx].phys_end) {
2543 		physmem -= atop(phys_avail[pa_indx].phys_end -
2544 				phys_avail[pa_indx].phys_beg);
2545 		phys_avail[pa_indx].phys_beg = 0;
2546 		phys_avail[pa_indx].phys_end = 0;
2547 		--pa_indx;
2548 	}
2549 
2550 	Maxmem = atop(phys_avail[pa_indx].phys_end);
2551 
2552 	/* Trim off space for the message buffer. */
2553 	phys_avail[pa_indx].phys_end -= msgbuf_size;
2554 
2555 	avail_end = phys_avail[pa_indx].phys_end;
2556 
2557 	/* Map the message buffer. */
2558 	for (off = 0; off < msgbuf_size; off += PAGE_SIZE) {
2559 		pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off);
2560 	}
2561 
2562 	/*
2563 	 * Try to get EFI framebuffer working as early as possible.
2564 	 *
2565 	 * WARN: Some BIOSes do not list the EFI framebuffer memory, causing
2566 	 * the pmap probe code to create a DMAP that does not cover its
2567 	 * physical address space, efi_fb_init_vaddr(1) might not return
2568 	 * an initialized framebuffer base pointer.  In this situation the
2569 	 * later efi_fb_init_vaddr(0) call will deal with it.
2570 	 */
2571 	if (have_efi_framebuffer)
2572 		efi_fb_init_vaddr(1);
2573 }
2574 
2575 struct machintr_abi MachIntrABI;
2576 
2577 /*
2578  * IDT VECTORS:
2579  *	0	Divide by zero
2580  *	1	Debug
2581  *	2	NMI
2582  *	3	BreakPoint
2583  *	4	OverFlow
2584  *	5	Bound-Range
2585  *	6	Invalid OpCode
2586  *	7	Device Not Available (x87)
2587  *	8	Double-Fault
2588  *	9	Coprocessor Segment overrun (unsupported, reserved)
2589  *	10	Invalid-TSS
2590  *	11	Segment not present
2591  *	12	Stack
2592  *	13	General Protection
2593  *	14	Page Fault
2594  *	15	Reserved
2595  *	16	x87 FP Exception pending
2596  *	17	Alignment Check
2597  *	18	Machine Check
2598  *	19	SIMD floating point
2599  *	20-31	reserved
2600  *	32-255	INTn/external sources
2601  */
2602 u_int64_t
2603 hammer_time(u_int64_t modulep, u_int64_t physfree)
2604 {
2605 	caddr_t kmdp;
2606 	int gsel_tss, x, cpu;
2607 #if 0 /* JG */
2608 	int metadata_missing, off;
2609 #endif
2610 	struct mdglobaldata *gd;
2611 	struct privatespace *ps;
2612 	u_int64_t msr;
2613 
2614 	/*
2615 	 * Prevent lowering of the ipl if we call tsleep() early.
2616 	 */
2617 	gd = &CPU_prvspace[0]->mdglobaldata;
2618 	ps = (struct privatespace *)gd;
2619 	bzero(gd, sizeof(*gd));
2620 	bzero(&ps->common_tss, sizeof(ps->common_tss));
2621 
2622 	/*
2623 	 * Note: on both UP and SMP curthread must be set non-NULL
2624 	 * early in the boot sequence because the system assumes
2625 	 * that 'curthread' is never NULL.
2626 	 */
2627 
2628 	gd->mi.gd_curthread = &thread0;
2629 	thread0.td_gd = &gd->mi;
2630 
2631 	atdevbase = ISA_HOLE_START + PTOV_OFFSET;
2632 
2633 #if 0 /* JG */
2634 	metadata_missing = 0;
2635 	if (bootinfo.bi_modulep) {
2636 		preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE;
2637 		preload_bootstrap_relocate(KERNBASE);
2638 	} else {
2639 		metadata_missing = 1;
2640 	}
2641 	if (bootinfo.bi_envp)
2642 		kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE;
2643 #endif
2644 
2645 	preload_metadata = (caddr_t)(uintptr_t)(modulep + PTOV_OFFSET);
2646 	preload_bootstrap_relocate(PTOV_OFFSET);
2647 	kmdp = preload_search_by_type("elf kernel");
2648 	if (kmdp == NULL)
2649 		kmdp = preload_search_by_type("elf64 kernel");
2650 	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
2651 	kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *) + PTOV_OFFSET;
2652 #ifdef DDB
2653 	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
2654 	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
2655 #endif
2656 	efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
2657 
2658 	if (boothowto & RB_VERBOSE)
2659 		bootverbose++;
2660 
2661 	/*
2662 	 * Default MachIntrABI to ICU
2663 	 */
2664 	MachIntrABI = MachIntrABI_ICU;
2665 
2666 	/*
2667 	 * start with one cpu.  Note: with one cpu, ncpus_fit_mask remain 0.
2668 	 */
2669 	ncpus = 1;
2670 	ncpus_fit = 1;
2671 	/* Init basic tunables, hz etc */
2672 	init_param1();
2673 
2674 	/*
2675 	 * make gdt memory segments
2676 	 */
2677 	gdt_segs[GPROC0_SEL].ssd_base =
2678 		(uintptr_t) &CPU_prvspace[0]->common_tss;
2679 
2680 	gd->mi.gd_prvspace = CPU_prvspace[0];
2681 
2682 	for (x = 0; x < NGDT; x++) {
2683 		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1))
2684 			ssdtosd(&gdt_segs[x], &gdt[x]);
2685 	}
2686 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
2687 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
2688 
2689 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
2690 	r_gdt.rd_base =  (long) gdt;
2691 	lgdt(&r_gdt);
2692 
2693 	wrmsr(MSR_FSBASE, 0);		/* User value */
2694 	wrmsr(MSR_GSBASE, (u_int64_t)&gd->mi);
2695 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
2696 
2697 	mi_gdinit(&gd->mi, 0);
2698 	cpu_gdinit(gd, 0);
2699 	proc0paddr = proc0paddr_buff;
2700 	mi_proc0init(&gd->mi, proc0paddr);
2701 	safepri = TDPRI_MAX;
2702 
2703 	/* spinlocks and the BGL */
2704 	init_locks();
2705 
2706 	/* exceptions */
2707 	for (x = 0; x < NIDT; x++)
2708 		setidt_global(x, rsvdary[x], SDT_SYSIGT, SEL_KPL, 0);
2709 	setidt_global(IDT_DE, &IDTVEC(div),  SDT_SYSIGT, SEL_KPL, 0);
2710 	setidt_global(IDT_DB, &IDTVEC(dbg),  SDT_SYSIGT, SEL_KPL, 2);
2711 	setidt_global(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 1);
2712 	setidt_global(IDT_BP, &IDTVEC(bpt),  SDT_SYSIGT, SEL_UPL, 0);
2713 	setidt_global(IDT_OF, &IDTVEC(ofl),  SDT_SYSIGT, SEL_KPL, 0);
2714 	setidt_global(IDT_BR, &IDTVEC(bnd),  SDT_SYSIGT, SEL_KPL, 0);
2715 	setidt_global(IDT_UD, &IDTVEC(ill),  SDT_SYSIGT, SEL_KPL, 0);
2716 	setidt_global(IDT_NM, &IDTVEC(dna),  SDT_SYSIGT, SEL_KPL, 0);
2717 	setidt_global(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
2718 	setidt_global(IDT_FPUGP, &IDTVEC(fpusegm),  SDT_SYSIGT, SEL_KPL, 0);
2719 	setidt_global(IDT_TS, &IDTVEC(tss),  SDT_SYSIGT, SEL_KPL, 0);
2720 	setidt_global(IDT_NP, &IDTVEC(missing),  SDT_SYSIGT, SEL_KPL, 0);
2721 	setidt_global(IDT_SS, &IDTVEC(stk),  SDT_SYSIGT, SEL_KPL, 0);
2722 	setidt_global(IDT_GP, &IDTVEC(prot),  SDT_SYSIGT, SEL_KPL, 0);
2723 	setidt_global(IDT_PF, &IDTVEC(page),  SDT_SYSIGT, SEL_KPL, 0);
2724 	setidt_global(IDT_MF, &IDTVEC(fpu),  SDT_SYSIGT, SEL_KPL, 0);
2725 	setidt_global(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0);
2726 	setidt_global(IDT_MC, &IDTVEC(mchk),  SDT_SYSIGT, SEL_KPL, 0);
2727 	setidt_global(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0);
2728 
2729 	for (cpu = 0; cpu < MAXCPU; ++cpu) {
2730 		r_idt_arr[cpu].rd_limit = sizeof(idt_arr[cpu]) - 1;
2731 		r_idt_arr[cpu].rd_base = (long) &idt_arr[cpu][0];
2732 	}
2733 
2734 	lidt(&r_idt_arr[0]);
2735 
2736 	/*
2737 	 * Initialize the console before we print anything out.
2738 	 */
2739 	cninit();
2740 
2741 #if 0 /* JG */
2742 	if (metadata_missing)
2743 		kprintf("WARNING: loader(8) metadata is missing!\n");
2744 #endif
2745 
2746 #if	NISA >0
2747 	elcr_probe();
2748 	isa_defaultirq();
2749 #endif
2750 	rand_initialize();
2751 
2752 	/*
2753 	 * Initialize IRQ mapping
2754 	 *
2755 	 * NOTE:
2756 	 * SHOULD be after elcr_probe()
2757 	 */
2758 	MachIntrABI_ICU.initmap();
2759 	MachIntrABI_IOAPIC.initmap();
2760 
2761 #ifdef DDB
2762 	kdb_init();
2763 	if (boothowto & RB_KDB)
2764 		Debugger("Boot flags requested debugger");
2765 #endif
2766 
2767 	identify_cpu();		/* Final stage of CPU initialization */
2768 	initializecpu(0);	/* Initialize CPU registers */
2769 
2770 	/*
2771 	 * On modern Intel cpus, haswell or later, cpu_idle_hlt=1 is better
2772 	 * because the cpu does significant power management in MWAIT
2773 	 * (also suggested is to set sysctl machdep.mwait.CX.idle=AUTODEEP).
2774 	 *
2775 	 * On many AMD cpus cpu_idle_hlt=3 is better, because the cpu does
2776 	 * significant power management only when using ACPI halt mode.
2777 	 * (However, on Ryzen, mode 4 (HLT) also does power management).
2778 	 *
2779 	 * On older AMD or Intel cpus, cpu_idle_hlt=2 is better because ACPI
2780 	 * is needed to reduce power consumption, but wakeup times are often
2781 	 * too long.
2782 	 */
2783 	if (cpu_vendor_id == CPU_VENDOR_INTEL &&
2784 	    CPUID_TO_MODEL(cpu_id) >= 0x3C) {	/* Haswell or later */
2785 		cpu_idle_hlt = 1;
2786 	}
2787 	if (cpu_vendor_id == CPU_VENDOR_AMD) {
2788 		if (CPUID_TO_FAMILY(cpu_id) >= 0x17) {
2789 			/* Ryzen or later */
2790 			cpu_idle_hlt = 3;
2791 		} else if (CPUID_TO_FAMILY(cpu_id) >= 0x14) {
2792 			/* Bobcat or later */
2793 			cpu_idle_hlt = 3;
2794 		}
2795 	}
2796 
2797 	TUNABLE_INT_FETCH("hw.apic_io_enable", &ioapic_enable); /* for compat */
2798 	TUNABLE_INT_FETCH("hw.ioapic_enable", &ioapic_enable);
2799 	TUNABLE_INT_FETCH("hw.lapic_enable", &lapic_enable);
2800 	TUNABLE_INT_FETCH("machdep.cpu_idle_hlt", &cpu_idle_hlt);
2801 
2802 	/*
2803 	 * Some of the virtual machines do not work w/ I/O APIC
2804 	 * enabled.  If the user does not explicitly enable or
2805 	 * disable the I/O APIC (ioapic_enable < 0), then we
2806 	 * disable I/O APIC on all virtual machines.
2807 	 *
2808 	 * NOTE:
2809 	 * This must be done after identify_cpu(), which sets
2810 	 * 'cpu_feature2'
2811 	 */
2812 	if (ioapic_enable < 0) {
2813 		if (cpu_feature2 & CPUID2_VMM)
2814 			ioapic_enable = 0;
2815 		else
2816 			ioapic_enable = 1;
2817 	}
2818 
2819 	/*
2820 	 * TSS entry point for interrupts, traps, and exceptions
2821 	 * (sans NMI).  This will always go to near the top of the pcpu
2822 	 * trampoline area.  Hardware-pushed data will be copied into
2823 	 * the trap-frame on entry, and (if necessary) returned to the
2824 	 * trampoline on exit.
2825 	 *
2826 	 * We store some pcb data for the trampoline code above the
2827 	 * stack the cpu hw pushes into, and arrange things so the
2828 	 * address of tr_pcb_rsp is the same as the desired top of
2829 	 * stack.
2830 	 */
2831 	ps->common_tss.tss_rsp0 = (register_t)&ps->trampoline.tr_pcb_rsp;
2832 	ps->trampoline.tr_pcb_rsp = ps->common_tss.tss_rsp0;
2833 	ps->trampoline.tr_pcb_gs_kernel = (register_t)gd;
2834 	ps->trampoline.tr_pcb_cr3 = KPML4phys;	/* adj to user cr3 live */
2835 	ps->dbltramp.tr_pcb_gs_kernel = (register_t)gd;
2836 	ps->dbltramp.tr_pcb_cr3 = KPML4phys;
2837 	ps->dbgtramp.tr_pcb_gs_kernel = (register_t)gd;
2838 	ps->dbgtramp.tr_pcb_cr3 = KPML4phys;
2839 
2840 	/* double fault stack */
2841 	ps->common_tss.tss_ist1 = (register_t)&ps->dbltramp.tr_pcb_rsp;
2842 	/* #DB debugger needs its own stack */
2843 	ps->common_tss.tss_ist2 = (register_t)&ps->dbgtramp.tr_pcb_rsp;
2844 
2845 	/* Set the IO permission bitmap (empty due to tss seg limit) */
2846 	ps->common_tss.tss_iobase = sizeof(struct x86_64tss);
2847 
2848 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
2849 	gd->gd_tss_gdt = &gdt[GPROC0_SEL];
2850 	gd->gd_common_tssd = *gd->gd_tss_gdt;
2851 	ltr(gsel_tss);
2852 
2853 	/* Set up the fast syscall stuff */
2854 	msr = rdmsr(MSR_EFER) | EFER_SCE;
2855 	wrmsr(MSR_EFER, msr);
2856 	wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall));
2857 	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
2858 	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
2859 	      ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
2860 	wrmsr(MSR_STAR, msr);
2861 	wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D|PSL_IOPL|PSL_AC);
2862 
2863 	getmemsize(kmdp, physfree);
2864 	init_param2(physmem);
2865 
2866 	/* now running on new page tables, configured,and u/iom is accessible */
2867 
2868 	/* Map the message buffer. */
2869 #if 0 /* JG */
2870 	for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE)
2871 		pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off);
2872 #endif
2873 
2874 	msgbufinit(msgbufp, MSGBUF_SIZE);
2875 
2876 
2877 	/* transfer to user mode */
2878 
2879 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
2880 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
2881 	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
2882 
2883 	load_ds(_udatasel);
2884 	load_es(_udatasel);
2885 	load_fs(_udatasel);
2886 
2887 	/* setup proc 0's pcb */
2888 	thread0.td_pcb->pcb_flags = 0;
2889 	thread0.td_pcb->pcb_cr3 = KPML4phys;
2890 	thread0.td_pcb->pcb_cr3_iso = 0;
2891 	thread0.td_pcb->pcb_ext = NULL;
2892 	lwp0.lwp_md.md_regs = &proc0_tf;	/* XXX needed? */
2893 
2894 	/* Location of kernel stack for locore */
2895 	return ((u_int64_t)thread0.td_pcb);
2896 }
2897 
2898 /*
2899  * Initialize machine-dependant portions of the global data structure.
2900  * Note that the global data area and cpu0's idlestack in the private
2901  * data space were allocated in locore.
2902  *
2903  * Note: the idlethread's cpl is 0
2904  *
2905  * WARNING!  Called from early boot, 'mycpu' may not work yet.
2906  */
2907 void
2908 cpu_gdinit(struct mdglobaldata *gd, int cpu)
2909 {
2910 	if (cpu)
2911 		gd->mi.gd_curthread = &gd->mi.gd_idlethread;
2912 
2913 	lwkt_init_thread(&gd->mi.gd_idlethread,
2914 			gd->mi.gd_prvspace->idlestack,
2915 			sizeof(gd->mi.gd_prvspace->idlestack),
2916 			0, &gd->mi);
2917 	lwkt_set_comm(&gd->mi.gd_idlethread, "idle_%d", cpu);
2918 	gd->mi.gd_idlethread.td_switch = cpu_lwkt_switch;
2919 	gd->mi.gd_idlethread.td_sp -= sizeof(void *);
2920 	*(void **)gd->mi.gd_idlethread.td_sp = cpu_idle_restore;
2921 }
2922 
2923 /*
2924  * We only have to check for DMAP bounds, the globaldata space is
2925  * actually part of the kernel_map so we don't have to waste time
2926  * checking CPU_prvspace[*].
2927  */
2928 int
2929 is_globaldata_space(vm_offset_t saddr, vm_offset_t eaddr)
2930 {
2931 #if 0
2932 	if (saddr >= (vm_offset_t)&CPU_prvspace[0] &&
2933 	    eaddr <= (vm_offset_t)&CPU_prvspace[MAXCPU]) {
2934 		return (TRUE);
2935 	}
2936 #endif
2937 	if (saddr >= DMAP_MIN_ADDRESS && eaddr <= DMAP_MAX_ADDRESS)
2938 		return (TRUE);
2939 	return (FALSE);
2940 }
2941 
2942 struct globaldata *
2943 globaldata_find(int cpu)
2944 {
2945 	KKASSERT(cpu >= 0 && cpu < ncpus);
2946 	return(&CPU_prvspace[cpu]->mdglobaldata.mi);
2947 }
2948 
2949 /*
2950  * This path should be safe from the SYSRET issue because only stopped threads
2951  * can have their %rip adjusted this way (and all heavy weight thread switches
2952  * clear QUICKREF and thus do not use SYSRET).  However, the code path is
2953  * convoluted so add a safety by forcing %rip to be cannonical.
2954  */
2955 int
2956 ptrace_set_pc(struct lwp *lp, unsigned long addr)
2957 {
2958 	if (addr & 0x0000800000000000LLU)
2959 		lp->lwp_md.md_regs->tf_rip = addr | 0xFFFF000000000000LLU;
2960 	else
2961 		lp->lwp_md.md_regs->tf_rip = addr & 0x0000FFFFFFFFFFFFLLU;
2962 	return (0);
2963 }
2964 
2965 int
2966 ptrace_single_step(struct lwp *lp)
2967 {
2968 	lp->lwp_md.md_regs->tf_rflags |= PSL_T;
2969 	return (0);
2970 }
2971 
2972 int
2973 fill_regs(struct lwp *lp, struct reg *regs)
2974 {
2975 	struct trapframe *tp;
2976 
2977 	if ((tp = lp->lwp_md.md_regs) == NULL)
2978 		return EINVAL;
2979 	bcopy(&tp->tf_rdi, &regs->r_rdi, sizeof(*regs));
2980 	return (0);
2981 }
2982 
2983 int
2984 set_regs(struct lwp *lp, struct reg *regs)
2985 {
2986 	struct trapframe *tp;
2987 
2988 	tp = lp->lwp_md.md_regs;
2989 	if (!EFL_SECURE(regs->r_rflags, tp->tf_rflags) ||
2990 	    !CS_SECURE(regs->r_cs))
2991 		return (EINVAL);
2992 	bcopy(&regs->r_rdi, &tp->tf_rdi, sizeof(*regs));
2993 	clear_quickret();
2994 	return (0);
2995 }
2996 
2997 static void
2998 fill_fpregs_xmm(struct savexmm *sv_xmm, struct save87 *sv_87)
2999 {
3000 	struct env87 *penv_87 = &sv_87->sv_env;
3001 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
3002 	int i;
3003 
3004 	/* FPU control/status */
3005 	penv_87->en_cw = penv_xmm->en_cw;
3006 	penv_87->en_sw = penv_xmm->en_sw;
3007 	penv_87->en_tw = penv_xmm->en_tw;
3008 	penv_87->en_fip = penv_xmm->en_fip;
3009 	penv_87->en_fcs = penv_xmm->en_fcs;
3010 	penv_87->en_opcode = penv_xmm->en_opcode;
3011 	penv_87->en_foo = penv_xmm->en_foo;
3012 	penv_87->en_fos = penv_xmm->en_fos;
3013 
3014 	/* FPU registers */
3015 	for (i = 0; i < 8; ++i)
3016 		sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc;
3017 }
3018 
3019 static void
3020 set_fpregs_xmm(struct save87 *sv_87, struct savexmm *sv_xmm)
3021 {
3022 	struct env87 *penv_87 = &sv_87->sv_env;
3023 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
3024 	int i;
3025 
3026 	/* FPU control/status */
3027 	penv_xmm->en_cw = penv_87->en_cw;
3028 	penv_xmm->en_sw = penv_87->en_sw;
3029 	penv_xmm->en_tw = penv_87->en_tw;
3030 	penv_xmm->en_fip = penv_87->en_fip;
3031 	penv_xmm->en_fcs = penv_87->en_fcs;
3032 	penv_xmm->en_opcode = penv_87->en_opcode;
3033 	penv_xmm->en_foo = penv_87->en_foo;
3034 	penv_xmm->en_fos = penv_87->en_fos;
3035 
3036 	/* FPU registers */
3037 	for (i = 0; i < 8; ++i)
3038 		sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i];
3039 }
3040 
3041 int
3042 fill_fpregs(struct lwp *lp, struct fpreg *fpregs)
3043 {
3044 	if (lp->lwp_thread == NULL || lp->lwp_thread->td_pcb == NULL)
3045 		return EINVAL;
3046 	if (cpu_fxsr) {
3047 		fill_fpregs_xmm(&lp->lwp_thread->td_pcb->pcb_save.sv_xmm,
3048 				(struct save87 *)fpregs);
3049 		return (0);
3050 	}
3051 	bcopy(&lp->lwp_thread->td_pcb->pcb_save.sv_87, fpregs, sizeof *fpregs);
3052 	return (0);
3053 }
3054 
3055 int
3056 set_fpregs(struct lwp *lp, struct fpreg *fpregs)
3057 {
3058 	if (cpu_fxsr) {
3059 		set_fpregs_xmm((struct save87 *)fpregs,
3060 			       &lp->lwp_thread->td_pcb->pcb_save.sv_xmm);
3061 		return (0);
3062 	}
3063 	bcopy(fpregs, &lp->lwp_thread->td_pcb->pcb_save.sv_87, sizeof *fpregs);
3064 	return (0);
3065 }
3066 
3067 int
3068 fill_dbregs(struct lwp *lp, struct dbreg *dbregs)
3069 {
3070 	struct pcb *pcb;
3071 
3072         if (lp == NULL) {
3073                 dbregs->dr[0] = rdr0();
3074                 dbregs->dr[1] = rdr1();
3075                 dbregs->dr[2] = rdr2();
3076                 dbregs->dr[3] = rdr3();
3077                 dbregs->dr[4] = rdr4();
3078                 dbregs->dr[5] = rdr5();
3079                 dbregs->dr[6] = rdr6();
3080                 dbregs->dr[7] = rdr7();
3081 		return (0);
3082         }
3083 	if (lp->lwp_thread == NULL || (pcb = lp->lwp_thread->td_pcb) == NULL)
3084 		return EINVAL;
3085 	dbregs->dr[0] = pcb->pcb_dr0;
3086 	dbregs->dr[1] = pcb->pcb_dr1;
3087 	dbregs->dr[2] = pcb->pcb_dr2;
3088 	dbregs->dr[3] = pcb->pcb_dr3;
3089 	dbregs->dr[4] = 0;
3090 	dbregs->dr[5] = 0;
3091 	dbregs->dr[6] = pcb->pcb_dr6;
3092 	dbregs->dr[7] = pcb->pcb_dr7;
3093 	return (0);
3094 }
3095 
3096 int
3097 set_dbregs(struct lwp *lp, struct dbreg *dbregs)
3098 {
3099 	if (lp == NULL) {
3100 		load_dr0(dbregs->dr[0]);
3101 		load_dr1(dbregs->dr[1]);
3102 		load_dr2(dbregs->dr[2]);
3103 		load_dr3(dbregs->dr[3]);
3104 		load_dr4(dbregs->dr[4]);
3105 		load_dr5(dbregs->dr[5]);
3106 		load_dr6(dbregs->dr[6]);
3107 		load_dr7(dbregs->dr[7]);
3108 	} else {
3109 		struct pcb *pcb;
3110 		struct ucred *ucred;
3111 		int i;
3112 		uint64_t mask1, mask2;
3113 
3114 		/*
3115 		 * Don't let an illegal value for dr7 get set.	Specifically,
3116 		 * check for undefined settings.  Setting these bit patterns
3117 		 * result in undefined behaviour and can lead to an unexpected
3118 		 * TRCTRAP.
3119 		 */
3120 		/* JG this loop looks unreadable */
3121 		/* Check 4 2-bit fields for invalid patterns.
3122 		 * These fields are R/Wi, for i = 0..3
3123 		 */
3124 		/* Is 10 in LENi allowed when running in compatibility mode? */
3125 		/* Pattern 10 in R/Wi might be used to indicate
3126 		 * breakpoint on I/O. Further analysis should be
3127 		 * carried to decide if it is safe and useful to
3128 		 * provide access to that capability
3129 		 */
3130 		for (i = 0, mask1 = 0x3<<16, mask2 = 0x2<<16; i < 4;
3131 		     i++, mask1 <<= 4, mask2 <<= 4)
3132 			if ((dbregs->dr[7] & mask1) == mask2)
3133 				return (EINVAL);
3134 
3135 		pcb = lp->lwp_thread->td_pcb;
3136 		ucred = lp->lwp_proc->p_ucred;
3137 
3138 		/*
3139 		 * Don't let a process set a breakpoint that is not within the
3140 		 * process's address space.  If a process could do this, it
3141 		 * could halt the system by setting a breakpoint in the kernel
3142 		 * (if ddb was enabled).  Thus, we need to check to make sure
3143 		 * that no breakpoints are being enabled for addresses outside
3144 		 * process's address space, unless, perhaps, we were called by
3145 		 * uid 0.
3146 		 *
3147 		 * XXX - what about when the watched area of the user's
3148 		 * address space is written into from within the kernel
3149 		 * ... wouldn't that still cause a breakpoint to be generated
3150 		 * from within kernel mode?
3151 		 */
3152 
3153 		if (priv_check_cred(ucred, PRIV_ROOT, 0) != 0) {
3154 			if (dbregs->dr[7] & 0x3) {
3155 				/* dr0 is enabled */
3156 				if (dbregs->dr[0] >= VM_MAX_USER_ADDRESS)
3157 					return (EINVAL);
3158 			}
3159 
3160 			if (dbregs->dr[7] & (0x3<<2)) {
3161 				/* dr1 is enabled */
3162 				if (dbregs->dr[1] >= VM_MAX_USER_ADDRESS)
3163 					return (EINVAL);
3164 			}
3165 
3166 			if (dbregs->dr[7] & (0x3<<4)) {
3167 				/* dr2 is enabled */
3168 				if (dbregs->dr[2] >= VM_MAX_USER_ADDRESS)
3169 					return (EINVAL);
3170 			}
3171 
3172 			if (dbregs->dr[7] & (0x3<<6)) {
3173 				/* dr3 is enabled */
3174 				if (dbregs->dr[3] >= VM_MAX_USER_ADDRESS)
3175 					return (EINVAL);
3176 			}
3177 		}
3178 
3179 		pcb->pcb_dr0 = dbregs->dr[0];
3180 		pcb->pcb_dr1 = dbregs->dr[1];
3181 		pcb->pcb_dr2 = dbregs->dr[2];
3182 		pcb->pcb_dr3 = dbregs->dr[3];
3183 		pcb->pcb_dr6 = dbregs->dr[6];
3184 		pcb->pcb_dr7 = dbregs->dr[7];
3185 
3186 		pcb->pcb_flags |= PCB_DBREGS;
3187 	}
3188 
3189 	return (0);
3190 }
3191 
3192 /*
3193  * Return > 0 if a hardware breakpoint has been hit, and the
3194  * breakpoint was in user space.  Return 0, otherwise.
3195  */
3196 int
3197 user_dbreg_trap(void)
3198 {
3199         u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */
3200         u_int64_t bp;       /* breakpoint bits extracted from dr6 */
3201         int nbp;            /* number of breakpoints that triggered */
3202         caddr_t addr[4];    /* breakpoint addresses */
3203         int i;
3204 
3205         dr7 = rdr7();
3206         if ((dr7 & 0xff) == 0) {
3207                 /*
3208                  * all GE and LE bits in the dr7 register are zero,
3209                  * thus the trap couldn't have been caused by the
3210                  * hardware debug registers
3211                  */
3212                 return 0;
3213         }
3214 
3215         nbp = 0;
3216         dr6 = rdr6();
3217         bp = dr6 & 0xf;
3218 
3219         if (bp == 0) {
3220                 /*
3221                  * None of the breakpoint bits are set meaning this
3222                  * trap was not caused by any of the debug registers
3223                  */
3224                 return 0;
3225         }
3226 
3227         /*
3228          * at least one of the breakpoints were hit, check to see
3229          * which ones and if any of them are user space addresses
3230          */
3231 
3232         if (bp & 0x01) {
3233                 addr[nbp++] = (caddr_t)rdr0();
3234         }
3235         if (bp & 0x02) {
3236                 addr[nbp++] = (caddr_t)rdr1();
3237         }
3238         if (bp & 0x04) {
3239                 addr[nbp++] = (caddr_t)rdr2();
3240         }
3241         if (bp & 0x08) {
3242                 addr[nbp++] = (caddr_t)rdr3();
3243         }
3244 
3245         for (i = 0; i < nbp; i++) {
3246                 if (addr[i] < (caddr_t)VM_MAX_USER_ADDRESS) {
3247                         /*
3248                          * addr[i] is in user space
3249                          */
3250                         return nbp;
3251                 }
3252         }
3253 
3254         /*
3255          * None of the breakpoints are in user space.
3256          */
3257         return 0;
3258 }
3259 
3260 
3261 #ifndef DDB
3262 void
3263 Debugger(const char *msg)
3264 {
3265 	kprintf("Debugger(\"%s\") called.\n", msg);
3266 }
3267 #endif /* no DDB */
3268 
3269 #ifdef DDB
3270 
3271 /*
3272  * Provide inb() and outb() as functions.  They are normally only
3273  * available as macros calling inlined functions, thus cannot be
3274  * called inside DDB.
3275  *
3276  * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
3277  */
3278 
3279 #undef inb
3280 #undef outb
3281 
3282 /* silence compiler warnings */
3283 u_char inb(u_int);
3284 void outb(u_int, u_char);
3285 
3286 u_char
3287 inb(u_int port)
3288 {
3289 	u_char	data;
3290 	/*
3291 	 * We use %%dx and not %1 here because i/o is done at %dx and not at
3292 	 * %edx, while gcc generates inferior code (movw instead of movl)
3293 	 * if we tell it to load (u_short) port.
3294 	 */
3295 	__asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port));
3296 	return (data);
3297 }
3298 
3299 void
3300 outb(u_int port, u_char data)
3301 {
3302 	u_char	al;
3303 	/*
3304 	 * Use an unnecessary assignment to help gcc's register allocator.
3305 	 * This make a large difference for gcc-1.40 and a tiny difference
3306 	 * for gcc-2.6.0.  For gcc-1.40, al had to be ``asm("ax")'' for
3307 	 * best results.  gcc-2.6.0 can't handle this.
3308 	 */
3309 	al = data;
3310 	__asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port));
3311 }
3312 
3313 #endif /* DDB */
3314 
3315 
3316 
3317 /*
3318  * initialize all the SMP locks
3319  */
3320 
3321 /* critical region when masking or unmasking interupts */
3322 struct spinlock_deprecated imen_spinlock;
3323 
3324 /* locks com (tty) data/hardware accesses: a FASTINTR() */
3325 struct spinlock_deprecated com_spinlock;
3326 
3327 /* lock regions around the clock hardware */
3328 struct spinlock_deprecated clock_spinlock;
3329 
3330 static void
3331 init_locks(void)
3332 {
3333 	/*
3334 	 * Get the initial mplock with a count of 1 for the BSP.
3335 	 * This uses a LOGICAL cpu ID, ie BSP == 0.
3336 	 */
3337 	cpu_get_initial_mplock();
3338 	/* DEPRECATED */
3339 	spin_init_deprecated(&imen_spinlock);
3340 	spin_init_deprecated(&com_spinlock);
3341 	spin_init_deprecated(&clock_spinlock);
3342 
3343 	/* our token pool needs to work early */
3344 	lwkt_token_pool_init();
3345 }
3346 
3347 boolean_t
3348 cpu_mwait_hint_valid(uint32_t hint)
3349 {
3350 	int cx_idx, sub;
3351 
3352 	cx_idx = MWAIT_EAX_TO_CX(hint);
3353 	if (cx_idx >= CPU_MWAIT_CX_MAX)
3354 		return FALSE;
3355 
3356 	sub = MWAIT_EAX_TO_CX_SUB(hint);
3357 	if (sub >= cpu_mwait_cx_info[cx_idx].subcnt)
3358 		return FALSE;
3359 
3360 	return TRUE;
3361 }
3362 
3363 void
3364 cpu_mwait_cx_no_bmsts(void)
3365 {
3366 	atomic_clear_int(&cpu_mwait_c3_preamble, CPU_MWAIT_C3_PREAMBLE_BM_STS);
3367 }
3368 
3369 void
3370 cpu_mwait_cx_no_bmarb(void)
3371 {
3372 	atomic_clear_int(&cpu_mwait_c3_preamble, CPU_MWAIT_C3_PREAMBLE_BM_ARB);
3373 }
3374 
3375 static int
3376 cpu_mwait_cx_hint2name(int hint, char *name, int namelen, boolean_t allow_auto)
3377 {
3378 	int old_cx_idx, sub = 0;
3379 
3380 	if (hint >= 0) {
3381 		old_cx_idx = MWAIT_EAX_TO_CX(hint);
3382 		sub = MWAIT_EAX_TO_CX_SUB(hint);
3383 	} else if (hint == CPU_MWAIT_HINT_AUTO) {
3384 		old_cx_idx = allow_auto ? CPU_MWAIT_C2 : CPU_MWAIT_CX_MAX;
3385 	} else if (hint == CPU_MWAIT_HINT_AUTODEEP) {
3386 		old_cx_idx = allow_auto ? CPU_MWAIT_C3 : CPU_MWAIT_CX_MAX;
3387 	} else {
3388 		old_cx_idx = CPU_MWAIT_CX_MAX;
3389 	}
3390 
3391 	if (!CPU_MWAIT_HAS_CX)
3392 		strlcpy(name, "NONE", namelen);
3393 	else if (allow_auto && hint == CPU_MWAIT_HINT_AUTO)
3394 		strlcpy(name, "AUTO", namelen);
3395 	else if (allow_auto && hint == CPU_MWAIT_HINT_AUTODEEP)
3396 		strlcpy(name, "AUTODEEP", namelen);
3397 	else if (old_cx_idx >= CPU_MWAIT_CX_MAX ||
3398 	    sub >= cpu_mwait_cx_info[old_cx_idx].subcnt)
3399 		strlcpy(name, "INVALID", namelen);
3400 	else
3401 		ksnprintf(name, namelen, "C%d/%d", old_cx_idx, sub);
3402 
3403 	return old_cx_idx;
3404 }
3405 
3406 static int
3407 cpu_mwait_cx_name2hint(char *name, int *hint0, boolean_t allow_auto)
3408 {
3409 	int cx_idx, sub, hint;
3410 	char *ptr, *start;
3411 
3412 	if (allow_auto && strcmp(name, "AUTO") == 0) {
3413 		hint = CPU_MWAIT_HINT_AUTO;
3414 		cx_idx = CPU_MWAIT_C2;
3415 		goto done;
3416 	}
3417 	if (allow_auto && strcmp(name, "AUTODEEP") == 0) {
3418 		hint = CPU_MWAIT_HINT_AUTODEEP;
3419 		cx_idx = CPU_MWAIT_C3;
3420 		goto done;
3421 	}
3422 
3423 	if (strlen(name) < 4 || toupper(name[0]) != 'C')
3424 		return -1;
3425 	start = &name[1];
3426 	ptr = NULL;
3427 
3428 	cx_idx = strtol(start, &ptr, 10);
3429 	if (ptr == start || *ptr != '/')
3430 		return -1;
3431 	if (cx_idx < 0 || cx_idx >= CPU_MWAIT_CX_MAX)
3432 		return -1;
3433 
3434 	start = ptr + 1;
3435 	ptr = NULL;
3436 
3437 	sub = strtol(start, &ptr, 10);
3438 	if (*ptr != '\0')
3439 		return -1;
3440 	if (sub < 0 || sub >= cpu_mwait_cx_info[cx_idx].subcnt)
3441 		return -1;
3442 
3443 	hint = MWAIT_EAX_HINT(cx_idx, sub);
3444 done:
3445 	*hint0 = hint;
3446 	return cx_idx;
3447 }
3448 
3449 static int
3450 cpu_mwait_cx_transit(int old_cx_idx, int cx_idx)
3451 {
3452 	if (cx_idx >= CPU_MWAIT_C3 && cpu_mwait_c3_preamble)
3453 		return EOPNOTSUPP;
3454 	if (old_cx_idx < CPU_MWAIT_C3 && cx_idx >= CPU_MWAIT_C3) {
3455 		int error;
3456 
3457 		error = cputimer_intr_powersave_addreq();
3458 		if (error)
3459 			return error;
3460 	} else if (old_cx_idx >= CPU_MWAIT_C3 && cx_idx < CPU_MWAIT_C3) {
3461 		cputimer_intr_powersave_remreq();
3462 	}
3463 	return 0;
3464 }
3465 
3466 static int
3467 cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS, int *hint0,
3468     boolean_t allow_auto)
3469 {
3470 	int error, cx_idx, old_cx_idx, hint;
3471 	char name[CPU_MWAIT_CX_NAMELEN];
3472 
3473 	hint = *hint0;
3474 	old_cx_idx = cpu_mwait_cx_hint2name(hint, name, sizeof(name),
3475 	    allow_auto);
3476 
3477 	error = sysctl_handle_string(oidp, name, sizeof(name), req);
3478 	if (error != 0 || req->newptr == NULL)
3479 		return error;
3480 
3481 	if (!CPU_MWAIT_HAS_CX)
3482 		return EOPNOTSUPP;
3483 
3484 	cx_idx = cpu_mwait_cx_name2hint(name, &hint, allow_auto);
3485 	if (cx_idx < 0)
3486 		return EINVAL;
3487 
3488 	error = cpu_mwait_cx_transit(old_cx_idx, cx_idx);
3489 	if (error)
3490 		return error;
3491 
3492 	*hint0 = hint;
3493 	return 0;
3494 }
3495 
3496 static int
3497 cpu_mwait_cx_setname(struct cpu_idle_stat *stat, const char *cx_name)
3498 {
3499 	int error, cx_idx, old_cx_idx, hint;
3500 	char name[CPU_MWAIT_CX_NAMELEN];
3501 
3502 	KASSERT(CPU_MWAIT_HAS_CX, ("cpu does not support mwait CX extension"));
3503 
3504 	hint = stat->hint;
3505 	old_cx_idx = cpu_mwait_cx_hint2name(hint, name, sizeof(name), TRUE);
3506 
3507 	strlcpy(name, cx_name, sizeof(name));
3508 	cx_idx = cpu_mwait_cx_name2hint(name, &hint, TRUE);
3509 	if (cx_idx < 0)
3510 		return EINVAL;
3511 
3512 	error = cpu_mwait_cx_transit(old_cx_idx, cx_idx);
3513 	if (error)
3514 		return error;
3515 
3516 	stat->hint = hint;
3517 	return 0;
3518 }
3519 
3520 static int
3521 cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS)
3522 {
3523 	int hint = cpu_mwait_halt_global;
3524 	int error, cx_idx, cpu;
3525 	char name[CPU_MWAIT_CX_NAMELEN], cx_name[CPU_MWAIT_CX_NAMELEN];
3526 
3527 	cpu_mwait_cx_hint2name(hint, name, sizeof(name), TRUE);
3528 
3529 	error = sysctl_handle_string(oidp, name, sizeof(name), req);
3530 	if (error != 0 || req->newptr == NULL)
3531 		return error;
3532 
3533 	if (!CPU_MWAIT_HAS_CX)
3534 		return EOPNOTSUPP;
3535 
3536 	/* Save name for later per-cpu CX configuration */
3537 	strlcpy(cx_name, name, sizeof(cx_name));
3538 
3539 	cx_idx = cpu_mwait_cx_name2hint(name, &hint, TRUE);
3540 	if (cx_idx < 0)
3541 		return EINVAL;
3542 
3543 	/* Change per-cpu CX configuration */
3544 	for (cpu = 0; cpu < ncpus; ++cpu) {
3545 		error = cpu_mwait_cx_setname(&cpu_idle_stats[cpu], cx_name);
3546 		if (error)
3547 			return error;
3548 	}
3549 
3550 	cpu_mwait_halt_global = hint;
3551 	return 0;
3552 }
3553 
3554 static int
3555 cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS)
3556 {
3557 	struct cpu_idle_stat *stat = arg1;
3558 	int error;
3559 
3560 	error = cpu_mwait_cx_select_sysctl(oidp, arg1, arg2, req,
3561 	    &stat->hint, TRUE);
3562 	return error;
3563 }
3564 
3565 static int
3566 cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS)
3567 {
3568 	int error;
3569 
3570 	error = cpu_mwait_cx_select_sysctl(oidp, arg1, arg2, req,
3571 	    &cpu_mwait_spin, FALSE);
3572 	return error;
3573 }
3574 
3575 /*
3576  * This manual debugging code is called unconditionally from Xtimer
3577  * (the per-cpu timer interrupt) whether the current thread is in a
3578  * critical section or not) and can be useful in tracking down lockups.
3579  *
3580  * NOTE: MANUAL DEBUG CODE
3581  */
3582 #if 0
3583 static int saveticks[SMP_MAXCPU];
3584 static int savecounts[SMP_MAXCPU];
3585 #endif
3586 
3587 void
3588 pcpu_timer_always(struct intrframe *frame)
3589 {
3590 #if 0
3591 	globaldata_t gd = mycpu;
3592 	int cpu = gd->gd_cpuid;
3593 	char buf[64];
3594 	short *gptr;
3595 	int i;
3596 
3597 	if (cpu <= 20) {
3598 		gptr = (short *)0xFFFFFFFF800b8000 + 80 * cpu;
3599 		*gptr = ((*gptr + 1) & 0x00FF) | 0x0700;
3600 		++gptr;
3601 
3602 		ksnprintf(buf, sizeof(buf), " %p %16s %d %16s ",
3603 		    (void *)frame->if_rip, gd->gd_curthread->td_comm, ticks,
3604 		    gd->gd_infomsg);
3605 		for (i = 0; buf[i]; ++i) {
3606 			gptr[i] = 0x0700 | (unsigned char)buf[i];
3607 		}
3608 	}
3609 #if 0
3610 	if (saveticks[gd->gd_cpuid] != ticks) {
3611 		saveticks[gd->gd_cpuid] = ticks;
3612 		savecounts[gd->gd_cpuid] = 0;
3613 	}
3614 	++savecounts[gd->gd_cpuid];
3615 	if (savecounts[gd->gd_cpuid] > 2000 && panicstr == NULL) {
3616 		panic("cpud %d panicing on ticks failure",
3617 			gd->gd_cpuid);
3618 	}
3619 	for (i = 0; i < ncpus; ++i) {
3620 		int delta;
3621 		if (saveticks[i] && panicstr == NULL) {
3622 			delta = saveticks[i] - ticks;
3623 			if (delta < -10 || delta > 10) {
3624 				panic("cpu %d panicing on cpu %d watchdog",
3625 				      gd->gd_cpuid, i);
3626 			}
3627 		}
3628 	}
3629 #endif
3630 #endif
3631 }
3632 
3633 SET_DECLARE(smap_open, char);
3634 SET_DECLARE(smap_close, char);
3635 
3636 static void
3637 cpu_implement_smap(void)
3638 {
3639 	char **scan;
3640 
3641 	for (scan = SET_BEGIN(smap_open);		/* nop -> stac */
3642 	     scan < SET_LIMIT(smap_open); ++scan) {
3643 		(*scan)[0] = 0x0F;
3644 		(*scan)[1] = 0x01;
3645 		(*scan)[2] = 0xCB;
3646 	}
3647 	for (scan = SET_BEGIN(smap_close);		/* nop -> clac */
3648 	     scan < SET_LIMIT(smap_close); ++scan) {
3649 		(*scan)[0] = 0x0F;
3650 		(*scan)[1] = 0x01;
3651 		(*scan)[2] = 0xCA;
3652 	}
3653 }
3654 
3655 /*
3656  * From a hard interrupt
3657  */
3658 int
3659 cpu_interrupt_running(struct thread *td)
3660 {
3661 	struct mdglobaldata *gd = mdcpu;
3662 
3663 	if (clock_debug1 > 0) {
3664 		--clock_debug1;
3665 		kprintf("%d %016lx %016lx %016lx\n",
3666 			((td->td_flags & TDF_INTTHREAD) != 0),
3667 			gd->gd_ipending[0],
3668 			gd->gd_ipending[1],
3669 			gd->gd_ipending[2]);
3670 		if (td->td_flags & TDF_CLKTHREAD) {
3671 			kprintf("CLKTD %s PREEMPT %s\n",
3672 				td->td_comm,
3673 				(td->td_preempted ?
3674 				 td->td_preempted->td_comm : ""));
3675 		} else {
3676 			kprintf("NORTD %s\n", td->td_comm);
3677 		}
3678 	}
3679 	if ((td->td_flags & TDF_INTTHREAD) ||
3680 	    gd->gd_ipending[0] ||
3681 	    gd->gd_ipending[1] ||
3682 	    gd->gd_ipending[2]) {
3683 		return 1;
3684 	} else {
3685 		return 0;
3686 	}
3687 }
3688