xref: /dragonfly/sys/platform/pc64/x86_64/machdep.c (revision a174495b)
1 /*-
2  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
3  * Copyright (c) 1992 Terrence R. Lambert.
4  * Copyright (c) 2003 Peter Wemm.
5  * Copyright (c) 2008-2017 The DragonFly Project.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to Berkeley by
9  * William Jolitz.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. All advertising materials mentioning features or use of this software
20  *    must display the following acknowledgement:
21  *	This product includes software developed by the University of
22  *	California, Berkeley and its contributors.
23  * 4. Neither the name of the University nor the names of its contributors
24  *    may be used to endorse or promote products derived from this software
25  *    without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37  * SUCH DAMAGE.
38  *
39  * from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
40  * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $
41  */
42 
43 //#include "use_npx.h"
44 #include "use_isa.h"
45 #include "opt_cpu.h"
46 #include "opt_ddb.h"
47 #include "opt_inet.h"
48 #include "opt_msgbuf.h"
49 #include "opt_swap.h"
50 
51 #include <sys/param.h>
52 #include <sys/systm.h>
53 #include <sys/sysmsg.h>
54 #include <sys/signalvar.h>
55 #include <sys/kernel.h>
56 #include <sys/linker.h>
57 #include <sys/malloc.h>
58 #include <sys/proc.h>
59 #include <sys/priv.h>
60 #include <sys/buf.h>
61 #include <sys/reboot.h>
62 #include <sys/mbuf.h>
63 #include <sys/msgbuf.h>
64 #include <sys/sysent.h>
65 #include <sys/sysctl.h>
66 #include <sys/vmmeter.h>
67 #include <sys/bus.h>
68 #include <sys/usched.h>
69 #include <sys/reg.h>
70 #include <sys/sbuf.h>
71 #include <sys/ctype.h>
72 #include <sys/serialize.h>
73 #include <sys/systimer.h>
74 
75 #include <vm/vm.h>
76 #include <vm/vm_param.h>
77 #include <sys/lock.h>
78 #include <vm/vm_kern.h>
79 #include <vm/vm_object.h>
80 #include <vm/vm_page.h>
81 #include <vm/vm_map.h>
82 #include <vm/vm_pager.h>
83 #include <vm/vm_extern.h>
84 
85 #include <sys/thread2.h>
86 #include <sys/mplock2.h>
87 
88 #include <sys/exec.h>
89 #include <sys/cons.h>
90 
91 #include <sys/efi.h>
92 
93 #include <ddb/ddb.h>
94 
95 #include <machine/cpu.h>
96 #include <machine/clock.h>
97 #include <machine/specialreg.h>
98 #if 0 /* JG */
99 #include <machine/bootinfo.h>
100 #endif
101 #include <machine/md_var.h>
102 #include <machine/metadata.h>
103 #include <machine/pc/bios.h>
104 #include <machine/pcb_ext.h>
105 #include <machine/globaldata.h>		/* CPU_prvspace */
106 #include <machine/smp.h>
107 #include <machine/cputypes.h>
108 #include <machine/intr_machdep.h>
109 #include <machine/framebuffer.h>
110 
111 #ifdef OLD_BUS_ARCH
112 #include <bus/isa/isa_device.h>
113 #endif
114 #include <machine_base/isa/isa_intr.h>
115 #include <bus/isa/rtc.h>
116 #include <sys/random.h>
117 #include <sys/ptrace.h>
118 #include <machine/sigframe.h>
119 
120 #include <sys/machintr.h>
121 #include <machine_base/icu/icu_abi.h>
122 #include <machine_base/icu/elcr_var.h>
123 #include <machine_base/apic/lapic.h>
124 #include <machine_base/apic/ioapic.h>
125 #include <machine_base/apic/ioapic_abi.h>
126 #include <machine/mptable.h>
127 
128 #define PHYSMAP_ENTRIES		10
129 #define MAXBUFSTRUCTSIZE	((size_t)512 * 1024 * 1024)
130 
131 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
132 
133 extern void printcpuinfo(void);	/* XXX header file */
134 extern void identify_cpu(void);
135 extern void panicifcpuunsupported(void);
136 
137 static void cpu_startup(void *);
138 static void pic_finish(void *);
139 static void cpu_finish(void *);
140 
141 static void set_fpregs_xmm(struct save87 *, struct savexmm *);
142 static void fill_fpregs_xmm(struct savexmm *, struct save87 *);
143 static void init_locks(void);
144 
145 extern void pcpu_timer_always(struct intrframe *);
146 
147 SYSINIT(cpu, SI_BOOT2_START_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
148 SYSINIT(pic_finish, SI_BOOT2_FINISH_PIC, SI_ORDER_FIRST, pic_finish, NULL);
149 SYSINIT(cpu_finish, SI_BOOT2_FINISH_CPU, SI_ORDER_FIRST, cpu_finish, NULL);
150 
151 #ifdef DDB
152 extern vm_offset_t ksym_start, ksym_end;
153 #endif
154 
155 struct privatespace CPU_prvspace_bsp __aligned(4096);
156 struct privatespace *CPU_prvspace[MAXCPU] = { &CPU_prvspace_bsp };
157 
158 vm_paddr_t efi_systbl_phys;
159 int	_udatasel, _ucodesel, _ucode32sel;
160 u_long	atdevbase;
161 int64_t tsc_offsets[MAXCPU];
162 cpumask_t smp_idleinvl_mask;
163 cpumask_t smp_idleinvl_reqs;
164 
165  /* MWAIT hint (EAX) or CPU_MWAIT_HINT_ */
166 __read_mostly static int cpu_mwait_halt_global;
167 __read_mostly static int clock_debug1;
168 
169 #if defined(SWTCH_OPTIM_STATS)
170 extern int swtch_optim_stats;
171 SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats,
172 	CTLFLAG_RD, &swtch_optim_stats, 0, "");
173 SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count,
174 	CTLFLAG_RD, &tlb_flush_count, 0, "");
175 #endif
176 SYSCTL_INT(_debug, OID_AUTO, clock_debug1,
177 	CTLFLAG_RW, &clock_debug1, 0, "");
178 SYSCTL_INT(_hw, OID_AUTO, cpu_mwait_halt,
179 	CTLFLAG_RD, &cpu_mwait_halt_global, 0, "");
180 SYSCTL_INT(_hw, OID_AUTO, cpu_mwait_spin,
181 	CTLFLAG_RD, &cpu_mwait_spin, 0, "monitor/mwait target state");
182 
183 #define CPU_MWAIT_HAS_CX	\
184 	((cpu_feature2 & CPUID2_MON) && \
185 	 (cpu_mwait_feature & CPUID_MWAIT_EXT))
186 
187 #define CPU_MWAIT_CX_NAMELEN	16
188 
189 #define CPU_MWAIT_C1		1
190 #define CPU_MWAIT_C2		2
191 #define CPU_MWAIT_C3		3
192 #define CPU_MWAIT_CX_MAX	8
193 
194 #define CPU_MWAIT_HINT_AUTO	-1	/* C1 and C2 */
195 #define CPU_MWAIT_HINT_AUTODEEP	-2	/* C3+ */
196 
197 SYSCTL_NODE(_machdep, OID_AUTO, mwait, CTLFLAG_RW, 0, "MWAIT features");
198 SYSCTL_NODE(_machdep_mwait, OID_AUTO, CX, CTLFLAG_RW, 0, "MWAIT Cx settings");
199 
200 struct cpu_mwait_cx {
201 	int			subcnt;
202 	char			name[4];
203 	struct sysctl_ctx_list	sysctl_ctx;
204 	struct sysctl_oid	*sysctl_tree;
205 };
206 static struct cpu_mwait_cx	cpu_mwait_cx_info[CPU_MWAIT_CX_MAX];
207 static char			cpu_mwait_cx_supported[256];
208 
209 static int			cpu_mwait_c1_hints_cnt;
210 static int			cpu_mwait_hints_cnt;
211 static int			*cpu_mwait_hints;
212 
213 static int			cpu_mwait_deep_hints_cnt;
214 static int			*cpu_mwait_deep_hints;
215 
216 #define CPU_IDLE_REPEAT_DEFAULT	750
217 
218 static u_int			cpu_idle_repeat = CPU_IDLE_REPEAT_DEFAULT;
219 static u_long			cpu_idle_repeat_max = CPU_IDLE_REPEAT_DEFAULT;
220 static u_int			cpu_mwait_repeat_shift = 1;
221 
222 #define CPU_MWAIT_C3_PREAMBLE_BM_ARB	0x1
223 #define CPU_MWAIT_C3_PREAMBLE_BM_STS	0x2
224 
225 static int			cpu_mwait_c3_preamble =
226 				    CPU_MWAIT_C3_PREAMBLE_BM_ARB |
227 				    CPU_MWAIT_C3_PREAMBLE_BM_STS;
228 
229 SYSCTL_STRING(_machdep_mwait_CX, OID_AUTO, supported, CTLFLAG_RD,
230     cpu_mwait_cx_supported, 0, "MWAIT supported C states");
231 SYSCTL_INT(_machdep_mwait_CX, OID_AUTO, c3_preamble, CTLFLAG_RD,
232     &cpu_mwait_c3_preamble, 0, "C3+ preamble mask");
233 
234 static int	cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS,
235 		    int *, boolean_t);
236 static int	cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS);
237 static int	cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS);
238 static int	cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS);
239 
240 SYSCTL_PROC(_machdep_mwait_CX, OID_AUTO, idle, CTLTYPE_STRING|CTLFLAG_RW,
241     NULL, 0, cpu_mwait_cx_idle_sysctl, "A", "");
242 SYSCTL_PROC(_machdep_mwait_CX, OID_AUTO, spin, CTLTYPE_STRING|CTLFLAG_RW,
243     NULL, 0, cpu_mwait_cx_spin_sysctl, "A", "");
244 SYSCTL_UINT(_machdep_mwait_CX, OID_AUTO, repeat_shift, CTLFLAG_RW,
245     &cpu_mwait_repeat_shift, 0, "");
246 
247 long physmem = 0;
248 
249 u_long ebda_addr = 0;
250 
251 int imcr_present = 0;
252 
253 int naps = 0; /* # of Applications processors */
254 
255 u_int base_memory;
256 
257 static int
258 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS)
259 {
260 	u_long pmem = ctob(physmem);
261 	int error;
262 
263 	error = sysctl_handle_long(oidp, &pmem, 0, req);
264 
265 	return (error);
266 }
267 
268 SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_ULONG|CTLFLAG_RD,
269 	0, 0, sysctl_hw_physmem, "LU",
270 	"Total system memory in bytes (number of pages * page size)");
271 
272 static int
273 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS)
274 {
275 	u_long usermem = ctob(physmem - vmstats.v_wire_count);
276 	int error;
277 
278 	error = sysctl_handle_long(oidp, &usermem, 0, req);
279 
280 	return (error);
281 }
282 
283 SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_ULONG|CTLFLAG_RD,
284 	0, 0, sysctl_hw_usermem, "LU", "");
285 
286 static int
287 sysctl_hw_availpages(SYSCTL_HANDLER_ARGS)
288 {
289 	int error;
290 	u_long availpages;
291 
292 	availpages = x86_64_btop(avail_end - avail_start);
293 	error = sysctl_handle_long(oidp, &availpages, 0, req);
294 
295 	return (error);
296 }
297 
298 SYSCTL_PROC(_hw, OID_AUTO, availpages, CTLTYPE_ULONG|CTLFLAG_RD,
299 	0, 0, sysctl_hw_availpages, "LU", "");
300 
301 vm_paddr_t Maxmem;
302 vm_paddr_t Realmem;
303 
304 /*
305  * The number of PHYSMAP entries must be one less than the number of
306  * PHYSSEG entries because the PHYSMAP entry that spans the largest
307  * physical address that is accessible by ISA DMA is split into two
308  * PHYSSEG entries.
309  */
310 vm_phystable_t phys_avail[VM_PHYSSEG_MAX + 1];
311 vm_phystable_t dump_avail[VM_PHYSSEG_MAX + 1];
312 
313 /* must be 1 less so 0 0 can signal end of chunks */
314 #define PHYS_AVAIL_ARRAY_END (NELEM(phys_avail) - 1)
315 #define DUMP_AVAIL_ARRAY_END (NELEM(dump_avail) - 1)
316 
317 static vm_offset_t buffer_sva, buffer_eva;
318 vm_offset_t clean_sva, clean_eva;
319 static vm_offset_t pager_sva, pager_eva;
320 static struct trapframe proc0_tf;
321 
322 static void cpu_implement_smap(void);
323 
324 static void
325 cpu_startup(void *dummy)
326 {
327 	caddr_t v;
328 	vm_size_t size = 0;
329 	vm_offset_t firstaddr;
330 
331 	/*
332 	 * Good {morning,afternoon,evening,night}.
333 	 */
334 	kprintf("%s", version);
335 	startrtclock();
336 	printcpuinfo();
337 	panicifcpuunsupported();
338 	if (cpu_stdext_feature & CPUID_STDEXT_SMAP)
339 		cpu_implement_smap();
340 
341 	kprintf("real memory  = %ju (%ju MB)\n",
342 		(intmax_t)Realmem,
343 		(intmax_t)Realmem / 1024 / 1024);
344 	/*
345 	 * Display any holes after the first chunk of extended memory.
346 	 */
347 	if (bootverbose) {
348 		int indx;
349 
350 		kprintf("Physical memory chunk(s):\n");
351 		for (indx = 0; phys_avail[indx].phys_end != 0; ++indx) {
352 			vm_paddr_t size1;
353 
354 			size1 = phys_avail[indx].phys_end -
355 				phys_avail[indx].phys_beg;
356 
357 			kprintf("0x%08jx - 0x%08jx, %ju bytes (%ju pages)\n",
358 				(intmax_t)phys_avail[indx].phys_beg,
359 				(intmax_t)phys_avail[indx].phys_end - 1,
360 				(intmax_t)size1,
361 				(intmax_t)(size1 / PAGE_SIZE));
362 		}
363 	}
364 
365 	/*
366 	 * Allocate space for system data structures.
367 	 * The first available kernel virtual address is in "v".
368 	 * As pages of kernel virtual memory are allocated, "v" is incremented.
369 	 * As pages of memory are allocated and cleared,
370 	 * "firstaddr" is incremented.
371 	 * An index into the kernel page table corresponding to the
372 	 * virtual memory address maintained in "v" is kept in "mapaddr".
373 	 */
374 
375 	/*
376 	 * Make two passes.  The first pass calculates how much memory is
377 	 * needed and allocates it.  The second pass assigns virtual
378 	 * addresses to the various data structures.
379 	 */
380 	firstaddr = 0;
381 again:
382 	v = (caddr_t)firstaddr;
383 
384 #define	valloc(name, type, num) \
385 	    (name) = (type *)v; v = (caddr_t)((name)+(num))
386 #define	valloclim(name, type, num, lim) \
387 	    (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num)))
388 
389 	/*
390 	 * Calculate nbuf such that maxbufspace uses approximately 1/20
391 	 * of physical memory by default, with a minimum of 50 buffers.
392 	 *
393 	 * The calculation is made after discounting 128MB.
394 	 *
395 	 * NOTE: maxbufspace is (nbuf * NBUFCALCSIZE) (NBUFCALCSIZE ~= 16KB).
396 	 *	 nbuf = (kbytes / factor) would cover all of memory.
397 	 */
398 	if (nbuf == 0) {
399 		long factor = NBUFCALCSIZE / 1024;		/* KB/nbuf */
400 		long kbytes = physmem * (PAGE_SIZE / 1024);	/* physmem */
401 
402 		nbuf = 50;
403 		if (kbytes > 128 * 1024)
404 			nbuf += (kbytes - 128 * 1024) / (factor * 20);
405 		if (maxbcache && nbuf > maxbcache / NBUFCALCSIZE)
406 			nbuf = maxbcache / NBUFCALCSIZE;
407 		if ((size_t)nbuf * sizeof(struct buf) > MAXBUFSTRUCTSIZE) {
408 			kprintf("Warning: nbuf capped at %ld due to the "
409 				"reasonability limit\n", nbuf);
410 			nbuf = MAXBUFSTRUCTSIZE / sizeof(struct buf);
411 		}
412 	}
413 
414 	/*
415 	 * Do not allow the buffer_map to be more then 1/2 the size of the
416 	 * kernel_map.
417 	 */
418 	if (nbuf > (virtual_end - virtual_start +
419 		    virtual2_end - virtual2_start) / (MAXBSIZE * 2)) {
420 		nbuf = (virtual_end - virtual_start +
421 			virtual2_end - virtual2_start) / (MAXBSIZE * 2);
422 		kprintf("Warning: nbufs capped at %ld due to kvm\n", nbuf);
423 	}
424 
425 	/*
426 	 * Do not allow the buffer_map to use more than 50% of available
427 	 * physical-equivalent memory.  Since the VM pages which back
428 	 * individual buffers are typically wired, having too many bufs
429 	 * can prevent the system from paging properly.
430 	 */
431 	if (nbuf > physmem * PAGE_SIZE / (NBUFCALCSIZE * 2)) {
432 		nbuf = physmem * PAGE_SIZE / (NBUFCALCSIZE * 2);
433 		kprintf("Warning: nbufs capped at %ld due to physmem\n", nbuf);
434 	}
435 
436 	/*
437 	 * Do not allow the sizeof(struct buf) * nbuf to exceed 1/4 of
438 	 * the valloc space which is just the virtual_end - virtual_start
439 	 * section.  This is typically ~2GB regardless of the amount of
440 	 * memory, so we use 500MB as a metric.
441 	 *
442 	 * This is because we use valloc() to allocate the buf header array.
443 	 *
444 	 * NOTE: buffer space in bytes is limited by vfs.*bufspace sysctls.
445 	 */
446 	if (nbuf > (virtual_end - virtual_start) / (sizeof(struct buf) * 4)) {
447 		nbuf = (virtual_end - virtual_start) /
448 		       (sizeof(struct buf) * 4);
449 		kprintf("Warning: nbufs capped at %ld due to "
450 			"valloc considerations\n",
451 			nbuf);
452 	}
453 
454 	nswbuf_mem = lmax(lmin(nbuf / 32, 512), 8);
455 #ifdef NSWBUF_MIN
456 	if (nswbuf_mem < NSWBUF_MIN)
457 		nswbuf_mem = NSWBUF_MIN;
458 #endif
459 	nswbuf_kva = lmax(lmin(nbuf / 4, 512), 16);
460 #ifdef NSWBUF_MIN
461 	if (nswbuf_kva < NSWBUF_MIN)
462 		nswbuf_kva = NSWBUF_MIN;
463 #endif
464 
465 	valloc(swbuf_mem, struct buf, nswbuf_mem);
466 	valloc(swbuf_kva, struct buf, nswbuf_kva);
467 	valloc(buf, struct buf, nbuf);
468 
469 	/*
470 	 * End of first pass, size has been calculated so allocate memory
471 	 */
472 	if (firstaddr == 0) {
473 		size = (vm_size_t)(v - firstaddr);
474 		firstaddr = kmem_alloc(&kernel_map, round_page(size),
475 				       VM_SUBSYS_BUF);
476 		if (firstaddr == 0)
477 			panic("startup: no room for tables");
478 		goto again;
479 	}
480 
481 	/*
482 	 * End of second pass, addresses have been assigned
483 	 *
484 	 * nbuf is an int, make sure we don't overflow the field.
485 	 *
486 	 * On 64-bit systems we always reserve maximal allocations for
487 	 * buffer cache buffers and there are no fragmentation issues,
488 	 * so the KVA segment does not have to be excessively oversized.
489 	 */
490 	if ((vm_size_t)(v - firstaddr) != size)
491 		panic("startup: table size inconsistency");
492 
493 	kmem_suballoc(&kernel_map, &clean_map, &clean_sva, &clean_eva,
494 		      ((vm_offset_t)(nbuf + 16) * MAXBSIZE) +
495 		      ((nswbuf_mem + nswbuf_kva) * MAXPHYS) + pager_map_size);
496 	kmem_suballoc(&clean_map, &buffer_map, &buffer_sva, &buffer_eva,
497 		      ((vm_offset_t)(nbuf + 16) * MAXBSIZE));
498 	buffer_map.system_map = 1;
499 	kmem_suballoc(&clean_map, &pager_map, &pager_sva, &pager_eva,
500 		      ((vm_offset_t)(nswbuf_mem + nswbuf_kva) * MAXPHYS) +
501 		      pager_map_size);
502 	pager_map.system_map = 1;
503 	kprintf("avail memory = %ju (%ju MB)\n",
504 		(uintmax_t)ptoa(vmstats.v_free_count + vmstats.v_dma_pages),
505 		(uintmax_t)ptoa(vmstats.v_free_count + vmstats.v_dma_pages) /
506 		1024 / 1024);
507 }
508 
509 struct cpu_idle_stat {
510 	int	hint;
511 	int	reserved;
512 	u_long	halt;
513 	u_long	spin;
514 	u_long	repeat;
515 	u_long	repeat_last;
516 	u_long	repeat_delta;
517 	u_long	mwait_cx[CPU_MWAIT_CX_MAX];
518 } __cachealign;
519 
520 #define CPU_IDLE_STAT_HALT	-1
521 #define CPU_IDLE_STAT_SPIN	-2
522 
523 static struct cpu_idle_stat	cpu_idle_stats[MAXCPU];
524 
525 static int
526 sysctl_cpu_idle_cnt(SYSCTL_HANDLER_ARGS)
527 {
528 	int idx = arg2, cpu, error;
529 	u_long val = 0;
530 
531 	if (idx == CPU_IDLE_STAT_HALT) {
532 		for (cpu = 0; cpu < ncpus; ++cpu)
533 			val += cpu_idle_stats[cpu].halt;
534 	} else if (idx == CPU_IDLE_STAT_SPIN) {
535 		for (cpu = 0; cpu < ncpus; ++cpu)
536 			val += cpu_idle_stats[cpu].spin;
537 	} else {
538 		KASSERT(idx >= 0 && idx < CPU_MWAIT_CX_MAX,
539 		    ("invalid index %d", idx));
540 		for (cpu = 0; cpu < ncpus; ++cpu)
541 			val += cpu_idle_stats[cpu].mwait_cx[idx];
542 	}
543 
544 	error = sysctl_handle_quad(oidp, &val, 0, req);
545         if (error || req->newptr == NULL)
546 	        return error;
547 
548 	if (idx == CPU_IDLE_STAT_HALT) {
549 		for (cpu = 0; cpu < ncpus; ++cpu)
550 			cpu_idle_stats[cpu].halt = 0;
551 		cpu_idle_stats[0].halt = val;
552 	} else if (idx == CPU_IDLE_STAT_SPIN) {
553 		for (cpu = 0; cpu < ncpus; ++cpu)
554 			cpu_idle_stats[cpu].spin = 0;
555 		cpu_idle_stats[0].spin = val;
556 	} else {
557 		KASSERT(idx >= 0 && idx < CPU_MWAIT_CX_MAX,
558 		    ("invalid index %d", idx));
559 		for (cpu = 0; cpu < ncpus; ++cpu)
560 			cpu_idle_stats[cpu].mwait_cx[idx] = 0;
561 		cpu_idle_stats[0].mwait_cx[idx] = val;
562 	}
563 	return 0;
564 }
565 
566 static void
567 cpu_mwait_attach(void)
568 {
569 	struct sbuf sb;
570 	int hint_idx, i;
571 
572 	if (!CPU_MWAIT_HAS_CX)
573 		return;
574 
575 	if (cpu_vendor_id == CPU_VENDOR_INTEL &&
576 	    (CPUID_TO_FAMILY(cpu_id) > 0xf ||
577 	     (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
578 	      CPUID_TO_MODEL(cpu_id) >= 0xf))) {
579 		int bm_sts = 1;
580 
581 		/*
582 		 * Pentium dual-core, Core 2 and beyond do not need any
583 		 * additional activities to enter deep C-state, i.e. C3(+).
584 		 */
585 		cpu_mwait_cx_no_bmarb();
586 
587 		TUNABLE_INT_FETCH("machdep.cpu.mwait.bm_sts", &bm_sts);
588 		if (!bm_sts)
589 			cpu_mwait_cx_no_bmsts();
590 	}
591 
592 	sbuf_new(&sb, cpu_mwait_cx_supported,
593 	    sizeof(cpu_mwait_cx_supported), SBUF_FIXEDLEN);
594 
595 	for (i = 0; i < CPU_MWAIT_CX_MAX; ++i) {
596 		struct cpu_mwait_cx *cx = &cpu_mwait_cx_info[i];
597 		int sub;
598 
599 		ksnprintf(cx->name, sizeof(cx->name), "C%d", i);
600 
601 		sysctl_ctx_init(&cx->sysctl_ctx);
602 		cx->sysctl_tree = SYSCTL_ADD_NODE(&cx->sysctl_ctx,
603 		    SYSCTL_STATIC_CHILDREN(_machdep_mwait), OID_AUTO,
604 		    cx->name, CTLFLAG_RW, NULL, "Cx control/info");
605 		if (cx->sysctl_tree == NULL)
606 			continue;
607 
608 		cx->subcnt = CPUID_MWAIT_CX_SUBCNT(cpu_mwait_extemu, i);
609 		SYSCTL_ADD_INT(&cx->sysctl_ctx,
610 		    SYSCTL_CHILDREN(cx->sysctl_tree), OID_AUTO,
611 		    "subcnt", CTLFLAG_RD, &cx->subcnt, 0,
612 		    "sub-state count");
613 		SYSCTL_ADD_PROC(&cx->sysctl_ctx,
614 		    SYSCTL_CHILDREN(cx->sysctl_tree), OID_AUTO,
615 		    "entered", (CTLTYPE_QUAD | CTLFLAG_RW), 0,
616 		    i, sysctl_cpu_idle_cnt, "Q", "# of times entered");
617 
618 		for (sub = 0; sub < cx->subcnt; ++sub)
619 			sbuf_printf(&sb, "C%d/%d ", i, sub);
620 	}
621 	sbuf_trim(&sb);
622 	sbuf_finish(&sb);
623 
624 	/*
625 	 * Non-deep C-states
626 	 */
627 	cpu_mwait_c1_hints_cnt = cpu_mwait_cx_info[CPU_MWAIT_C1].subcnt;
628 	for (i = CPU_MWAIT_C1; i < CPU_MWAIT_C3; ++i)
629 		cpu_mwait_hints_cnt += cpu_mwait_cx_info[i].subcnt;
630 	cpu_mwait_hints = kmalloc(sizeof(int) * cpu_mwait_hints_cnt,
631 				  M_DEVBUF, M_WAITOK);
632 
633 	hint_idx = 0;
634 	for (i = CPU_MWAIT_C1; i < CPU_MWAIT_C3; ++i) {
635 		int j, subcnt;
636 
637 		subcnt = cpu_mwait_cx_info[i].subcnt;
638 		for (j = 0; j < subcnt; ++j) {
639 			KASSERT(hint_idx < cpu_mwait_hints_cnt,
640 			    ("invalid mwait hint index %d", hint_idx));
641 			cpu_mwait_hints[hint_idx] = MWAIT_EAX_HINT(i, j);
642 			++hint_idx;
643 		}
644 	}
645 	KASSERT(hint_idx == cpu_mwait_hints_cnt,
646 	    ("mwait hint count %d != index %d",
647 	     cpu_mwait_hints_cnt, hint_idx));
648 
649 	if (bootverbose) {
650 		kprintf("MWAIT hints (%d C1 hints):\n", cpu_mwait_c1_hints_cnt);
651 		for (i = 0; i < cpu_mwait_hints_cnt; ++i) {
652 			int hint = cpu_mwait_hints[i];
653 
654 			kprintf("  C%d/%d hint 0x%04x\n",
655 			    MWAIT_EAX_TO_CX(hint), MWAIT_EAX_TO_CX_SUB(hint),
656 			    hint);
657 		}
658 	}
659 
660 	/*
661 	 * Deep C-states
662 	 */
663 	for (i = CPU_MWAIT_C1; i < CPU_MWAIT_CX_MAX; ++i)
664 		cpu_mwait_deep_hints_cnt += cpu_mwait_cx_info[i].subcnt;
665 	cpu_mwait_deep_hints = kmalloc(sizeof(int) * cpu_mwait_deep_hints_cnt,
666 	    M_DEVBUF, M_WAITOK);
667 
668 	hint_idx = 0;
669 	for (i = CPU_MWAIT_C1; i < CPU_MWAIT_CX_MAX; ++i) {
670 		int j, subcnt;
671 
672 		subcnt = cpu_mwait_cx_info[i].subcnt;
673 		for (j = 0; j < subcnt; ++j) {
674 			KASSERT(hint_idx < cpu_mwait_deep_hints_cnt,
675 			    ("invalid mwait deep hint index %d", hint_idx));
676 			cpu_mwait_deep_hints[hint_idx] = MWAIT_EAX_HINT(i, j);
677 			++hint_idx;
678 		}
679 	}
680 	KASSERT(hint_idx == cpu_mwait_deep_hints_cnt,
681 	    ("mwait deep hint count %d != index %d",
682 	     cpu_mwait_deep_hints_cnt, hint_idx));
683 
684 	if (bootverbose) {
685 		kprintf("MWAIT deep hints:\n");
686 		for (i = 0; i < cpu_mwait_deep_hints_cnt; ++i) {
687 			int hint = cpu_mwait_deep_hints[i];
688 
689 			kprintf("  C%d/%d hint 0x%04x\n",
690 			    MWAIT_EAX_TO_CX(hint), MWAIT_EAX_TO_CX_SUB(hint),
691 			    hint);
692 		}
693 	}
694 	cpu_idle_repeat_max = 256 * cpu_mwait_deep_hints_cnt;
695 
696 	for (i = 0; i < ncpus; ++i) {
697 		char name[16];
698 
699 		ksnprintf(name, sizeof(name), "idle%d", i);
700 		SYSCTL_ADD_PROC(NULL,
701 		    SYSCTL_STATIC_CHILDREN(_machdep_mwait_CX), OID_AUTO,
702 		    name, (CTLTYPE_STRING | CTLFLAG_RW), &cpu_idle_stats[i],
703 		    0, cpu_mwait_cx_pcpu_idle_sysctl, "A", "");
704 	}
705 }
706 
707 static void
708 cpu_finish(void *dummy __unused)
709 {
710 	cpu_setregs();
711 	cpu_mwait_attach();
712 }
713 
714 static void
715 pic_finish(void *dummy __unused)
716 {
717 	/* Log ELCR information */
718 	elcr_dump();
719 
720 	/* Log MPTABLE information */
721 	mptable_pci_int_dump();
722 
723 	/* Finalize PCI */
724 	MachIntrABI.finalize();
725 }
726 
727 /*
728  * Send an interrupt to process.
729  *
730  * Stack is set up to allow sigcode stored
731  * at top to call routine, followed by kcall
732  * to sigreturn routine below.  After sigreturn
733  * resets the signal mask, the stack, and the
734  * frame pointer, it returns to the user
735  * specified pc, psl.
736  */
737 void
738 sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
739 {
740 	struct lwp *lp = curthread->td_lwp;
741 	struct proc *p = lp->lwp_proc;
742 	struct trapframe *regs;
743 	struct sigacts *psp = p->p_sigacts;
744 	struct sigframe sf, *sfp;
745 	int oonstack;
746 	char *sp;
747 
748 	regs = lp->lwp_md.md_regs;
749 	oonstack = (lp->lwp_sigstk.ss_flags & SS_ONSTACK) ? 1 : 0;
750 
751 	/* Save user context */
752 	bzero(&sf, sizeof(struct sigframe));
753 	sf.sf_uc.uc_sigmask = *mask;
754 	sf.sf_uc.uc_stack = lp->lwp_sigstk;
755 	sf.sf_uc.uc_mcontext.mc_onstack = oonstack;
756 	KKASSERT(__offsetof(struct trapframe, tf_rdi) == 0);
757 	/* gcc errors out on optimized bcopy */
758 	_bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(struct trapframe));
759 
760 	/* Make the size of the saved context visible to userland */
761 	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext);
762 
763 	/* Allocate and validate space for the signal handler context. */
764         if ((lp->lwp_flags & LWP_ALTSTACK) != 0 && !oonstack &&
765 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
766 		sp = (char *)lp->lwp_sigstk.ss_sp + lp->lwp_sigstk.ss_size -
767 		    sizeof(struct sigframe);
768 		lp->lwp_sigstk.ss_flags |= SS_ONSTACK;
769 	} else {
770 		/* We take red zone into account */
771 		sp = (char *)regs->tf_rsp - sizeof(struct sigframe) - 128;
772 	}
773 
774 	/*
775 	 * XXX AVX needs 64-byte alignment but sigframe has other fields and
776 	 * the embedded ucontext is not at the front, so aligning this won't
777 	 * help us.  Fortunately we bcopy in/out of the sigframe, so the
778 	 * kernel is ok.
779 	 *
780 	 * The problem though is if userland winds up trying to use the
781 	 * context directly.
782 	 */
783 	sfp = (struct sigframe *)((intptr_t)sp & ~(intptr_t)0xF);
784 
785 	/* Translate the signal is appropriate */
786 	if (p->p_sysent->sv_sigtbl) {
787 		if (sig <= p->p_sysent->sv_sigsize)
788 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
789 	}
790 
791 	/*
792 	 * Build the argument list for the signal handler.
793 	 *
794 	 * Arguments are in registers (%rdi, %rsi, %rdx, %rcx)
795 	 */
796 	regs->tf_rdi = sig;				/* argument 1 */
797 	regs->tf_rdx = (register_t)&sfp->sf_uc;		/* argument 3 */
798 
799 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
800 		/*
801 		 * Signal handler installed with SA_SIGINFO.
802 		 *
803 		 * action(signo, siginfo, ucontext)
804 		 */
805 		regs->tf_rsi = (register_t)&sfp->sf_si;	/* argument 2 */
806 		regs->tf_rcx = (register_t)regs->tf_addr; /* argument 4 */
807 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
808 
809 		/* fill siginfo structure */
810 		sf.sf_si.si_signo = sig;
811 		sf.sf_si.si_pid = psp->ps_frominfo[sig].pid;
812 		sf.sf_si.si_uid = psp->ps_frominfo[sig].uid;
813 		sf.sf_si.si_code = code;
814 		sf.sf_si.si_addr = (void *)regs->tf_addr;
815 	} else {
816 		/*
817 		 * Old FreeBSD-style arguments.
818 		 *
819 		 * handler (signo, code, [uc], addr)
820 		 */
821 		regs->tf_rsi = (register_t)code;	/* argument 2 */
822 		regs->tf_rcx = (register_t)regs->tf_addr; /* argument 4 */
823 		sf.sf_ahu.sf_handler = catcher;
824 	}
825 
826 	/*
827 	 * If we're a vm86 process, we want to save the segment registers.
828 	 * We also change eflags to be our emulated eflags, not the actual
829 	 * eflags.
830 	 */
831 #if 0 /* JG */
832 	if (regs->tf_eflags & PSL_VM) {
833 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
834 		struct vm86_kernel *vm86 = &lp->lwp_thread->td_pcb->pcb_ext->ext_vm86;
835 
836 		sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
837 		sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
838 		sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
839 		sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
840 
841 		if (vm86->vm86_has_vme == 0)
842 			sf.sf_uc.uc_mcontext.mc_eflags =
843 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
844 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
845 
846 		/*
847 		 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
848 		 * syscalls made by the signal handler.  This just avoids
849 		 * wasting time for our lazy fixup of such faults.  PSL_NT
850 		 * does nothing in vm86 mode, but vm86 programs can set it
851 		 * almost legitimately in probes for old cpu types.
852 		 */
853 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
854 	}
855 #endif
856 
857 	/*
858 	 * Save the FPU state and reinit the FP unit
859 	 */
860 	npxpush(&sf.sf_uc.uc_mcontext);
861 
862 	/*
863 	 * Copy the sigframe out to the user's stack.
864 	 */
865 	if (copyout(&sf, sfp, sizeof(struct sigframe)) != 0) {
866 		/*
867 		 * Something is wrong with the stack pointer.
868 		 * ...Kill the process.
869 		 */
870 		sigexit(lp, SIGILL);
871 	}
872 
873 	regs->tf_rsp = (register_t)sfp;
874 	regs->tf_rip = trunc_page64(PS_STRINGS - *(p->p_sysent->sv_szsigcode));
875 	regs->tf_rip -= SZSIGCODE_EXTRA_BYTES;
876 
877 	/*
878 	 * x86 abi specifies that the direction flag must be cleared
879 	 * on function entry
880 	 */
881 	regs->tf_rflags &= ~(PSL_T | PSL_D);
882 
883 	/*
884 	 * 64 bit mode has a code and stack selector but
885 	 * no data or extra selector.  %fs and %gs are not
886 	 * stored in-context.
887 	 */
888 	regs->tf_cs = _ucodesel;
889 	regs->tf_ss = _udatasel;
890 	clear_quickret();
891 }
892 
893 /*
894  * Sanitize the trapframe for a virtual kernel passing control to a custom
895  * VM context.  Remove any items that would otherwise create a privilage
896  * issue.
897  *
898  * XXX at the moment we allow userland to set the resume flag.  Is this a
899  * bad idea?
900  */
901 int
902 cpu_sanitize_frame(struct trapframe *frame)
903 {
904 	frame->tf_cs = _ucodesel;
905 	frame->tf_ss = _udatasel;
906 	/* XXX VM (8086) mode not supported? */
907 	frame->tf_rflags &= (PSL_RF | PSL_USERCHANGE | PSL_VM_UNSUPP);
908 	frame->tf_rflags |= PSL_RESERVED_DEFAULT | PSL_I;
909 
910 	return(0);
911 }
912 
913 /*
914  * Sanitize the tls so loading the descriptor does not blow up
915  * on us.  For x86_64 we don't have to do anything.
916  */
917 int
918 cpu_sanitize_tls(struct savetls *tls)
919 {
920 	return(0);
921 }
922 
923 /*
924  * sigreturn(ucontext_t *sigcntxp)
925  *
926  * System call to cleanup state after a signal
927  * has been taken.  Reset signal mask and
928  * stack state from context left by sendsig (above).
929  * Return to previous pc and psl as specified by
930  * context left by sendsig. Check carefully to
931  * make sure that the user has not modified the
932  * state to gain improper privileges.
933  *
934  * MPSAFE
935  */
936 #define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
937 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
938 
939 int
940 sys_sigreturn(struct sysmsg *sysmsg, const struct sigreturn_args *uap)
941 {
942 	struct lwp *lp = curthread->td_lwp;
943 	struct trapframe *regs;
944 	ucontext_t uc;
945 	ucontext_t *ucp;
946 	register_t rflags;
947 	int cs;
948 	int error;
949 
950 	/*
951 	 * We have to copy the information into kernel space so userland
952 	 * can't modify it while we are sniffing it.
953 	 */
954 	regs = lp->lwp_md.md_regs;
955 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
956 	if (error)
957 		return (error);
958 	ucp = &uc;
959 	rflags = ucp->uc_mcontext.mc_rflags;
960 
961 	/* VM (8086) mode not supported */
962 	rflags &= ~PSL_VM_UNSUPP;
963 
964 #if 0 /* JG */
965 	if (eflags & PSL_VM) {
966 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
967 		struct vm86_kernel *vm86;
968 
969 		/*
970 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
971 		 * set up the vm86 area, and we can't enter vm86 mode.
972 		 */
973 		if (lp->lwp_thread->td_pcb->pcb_ext == 0)
974 			return (EINVAL);
975 		vm86 = &lp->lwp_thread->td_pcb->pcb_ext->ext_vm86;
976 		if (vm86->vm86_inited == 0)
977 			return (EINVAL);
978 
979 		/* go back to user mode if both flags are set */
980 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF))
981 			trapsignal(lp, SIGBUS, 0);
982 
983 		if (vm86->vm86_has_vme) {
984 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
985 			    (eflags & VME_USERCHANGE) | PSL_VM;
986 		} else {
987 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
988 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
989 			    (eflags & VM_USERCHANGE) | PSL_VM;
990 		}
991 		bcopy(&ucp->uc_mcontext.mc_gs, tf, sizeof(struct trapframe));
992 		tf->tf_eflags = eflags;
993 		tf->tf_vm86_ds = tf->tf_ds;
994 		tf->tf_vm86_es = tf->tf_es;
995 		tf->tf_vm86_fs = tf->tf_fs;
996 		tf->tf_vm86_gs = tf->tf_gs;
997 		tf->tf_ds = _udatasel;
998 		tf->tf_es = _udatasel;
999 		tf->tf_fs = _udatasel;
1000 		tf->tf_gs = _udatasel;
1001 	} else
1002 #endif
1003 	{
1004 		/*
1005 		 * Don't allow users to change privileged or reserved flags.
1006 		 */
1007 		/*
1008 		 * XXX do allow users to change the privileged flag PSL_RF.
1009 		 * The cpu sets PSL_RF in tf_eflags for faults.  Debuggers
1010 		 * should sometimes set it there too.  tf_eflags is kept in
1011 		 * the signal context during signal handling and there is no
1012 		 * other place to remember it, so the PSL_RF bit may be
1013 		 * corrupted by the signal handler without us knowing.
1014 		 * Corruption of the PSL_RF bit at worst causes one more or
1015 		 * one less debugger trap, so allowing it is fairly harmless.
1016 		 */
1017 		if (!EFL_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) {
1018 			kprintf("sigreturn: rflags = 0x%lx\n", (long)rflags);
1019 			return(EINVAL);
1020 		}
1021 
1022 		/*
1023 		 * Don't allow users to load a valid privileged %cs.  Let the
1024 		 * hardware check for invalid selectors, excess privilege in
1025 		 * other selectors, invalid %eip's and invalid %esp's.
1026 		 */
1027 		cs = ucp->uc_mcontext.mc_cs;
1028 		if (!CS_SECURE(cs)) {
1029 			kprintf("sigreturn: cs = 0x%x\n", cs);
1030 			trapsignal(lp, SIGBUS, T_PROTFLT);
1031 			return(EINVAL);
1032 		}
1033 		/* gcc errors out on optimized bcopy */
1034 		_bcopy(&ucp->uc_mcontext.mc_rdi, regs,
1035 		       sizeof(struct trapframe));
1036 	}
1037 
1038 	/*
1039 	 * Restore the FPU state from the frame
1040 	 */
1041 	crit_enter();
1042 	npxpop(&ucp->uc_mcontext);
1043 
1044 	if (ucp->uc_mcontext.mc_onstack & 1)
1045 		lp->lwp_sigstk.ss_flags |= SS_ONSTACK;
1046 	else
1047 		lp->lwp_sigstk.ss_flags &= ~SS_ONSTACK;
1048 
1049 	lp->lwp_sigmask = ucp->uc_sigmask;
1050 	SIG_CANTMASK(lp->lwp_sigmask);
1051 	clear_quickret();
1052 	crit_exit();
1053 	return(EJUSTRETURN);
1054 }
1055 
1056 /*
1057  * Machine dependent boot() routine
1058  *
1059  * I haven't seen anything to put here yet
1060  * Possibly some stuff might be grafted back here from boot()
1061  */
1062 void
1063 cpu_boot(int howto)
1064 {
1065 }
1066 
1067 /*
1068  * Shutdown the CPU as much as possible
1069  */
1070 void
1071 cpu_halt(void)
1072 {
1073 	for (;;)
1074 		__asm__ __volatile("hlt");
1075 }
1076 
1077 /*
1078  * cpu_idle() represents the idle LWKT.  You cannot return from this function
1079  * (unless you want to blow things up!).  Instead we look for runnable threads
1080  * and loop or halt as appropriate.  Giant is not held on entry to the thread.
1081  *
1082  * The main loop is entered with a critical section held, we must release
1083  * the critical section before doing anything else.  lwkt_switch() will
1084  * check for pending interrupts due to entering and exiting its own
1085  * critical section.
1086  *
1087  * NOTE: On an SMP system we rely on a scheduler IPI to wake a HLTed cpu up.
1088  *	 However, there are cases where the idlethread will be entered with
1089  *	 the possibility that no IPI will occur and in such cases
1090  *	 lwkt_switch() sets TDF_IDLE_NOHLT.
1091  *
1092  * NOTE: cpu_idle_repeat determines how many entries into the idle thread
1093  *	 must occur before it starts using ACPI halt.
1094  *
1095  * NOTE: Value overridden in hammer_time().
1096  */
1097 static int	cpu_idle_hlt = 2;
1098 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW,
1099     &cpu_idle_hlt, 0, "Idle loop HLT enable");
1100 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_repeat, CTLFLAG_RW,
1101     &cpu_idle_repeat, 0, "Idle entries before acpi hlt");
1102 
1103 SYSCTL_PROC(_machdep, OID_AUTO, cpu_idle_hltcnt, (CTLTYPE_QUAD | CTLFLAG_RW),
1104     0, CPU_IDLE_STAT_HALT, sysctl_cpu_idle_cnt, "Q", "Idle loop entry halts");
1105 SYSCTL_PROC(_machdep, OID_AUTO, cpu_idle_spincnt, (CTLTYPE_QUAD | CTLFLAG_RW),
1106     0, CPU_IDLE_STAT_SPIN, sysctl_cpu_idle_cnt, "Q", "Idle loop entry spins");
1107 
1108 static void
1109 cpu_idle_default_hook(void)
1110 {
1111 	/*
1112 	 * We must guarentee that hlt is exactly the instruction
1113 	 * following the sti.
1114 	 */
1115 	__asm __volatile("sti; hlt");
1116 }
1117 
1118 /* Other subsystems (e.g., ACPI) can hook this later. */
1119 void (*cpu_idle_hook)(void) = cpu_idle_default_hook;
1120 
1121 static __inline int
1122 cpu_mwait_cx_hint(struct cpu_idle_stat *stat)
1123 {
1124 	int hint, cx_idx;
1125 	u_int idx;
1126 
1127 	hint = stat->hint;
1128 	if (hint >= 0)
1129 		goto done;
1130 
1131 	idx = (stat->repeat + stat->repeat_last + stat->repeat_delta) >>
1132 	    cpu_mwait_repeat_shift;
1133 	if (idx >= cpu_mwait_c1_hints_cnt) {
1134 		/* Step up faster, once we walked through all C1 states */
1135 		stat->repeat_delta += 1 << (cpu_mwait_repeat_shift + 1);
1136 	}
1137 	if (hint == CPU_MWAIT_HINT_AUTODEEP) {
1138 		if (idx >= cpu_mwait_deep_hints_cnt)
1139 			idx = cpu_mwait_deep_hints_cnt - 1;
1140 		hint = cpu_mwait_deep_hints[idx];
1141 	} else {
1142 		if (idx >= cpu_mwait_hints_cnt)
1143 			idx = cpu_mwait_hints_cnt - 1;
1144 		hint = cpu_mwait_hints[idx];
1145 	}
1146 done:
1147 	cx_idx = MWAIT_EAX_TO_CX(hint);
1148 	if (cx_idx >= 0 && cx_idx < CPU_MWAIT_CX_MAX)
1149 		stat->mwait_cx[cx_idx]++;
1150 	return hint;
1151 }
1152 
1153 void
1154 cpu_idle(void)
1155 {
1156 	globaldata_t gd = mycpu;
1157 	struct cpu_idle_stat *stat = &cpu_idle_stats[gd->gd_cpuid];
1158 	struct thread *td __debugvar = gd->gd_curthread;
1159 	int reqflags;
1160 
1161 	stat->repeat = stat->repeat_last = cpu_idle_repeat_max;
1162 
1163 	crit_exit();
1164 	KKASSERT(td->td_critcount == 0);
1165 
1166 	for (;;) {
1167 		/*
1168 		 * See if there are any LWKTs ready to go.
1169 		 */
1170 		lwkt_switch();
1171 
1172 		/*
1173 		 * When halting inside a cli we must check for reqflags
1174 		 * races, particularly [re]schedule requests.  Running
1175 		 * splz() does the job.
1176 		 *
1177 		 * cpu_idle_hlt:
1178 		 *	0	Never halt, just spin
1179 		 *
1180 		 *	1	Always use MONITOR/MWAIT if avail, HLT
1181 		 *		otherwise.
1182 		 *
1183 		 *		Better default for modern (Haswell+) Intel
1184 		 *		cpus.
1185 		 *
1186 		 *	2	Use HLT/MONITOR/MWAIT up to a point and then
1187 		 *		use the ACPI halt (default).  This is a hybrid
1188 		 *		approach.  See machdep.cpu_idle_repeat.
1189 		 *
1190 		 *		Better default for modern AMD cpus and older
1191 		 *		Intel cpus.
1192 		 *
1193 		 *	3	Always use the ACPI halt.  This typically
1194 		 *		eats the least amount of power but the cpu
1195 		 *		will be slow waking up.  Slows down e.g.
1196 		 *		compiles and other pipe/event oriented stuff.
1197 		 *
1198 		 *		Usually the best default for AMD cpus.
1199 		 *
1200 		 *	4	Always use HLT.
1201 		 *
1202 		 *	5	Always spin.
1203 		 *
1204 		 * NOTE: Interrupts are enabled and we are not in a critical
1205 		 *	 section.
1206 		 *
1207 		 * NOTE: Preemptions do not reset gd_idle_repeat.   Also we
1208 		 *	 don't bother capping gd_idle_repeat, it is ok if
1209 		 *	 it overflows (we do make it unsigned, however).
1210 		 *
1211 		 * Implement optimized invltlb operations when halted
1212 		 * in idle.  By setting the bit in smp_idleinvl_mask
1213 		 * we inform other cpus that they can set _reqs to
1214 		 * request an invltlb.  Current the code to do that
1215 		 * sets the bits in _reqs anyway, but then check _mask
1216 		 * to determine if they can assume the invltlb will execute.
1217 		 *
1218 		 * A critical section is required to ensure that interrupts
1219 		 * do not fully run until after we've had a chance to execute
1220 		 * the request.
1221 		 */
1222 		if (gd->gd_idle_repeat == 0) {
1223 			stat->repeat = (stat->repeat + stat->repeat_last) >> 1;
1224 			if (stat->repeat > cpu_idle_repeat_max)
1225 				stat->repeat = cpu_idle_repeat_max;
1226 			stat->repeat_last = 0;
1227 			stat->repeat_delta = 0;
1228 		}
1229 		++stat->repeat_last;
1230 
1231 		/*
1232 		 * General idle thread halt code
1233 		 *
1234 		 * IBRS NOTES - IBRS is a SPECTRE mitigation.  When going
1235 		 *		idle, disable IBRS to reduce hyperthread
1236 		 *		overhead.
1237 		 */
1238 		++gd->gd_idle_repeat;
1239 
1240 		switch(cpu_idle_hlt) {
1241 		default:
1242 		case 0:
1243 			/*
1244 			 * Always spin
1245 			 */
1246 			;
1247 do_spin:
1248 			splz();
1249 			__asm __volatile("sti");
1250 			stat->spin++;
1251 			crit_enter_gd(gd);
1252 			crit_exit_gd(gd);
1253 			break;
1254 		case 2:
1255 			/*
1256 			 * Use MONITOR/MWAIT (or HLT) for a few cycles,
1257 			 * then start using the ACPI halt code if we
1258 			 * continue to be idle.
1259 			 */
1260 			if (gd->gd_idle_repeat >= cpu_idle_repeat)
1261 				goto do_acpi;
1262 			/* FALL THROUGH */
1263 		case 1:
1264 			/*
1265 			 * Always use MONITOR/MWAIT (will use HLT if
1266 			 * MONITOR/MWAIT not available).
1267 			 */
1268 			if (cpu_mi_feature & CPU_MI_MONITOR) {
1269 				splz(); /* XXX */
1270 				reqflags = gd->gd_reqflags;
1271 				if (reqflags & RQF_IDLECHECK_WK_MASK)
1272 					goto do_spin;
1273 				crit_enter_gd(gd);
1274 				ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask, gd->gd_cpuid);
1275 				/*
1276 				 * IBRS/STIBP
1277 				 */
1278 				if (pscpu->trampoline.tr_pcb_spec_ctrl[1] &
1279 				    SPEC_CTRL_DUMMY_ENABLE) {
1280 					wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[1] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
1281 				}
1282 				cpu_mmw_pause_int(&gd->gd_reqflags, reqflags,
1283 						  cpu_mwait_cx_hint(stat), 0);
1284 				if (pscpu->trampoline.tr_pcb_spec_ctrl[0] &
1285 				    SPEC_CTRL_DUMMY_ENABLE) {
1286 					wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[0] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
1287 				}
1288 				stat->halt++;
1289 				ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask, gd->gd_cpuid);
1290 				if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs,
1291 							      gd->gd_cpuid)) {
1292 					cpu_invltlb();
1293 					cpu_mfence();
1294 				}
1295 				crit_exit_gd(gd);
1296 				break;
1297 			}
1298 			/* FALLTHROUGH */
1299 		case 4:
1300 			/*
1301 			 * Use HLT
1302 			 */
1303 			__asm __volatile("cli");
1304 			splz();
1305 			crit_enter_gd(gd);
1306 			if ((gd->gd_reqflags & RQF_IDLECHECK_WK_MASK) == 0) {
1307 				ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask,
1308 						     gd->gd_cpuid);
1309 				if (pscpu->trampoline.tr_pcb_spec_ctrl[1] &
1310 				    SPEC_CTRL_DUMMY_ENABLE) {
1311 					wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[1] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
1312 				}
1313 				cpu_idle_default_hook();
1314 				if (pscpu->trampoline.tr_pcb_spec_ctrl[0] &
1315 				    SPEC_CTRL_DUMMY_ENABLE) {
1316 					wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[0] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
1317 				}
1318 				ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask,
1319 						       gd->gd_cpuid);
1320 				if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs,
1321 							      gd->gd_cpuid)) {
1322 					cpu_invltlb();
1323 					cpu_mfence();
1324 				}
1325 			}
1326 			__asm __volatile("sti");
1327 			stat->halt++;
1328 			crit_exit_gd(gd);
1329 			break;
1330 		case 3:
1331 			/*
1332 			 * Use ACPI halt
1333 			 */
1334 			;
1335 do_acpi:
1336 			__asm __volatile("cli");
1337 			splz();
1338 			crit_enter_gd(gd);
1339 			if ((gd->gd_reqflags & RQF_IDLECHECK_WK_MASK) == 0) {
1340 				ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask,
1341 						     gd->gd_cpuid);
1342 				if (pscpu->trampoline.tr_pcb_spec_ctrl[1] &
1343 				    SPEC_CTRL_DUMMY_ENABLE) {
1344 					wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[1] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
1345 				}
1346 				cpu_idle_hook();
1347 				if (pscpu->trampoline.tr_pcb_spec_ctrl[0] &
1348 				    SPEC_CTRL_DUMMY_ENABLE) {
1349 					wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[0] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
1350 				}
1351 				ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask,
1352 						       gd->gd_cpuid);
1353 				if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs,
1354 							      gd->gd_cpuid)) {
1355 					cpu_invltlb();
1356 					cpu_mfence();
1357 				}
1358 			}
1359 			__asm __volatile("sti");
1360 			stat->halt++;
1361 			crit_exit_gd(gd);
1362 			break;
1363 		}
1364 	}
1365 }
1366 
1367 /*
1368  * Called from deep ACPI via cpu_idle_hook() (see above) to actually halt
1369  * the cpu in C1.  ACPI might use other halt methods for deeper states
1370  * and not reach here.
1371  *
1372  * For now we always use HLT as we are not sure what ACPI may have actually
1373  * done.  MONITOR/MWAIT might not be appropriate.
1374  *
1375  * NOTE: MONITOR/MWAIT does not appear to throttle AMD cpus, while HLT
1376  *	 does.  On Intel, MONITOR/MWAIT does appear to throttle the cpu.
1377  */
1378 void
1379 cpu_idle_halt(void)
1380 {
1381 	globaldata_t gd;
1382 
1383 	gd = mycpu;
1384 #if 0
1385 	/* DISABLED FOR NOW */
1386 	struct cpu_idle_stat *stat;
1387 	int reqflags;
1388 
1389 
1390 	if ((cpu_idle_hlt == 1 || cpu_idle_hlt == 2) &&
1391 	    (cpu_mi_feature & CPU_MI_MONITOR) &&
1392 	    cpu_vendor_id != CPU_VENDOR_AMD) {
1393 		/*
1394 		 * Use MONITOR/MWAIT
1395 		 *
1396 		 * (NOTE: On ryzen, MWAIT does not throttle clocks, so we
1397 		 *	  have to use HLT)
1398 		 */
1399 		stat = &cpu_idle_stats[gd->gd_cpuid];
1400 		reqflags = gd->gd_reqflags;
1401 		if ((reqflags & RQF_IDLECHECK_WK_MASK) == 0) {
1402 			__asm __volatile("sti");
1403 			cpu_mmw_pause_int(&gd->gd_reqflags, reqflags,
1404 					  cpu_mwait_cx_hint(stat), 0);
1405 		} else {
1406 			__asm __volatile("sti; pause");
1407 		}
1408 	} else
1409 #endif
1410 	{
1411 		/*
1412 		 * Use HLT
1413 		 */
1414 		if ((gd->gd_reqflags & RQF_IDLECHECK_WK_MASK) == 0)
1415 			__asm __volatile("sti; hlt");
1416 		else
1417 			__asm __volatile("sti; pause");
1418 	}
1419 }
1420 
1421 
1422 /*
1423  * Called in a loop indirectly via Xcpustop
1424  */
1425 void
1426 cpu_smp_stopped(void)
1427 {
1428 	globaldata_t gd = mycpu;
1429 	volatile __uint64_t *ptr;
1430 	__uint64_t ovalue;
1431 
1432 	ptr = CPUMASK_ADDR(started_cpus, gd->gd_cpuid);
1433 	ovalue = *ptr;
1434 	if ((ovalue & CPUMASK_SIMPLE(gd->gd_cpuid & 63)) == 0) {
1435 		if (cpu_mi_feature & CPU_MI_MONITOR) {
1436 			if (cpu_mwait_hints) {
1437 				cpu_mmw_pause_long(__DEVOLATILE(void *, ptr),
1438 					   ovalue,
1439 					   cpu_mwait_hints[
1440 						cpu_mwait_hints_cnt - 1], 0);
1441 			} else {
1442 				cpu_mmw_pause_long(__DEVOLATILE(void *, ptr),
1443 					   ovalue, 0, 0);
1444 			}
1445 		} else {
1446 			cpu_halt();	/* depend on lapic timer */
1447 		}
1448 	}
1449 }
1450 
1451 /*
1452  * This routine is called if a spinlock has been held through the
1453  * exponential backoff period and is seriously contested.  On a real cpu
1454  * we let it spin.
1455  */
1456 void
1457 cpu_spinlock_contested(void)
1458 {
1459 	cpu_pause();
1460 }
1461 
1462 /*
1463  * Clear registers on exec
1464  */
1465 void
1466 exec_setregs(u_long entry, u_long stack, u_long ps_strings)
1467 {
1468 	struct thread *td = curthread;
1469 	struct lwp *lp = td->td_lwp;
1470 	struct pcb *pcb = td->td_pcb;
1471 	struct trapframe *regs = lp->lwp_md.md_regs;
1472 
1473 	user_ldt_free(pcb);
1474 
1475 	clear_quickret();
1476 	bzero((char *)regs, sizeof(struct trapframe));
1477 	regs->tf_rip = entry;
1478 	regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; /* align the stack */
1479 	regs->tf_rdi = stack;		/* argv */
1480 	regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
1481 	regs->tf_ss = _udatasel;
1482 	regs->tf_cs = _ucodesel;
1483 	regs->tf_rbx = ps_strings;
1484 
1485 	/*
1486 	 * Reset the hardware debug registers if they were in use.
1487 	 * They won't have any meaning for the newly exec'd process.
1488 	 */
1489 	if (pcb->pcb_flags & PCB_DBREGS) {
1490 		pcb->pcb_dr0 = 0;
1491 		pcb->pcb_dr1 = 0;
1492 		pcb->pcb_dr2 = 0;
1493 		pcb->pcb_dr3 = 0;
1494 		pcb->pcb_dr6 = 0;
1495 		pcb->pcb_dr7 = 0; /* JG set bit 10? */
1496 		if (pcb == td->td_pcb) {
1497 			/*
1498 			 * Clear the debug registers on the running
1499 			 * CPU, otherwise they will end up affecting
1500 			 * the next process we switch to.
1501 			 */
1502 			reset_dbregs();
1503 		}
1504 		pcb->pcb_flags &= ~PCB_DBREGS;
1505 	}
1506 
1507 	/*
1508 	 * Initialize the math emulator (if any) for the current process.
1509 	 * Actually, just clear the bit that says that the emulator has
1510 	 * been initialized.  Initialization is delayed until the process
1511 	 * traps to the emulator (if it is done at all) mainly because
1512 	 * emulators don't provide an entry point for initialization.
1513 	 */
1514 	pcb->pcb_flags &= ~FP_SOFTFP;
1515 
1516 	/*
1517 	 * NOTE: do not set CR0_TS here.  npxinit() must do it after clearing
1518 	 *	 gd_npxthread.  Otherwise a preemptive interrupt thread
1519 	 *	 may panic in npxdna().
1520 	 */
1521 	crit_enter();
1522 	load_cr0(rcr0() | CR0_MP);
1523 
1524 	/*
1525 	 * NOTE: The MSR values must be correct so we can return to
1526 	 *	 userland.  gd_user_fs/gs must be correct so the switch
1527 	 *	 code knows what the current MSR values are.
1528 	 */
1529 	pcb->pcb_fsbase = 0;	/* Values loaded from PCB on switch */
1530 	pcb->pcb_gsbase = 0;
1531 	mdcpu->gd_user_fs = 0;	/* Cache of current MSR values */
1532 	mdcpu->gd_user_gs = 0;
1533 	wrmsr(MSR_FSBASE, 0);	/* Set MSR values for return to userland */
1534 	wrmsr(MSR_KGSBASE, 0);
1535 
1536 	/* Initialize the npx (if any) for the current process. */
1537 	npxinit();
1538 	crit_exit();
1539 
1540 	pcb->pcb_ds = _udatasel;
1541 	pcb->pcb_es = _udatasel;
1542 	pcb->pcb_fs = _udatasel;
1543 	pcb->pcb_gs = _udatasel;
1544 }
1545 
1546 void
1547 cpu_setregs(void)
1548 {
1549 	register_t cr0;
1550 
1551 	cr0 = rcr0();
1552 	cr0 |= CR0_NE;			/* Done by npxinit() */
1553 	cr0 |= CR0_MP | CR0_TS;		/* Done at every execve() too. */
1554 	cr0 |= CR0_WP | CR0_AM;
1555 	load_cr0(cr0);
1556 	load_gs(_udatasel);
1557 }
1558 
1559 static int
1560 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS)
1561 {
1562 	int error;
1563 	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2,
1564 		req);
1565 	if (!error && req->newptr)
1566 		resettodr();
1567 	return (error);
1568 }
1569 
1570 SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW,
1571 	&adjkerntz, 0, sysctl_machdep_adjkerntz, "I", "");
1572 
1573 SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set,
1574 	CTLFLAG_RW, &disable_rtc_set, 0, "");
1575 
1576 #if 0 /* JG */
1577 SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo,
1578 	CTLFLAG_RD, &bootinfo, bootinfo, "");
1579 #endif
1580 
1581 SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock,
1582 	CTLFLAG_RW, &wall_cmos_clock, 0, "");
1583 
1584 static int
1585 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1586 {
1587 	struct efi_map_header *efihdr;
1588 	caddr_t kmdp;
1589 	uint32_t efisize;
1590 
1591 	kmdp = preload_search_by_type("elf kernel");
1592 	if (kmdp == NULL)
1593 		kmdp = preload_search_by_type("elf64 kernel");
1594 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1595 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1596 	if (efihdr == NULL)
1597 		return (0);
1598 	efisize = *((uint32_t *)efihdr - 1);
1599 	return (SYSCTL_OUT(req, efihdr, efisize));
1600 }
1601 SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1602     efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map");
1603 
1604 /*
1605  * Initialize x86 and configure to run kernel
1606  */
1607 
1608 /*
1609  * Initialize segments & interrupt table
1610  */
1611 
1612 int _default_ldt;
1613 struct user_segment_descriptor gdt[NGDT * MAXCPU];	/* global descriptor table */
1614 struct gate_descriptor idt_arr[MAXCPU][NIDT];
1615 #if 0 /* JG */
1616 union descriptor ldt[NLDT];		/* local descriptor table */
1617 #endif
1618 
1619 /* table descriptors - used to load tables by cpu */
1620 struct region_descriptor r_gdt;
1621 struct region_descriptor r_idt_arr[MAXCPU];
1622 
1623 /* JG proc0paddr is a virtual address */
1624 void *proc0paddr;
1625 /* JG alignment? */
1626 char proc0paddr_buff[LWKT_THREAD_STACK];
1627 
1628 
1629 /* software prototypes -- in more palatable form */
1630 struct soft_segment_descriptor gdt_segs[] = {
1631 /* GNULL_SEL	0 Null Descriptor */
1632 {	0x0,			/* segment base address  */
1633 	0x0,			/* length */
1634 	0,			/* segment type */
1635 	0,			/* segment descriptor priority level */
1636 	0,			/* segment descriptor present */
1637 	0,			/* long */
1638 	0,			/* default 32 vs 16 bit size */
1639 	0			/* limit granularity (byte/page units)*/ },
1640 /* GCODE_SEL	1 Code Descriptor for kernel */
1641 {	0x0,			/* segment base address  */
1642 	0xfffff,		/* length - all address space */
1643 	SDT_MEMERA,		/* segment type */
1644 	SEL_KPL,		/* segment descriptor priority level */
1645 	1,			/* segment descriptor present */
1646 	1,			/* long */
1647 	0,			/* default 32 vs 16 bit size */
1648 	1			/* limit granularity (byte/page units)*/ },
1649 /* GDATA_SEL	2 Data Descriptor for kernel */
1650 {	0x0,			/* segment base address  */
1651 	0xfffff,		/* length - all address space */
1652 	SDT_MEMRWA,		/* segment type */
1653 	SEL_KPL,		/* segment descriptor priority level */
1654 	1,			/* segment descriptor present */
1655 	1,			/* long */
1656 	0,			/* default 32 vs 16 bit size */
1657 	1			/* limit granularity (byte/page units)*/ },
1658 /* GUCODE32_SEL	3 32 bit Code Descriptor for user */
1659 {	0x0,			/* segment base address  */
1660 	0xfffff,		/* length - all address space */
1661 	SDT_MEMERA,		/* segment type */
1662 	SEL_UPL,		/* segment descriptor priority level */
1663 	1,			/* segment descriptor present */
1664 	0,			/* long */
1665 	1,			/* default 32 vs 16 bit size */
1666 	1			/* limit granularity (byte/page units)*/ },
1667 /* GUDATA_SEL	4 32/64 bit Data Descriptor for user */
1668 {	0x0,			/* segment base address  */
1669 	0xfffff,		/* length - all address space */
1670 	SDT_MEMRWA,		/* segment type */
1671 	SEL_UPL,		/* segment descriptor priority level */
1672 	1,			/* segment descriptor present */
1673 	0,			/* long */
1674 	1,			/* default 32 vs 16 bit size */
1675 	1			/* limit granularity (byte/page units)*/ },
1676 /* GUCODE_SEL	5 64 bit Code Descriptor for user */
1677 {	0x0,			/* segment base address  */
1678 	0xfffff,		/* length - all address space */
1679 	SDT_MEMERA,		/* segment type */
1680 	SEL_UPL,		/* segment descriptor priority level */
1681 	1,			/* segment descriptor present */
1682 	1,			/* long */
1683 	0,			/* default 32 vs 16 bit size */
1684 	1			/* limit granularity (byte/page units)*/ },
1685 /* GPROC0_SEL	6 Proc 0 Tss Descriptor */
1686 {
1687 	0x0,			/* segment base address */
1688 	sizeof(struct x86_64tss)-1,/* length - all address space */
1689 	SDT_SYSTSS,		/* segment type */
1690 	SEL_KPL,		/* segment descriptor priority level */
1691 	1,			/* segment descriptor present */
1692 	0,			/* long */
1693 	0,			/* unused - default 32 vs 16 bit size */
1694 	0			/* limit granularity (byte/page units)*/ },
1695 /* Actually, the TSS is a system descriptor which is double size */
1696 {	0x0,			/* segment base address  */
1697 	0x0,			/* length */
1698 	0,			/* segment type */
1699 	0,			/* segment descriptor priority level */
1700 	0,			/* segment descriptor present */
1701 	0,			/* long */
1702 	0,			/* default 32 vs 16 bit size */
1703 	0			/* limit granularity (byte/page units)*/ },
1704 /* GUGS32_SEL	8 32 bit GS Descriptor for user */
1705 {	0x0,			/* segment base address  */
1706 	0xfffff,		/* length - all address space */
1707 	SDT_MEMRWA,		/* segment type */
1708 	SEL_UPL,		/* segment descriptor priority level */
1709 	1,			/* segment descriptor present */
1710 	0,			/* long */
1711 	1,			/* default 32 vs 16 bit size */
1712 	1			/* limit granularity (byte/page units)*/ },
1713 };
1714 
1715 void
1716 setidt_global(int idx, inthand_t *func, int typ, int dpl, int ist)
1717 {
1718 	int cpu;
1719 
1720 	for (cpu = 0; cpu < MAXCPU; ++cpu) {
1721 		struct gate_descriptor *ip = &idt_arr[cpu][idx];
1722 
1723 		ip->gd_looffset = (uintptr_t)func;
1724 		ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
1725 		ip->gd_ist = ist;
1726 		ip->gd_xx = 0;
1727 		ip->gd_type = typ;
1728 		ip->gd_dpl = dpl;
1729 		ip->gd_p = 1;
1730 		ip->gd_hioffset = ((uintptr_t)func)>>16 ;
1731 	}
1732 }
1733 
1734 void
1735 setidt(int idx, inthand_t *func, int typ, int dpl, int ist, int cpu)
1736 {
1737 	struct gate_descriptor *ip;
1738 
1739 	KASSERT(cpu >= 0 && cpu < ncpus, ("invalid cpu %d", cpu));
1740 
1741 	ip = &idt_arr[cpu][idx];
1742 	ip->gd_looffset = (uintptr_t)func;
1743 	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
1744 	ip->gd_ist = ist;
1745 	ip->gd_xx = 0;
1746 	ip->gd_type = typ;
1747 	ip->gd_dpl = dpl;
1748 	ip->gd_p = 1;
1749 	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
1750 }
1751 
1752 #define	IDTVEC(name)	__CONCAT(X,name)
1753 
1754 extern inthand_t
1755 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
1756 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
1757 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
1758 	IDTVEC(page), IDTVEC(mchk), IDTVEC(fpu), IDTVEC(align),
1759 	IDTVEC(xmm), IDTVEC(dblfault),
1760 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
1761 
1762 extern inthand_t
1763 	IDTVEC(rsvd00), IDTVEC(rsvd01), IDTVEC(rsvd02), IDTVEC(rsvd03),
1764 	IDTVEC(rsvd04), IDTVEC(rsvd05), IDTVEC(rsvd06), IDTVEC(rsvd07),
1765 	IDTVEC(rsvd08), IDTVEC(rsvd09), IDTVEC(rsvd0a), IDTVEC(rsvd0b),
1766 	IDTVEC(rsvd0c), IDTVEC(rsvd0d), IDTVEC(rsvd0e), IDTVEC(rsvd0f),
1767 	IDTVEC(rsvd10), IDTVEC(rsvd11), IDTVEC(rsvd12), IDTVEC(rsvd13),
1768 	IDTVEC(rsvd14), IDTVEC(rsvd15), IDTVEC(rsvd16), IDTVEC(rsvd17),
1769 	IDTVEC(rsvd18), IDTVEC(rsvd19), IDTVEC(rsvd1a), IDTVEC(rsvd1b),
1770 	IDTVEC(rsvd1c), IDTVEC(rsvd1d), IDTVEC(rsvd1e), IDTVEC(rsvd1f),
1771 	IDTVEC(rsvd20), IDTVEC(rsvd21), IDTVEC(rsvd22), IDTVEC(rsvd23),
1772 	IDTVEC(rsvd24), IDTVEC(rsvd25), IDTVEC(rsvd26), IDTVEC(rsvd27),
1773 	IDTVEC(rsvd28), IDTVEC(rsvd29), IDTVEC(rsvd2a), IDTVEC(rsvd2b),
1774 	IDTVEC(rsvd2c), IDTVEC(rsvd2d), IDTVEC(rsvd2e), IDTVEC(rsvd2f),
1775 	IDTVEC(rsvd30), IDTVEC(rsvd31), IDTVEC(rsvd32), IDTVEC(rsvd33),
1776 	IDTVEC(rsvd34), IDTVEC(rsvd35), IDTVEC(rsvd36), IDTVEC(rsvd37),
1777 	IDTVEC(rsvd38), IDTVEC(rsvd39), IDTVEC(rsvd3a), IDTVEC(rsvd3b),
1778 	IDTVEC(rsvd3c), IDTVEC(rsvd3d), IDTVEC(rsvd3e), IDTVEC(rsvd3f),
1779 	IDTVEC(rsvd40), IDTVEC(rsvd41), IDTVEC(rsvd42), IDTVEC(rsvd43),
1780 	IDTVEC(rsvd44), IDTVEC(rsvd45), IDTVEC(rsvd46), IDTVEC(rsvd47),
1781 	IDTVEC(rsvd48), IDTVEC(rsvd49), IDTVEC(rsvd4a), IDTVEC(rsvd4b),
1782 	IDTVEC(rsvd4c), IDTVEC(rsvd4d), IDTVEC(rsvd4e), IDTVEC(rsvd4f),
1783 	IDTVEC(rsvd50), IDTVEC(rsvd51), IDTVEC(rsvd52), IDTVEC(rsvd53),
1784 	IDTVEC(rsvd54), IDTVEC(rsvd55), IDTVEC(rsvd56), IDTVEC(rsvd57),
1785 	IDTVEC(rsvd58), IDTVEC(rsvd59), IDTVEC(rsvd5a), IDTVEC(rsvd5b),
1786 	IDTVEC(rsvd5c), IDTVEC(rsvd5d), IDTVEC(rsvd5e), IDTVEC(rsvd5f),
1787 	IDTVEC(rsvd60), IDTVEC(rsvd61), IDTVEC(rsvd62), IDTVEC(rsvd63),
1788 	IDTVEC(rsvd64), IDTVEC(rsvd65), IDTVEC(rsvd66), IDTVEC(rsvd67),
1789 	IDTVEC(rsvd68), IDTVEC(rsvd69), IDTVEC(rsvd6a), IDTVEC(rsvd6b),
1790 	IDTVEC(rsvd6c), IDTVEC(rsvd6d), IDTVEC(rsvd6e), IDTVEC(rsvd6f),
1791 	IDTVEC(rsvd70), IDTVEC(rsvd71), IDTVEC(rsvd72), IDTVEC(rsvd73),
1792 	IDTVEC(rsvd74), IDTVEC(rsvd75), IDTVEC(rsvd76), IDTVEC(rsvd77),
1793 	IDTVEC(rsvd78), IDTVEC(rsvd79), IDTVEC(rsvd7a), IDTVEC(rsvd7b),
1794 	IDTVEC(rsvd7c), IDTVEC(rsvd7d), IDTVEC(rsvd7e), IDTVEC(rsvd7f),
1795 	IDTVEC(rsvd80), IDTVEC(rsvd81), IDTVEC(rsvd82), IDTVEC(rsvd83),
1796 	IDTVEC(rsvd84), IDTVEC(rsvd85), IDTVEC(rsvd86), IDTVEC(rsvd87),
1797 	IDTVEC(rsvd88), IDTVEC(rsvd89), IDTVEC(rsvd8a), IDTVEC(rsvd8b),
1798 	IDTVEC(rsvd8c), IDTVEC(rsvd8d), IDTVEC(rsvd8e), IDTVEC(rsvd8f),
1799 	IDTVEC(rsvd90), IDTVEC(rsvd91), IDTVEC(rsvd92), IDTVEC(rsvd93),
1800 	IDTVEC(rsvd94), IDTVEC(rsvd95), IDTVEC(rsvd96), IDTVEC(rsvd97),
1801 	IDTVEC(rsvd98), IDTVEC(rsvd99), IDTVEC(rsvd9a), IDTVEC(rsvd9b),
1802 	IDTVEC(rsvd9c), IDTVEC(rsvd9d), IDTVEC(rsvd9e), IDTVEC(rsvd9f),
1803 	IDTVEC(rsvda0), IDTVEC(rsvda1), IDTVEC(rsvda2), IDTVEC(rsvda3),
1804 	IDTVEC(rsvda4), IDTVEC(rsvda5), IDTVEC(rsvda6), IDTVEC(rsvda7),
1805 	IDTVEC(rsvda8), IDTVEC(rsvda9), IDTVEC(rsvdaa), IDTVEC(rsvdab),
1806 	IDTVEC(rsvdac), IDTVEC(rsvdad), IDTVEC(rsvdae), IDTVEC(rsvdaf),
1807 	IDTVEC(rsvdb0), IDTVEC(rsvdb1), IDTVEC(rsvdb2), IDTVEC(rsvdb3),
1808 	IDTVEC(rsvdb4), IDTVEC(rsvdb5), IDTVEC(rsvdb6), IDTVEC(rsvdb7),
1809 	IDTVEC(rsvdb8), IDTVEC(rsvdb9), IDTVEC(rsvdba), IDTVEC(rsvdbb),
1810 	IDTVEC(rsvdbc), IDTVEC(rsvdbd), IDTVEC(rsvdbe), IDTVEC(rsvdbf),
1811 	IDTVEC(rsvdc0), IDTVEC(rsvdc1), IDTVEC(rsvdc2), IDTVEC(rsvdc3),
1812 	IDTVEC(rsvdc4), IDTVEC(rsvdc5), IDTVEC(rsvdc6), IDTVEC(rsvdc7),
1813 	IDTVEC(rsvdc8), IDTVEC(rsvdc9), IDTVEC(rsvdca), IDTVEC(rsvdcb),
1814 	IDTVEC(rsvdcc), IDTVEC(rsvdcd), IDTVEC(rsvdce), IDTVEC(rsvdcf),
1815 	IDTVEC(rsvdd0), IDTVEC(rsvdd1), IDTVEC(rsvdd2), IDTVEC(rsvdd3),
1816 	IDTVEC(rsvdd4), IDTVEC(rsvdd5), IDTVEC(rsvdd6), IDTVEC(rsvdd7),
1817 	IDTVEC(rsvdd8), IDTVEC(rsvdd9), IDTVEC(rsvdda), IDTVEC(rsvddb),
1818 	IDTVEC(rsvddc), IDTVEC(rsvddd), IDTVEC(rsvdde), IDTVEC(rsvddf),
1819 	IDTVEC(rsvde0), IDTVEC(rsvde1), IDTVEC(rsvde2), IDTVEC(rsvde3),
1820 	IDTVEC(rsvde4), IDTVEC(rsvde5), IDTVEC(rsvde6), IDTVEC(rsvde7),
1821 	IDTVEC(rsvde8), IDTVEC(rsvde9), IDTVEC(rsvdea), IDTVEC(rsvdeb),
1822 	IDTVEC(rsvdec), IDTVEC(rsvded), IDTVEC(rsvdee), IDTVEC(rsvdef),
1823 	IDTVEC(rsvdf0), IDTVEC(rsvdf1), IDTVEC(rsvdf2), IDTVEC(rsvdf3),
1824 	IDTVEC(rsvdf4), IDTVEC(rsvdf5), IDTVEC(rsvdf6), IDTVEC(rsvdf7),
1825 	IDTVEC(rsvdf8), IDTVEC(rsvdf9), IDTVEC(rsvdfa), IDTVEC(rsvdfb),
1826 	IDTVEC(rsvdfc), IDTVEC(rsvdfd), IDTVEC(rsvdfe), IDTVEC(rsvdff);
1827 
1828 inthand_t *rsvdary[NIDT] = {
1829 	&IDTVEC(rsvd00), &IDTVEC(rsvd01), &IDTVEC(rsvd02), &IDTVEC(rsvd03),
1830 	&IDTVEC(rsvd04), &IDTVEC(rsvd05), &IDTVEC(rsvd06), &IDTVEC(rsvd07),
1831 	&IDTVEC(rsvd08), &IDTVEC(rsvd09), &IDTVEC(rsvd0a), &IDTVEC(rsvd0b),
1832 	&IDTVEC(rsvd0c), &IDTVEC(rsvd0d), &IDTVEC(rsvd0e), &IDTVEC(rsvd0f),
1833 	&IDTVEC(rsvd10), &IDTVEC(rsvd11), &IDTVEC(rsvd12), &IDTVEC(rsvd13),
1834 	&IDTVEC(rsvd14), &IDTVEC(rsvd15), &IDTVEC(rsvd16), &IDTVEC(rsvd17),
1835 	&IDTVEC(rsvd18), &IDTVEC(rsvd19), &IDTVEC(rsvd1a), &IDTVEC(rsvd1b),
1836 	&IDTVEC(rsvd1c), &IDTVEC(rsvd1d), &IDTVEC(rsvd1e), &IDTVEC(rsvd1f),
1837 	&IDTVEC(rsvd20), &IDTVEC(rsvd21), &IDTVEC(rsvd22), &IDTVEC(rsvd23),
1838 	&IDTVEC(rsvd24), &IDTVEC(rsvd25), &IDTVEC(rsvd26), &IDTVEC(rsvd27),
1839 	&IDTVEC(rsvd28), &IDTVEC(rsvd29), &IDTVEC(rsvd2a), &IDTVEC(rsvd2b),
1840 	&IDTVEC(rsvd2c), &IDTVEC(rsvd2d), &IDTVEC(rsvd2e), &IDTVEC(rsvd2f),
1841 	&IDTVEC(rsvd30), &IDTVEC(rsvd31), &IDTVEC(rsvd32), &IDTVEC(rsvd33),
1842 	&IDTVEC(rsvd34), &IDTVEC(rsvd35), &IDTVEC(rsvd36), &IDTVEC(rsvd37),
1843 	&IDTVEC(rsvd38), &IDTVEC(rsvd39), &IDTVEC(rsvd3a), &IDTVEC(rsvd3b),
1844 	&IDTVEC(rsvd3c), &IDTVEC(rsvd3d), &IDTVEC(rsvd3e), &IDTVEC(rsvd3f),
1845 	&IDTVEC(rsvd40), &IDTVEC(rsvd41), &IDTVEC(rsvd42), &IDTVEC(rsvd43),
1846 	&IDTVEC(rsvd44), &IDTVEC(rsvd45), &IDTVEC(rsvd46), &IDTVEC(rsvd47),
1847 	&IDTVEC(rsvd48), &IDTVEC(rsvd49), &IDTVEC(rsvd4a), &IDTVEC(rsvd4b),
1848 	&IDTVEC(rsvd4c), &IDTVEC(rsvd4d), &IDTVEC(rsvd4e), &IDTVEC(rsvd4f),
1849 	&IDTVEC(rsvd50), &IDTVEC(rsvd51), &IDTVEC(rsvd52), &IDTVEC(rsvd53),
1850 	&IDTVEC(rsvd54), &IDTVEC(rsvd55), &IDTVEC(rsvd56), &IDTVEC(rsvd57),
1851 	&IDTVEC(rsvd58), &IDTVEC(rsvd59), &IDTVEC(rsvd5a), &IDTVEC(rsvd5b),
1852 	&IDTVEC(rsvd5c), &IDTVEC(rsvd5d), &IDTVEC(rsvd5e), &IDTVEC(rsvd5f),
1853 	&IDTVEC(rsvd60), &IDTVEC(rsvd61), &IDTVEC(rsvd62), &IDTVEC(rsvd63),
1854 	&IDTVEC(rsvd64), &IDTVEC(rsvd65), &IDTVEC(rsvd66), &IDTVEC(rsvd67),
1855 	&IDTVEC(rsvd68), &IDTVEC(rsvd69), &IDTVEC(rsvd6a), &IDTVEC(rsvd6b),
1856 	&IDTVEC(rsvd6c), &IDTVEC(rsvd6d), &IDTVEC(rsvd6e), &IDTVEC(rsvd6f),
1857 	&IDTVEC(rsvd70), &IDTVEC(rsvd71), &IDTVEC(rsvd72), &IDTVEC(rsvd73),
1858 	&IDTVEC(rsvd74), &IDTVEC(rsvd75), &IDTVEC(rsvd76), &IDTVEC(rsvd77),
1859 	&IDTVEC(rsvd78), &IDTVEC(rsvd79), &IDTVEC(rsvd7a), &IDTVEC(rsvd7b),
1860 	&IDTVEC(rsvd7c), &IDTVEC(rsvd7d), &IDTVEC(rsvd7e), &IDTVEC(rsvd7f),
1861 	&IDTVEC(rsvd80), &IDTVEC(rsvd81), &IDTVEC(rsvd82), &IDTVEC(rsvd83),
1862 	&IDTVEC(rsvd84), &IDTVEC(rsvd85), &IDTVEC(rsvd86), &IDTVEC(rsvd87),
1863 	&IDTVEC(rsvd88), &IDTVEC(rsvd89), &IDTVEC(rsvd8a), &IDTVEC(rsvd8b),
1864 	&IDTVEC(rsvd8c), &IDTVEC(rsvd8d), &IDTVEC(rsvd8e), &IDTVEC(rsvd8f),
1865 	&IDTVEC(rsvd90), &IDTVEC(rsvd91), &IDTVEC(rsvd92), &IDTVEC(rsvd93),
1866 	&IDTVEC(rsvd94), &IDTVEC(rsvd95), &IDTVEC(rsvd96), &IDTVEC(rsvd97),
1867 	&IDTVEC(rsvd98), &IDTVEC(rsvd99), &IDTVEC(rsvd9a), &IDTVEC(rsvd9b),
1868 	&IDTVEC(rsvd9c), &IDTVEC(rsvd9d), &IDTVEC(rsvd9e), &IDTVEC(rsvd9f),
1869 	&IDTVEC(rsvda0), &IDTVEC(rsvda1), &IDTVEC(rsvda2), &IDTVEC(rsvda3),
1870 	&IDTVEC(rsvda4), &IDTVEC(rsvda5), &IDTVEC(rsvda6), &IDTVEC(rsvda7),
1871 	&IDTVEC(rsvda8), &IDTVEC(rsvda9), &IDTVEC(rsvdaa), &IDTVEC(rsvdab),
1872 	&IDTVEC(rsvdac), &IDTVEC(rsvdad), &IDTVEC(rsvdae), &IDTVEC(rsvdaf),
1873 	&IDTVEC(rsvdb0), &IDTVEC(rsvdb1), &IDTVEC(rsvdb2), &IDTVEC(rsvdb3),
1874 	&IDTVEC(rsvdb4), &IDTVEC(rsvdb5), &IDTVEC(rsvdb6), &IDTVEC(rsvdb7),
1875 	&IDTVEC(rsvdb8), &IDTVEC(rsvdb9), &IDTVEC(rsvdba), &IDTVEC(rsvdbb),
1876 	&IDTVEC(rsvdbc), &IDTVEC(rsvdbd), &IDTVEC(rsvdbe), &IDTVEC(rsvdbf),
1877 	&IDTVEC(rsvdc0), &IDTVEC(rsvdc1), &IDTVEC(rsvdc2), &IDTVEC(rsvdc3),
1878 	&IDTVEC(rsvdc4), &IDTVEC(rsvdc5), &IDTVEC(rsvdc6), &IDTVEC(rsvdc7),
1879 	&IDTVEC(rsvdc8), &IDTVEC(rsvdc9), &IDTVEC(rsvdca), &IDTVEC(rsvdcb),
1880 	&IDTVEC(rsvdcc), &IDTVEC(rsvdcd), &IDTVEC(rsvdce), &IDTVEC(rsvdcf),
1881 	&IDTVEC(rsvdd0), &IDTVEC(rsvdd1), &IDTVEC(rsvdd2), &IDTVEC(rsvdd3),
1882 	&IDTVEC(rsvdd4), &IDTVEC(rsvdd5), &IDTVEC(rsvdd6), &IDTVEC(rsvdd7),
1883 	&IDTVEC(rsvdd8), &IDTVEC(rsvdd9), &IDTVEC(rsvdda), &IDTVEC(rsvddb),
1884 	&IDTVEC(rsvddc), &IDTVEC(rsvddd), &IDTVEC(rsvdde), &IDTVEC(rsvddf),
1885 	&IDTVEC(rsvde0), &IDTVEC(rsvde1), &IDTVEC(rsvde2), &IDTVEC(rsvde3),
1886 	&IDTVEC(rsvde4), &IDTVEC(rsvde5), &IDTVEC(rsvde6), &IDTVEC(rsvde7),
1887 	&IDTVEC(rsvde8), &IDTVEC(rsvde9), &IDTVEC(rsvdea), &IDTVEC(rsvdeb),
1888 	&IDTVEC(rsvdec), &IDTVEC(rsvded), &IDTVEC(rsvdee), &IDTVEC(rsvdef),
1889 	&IDTVEC(rsvdf0), &IDTVEC(rsvdf1), &IDTVEC(rsvdf2), &IDTVEC(rsvdf3),
1890 	&IDTVEC(rsvdf4), &IDTVEC(rsvdf5), &IDTVEC(rsvdf6), &IDTVEC(rsvdf7),
1891 	&IDTVEC(rsvdf8), &IDTVEC(rsvdf9), &IDTVEC(rsvdfa), &IDTVEC(rsvdfb),
1892 	&IDTVEC(rsvdfc), &IDTVEC(rsvdfd), &IDTVEC(rsvdfe), &IDTVEC(rsvdff)
1893 };
1894 
1895 void
1896 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd)
1897 {
1898 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
1899 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
1900 	ssd->ssd_type  = sd->sd_type;
1901 	ssd->ssd_dpl   = sd->sd_dpl;
1902 	ssd->ssd_p     = sd->sd_p;
1903 	ssd->ssd_def32 = sd->sd_def32;
1904 	ssd->ssd_gran  = sd->sd_gran;
1905 }
1906 
1907 void
1908 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd)
1909 {
1910 
1911 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
1912 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
1913 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
1914 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
1915 	sd->sd_type  = ssd->ssd_type;
1916 	sd->sd_dpl   = ssd->ssd_dpl;
1917 	sd->sd_p     = ssd->ssd_p;
1918 	sd->sd_long  = ssd->ssd_long;
1919 	sd->sd_def32 = ssd->ssd_def32;
1920 	sd->sd_gran  = ssd->ssd_gran;
1921 }
1922 
1923 void
1924 ssdtosyssd(struct soft_segment_descriptor *ssd,
1925     struct system_segment_descriptor *sd)
1926 {
1927 
1928 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
1929 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
1930 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
1931 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
1932 	sd->sd_type  = ssd->ssd_type;
1933 	sd->sd_dpl   = ssd->ssd_dpl;
1934 	sd->sd_p     = ssd->ssd_p;
1935 	sd->sd_gran  = ssd->ssd_gran;
1936 }
1937 
1938 /*
1939  * Populate the (physmap) array with base/bound pairs describing the
1940  * available physical memory in the system, then test this memory and
1941  * build the phys_avail array describing the actually-available memory.
1942  *
1943  * If we cannot accurately determine the physical memory map, then use
1944  * value from the 0xE801 call, and failing that, the RTC.
1945  *
1946  * Total memory size may be set by the kernel environment variable
1947  * hw.physmem or the compile-time define MAXMEM.
1948  *
1949  * Memory is aligned to PHYSMAP_ALIGN which must be a multiple
1950  * of PAGE_SIZE.  This also greatly reduces the memory test time
1951  * which would otherwise be excessive on machines with > 8G of ram.
1952  *
1953  * XXX first should be vm_paddr_t.
1954  */
1955 
1956 #define PHYSMAP_ALIGN		(vm_paddr_t)(128 * 1024)
1957 #define PHYSMAP_ALIGN_MASK	(vm_paddr_t)(PHYSMAP_ALIGN - 1)
1958 #define PHYSMAP_SIZE		VM_PHYSSEG_MAX
1959 
1960 vm_paddr_t physmap[PHYSMAP_SIZE];
1961 struct bios_smap *smapbase, *smap, *smapend;
1962 struct efi_map_header *efihdrbase;
1963 u_int32_t smapsize;
1964 
1965 #define PHYSMAP_HANDWAVE	(vm_paddr_t)(2 * 1024 * 1024)
1966 #define PHYSMAP_HANDWAVE_MASK	(PHYSMAP_HANDWAVE - 1)
1967 
1968 static void
1969 add_smap_entries(int *physmap_idx)
1970 {
1971 	int i;
1972 
1973 	smapsize = *((u_int32_t *)smapbase - 1);
1974 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
1975 
1976 	for (smap = smapbase; smap < smapend; smap++) {
1977 		if (boothowto & RB_VERBOSE)
1978 			kprintf("SMAP type=%02x base=%016lx len=%016lx\n",
1979 			    smap->type, smap->base, smap->length);
1980 
1981 		if (smap->type != SMAP_TYPE_MEMORY)
1982 			continue;
1983 
1984 		if (smap->length == 0)
1985 			continue;
1986 
1987 		for (i = 0; i <= *physmap_idx; i += 2) {
1988 			if (smap->base < physmap[i + 1]) {
1989 				if (boothowto & RB_VERBOSE) {
1990 					kprintf("Overlapping or non-monotonic "
1991 						"memory region, ignoring "
1992 						"second region\n");
1993 				}
1994 				break;
1995 			}
1996 		}
1997 		if (i <= *physmap_idx)
1998 			continue;
1999 
2000 		Realmem += smap->length;
2001 
2002 		/*
2003 		 * NOTE: This little bit of code initially expands
2004 		 *	 physmap[1] as well as later entries.
2005 		 */
2006 		if (smap->base == physmap[*physmap_idx + 1]) {
2007 			physmap[*physmap_idx + 1] += smap->length;
2008 			continue;
2009 		}
2010 
2011 		*physmap_idx += 2;
2012 		if (*physmap_idx == PHYSMAP_SIZE) {
2013 			kprintf("Too many segments in the physical "
2014 				"address map, giving up\n");
2015 			break;
2016 		}
2017 		physmap[*physmap_idx] = smap->base;
2018 		physmap[*physmap_idx + 1] = smap->base + smap->length;
2019 	}
2020 }
2021 
2022 static void
2023 add_efi_map_entries(int *physmap_idx)
2024 {
2025 	struct efi_md *map, *p;
2026 	const char *type;
2027 	size_t efisz;
2028 	int i, ndesc;
2029 
2030 	static const char *types[] = {
2031 		"Reserved",
2032 		"LoaderCode",
2033 		"LoaderData",
2034 		"BootServicesCode",
2035 		"BootServicesData",
2036 		"RuntimeServicesCode",
2037 		"RuntimeServicesData",
2038 		"ConventionalMemory",
2039 		"UnusableMemory",
2040 		"ACPIReclaimMemory",
2041 		"ACPIMemoryNVS",
2042 		"MemoryMappedIO",
2043 		"MemoryMappedIOPortSpace",
2044 		"PalCode"
2045 	 };
2046 
2047 	/*
2048 	 * Memory map data provided by UEFI via the GetMemoryMap
2049 	 * Boot Services API.
2050 	 */
2051 	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
2052 	map = (struct efi_md *)((uint8_t *)efihdrbase + efisz);
2053 
2054 	if (efihdrbase->descriptor_size == 0)
2055 		return;
2056 	ndesc = efihdrbase->memory_size / efihdrbase->descriptor_size;
2057 
2058 	if (boothowto & RB_VERBOSE)
2059 		kprintf("%23s %12s %12s %8s %4s\n",
2060 		    "Type", "Physical", "Virtual", "#Pages", "Attr");
2061 
2062 	for (i = 0, p = map; i < ndesc; i++,
2063 	    p = efi_next_descriptor(p, efihdrbase->descriptor_size)) {
2064 		if (boothowto & RB_VERBOSE) {
2065 			if (p->md_type <= EFI_MD_TYPE_PALCODE)
2066 				type = types[p->md_type];
2067 			else
2068 				type = "<INVALID>";
2069 			kprintf("%23s %012lx %12p %08lx ", type, p->md_phys,
2070 			    p->md_virt, p->md_pages);
2071 			if (p->md_attr & EFI_MD_ATTR_UC)
2072 				kprintf("UC ");
2073 			if (p->md_attr & EFI_MD_ATTR_WC)
2074 				kprintf("WC ");
2075 			if (p->md_attr & EFI_MD_ATTR_WT)
2076 				kprintf("WT ");
2077 			if (p->md_attr & EFI_MD_ATTR_WB)
2078 				kprintf("WB ");
2079 			if (p->md_attr & EFI_MD_ATTR_UCE)
2080 				kprintf("UCE ");
2081 			if (p->md_attr & EFI_MD_ATTR_WP)
2082 				kprintf("WP ");
2083 			if (p->md_attr & EFI_MD_ATTR_RP)
2084 				kprintf("RP ");
2085 			if (p->md_attr & EFI_MD_ATTR_XP)
2086 				kprintf("XP ");
2087 			if (p->md_attr & EFI_MD_ATTR_RT)
2088 				kprintf("RUNTIME");
2089 			kprintf("\n");
2090 		}
2091 
2092 		switch (p->md_type) {
2093 		case EFI_MD_TYPE_CODE:
2094 		case EFI_MD_TYPE_DATA:
2095 		case EFI_MD_TYPE_BS_CODE:
2096 		case EFI_MD_TYPE_BS_DATA:
2097 		case EFI_MD_TYPE_FREE:
2098 			/*
2099 			 * We're allowed to use any entry with these types.
2100 			 */
2101 			break;
2102 		default:
2103 			continue;
2104 		}
2105 
2106 		Realmem += p->md_pages * PAGE_SIZE;
2107 
2108 		/*
2109 		 * NOTE: This little bit of code initially expands
2110 		 *	 physmap[1] as well as later entries.
2111 		 */
2112 		if (p->md_phys == physmap[*physmap_idx + 1]) {
2113 			physmap[*physmap_idx + 1] += p->md_pages * PAGE_SIZE;
2114 			continue;
2115 		}
2116 
2117 		*physmap_idx += 2;
2118 		if (*physmap_idx == PHYSMAP_SIZE) {
2119 			kprintf("Too many segments in the physical "
2120 				"address map, giving up\n");
2121 			break;
2122 		}
2123 		physmap[*physmap_idx] = p->md_phys;
2124 		physmap[*physmap_idx + 1] = p->md_phys + p->md_pages * PAGE_SIZE;
2125 	 }
2126 }
2127 
2128 struct fb_info efi_fb_info;
2129 static int have_efi_framebuffer = 0;
2130 
2131 static void
2132 efi_fb_init_vaddr(int direct_map)
2133 {
2134 	uint64_t sz;
2135 	vm_offset_t addr, v;
2136 
2137 	v = efi_fb_info.vaddr;
2138 	sz = efi_fb_info.stride * efi_fb_info.height;
2139 
2140 	if (direct_map) {
2141 		addr = PHYS_TO_DMAP(efi_fb_info.paddr);
2142 		if (addr >= DMAP_MIN_ADDRESS && addr + sz <= DMapMaxAddress)
2143 			efi_fb_info.vaddr = addr;
2144 	} else {
2145 		efi_fb_info.vaddr =
2146 			(vm_offset_t)pmap_mapdev_attr(efi_fb_info.paddr,
2147 						      sz,
2148 						      PAT_WRITE_COMBINING);
2149 	}
2150 }
2151 
2152 static u_int
2153 efifb_color_depth(struct efi_fb *efifb)
2154 {
2155 	uint32_t mask;
2156 	u_int depth;
2157 
2158 	mask = efifb->fb_mask_red | efifb->fb_mask_green |
2159 	    efifb->fb_mask_blue | efifb->fb_mask_reserved;
2160 	if (mask == 0)
2161 		return (0);
2162 	for (depth = 1; mask != 1; depth++)
2163 		mask >>= 1;
2164 	return (depth);
2165 }
2166 
2167 int
2168 probe_efi_fb(int early)
2169 {
2170 	struct efi_fb	*efifb;
2171 	caddr_t		kmdp;
2172 	u_int		depth;
2173 
2174 	if (have_efi_framebuffer) {
2175 		if (!early &&
2176 		    (efi_fb_info.vaddr == 0 ||
2177 		     efi_fb_info.vaddr == PHYS_TO_DMAP(efi_fb_info.paddr)))
2178 			efi_fb_init_vaddr(0);
2179 		return 0;
2180 	}
2181 
2182 	kmdp = preload_search_by_type("elf kernel");
2183 	if (kmdp == NULL)
2184 		kmdp = preload_search_by_type("elf64 kernel");
2185 	efifb = (struct efi_fb *)preload_search_info(kmdp,
2186 	    MODINFO_METADATA | MODINFOMD_EFI_FB);
2187 	if (efifb == NULL)
2188 		return 1;
2189 
2190 	depth = efifb_color_depth(efifb);
2191 	/*
2192 	 * Our bootloader should already notice, when we won't be able to
2193 	 * use the UEFI framebuffer.
2194 	 */
2195 	if (depth != 24 && depth != 32)
2196 		return 1;
2197 
2198 	have_efi_framebuffer = 1;
2199 
2200 	efi_fb_info.is_vga_boot_display = 1;
2201 	efi_fb_info.width = efifb->fb_width;
2202 	efi_fb_info.height = efifb->fb_height;
2203 	efi_fb_info.depth = depth;
2204 	efi_fb_info.stride = efifb->fb_stride * (depth / 8);
2205 	efi_fb_info.paddr = efifb->fb_addr;
2206 	if (early) {
2207 		efi_fb_info.vaddr = 0;
2208 	} else {
2209 		efi_fb_init_vaddr(0);
2210 	}
2211 	efi_fb_info.fbops.fb_set_par = NULL;
2212 	efi_fb_info.fbops.fb_blank = NULL;
2213 	efi_fb_info.fbops.fb_debug_enter = NULL;
2214 	efi_fb_info.device = NULL;
2215 
2216 	return 0;
2217 }
2218 
2219 static void
2220 efifb_startup(void *arg)
2221 {
2222 	probe_efi_fb(0);
2223 }
2224 
2225 SYSINIT(efi_fb_info, SI_BOOT1_POST, SI_ORDER_FIRST, efifb_startup, NULL);
2226 
2227 static void
2228 getmemsize(caddr_t kmdp, u_int64_t first)
2229 {
2230 	int off, physmap_idx, pa_indx, da_indx;
2231 	int i, j;
2232 	vm_paddr_t pa;
2233 	vm_paddr_t msgbuf_size;
2234 	u_long physmem_tunable;
2235 	pt_entry_t *pte;
2236 	quad_t dcons_addr, dcons_size;
2237 
2238 	bzero(physmap, sizeof(physmap));
2239 	physmap_idx = 0;
2240 
2241 	/*
2242 	 * get memory map from INT 15:E820, kindly supplied by the loader.
2243 	 *
2244 	 * subr_module.c says:
2245 	 * "Consumer may safely assume that size value precedes data."
2246 	 * ie: an int32_t immediately precedes smap.
2247 	 */
2248 	efihdrbase = (struct efi_map_header *)preload_search_info(kmdp,
2249 		     MODINFO_METADATA | MODINFOMD_EFI_MAP);
2250 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
2251 		   MODINFO_METADATA | MODINFOMD_SMAP);
2252 	if (smapbase == NULL && efihdrbase == NULL)
2253 		panic("No BIOS smap or EFI map info from loader!");
2254 
2255 	if (efihdrbase == NULL)
2256 		add_smap_entries(&physmap_idx);
2257 	else
2258 		add_efi_map_entries(&physmap_idx);
2259 
2260 	base_memory = physmap[1] / 1024;
2261 	/* make hole for AP bootstrap code */
2262 	physmap[1] = mp_bootaddress(base_memory);
2263 
2264 	/* Save EBDA address, if any */
2265 	ebda_addr = (u_long)(*(u_short *)(KERNBASE + 0x40e));
2266 	ebda_addr <<= 4;
2267 
2268 	/*
2269 	 * Maxmem isn't the "maximum memory", it's one larger than the
2270 	 * highest page of the physical address space.  It should be
2271 	 * called something like "Maxphyspage".  We may adjust this
2272 	 * based on ``hw.physmem'' and the results of the memory test.
2273 	 */
2274 	Maxmem = atop(physmap[physmap_idx + 1]);
2275 
2276 #ifdef MAXMEM
2277 	Maxmem = MAXMEM / 4;
2278 #endif
2279 
2280 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
2281 		Maxmem = atop(physmem_tunable);
2282 
2283 	/*
2284 	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
2285 	 * in the system.
2286 	 */
2287 	if (Maxmem > atop(physmap[physmap_idx + 1]))
2288 		Maxmem = atop(physmap[physmap_idx + 1]);
2289 
2290 	/*
2291 	 * Blowing out the DMAP will blow up the system.
2292 	 */
2293 	if (Maxmem > atop(DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS)) {
2294 		kprintf("Limiting Maxmem due to DMAP size\n");
2295 		Maxmem = atop(DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS);
2296 	}
2297 
2298 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
2299 	    (boothowto & RB_VERBOSE)) {
2300 		kprintf("Physical memory use set to %ldK\n", Maxmem * 4);
2301 	}
2302 
2303 	/*
2304 	 * Call pmap initialization to make new kernel address space
2305 	 *
2306 	 * Mask off page 0.
2307 	 */
2308 	pmap_bootstrap(&first);
2309 	physmap[0] = PAGE_SIZE;
2310 
2311 	/*
2312 	 * Align the physmap to PHYSMAP_ALIGN and cut out anything
2313 	 * exceeding Maxmem.
2314 	 */
2315 	for (i = j = 0; i <= physmap_idx; i += 2) {
2316 		if (physmap[i+1] > ptoa(Maxmem))
2317 			physmap[i+1] = ptoa(Maxmem);
2318 		physmap[i] = (physmap[i] + PHYSMAP_ALIGN_MASK) &
2319 			     ~PHYSMAP_ALIGN_MASK;
2320 		physmap[i+1] = physmap[i+1] & ~PHYSMAP_ALIGN_MASK;
2321 
2322 		physmap[j] = physmap[i];
2323 		physmap[j+1] = physmap[i+1];
2324 
2325 		if (physmap[i] < physmap[i+1])
2326 			j += 2;
2327 	}
2328 	physmap_idx = j - 2;
2329 
2330 	/*
2331 	 * Align anything else used in the validation loop.
2332 	 *
2333 	 * Also make sure that our 2MB kernel text+data+bss mappings
2334 	 * do not overlap potentially allocatable space.
2335 	 */
2336 	first = (first + PHYSMAP_ALIGN_MASK) & ~PHYSMAP_ALIGN_MASK;
2337 
2338 	/*
2339 	 * Size up each available chunk of physical memory.
2340 	 */
2341 	pa_indx = 0;
2342 	da_indx = 0;
2343 	phys_avail[pa_indx].phys_beg = physmap[0];
2344 	phys_avail[pa_indx].phys_end = physmap[0];
2345 	dump_avail[da_indx].phys_beg = 0;
2346 	dump_avail[da_indx].phys_end = physmap[0];
2347 	pte = CMAP1;
2348 
2349 	/*
2350 	 * Get dcons buffer address
2351 	 */
2352 	if (kgetenv_quad("dcons.addr", &dcons_addr) == 0 ||
2353 	    kgetenv_quad("dcons.size", &dcons_size) == 0)
2354 		dcons_addr = 0;
2355 
2356 	/*
2357 	 * Validate the physical memory.  The physical memory segments
2358 	 * have already been aligned to PHYSMAP_ALIGN which is a multiple
2359 	 * of PAGE_SIZE.
2360 	 *
2361 	 * We no longer perform an exhaustive memory test.  Instead we
2362 	 * simply test the first and last word in each physmap[]
2363 	 * segment.
2364 	 */
2365 	for (i = 0; i <= physmap_idx; i += 2) {
2366 		vm_paddr_t end;
2367 		vm_paddr_t incr;
2368 
2369 		end = physmap[i + 1];
2370 
2371 		for (pa = physmap[i]; pa < end; pa += incr) {
2372 			int page_bad, full;
2373 			volatile uint64_t *ptr = (uint64_t *)CADDR1;
2374 			uint64_t tmp;
2375 
2376 			full = FALSE;
2377 
2378 			/*
2379 			 * Calculate incr.  Just test the first and
2380 			 * last page in each physmap[] segment.
2381 			 */
2382 			if (pa == end - PAGE_SIZE)
2383 				incr = PAGE_SIZE;
2384 			else
2385 				incr = end - pa - PAGE_SIZE;
2386 
2387 			/*
2388 			 * Make sure we don't skip blacked out areas.
2389 			 */
2390 			if (pa < 0x200000 && 0x200000 < end) {
2391 				incr = 0x200000 - pa;
2392 			}
2393 			if (dcons_addr > 0 &&
2394 			    pa < dcons_addr &&
2395 			    dcons_addr < end) {
2396 				incr = dcons_addr - pa;
2397 			}
2398 
2399 			/*
2400 			 * Block out kernel memory as not available.
2401 			 */
2402 			if (pa >= 0x200000 && pa < first) {
2403 				incr = first - pa;
2404 				if (pa + incr > end)
2405 					incr = end - pa;
2406 				goto do_dump_avail;
2407 			}
2408 
2409 			/*
2410 			 * Block out the dcons buffer if it exists.
2411 			 */
2412 			if (dcons_addr > 0 &&
2413 			    pa >= trunc_page(dcons_addr) &&
2414 			    pa < dcons_addr + dcons_size) {
2415 				incr = dcons_addr + dcons_size - pa;
2416 				incr = (incr + PAGE_MASK) &
2417 				       ~(vm_paddr_t)PAGE_MASK;
2418 				if (pa + incr > end)
2419 					incr = end - pa;
2420 				goto do_dump_avail;
2421 			}
2422 
2423 			page_bad = FALSE;
2424 
2425 			/*
2426 			 * Map the page non-cacheable for the memory
2427 			 * test.
2428 			 */
2429 			*pte = pa |
2430 			    kernel_pmap.pmap_bits[PG_V_IDX] |
2431 			    kernel_pmap.pmap_bits[PG_RW_IDX] |
2432 			    kernel_pmap.pmap_bits[PG_N_IDX];
2433 			cpu_invlpg(__DEVOLATILE(void *, ptr));
2434 			cpu_mfence();
2435 
2436 			/*
2437 			 * Save original value for restoration later.
2438 			 */
2439 			tmp = *ptr;
2440 
2441 			/*
2442 			 * Test for alternating 1's and 0's
2443 			 */
2444 			*ptr = 0xaaaaaaaaaaaaaaaaLLU;
2445 			cpu_mfence();
2446 			if (*ptr != 0xaaaaaaaaaaaaaaaaLLU)
2447 				page_bad = TRUE;
2448 			/*
2449 			 * Test for alternating 0's and 1's
2450 			 */
2451 			*ptr = 0x5555555555555555LLU;
2452 			cpu_mfence();
2453 			if (*ptr != 0x5555555555555555LLU)
2454 				page_bad = TRUE;
2455 			/*
2456 			 * Test for all 1's
2457 			 */
2458 			*ptr = 0xffffffffffffffffLLU;
2459 			cpu_mfence();
2460 			if (*ptr != 0xffffffffffffffffLLU)
2461 				page_bad = TRUE;
2462 			/*
2463 			 * Test for all 0's
2464 			 */
2465 			*ptr = 0x0;
2466 			cpu_mfence();
2467 			if (*ptr != 0x0)
2468 				page_bad = TRUE;
2469 
2470 			/*
2471 			 * Restore original value.
2472 			 */
2473 			*ptr = tmp;
2474 
2475 			/*
2476 			 * Adjust array of valid/good pages.
2477 			 */
2478 			if (page_bad == TRUE) {
2479 				incr = PAGE_SIZE;
2480 				continue;
2481 			}
2482 
2483 			/*
2484 			 * Collapse page address into phys_avail[].  Do a
2485 			 * continuation of the current phys_avail[] index
2486 			 * when possible.
2487 			 */
2488 			if (phys_avail[pa_indx].phys_end == pa) {
2489 				/*
2490 				 * Continuation
2491 				 */
2492 				phys_avail[pa_indx].phys_end += incr;
2493 			} else if (phys_avail[pa_indx].phys_beg ==
2494 				   phys_avail[pa_indx].phys_end) {
2495 				/*
2496 				 * Current phys_avail is completely empty,
2497 				 * reuse the index.
2498 				 */
2499 				phys_avail[pa_indx].phys_beg = pa;
2500 				phys_avail[pa_indx].phys_end = pa + incr;
2501 			} else {
2502 				/*
2503 				 * Allocate next phys_avail index.
2504 				 */
2505 				++pa_indx;
2506 				if (pa_indx == PHYS_AVAIL_ARRAY_END) {
2507 					kprintf(
2508 		"Too many holes in the physical address space, giving up\n");
2509 					--pa_indx;
2510 					full = TRUE;
2511 					goto do_dump_avail;
2512 				}
2513 				phys_avail[pa_indx].phys_beg = pa;
2514 				phys_avail[pa_indx].phys_end = pa + incr;
2515 			}
2516 			physmem += incr / PAGE_SIZE;
2517 
2518 			/*
2519 			 * pa available for dumping
2520 			 */
2521 do_dump_avail:
2522 			if (dump_avail[da_indx].phys_end == pa) {
2523 				dump_avail[da_indx].phys_end += incr;
2524 			} else {
2525 				++da_indx;
2526 				if (da_indx == DUMP_AVAIL_ARRAY_END) {
2527 					--da_indx;
2528 					goto do_next;
2529 				}
2530 				dump_avail[da_indx].phys_beg = pa;
2531 				dump_avail[da_indx].phys_end = pa + incr;
2532 			}
2533 do_next:
2534 			if (full)
2535 				break;
2536 		}
2537 	}
2538 	*pte = 0;
2539 	cpu_invltlb();
2540 	cpu_mfence();
2541 
2542 	/*
2543 	 * The last chunk must contain at least one page plus the message
2544 	 * buffer to avoid complicating other code (message buffer address
2545 	 * calculation, etc.).
2546 	 */
2547 	msgbuf_size = (MSGBUF_SIZE + PHYSMAP_ALIGN_MASK) & ~PHYSMAP_ALIGN_MASK;
2548 
2549 	while (phys_avail[pa_indx].phys_beg + PHYSMAP_ALIGN + msgbuf_size >=
2550 	       phys_avail[pa_indx].phys_end) {
2551 		physmem -= atop(phys_avail[pa_indx].phys_end -
2552 				phys_avail[pa_indx].phys_beg);
2553 		phys_avail[pa_indx].phys_beg = 0;
2554 		phys_avail[pa_indx].phys_end = 0;
2555 		--pa_indx;
2556 	}
2557 
2558 	Maxmem = atop(phys_avail[pa_indx].phys_end);
2559 
2560 	/* Trim off space for the message buffer. */
2561 	phys_avail[pa_indx].phys_end -= msgbuf_size;
2562 
2563 	avail_end = phys_avail[pa_indx].phys_end;
2564 
2565 	/* Map the message buffer. */
2566 	for (off = 0; off < msgbuf_size; off += PAGE_SIZE) {
2567 		pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off);
2568 	}
2569 
2570 	/*
2571 	 * Try to get EFI framebuffer working as early as possible.
2572 	 *
2573 	 * WARN: Some BIOSes do not list the EFI framebuffer memory, causing
2574 	 * the pmap probe code to create a DMAP that does not cover its
2575 	 * physical address space, efi_fb_init_vaddr(1) might not return
2576 	 * an initialized framebuffer base pointer.  In this situation the
2577 	 * later efi_fb_init_vaddr(0) call will deal with it.
2578 	 */
2579 	if (have_efi_framebuffer)
2580 		efi_fb_init_vaddr(1);
2581 }
2582 
2583 struct machintr_abi MachIntrABI;
2584 
2585 /*
2586  * IDT VECTORS:
2587  *	0	Divide by zero
2588  *	1	Debug
2589  *	2	NMI
2590  *	3	BreakPoint
2591  *	4	OverFlow
2592  *	5	Bound-Range
2593  *	6	Invalid OpCode
2594  *	7	Device Not Available (x87)
2595  *	8	Double-Fault
2596  *	9	Coprocessor Segment overrun (unsupported, reserved)
2597  *	10	Invalid-TSS
2598  *	11	Segment not present
2599  *	12	Stack
2600  *	13	General Protection
2601  *	14	Page Fault
2602  *	15	Reserved
2603  *	16	x87 FP Exception pending
2604  *	17	Alignment Check
2605  *	18	Machine Check
2606  *	19	SIMD floating point
2607  *	20-31	reserved
2608  *	32-255	INTn/external sources
2609  */
2610 u_int64_t
2611 hammer_time(u_int64_t modulep, u_int64_t physfree)
2612 {
2613 	caddr_t kmdp;
2614 	int gsel_tss, x, cpu;
2615 #if 0 /* JG */
2616 	int metadata_missing, off;
2617 #endif
2618 	struct mdglobaldata *gd;
2619 	struct privatespace *ps;
2620 	u_int64_t msr;
2621 
2622 	/*
2623 	 * Prevent lowering of the ipl if we call tsleep() early.
2624 	 */
2625 	gd = &CPU_prvspace[0]->mdglobaldata;
2626 	ps = (struct privatespace *)gd;
2627 	bzero(gd, sizeof(*gd));
2628 	bzero(&ps->common_tss, sizeof(ps->common_tss));
2629 
2630 	/*
2631 	 * Note: on both UP and SMP curthread must be set non-NULL
2632 	 * early in the boot sequence because the system assumes
2633 	 * that 'curthread' is never NULL.
2634 	 */
2635 
2636 	gd->mi.gd_curthread = &thread0;
2637 	thread0.td_gd = &gd->mi;
2638 
2639 	atdevbase = ISA_HOLE_START + PTOV_OFFSET;
2640 
2641 #if 0 /* JG */
2642 	metadata_missing = 0;
2643 	if (bootinfo.bi_modulep) {
2644 		preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE;
2645 		preload_bootstrap_relocate(KERNBASE);
2646 	} else {
2647 		metadata_missing = 1;
2648 	}
2649 	if (bootinfo.bi_envp)
2650 		kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE;
2651 #endif
2652 
2653 	preload_metadata = (caddr_t)(uintptr_t)(modulep + PTOV_OFFSET);
2654 	preload_bootstrap_relocate(PTOV_OFFSET);
2655 	kmdp = preload_search_by_type("elf kernel");
2656 	if (kmdp == NULL)
2657 		kmdp = preload_search_by_type("elf64 kernel");
2658 	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
2659 	kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *) + PTOV_OFFSET;
2660 #ifdef DDB
2661 	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
2662 	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
2663 #endif
2664 	efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
2665 
2666 	if (boothowto & RB_VERBOSE)
2667 		bootverbose++;
2668 
2669 	/*
2670 	 * Default MachIntrABI to ICU
2671 	 */
2672 	MachIntrABI = MachIntrABI_ICU;
2673 
2674 	/*
2675 	 * start with one cpu.  Note: with one cpu, ncpus_fit_mask remain 0.
2676 	 */
2677 	ncpus = 1;
2678 	ncpus_fit = 1;
2679 	/* Init basic tunables, hz etc */
2680 	init_param1();
2681 
2682 	/*
2683 	 * make gdt memory segments
2684 	 */
2685 	gdt_segs[GPROC0_SEL].ssd_base =
2686 		(uintptr_t) &CPU_prvspace[0]->common_tss;
2687 
2688 	gd->mi.gd_prvspace = CPU_prvspace[0];
2689 
2690 	for (x = 0; x < NGDT; x++) {
2691 		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1))
2692 			ssdtosd(&gdt_segs[x], &gdt[x]);
2693 	}
2694 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
2695 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
2696 
2697 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
2698 	r_gdt.rd_base =  (long) gdt;
2699 	lgdt(&r_gdt);
2700 
2701 	wrmsr(MSR_FSBASE, 0);		/* User value */
2702 	wrmsr(MSR_GSBASE, (u_int64_t)&gd->mi);
2703 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
2704 
2705 	mi_gdinit(&gd->mi, 0);
2706 	cpu_gdinit(gd, 0);
2707 	proc0paddr = proc0paddr_buff;
2708 	mi_proc0init(&gd->mi, proc0paddr);
2709 	safepri = TDPRI_MAX;
2710 
2711 	/* spinlocks and the BGL */
2712 	init_locks();
2713 
2714 	/* exceptions */
2715 	for (x = 0; x < NIDT; x++)
2716 		setidt_global(x, rsvdary[x], SDT_SYSIGT, SEL_KPL, 0);
2717 	setidt_global(IDT_DE, &IDTVEC(div),  SDT_SYSIGT, SEL_KPL, 0);
2718 	setidt_global(IDT_DB, &IDTVEC(dbg),  SDT_SYSIGT, SEL_KPL, 2);
2719 	setidt_global(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 1);
2720 	setidt_global(IDT_BP, &IDTVEC(bpt),  SDT_SYSIGT, SEL_UPL, 0);
2721 	setidt_global(IDT_OF, &IDTVEC(ofl),  SDT_SYSIGT, SEL_KPL, 0);
2722 	setidt_global(IDT_BR, &IDTVEC(bnd),  SDT_SYSIGT, SEL_KPL, 0);
2723 	setidt_global(IDT_UD, &IDTVEC(ill),  SDT_SYSIGT, SEL_KPL, 0);
2724 	setidt_global(IDT_NM, &IDTVEC(dna),  SDT_SYSIGT, SEL_KPL, 0);
2725 	setidt_global(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
2726 	setidt_global(IDT_FPUGP, &IDTVEC(fpusegm),  SDT_SYSIGT, SEL_KPL, 0);
2727 	setidt_global(IDT_TS, &IDTVEC(tss),  SDT_SYSIGT, SEL_KPL, 0);
2728 	setidt_global(IDT_NP, &IDTVEC(missing),  SDT_SYSIGT, SEL_KPL, 0);
2729 	setidt_global(IDT_SS, &IDTVEC(stk),  SDT_SYSIGT, SEL_KPL, 0);
2730 	setidt_global(IDT_GP, &IDTVEC(prot),  SDT_SYSIGT, SEL_KPL, 0);
2731 	setidt_global(IDT_PF, &IDTVEC(page),  SDT_SYSIGT, SEL_KPL, 0);
2732 	setidt_global(IDT_MF, &IDTVEC(fpu),  SDT_SYSIGT, SEL_KPL, 0);
2733 	setidt_global(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0);
2734 	setidt_global(IDT_MC, &IDTVEC(mchk),  SDT_SYSIGT, SEL_KPL, 0);
2735 	setidt_global(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0);
2736 
2737 	for (cpu = 0; cpu < MAXCPU; ++cpu) {
2738 		r_idt_arr[cpu].rd_limit = sizeof(idt_arr[cpu]) - 1;
2739 		r_idt_arr[cpu].rd_base = (long) &idt_arr[cpu][0];
2740 	}
2741 
2742 	lidt(&r_idt_arr[0]);
2743 
2744 	/*
2745 	 * Initialize the console before we print anything out.
2746 	 */
2747 	cninit();
2748 
2749 #if 0 /* JG */
2750 	if (metadata_missing)
2751 		kprintf("WARNING: loader(8) metadata is missing!\n");
2752 #endif
2753 
2754 #if	NISA >0
2755 	elcr_probe();
2756 	isa_defaultirq();
2757 #endif
2758 	rand_initialize();
2759 
2760 	/*
2761 	 * Initialize IRQ mapping
2762 	 *
2763 	 * NOTE:
2764 	 * SHOULD be after elcr_probe()
2765 	 */
2766 	MachIntrABI_ICU.initmap();
2767 	MachIntrABI_IOAPIC.initmap();
2768 
2769 #ifdef DDB
2770 	kdb_init();
2771 	if (boothowto & RB_KDB)
2772 		Debugger("Boot flags requested debugger");
2773 #endif
2774 
2775 	identify_cpu();		/* Final stage of CPU initialization */
2776 	initializecpu(0);	/* Initialize CPU registers */
2777 
2778 	/*
2779 	 * On modern Intel cpus, haswell or later, cpu_idle_hlt=1 is better
2780 	 * because the cpu does significant power management in MWAIT
2781 	 * (also suggested is to set sysctl machdep.mwait.CX.idle=AUTODEEP).
2782 	 *
2783 	 * On many AMD cpus cpu_idle_hlt=3 is better, because the cpu does
2784 	 * significant power management only when using ACPI halt mode.
2785 	 * (However, on Ryzen, mode 4 (HLT) also does power management).
2786 	 *
2787 	 * On older AMD or Intel cpus, cpu_idle_hlt=2 is better because ACPI
2788 	 * is needed to reduce power consumption, but wakeup times are often
2789 	 * too long.
2790 	 */
2791 	if (cpu_vendor_id == CPU_VENDOR_INTEL &&
2792 	    CPUID_TO_MODEL(cpu_id) >= 0x3C) {	/* Haswell or later */
2793 		cpu_idle_hlt = 1;
2794 	}
2795 	if (cpu_vendor_id == CPU_VENDOR_AMD) {
2796 		if (CPUID_TO_FAMILY(cpu_id) >= 0x17) {
2797 			/* Ryzen or later */
2798 			cpu_idle_hlt = 3;
2799 		} else if (CPUID_TO_FAMILY(cpu_id) >= 0x14) {
2800 			/* Bobcat or later */
2801 			cpu_idle_hlt = 3;
2802 		}
2803 	}
2804 
2805 	TUNABLE_INT_FETCH("hw.apic_io_enable", &ioapic_enable); /* for compat */
2806 	TUNABLE_INT_FETCH("hw.ioapic_enable", &ioapic_enable);
2807 	TUNABLE_INT_FETCH("hw.lapic_enable", &lapic_enable);
2808 	TUNABLE_INT_FETCH("machdep.cpu_idle_hlt", &cpu_idle_hlt);
2809 
2810 	/*
2811 	 * Some of the virtual machines do not work w/ I/O APIC
2812 	 * enabled.  If the user does not explicitly enable or
2813 	 * disable the I/O APIC (ioapic_enable < 0), then we
2814 	 * disable I/O APIC on all virtual machines.
2815 	 *
2816 	 * NOTE:
2817 	 * This must be done after identify_cpu(), which sets
2818 	 * 'cpu_feature2'
2819 	 */
2820 	if (ioapic_enable < 0) {
2821 		if (cpu_feature2 & CPUID2_VMM)
2822 			ioapic_enable = 0;
2823 		else
2824 			ioapic_enable = 1;
2825 	}
2826 
2827 	/*
2828 	 * TSS entry point for interrupts, traps, and exceptions
2829 	 * (sans NMI).  This will always go to near the top of the pcpu
2830 	 * trampoline area.  Hardware-pushed data will be copied into
2831 	 * the trap-frame on entry, and (if necessary) returned to the
2832 	 * trampoline on exit.
2833 	 *
2834 	 * We store some pcb data for the trampoline code above the
2835 	 * stack the cpu hw pushes into, and arrange things so the
2836 	 * address of tr_pcb_rsp is the same as the desired top of
2837 	 * stack.
2838 	 */
2839 	ps->common_tss.tss_rsp0 = (register_t)&ps->trampoline.tr_pcb_rsp;
2840 	ps->trampoline.tr_pcb_rsp = ps->common_tss.tss_rsp0;
2841 	ps->trampoline.tr_pcb_gs_kernel = (register_t)gd;
2842 	ps->trampoline.tr_pcb_cr3 = KPML4phys;	/* adj to user cr3 live */
2843 	ps->dbltramp.tr_pcb_gs_kernel = (register_t)gd;
2844 	ps->dbltramp.tr_pcb_cr3 = KPML4phys;
2845 	ps->dbgtramp.tr_pcb_gs_kernel = (register_t)gd;
2846 	ps->dbgtramp.tr_pcb_cr3 = KPML4phys;
2847 
2848 	/* double fault stack */
2849 	ps->common_tss.tss_ist1 = (register_t)&ps->dbltramp.tr_pcb_rsp;
2850 	/* #DB debugger needs its own stack */
2851 	ps->common_tss.tss_ist2 = (register_t)&ps->dbgtramp.tr_pcb_rsp;
2852 
2853 	/* Set the IO permission bitmap (empty due to tss seg limit) */
2854 	ps->common_tss.tss_iobase = sizeof(struct x86_64tss);
2855 
2856 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
2857 	gd->gd_tss_gdt = &gdt[GPROC0_SEL];
2858 	gd->gd_common_tssd = *gd->gd_tss_gdt;
2859 	ltr(gsel_tss);
2860 
2861 	/* Set up the fast syscall stuff */
2862 	msr = rdmsr(MSR_EFER) | EFER_SCE;
2863 	wrmsr(MSR_EFER, msr);
2864 	wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall));
2865 	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
2866 	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
2867 	      ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
2868 	wrmsr(MSR_STAR, msr);
2869 	wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D|PSL_IOPL|PSL_AC);
2870 
2871 	getmemsize(kmdp, physfree);
2872 	init_param2(physmem);
2873 
2874 	/* now running on new page tables, configured,and u/iom is accessible */
2875 
2876 	/* Map the message buffer. */
2877 #if 0 /* JG */
2878 	for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE)
2879 		pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off);
2880 #endif
2881 
2882 	msgbufinit(msgbufp, MSGBUF_SIZE);
2883 
2884 
2885 	/* transfer to user mode */
2886 
2887 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
2888 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
2889 	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
2890 
2891 	load_ds(_udatasel);
2892 	load_es(_udatasel);
2893 	load_fs(_udatasel);
2894 
2895 	/* setup proc 0's pcb */
2896 	thread0.td_pcb->pcb_flags = 0;
2897 	thread0.td_pcb->pcb_cr3 = KPML4phys;
2898 	thread0.td_pcb->pcb_cr3_iso = 0;
2899 	thread0.td_pcb->pcb_ext = NULL;
2900 	lwp0.lwp_md.md_regs = &proc0_tf;	/* XXX needed? */
2901 
2902 	/* Location of kernel stack for locore */
2903 	return ((u_int64_t)thread0.td_pcb);
2904 }
2905 
2906 /*
2907  * Initialize machine-dependant portions of the global data structure.
2908  * Note that the global data area and cpu0's idlestack in the private
2909  * data space were allocated in locore.
2910  *
2911  * Note: the idlethread's cpl is 0
2912  *
2913  * WARNING!  Called from early boot, 'mycpu' may not work yet.
2914  */
2915 void
2916 cpu_gdinit(struct mdglobaldata *gd, int cpu)
2917 {
2918 	if (cpu)
2919 		gd->mi.gd_curthread = &gd->mi.gd_idlethread;
2920 
2921 	lwkt_init_thread(&gd->mi.gd_idlethread,
2922 			gd->mi.gd_prvspace->idlestack,
2923 			sizeof(gd->mi.gd_prvspace->idlestack),
2924 			0, &gd->mi);
2925 	lwkt_set_comm(&gd->mi.gd_idlethread, "idle_%d", cpu);
2926 	gd->mi.gd_idlethread.td_switch = cpu_lwkt_switch;
2927 	gd->mi.gd_idlethread.td_sp -= sizeof(void *);
2928 	*(void **)gd->mi.gd_idlethread.td_sp = cpu_idle_restore;
2929 }
2930 
2931 /*
2932  * We only have to check for DMAP bounds, the globaldata space is
2933  * actually part of the kernel_map so we don't have to waste time
2934  * checking CPU_prvspace[*].
2935  */
2936 int
2937 is_globaldata_space(vm_offset_t saddr, vm_offset_t eaddr)
2938 {
2939 #if 0
2940 	if (saddr >= (vm_offset_t)&CPU_prvspace[0] &&
2941 	    eaddr <= (vm_offset_t)&CPU_prvspace[MAXCPU]) {
2942 		return (TRUE);
2943 	}
2944 #endif
2945 	if (saddr >= DMAP_MIN_ADDRESS && eaddr <= DMAP_MAX_ADDRESS)
2946 		return (TRUE);
2947 	return (FALSE);
2948 }
2949 
2950 struct globaldata *
2951 globaldata_find(int cpu)
2952 {
2953 	KKASSERT(cpu >= 0 && cpu < ncpus);
2954 	return(&CPU_prvspace[cpu]->mdglobaldata.mi);
2955 }
2956 
2957 /*
2958  * This path should be safe from the SYSRET issue because only stopped threads
2959  * can have their %rip adjusted this way (and all heavy weight thread switches
2960  * clear QUICKREF and thus do not use SYSRET).  However, the code path is
2961  * convoluted so add a safety by forcing %rip to be cannonical.
2962  */
2963 int
2964 ptrace_set_pc(struct lwp *lp, unsigned long addr)
2965 {
2966 	if (addr & 0x0000800000000000LLU)
2967 		lp->lwp_md.md_regs->tf_rip = addr | 0xFFFF000000000000LLU;
2968 	else
2969 		lp->lwp_md.md_regs->tf_rip = addr & 0x0000FFFFFFFFFFFFLLU;
2970 	return (0);
2971 }
2972 
2973 int
2974 ptrace_single_step(struct lwp *lp)
2975 {
2976 	lp->lwp_md.md_regs->tf_rflags |= PSL_T;
2977 	return (0);
2978 }
2979 
2980 int
2981 fill_regs(struct lwp *lp, struct reg *regs)
2982 {
2983 	struct trapframe *tp;
2984 
2985 	if ((tp = lp->lwp_md.md_regs) == NULL)
2986 		return EINVAL;
2987 	bcopy(&tp->tf_rdi, &regs->r_rdi, sizeof(*regs));
2988 	return (0);
2989 }
2990 
2991 int
2992 set_regs(struct lwp *lp, struct reg *regs)
2993 {
2994 	struct trapframe *tp;
2995 
2996 	tp = lp->lwp_md.md_regs;
2997 	if (!EFL_SECURE(regs->r_rflags, tp->tf_rflags) ||
2998 	    !CS_SECURE(regs->r_cs))
2999 		return (EINVAL);
3000 	bcopy(&regs->r_rdi, &tp->tf_rdi, sizeof(*regs));
3001 	clear_quickret();
3002 	return (0);
3003 }
3004 
3005 static void
3006 fill_fpregs_xmm(struct savexmm *sv_xmm, struct save87 *sv_87)
3007 {
3008 	struct env87 *penv_87 = &sv_87->sv_env;
3009 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
3010 	int i;
3011 
3012 	/* FPU control/status */
3013 	penv_87->en_cw = penv_xmm->en_cw;
3014 	penv_87->en_sw = penv_xmm->en_sw;
3015 	penv_87->en_tw = penv_xmm->en_tw;
3016 	penv_87->en_fip = penv_xmm->en_fip;
3017 	penv_87->en_fcs = penv_xmm->en_fcs;
3018 	penv_87->en_opcode = penv_xmm->en_opcode;
3019 	penv_87->en_foo = penv_xmm->en_foo;
3020 	penv_87->en_fos = penv_xmm->en_fos;
3021 
3022 	/* FPU registers */
3023 	for (i = 0; i < 8; ++i)
3024 		sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc;
3025 }
3026 
3027 static void
3028 set_fpregs_xmm(struct save87 *sv_87, struct savexmm *sv_xmm)
3029 {
3030 	struct env87 *penv_87 = &sv_87->sv_env;
3031 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
3032 	int i;
3033 
3034 	/* FPU control/status */
3035 	penv_xmm->en_cw = penv_87->en_cw;
3036 	penv_xmm->en_sw = penv_87->en_sw;
3037 	penv_xmm->en_tw = penv_87->en_tw;
3038 	penv_xmm->en_fip = penv_87->en_fip;
3039 	penv_xmm->en_fcs = penv_87->en_fcs;
3040 	penv_xmm->en_opcode = penv_87->en_opcode;
3041 	penv_xmm->en_foo = penv_87->en_foo;
3042 	penv_xmm->en_fos = penv_87->en_fos;
3043 
3044 	/* FPU registers */
3045 	for (i = 0; i < 8; ++i)
3046 		sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i];
3047 }
3048 
3049 int
3050 fill_fpregs(struct lwp *lp, struct fpreg *fpregs)
3051 {
3052 	if (lp->lwp_thread == NULL || lp->lwp_thread->td_pcb == NULL)
3053 		return EINVAL;
3054 	if (cpu_fxsr) {
3055 		fill_fpregs_xmm(&lp->lwp_thread->td_pcb->pcb_save.sv_xmm,
3056 				(struct save87 *)fpregs);
3057 		return (0);
3058 	}
3059 	bcopy(&lp->lwp_thread->td_pcb->pcb_save.sv_87, fpregs, sizeof *fpregs);
3060 	return (0);
3061 }
3062 
3063 int
3064 set_fpregs(struct lwp *lp, struct fpreg *fpregs)
3065 {
3066 	if (cpu_fxsr) {
3067 		set_fpregs_xmm((struct save87 *)fpregs,
3068 			       &lp->lwp_thread->td_pcb->pcb_save.sv_xmm);
3069 		return (0);
3070 	}
3071 	bcopy(fpregs, &lp->lwp_thread->td_pcb->pcb_save.sv_87, sizeof *fpregs);
3072 	return (0);
3073 }
3074 
3075 int
3076 fill_dbregs(struct lwp *lp, struct dbreg *dbregs)
3077 {
3078 	struct pcb *pcb;
3079 
3080         if (lp == NULL) {
3081                 dbregs->dr[0] = rdr0();
3082                 dbregs->dr[1] = rdr1();
3083                 dbregs->dr[2] = rdr2();
3084                 dbregs->dr[3] = rdr3();
3085                 dbregs->dr[4] = rdr4();
3086                 dbregs->dr[5] = rdr5();
3087                 dbregs->dr[6] = rdr6();
3088                 dbregs->dr[7] = rdr7();
3089 		return (0);
3090         }
3091 	if (lp->lwp_thread == NULL || (pcb = lp->lwp_thread->td_pcb) == NULL)
3092 		return EINVAL;
3093 	dbregs->dr[0] = pcb->pcb_dr0;
3094 	dbregs->dr[1] = pcb->pcb_dr1;
3095 	dbregs->dr[2] = pcb->pcb_dr2;
3096 	dbregs->dr[3] = pcb->pcb_dr3;
3097 	dbregs->dr[4] = 0;
3098 	dbregs->dr[5] = 0;
3099 	dbregs->dr[6] = pcb->pcb_dr6;
3100 	dbregs->dr[7] = pcb->pcb_dr7;
3101 	return (0);
3102 }
3103 
3104 int
3105 set_dbregs(struct lwp *lp, struct dbreg *dbregs)
3106 {
3107 	if (lp == NULL) {
3108 		load_dr0(dbregs->dr[0]);
3109 		load_dr1(dbregs->dr[1]);
3110 		load_dr2(dbregs->dr[2]);
3111 		load_dr3(dbregs->dr[3]);
3112 		load_dr4(dbregs->dr[4]);
3113 		load_dr5(dbregs->dr[5]);
3114 		load_dr6(dbregs->dr[6]);
3115 		load_dr7(dbregs->dr[7]);
3116 	} else {
3117 		struct pcb *pcb;
3118 		struct ucred *ucred;
3119 		int i;
3120 		uint64_t mask1, mask2;
3121 
3122 		/*
3123 		 * Don't let an illegal value for dr7 get set.	Specifically,
3124 		 * check for undefined settings.  Setting these bit patterns
3125 		 * result in undefined behaviour and can lead to an unexpected
3126 		 * TRCTRAP.
3127 		 */
3128 		/* JG this loop looks unreadable */
3129 		/* Check 4 2-bit fields for invalid patterns.
3130 		 * These fields are R/Wi, for i = 0..3
3131 		 */
3132 		/* Is 10 in LENi allowed when running in compatibility mode? */
3133 		/* Pattern 10 in R/Wi might be used to indicate
3134 		 * breakpoint on I/O. Further analysis should be
3135 		 * carried to decide if it is safe and useful to
3136 		 * provide access to that capability
3137 		 */
3138 		for (i = 0, mask1 = 0x3<<16, mask2 = 0x2<<16; i < 4;
3139 		     i++, mask1 <<= 4, mask2 <<= 4)
3140 			if ((dbregs->dr[7] & mask1) == mask2)
3141 				return (EINVAL);
3142 
3143 		pcb = lp->lwp_thread->td_pcb;
3144 		ucred = lp->lwp_proc->p_ucred;
3145 
3146 		/*
3147 		 * Don't let a process set a breakpoint that is not within the
3148 		 * process's address space.  If a process could do this, it
3149 		 * could halt the system by setting a breakpoint in the kernel
3150 		 * (if ddb was enabled).  Thus, we need to check to make sure
3151 		 * that no breakpoints are being enabled for addresses outside
3152 		 * process's address space, unless, perhaps, we were called by
3153 		 * uid 0.
3154 		 *
3155 		 * XXX - what about when the watched area of the user's
3156 		 * address space is written into from within the kernel
3157 		 * ... wouldn't that still cause a breakpoint to be generated
3158 		 * from within kernel mode?
3159 		 */
3160 
3161 		if (priv_check_cred(ucred, PRIV_ROOT, 0) != 0) {
3162 			if (dbregs->dr[7] & 0x3) {
3163 				/* dr0 is enabled */
3164 				if (dbregs->dr[0] >= VM_MAX_USER_ADDRESS)
3165 					return (EINVAL);
3166 			}
3167 
3168 			if (dbregs->dr[7] & (0x3<<2)) {
3169 				/* dr1 is enabled */
3170 				if (dbregs->dr[1] >= VM_MAX_USER_ADDRESS)
3171 					return (EINVAL);
3172 			}
3173 
3174 			if (dbregs->dr[7] & (0x3<<4)) {
3175 				/* dr2 is enabled */
3176 				if (dbregs->dr[2] >= VM_MAX_USER_ADDRESS)
3177 					return (EINVAL);
3178 			}
3179 
3180 			if (dbregs->dr[7] & (0x3<<6)) {
3181 				/* dr3 is enabled */
3182 				if (dbregs->dr[3] >= VM_MAX_USER_ADDRESS)
3183 					return (EINVAL);
3184 			}
3185 		}
3186 
3187 		pcb->pcb_dr0 = dbregs->dr[0];
3188 		pcb->pcb_dr1 = dbregs->dr[1];
3189 		pcb->pcb_dr2 = dbregs->dr[2];
3190 		pcb->pcb_dr3 = dbregs->dr[3];
3191 		pcb->pcb_dr6 = dbregs->dr[6];
3192 		pcb->pcb_dr7 = dbregs->dr[7];
3193 
3194 		pcb->pcb_flags |= PCB_DBREGS;
3195 	}
3196 
3197 	return (0);
3198 }
3199 
3200 /*
3201  * Return > 0 if a hardware breakpoint has been hit, and the
3202  * breakpoint was in user space.  Return 0, otherwise.
3203  */
3204 int
3205 user_dbreg_trap(void)
3206 {
3207         u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */
3208         u_int64_t bp;       /* breakpoint bits extracted from dr6 */
3209         int nbp;            /* number of breakpoints that triggered */
3210         caddr_t addr[4];    /* breakpoint addresses */
3211         int i;
3212 
3213         dr7 = rdr7();
3214         if ((dr7 & 0xff) == 0) {
3215                 /*
3216                  * all GE and LE bits in the dr7 register are zero,
3217                  * thus the trap couldn't have been caused by the
3218                  * hardware debug registers
3219                  */
3220                 return 0;
3221         }
3222 
3223         nbp = 0;
3224         dr6 = rdr6();
3225         bp = dr6 & 0xf;
3226 
3227         if (bp == 0) {
3228                 /*
3229                  * None of the breakpoint bits are set meaning this
3230                  * trap was not caused by any of the debug registers
3231                  */
3232                 return 0;
3233         }
3234 
3235         /*
3236          * at least one of the breakpoints were hit, check to see
3237          * which ones and if any of them are user space addresses
3238          */
3239 
3240         if (bp & 0x01) {
3241                 addr[nbp++] = (caddr_t)rdr0();
3242         }
3243         if (bp & 0x02) {
3244                 addr[nbp++] = (caddr_t)rdr1();
3245         }
3246         if (bp & 0x04) {
3247                 addr[nbp++] = (caddr_t)rdr2();
3248         }
3249         if (bp & 0x08) {
3250                 addr[nbp++] = (caddr_t)rdr3();
3251         }
3252 
3253         for (i = 0; i < nbp; i++) {
3254                 if (addr[i] < (caddr_t)VM_MAX_USER_ADDRESS) {
3255                         /*
3256                          * addr[i] is in user space
3257                          */
3258                         return nbp;
3259                 }
3260         }
3261 
3262         /*
3263          * None of the breakpoints are in user space.
3264          */
3265         return 0;
3266 }
3267 
3268 
3269 #ifndef DDB
3270 void
3271 Debugger(const char *msg)
3272 {
3273 	kprintf("Debugger(\"%s\") called.\n", msg);
3274 }
3275 #endif /* no DDB */
3276 
3277 #ifdef DDB
3278 
3279 /*
3280  * Provide inb() and outb() as functions.  They are normally only
3281  * available as macros calling inlined functions, thus cannot be
3282  * called inside DDB.
3283  *
3284  * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
3285  */
3286 
3287 #undef inb
3288 #undef outb
3289 
3290 /* silence compiler warnings */
3291 u_char inb(u_int);
3292 void outb(u_int, u_char);
3293 
3294 u_char
3295 inb(u_int port)
3296 {
3297 	u_char	data;
3298 	/*
3299 	 * We use %%dx and not %1 here because i/o is done at %dx and not at
3300 	 * %edx, while gcc generates inferior code (movw instead of movl)
3301 	 * if we tell it to load (u_short) port.
3302 	 */
3303 	__asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port));
3304 	return (data);
3305 }
3306 
3307 void
3308 outb(u_int port, u_char data)
3309 {
3310 	u_char	al;
3311 	/*
3312 	 * Use an unnecessary assignment to help gcc's register allocator.
3313 	 * This make a large difference for gcc-1.40 and a tiny difference
3314 	 * for gcc-2.6.0.  For gcc-1.40, al had to be ``asm("ax")'' for
3315 	 * best results.  gcc-2.6.0 can't handle this.
3316 	 */
3317 	al = data;
3318 	__asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port));
3319 }
3320 
3321 #endif /* DDB */
3322 
3323 
3324 
3325 /*
3326  * initialize all the SMP locks
3327  */
3328 
3329 /* critical region when masking or unmasking interupts */
3330 struct spinlock_deprecated imen_spinlock;
3331 
3332 /* locks com (tty) data/hardware accesses: a FASTINTR() */
3333 struct spinlock_deprecated com_spinlock;
3334 
3335 /* lock regions around the clock hardware */
3336 struct spinlock_deprecated clock_spinlock;
3337 
3338 static void
3339 init_locks(void)
3340 {
3341 	/*
3342 	 * Get the initial mplock with a count of 1 for the BSP.
3343 	 * This uses a LOGICAL cpu ID, ie BSP == 0.
3344 	 */
3345 	cpu_get_initial_mplock();
3346 	/* DEPRECATED */
3347 	spin_init_deprecated(&imen_spinlock);
3348 	spin_init_deprecated(&com_spinlock);
3349 	spin_init_deprecated(&clock_spinlock);
3350 
3351 	/* our token pool needs to work early */
3352 	lwkt_token_pool_init();
3353 }
3354 
3355 boolean_t
3356 cpu_mwait_hint_valid(uint32_t hint)
3357 {
3358 	int cx_idx, sub;
3359 
3360 	cx_idx = MWAIT_EAX_TO_CX(hint);
3361 	if (cx_idx >= CPU_MWAIT_CX_MAX)
3362 		return FALSE;
3363 
3364 	sub = MWAIT_EAX_TO_CX_SUB(hint);
3365 	if (sub >= cpu_mwait_cx_info[cx_idx].subcnt)
3366 		return FALSE;
3367 
3368 	return TRUE;
3369 }
3370 
3371 void
3372 cpu_mwait_cx_no_bmsts(void)
3373 {
3374 	atomic_clear_int(&cpu_mwait_c3_preamble, CPU_MWAIT_C3_PREAMBLE_BM_STS);
3375 }
3376 
3377 void
3378 cpu_mwait_cx_no_bmarb(void)
3379 {
3380 	atomic_clear_int(&cpu_mwait_c3_preamble, CPU_MWAIT_C3_PREAMBLE_BM_ARB);
3381 }
3382 
3383 static int
3384 cpu_mwait_cx_hint2name(int hint, char *name, int namelen, boolean_t allow_auto)
3385 {
3386 	int old_cx_idx, sub = 0;
3387 
3388 	if (hint >= 0) {
3389 		old_cx_idx = MWAIT_EAX_TO_CX(hint);
3390 		sub = MWAIT_EAX_TO_CX_SUB(hint);
3391 	} else if (hint == CPU_MWAIT_HINT_AUTO) {
3392 		old_cx_idx = allow_auto ? CPU_MWAIT_C2 : CPU_MWAIT_CX_MAX;
3393 	} else if (hint == CPU_MWAIT_HINT_AUTODEEP) {
3394 		old_cx_idx = allow_auto ? CPU_MWAIT_C3 : CPU_MWAIT_CX_MAX;
3395 	} else {
3396 		old_cx_idx = CPU_MWAIT_CX_MAX;
3397 	}
3398 
3399 	if (!CPU_MWAIT_HAS_CX)
3400 		strlcpy(name, "NONE", namelen);
3401 	else if (allow_auto && hint == CPU_MWAIT_HINT_AUTO)
3402 		strlcpy(name, "AUTO", namelen);
3403 	else if (allow_auto && hint == CPU_MWAIT_HINT_AUTODEEP)
3404 		strlcpy(name, "AUTODEEP", namelen);
3405 	else if (old_cx_idx >= CPU_MWAIT_CX_MAX ||
3406 	    sub >= cpu_mwait_cx_info[old_cx_idx].subcnt)
3407 		strlcpy(name, "INVALID", namelen);
3408 	else
3409 		ksnprintf(name, namelen, "C%d/%d", old_cx_idx, sub);
3410 
3411 	return old_cx_idx;
3412 }
3413 
3414 static int
3415 cpu_mwait_cx_name2hint(char *name, int *hint0, boolean_t allow_auto)
3416 {
3417 	int cx_idx, sub, hint;
3418 	char *ptr, *start;
3419 
3420 	if (allow_auto && strcmp(name, "AUTO") == 0) {
3421 		hint = CPU_MWAIT_HINT_AUTO;
3422 		cx_idx = CPU_MWAIT_C2;
3423 		goto done;
3424 	}
3425 	if (allow_auto && strcmp(name, "AUTODEEP") == 0) {
3426 		hint = CPU_MWAIT_HINT_AUTODEEP;
3427 		cx_idx = CPU_MWAIT_C3;
3428 		goto done;
3429 	}
3430 
3431 	if (strlen(name) < 4 || toupper(name[0]) != 'C')
3432 		return -1;
3433 	start = &name[1];
3434 	ptr = NULL;
3435 
3436 	cx_idx = strtol(start, &ptr, 10);
3437 	if (ptr == start || *ptr != '/')
3438 		return -1;
3439 	if (cx_idx < 0 || cx_idx >= CPU_MWAIT_CX_MAX)
3440 		return -1;
3441 
3442 	start = ptr + 1;
3443 	ptr = NULL;
3444 
3445 	sub = strtol(start, &ptr, 10);
3446 	if (*ptr != '\0')
3447 		return -1;
3448 	if (sub < 0 || sub >= cpu_mwait_cx_info[cx_idx].subcnt)
3449 		return -1;
3450 
3451 	hint = MWAIT_EAX_HINT(cx_idx, sub);
3452 done:
3453 	*hint0 = hint;
3454 	return cx_idx;
3455 }
3456 
3457 static int
3458 cpu_mwait_cx_transit(int old_cx_idx, int cx_idx)
3459 {
3460 	if (cx_idx >= CPU_MWAIT_C3 && cpu_mwait_c3_preamble)
3461 		return EOPNOTSUPP;
3462 	if (old_cx_idx < CPU_MWAIT_C3 && cx_idx >= CPU_MWAIT_C3) {
3463 		int error;
3464 
3465 		error = cputimer_intr_powersave_addreq();
3466 		if (error)
3467 			return error;
3468 	} else if (old_cx_idx >= CPU_MWAIT_C3 && cx_idx < CPU_MWAIT_C3) {
3469 		cputimer_intr_powersave_remreq();
3470 	}
3471 	return 0;
3472 }
3473 
3474 static int
3475 cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS, int *hint0,
3476     boolean_t allow_auto)
3477 {
3478 	int error, cx_idx, old_cx_idx, hint;
3479 	char name[CPU_MWAIT_CX_NAMELEN];
3480 
3481 	hint = *hint0;
3482 	old_cx_idx = cpu_mwait_cx_hint2name(hint, name, sizeof(name),
3483 	    allow_auto);
3484 
3485 	error = sysctl_handle_string(oidp, name, sizeof(name), req);
3486 	if (error != 0 || req->newptr == NULL)
3487 		return error;
3488 
3489 	if (!CPU_MWAIT_HAS_CX)
3490 		return EOPNOTSUPP;
3491 
3492 	cx_idx = cpu_mwait_cx_name2hint(name, &hint, allow_auto);
3493 	if (cx_idx < 0)
3494 		return EINVAL;
3495 
3496 	error = cpu_mwait_cx_transit(old_cx_idx, cx_idx);
3497 	if (error)
3498 		return error;
3499 
3500 	*hint0 = hint;
3501 	return 0;
3502 }
3503 
3504 static int
3505 cpu_mwait_cx_setname(struct cpu_idle_stat *stat, const char *cx_name)
3506 {
3507 	int error, cx_idx, old_cx_idx, hint;
3508 	char name[CPU_MWAIT_CX_NAMELEN];
3509 
3510 	KASSERT(CPU_MWAIT_HAS_CX, ("cpu does not support mwait CX extension"));
3511 
3512 	hint = stat->hint;
3513 	old_cx_idx = cpu_mwait_cx_hint2name(hint, name, sizeof(name), TRUE);
3514 
3515 	strlcpy(name, cx_name, sizeof(name));
3516 	cx_idx = cpu_mwait_cx_name2hint(name, &hint, TRUE);
3517 	if (cx_idx < 0)
3518 		return EINVAL;
3519 
3520 	error = cpu_mwait_cx_transit(old_cx_idx, cx_idx);
3521 	if (error)
3522 		return error;
3523 
3524 	stat->hint = hint;
3525 	return 0;
3526 }
3527 
3528 static int
3529 cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS)
3530 {
3531 	int hint = cpu_mwait_halt_global;
3532 	int error, cx_idx, cpu;
3533 	char name[CPU_MWAIT_CX_NAMELEN], cx_name[CPU_MWAIT_CX_NAMELEN];
3534 
3535 	cpu_mwait_cx_hint2name(hint, name, sizeof(name), TRUE);
3536 
3537 	error = sysctl_handle_string(oidp, name, sizeof(name), req);
3538 	if (error != 0 || req->newptr == NULL)
3539 		return error;
3540 
3541 	if (!CPU_MWAIT_HAS_CX)
3542 		return EOPNOTSUPP;
3543 
3544 	/* Save name for later per-cpu CX configuration */
3545 	strlcpy(cx_name, name, sizeof(cx_name));
3546 
3547 	cx_idx = cpu_mwait_cx_name2hint(name, &hint, TRUE);
3548 	if (cx_idx < 0)
3549 		return EINVAL;
3550 
3551 	/* Change per-cpu CX configuration */
3552 	for (cpu = 0; cpu < ncpus; ++cpu) {
3553 		error = cpu_mwait_cx_setname(&cpu_idle_stats[cpu], cx_name);
3554 		if (error)
3555 			return error;
3556 	}
3557 
3558 	cpu_mwait_halt_global = hint;
3559 	return 0;
3560 }
3561 
3562 static int
3563 cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS)
3564 {
3565 	struct cpu_idle_stat *stat = arg1;
3566 	int error;
3567 
3568 	error = cpu_mwait_cx_select_sysctl(oidp, arg1, arg2, req,
3569 	    &stat->hint, TRUE);
3570 	return error;
3571 }
3572 
3573 static int
3574 cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS)
3575 {
3576 	int error;
3577 
3578 	error = cpu_mwait_cx_select_sysctl(oidp, arg1, arg2, req,
3579 	    &cpu_mwait_spin, FALSE);
3580 	return error;
3581 }
3582 
3583 /*
3584  * This manual debugging code is called unconditionally from Xtimer
3585  * (the per-cpu timer interrupt) whether the current thread is in a
3586  * critical section or not) and can be useful in tracking down lockups.
3587  *
3588  * NOTE: MANUAL DEBUG CODE
3589  */
3590 #if 0
3591 static int saveticks[SMP_MAXCPU];
3592 static int savecounts[SMP_MAXCPU];
3593 #endif
3594 
3595 void
3596 pcpu_timer_always(struct intrframe *frame)
3597 {
3598 #if 0
3599 	globaldata_t gd = mycpu;
3600 	int cpu = gd->gd_cpuid;
3601 	char buf[64];
3602 	short *gptr;
3603 	int i;
3604 
3605 	if (cpu <= 20) {
3606 		gptr = (short *)0xFFFFFFFF800b8000 + 80 * cpu;
3607 		*gptr = ((*gptr + 1) & 0x00FF) | 0x0700;
3608 		++gptr;
3609 
3610 		ksnprintf(buf, sizeof(buf), " %p %16s %d %16s ",
3611 		    (void *)frame->if_rip, gd->gd_curthread->td_comm, ticks,
3612 		    gd->gd_infomsg);
3613 		for (i = 0; buf[i]; ++i) {
3614 			gptr[i] = 0x0700 | (unsigned char)buf[i];
3615 		}
3616 	}
3617 #if 0
3618 	if (saveticks[gd->gd_cpuid] != ticks) {
3619 		saveticks[gd->gd_cpuid] = ticks;
3620 		savecounts[gd->gd_cpuid] = 0;
3621 	}
3622 	++savecounts[gd->gd_cpuid];
3623 	if (savecounts[gd->gd_cpuid] > 2000 && panicstr == NULL) {
3624 		panic("cpud %d panicing on ticks failure",
3625 			gd->gd_cpuid);
3626 	}
3627 	for (i = 0; i < ncpus; ++i) {
3628 		int delta;
3629 		if (saveticks[i] && panicstr == NULL) {
3630 			delta = saveticks[i] - ticks;
3631 			if (delta < -10 || delta > 10) {
3632 				panic("cpu %d panicing on cpu %d watchdog",
3633 				      gd->gd_cpuid, i);
3634 			}
3635 		}
3636 	}
3637 #endif
3638 #endif
3639 }
3640 
3641 SET_DECLARE(smap_open, char);
3642 SET_DECLARE(smap_close, char);
3643 
3644 static void
3645 cpu_implement_smap(void)
3646 {
3647 	char **scan;
3648 
3649 	for (scan = SET_BEGIN(smap_open);		/* nop -> stac */
3650 	     scan < SET_LIMIT(smap_open); ++scan) {
3651 		(*scan)[0] = 0x0F;
3652 		(*scan)[1] = 0x01;
3653 		(*scan)[2] = 0xCB;
3654 	}
3655 	for (scan = SET_BEGIN(smap_close);		/* nop -> clac */
3656 	     scan < SET_LIMIT(smap_close); ++scan) {
3657 		(*scan)[0] = 0x0F;
3658 		(*scan)[1] = 0x01;
3659 		(*scan)[2] = 0xCA;
3660 	}
3661 }
3662 
3663 /*
3664  * From a hard interrupt
3665  */
3666 int
3667 cpu_interrupt_running(struct thread *td)
3668 {
3669 	struct mdglobaldata *gd = mdcpu;
3670 
3671 	if (clock_debug1 > 0) {
3672 		--clock_debug1;
3673 		kprintf("%d %016lx %016lx %016lx\n",
3674 			((td->td_flags & TDF_INTTHREAD) != 0),
3675 			gd->gd_ipending[0],
3676 			gd->gd_ipending[1],
3677 			gd->gd_ipending[2]);
3678 		if (td->td_flags & TDF_CLKTHREAD) {
3679 			kprintf("CLKTD %s PREEMPT %s\n",
3680 				td->td_comm,
3681 				(td->td_preempted ?
3682 				 td->td_preempted->td_comm : ""));
3683 		} else {
3684 			kprintf("NORTD %s\n", td->td_comm);
3685 		}
3686 	}
3687 	if ((td->td_flags & TDF_INTTHREAD) ||
3688 	    gd->gd_ipending[0] ||
3689 	    gd->gd_ipending[1] ||
3690 	    gd->gd_ipending[2]) {
3691 		return 1;
3692 	} else {
3693 		return 0;
3694 	}
3695 }
3696