xref: /openbsd/sys/arch/amd64/amd64/machdep.c (revision 36fd90dc)
1 /*	$OpenBSD: machdep.c,v 1.273 2021/03/11 11:16:55 jsg Exp $	*/
2 /*	$NetBSD: machdep.c,v 1.3 2003/05/07 22:58:18 fvdl Exp $	*/
3 
4 /*-
5  * Copyright (c) 1996, 1997, 1998, 2000 The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace
10  * Simulation Facility, NASA Ames Research Center.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
25  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 /*-
35  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
36  * All rights reserved.
37  *
38  * This code is derived from software contributed to Berkeley by
39  * William Jolitz.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  *	@(#)machdep.c	7.4 (Berkeley) 6/3/91
66  */
67 
68 #include <sys/param.h>
69 #include <sys/systm.h>
70 #include <sys/signal.h>
71 #include <sys/signalvar.h>
72 #include <sys/proc.h>
73 #include <sys/user.h>
74 #include <sys/exec.h>
75 #include <sys/buf.h>
76 #include <sys/reboot.h>
77 #include <sys/conf.h>
78 #include <sys/msgbuf.h>
79 #include <sys/mount.h>
80 #include <sys/extent.h>
81 #include <sys/core.h>
82 #include <sys/kcore.h>
83 #include <sys/syscallargs.h>
84 
85 #include <dev/cons.h>
86 #include <stand/boot/bootarg.h>
87 
88 #include <net/if.h>
89 #include <uvm/uvm_extern.h>
90 
91 #include <sys/sysctl.h>
92 
93 #include <machine/cpu_full.h>
94 #include <machine/cpufunc.h>
95 #include <machine/pio.h>
96 #include <machine/psl.h>
97 #include <machine/reg.h>
98 #include <machine/fpu.h>
99 #include <machine/biosvar.h>
100 #include <machine/mpbiosvar.h>
101 #include <machine/kcore.h>
102 #include <machine/tss.h>
103 
104 #include <dev/isa/isareg.h>
105 #include <dev/ic/i8042reg.h>
106 
107 #ifdef DDB
108 #include <machine/db_machdep.h>
109 #include <ddb/db_extern.h>
110 extern int db_console;
111 #endif
112 
113 #include "isa.h"
114 #include "isadma.h"
115 #include "ksyms.h"
116 
117 #include "acpi.h"
118 #if NACPI > 0
119 #include <dev/acpi/acpivar.h>
120 #endif
121 
122 #include "com.h"
123 #if NCOM > 0
124 #include <sys/tty.h>
125 #include <dev/ic/comvar.h>
126 #include <dev/ic/comreg.h>
127 #endif
128 
129 #include "softraid.h"
130 #if NSOFTRAID > 0
131 #include <dev/softraidvar.h>
132 #endif
133 
134 #ifdef HIBERNATE
135 #include <machine/hibernate_var.h>
136 #endif /* HIBERNATE */
137 
138 #include "ukbd.h"
139 #include "pckbc.h"
140 #if NPCKBC > 0 && NUKBD > 0
141 #include <dev/ic/pckbcvar.h>
142 #endif
143 
144 /* #define MACHDEP_DEBUG */
145 
146 #ifdef MACHDEP_DEBUG
147 #define DPRINTF(x...)	do { printf(x); } while(0)
148 #else
149 #define DPRINTF(x...)
150 #endif /* MACHDEP_DEBUG */
151 
152 /* the following is used externally (sysctl_hw) */
153 char machine[] = MACHINE;
154 
155 /*
156  * switchto vectors
157  */
158 void (*cpu_idle_cycle_fcn)(void) = NULL;
159 
160 /* the following is used externally for concurrent handlers */
161 int setperf_prio = 0;
162 
163 #ifdef CPURESET_DELAY
164 int	cpureset_delay = CPURESET_DELAY;
165 #else
166 int     cpureset_delay = 0;
167 #endif
168 
169 int	physmem;
170 u_int64_t	dumpmem_low;
171 u_int64_t	dumpmem_high;
172 extern int	boothowto;
173 int	cpu_class;
174 
175 paddr_t	dumpmem_paddr;
176 vaddr_t	dumpmem_vaddr;
177 psize_t	dumpmem_sz;
178 
179 vaddr_t kern_end;
180 
181 vaddr_t	msgbuf_vaddr;
182 paddr_t msgbuf_paddr;
183 
184 vaddr_t	idt_vaddr;
185 paddr_t	idt_paddr;
186 
187 vaddr_t lo32_vaddr;
188 paddr_t lo32_paddr;
189 paddr_t tramp_pdirpa;
190 
191 int kbd_reset;
192 int lid_action = 1;
193 int pwr_action = 1;
194 int forceukbd;
195 
196 /*
197  * safepri is a safe priority for sleep to set for a spin-wait
198  * during autoconfiguration or after a panic.
199  */
200 int	safepri = 0;
201 
202 struct vm_map *exec_map = NULL;
203 struct vm_map *phys_map = NULL;
204 
205 /* UVM constraint ranges. */
206 struct uvm_constraint_range  isa_constraint = { 0x0, 0x00ffffffUL };
207 struct uvm_constraint_range  dma_constraint = { 0x0, 0xffffffffUL };
208 struct uvm_constraint_range *uvm_md_constraints[] = {
209     &isa_constraint,
210     &dma_constraint,
211     NULL,
212 };
213 
214 paddr_t avail_start;
215 paddr_t avail_end;
216 
217 void (*delay_func)(int) = i8254_delay;
218 void (*initclock_func)(void) = i8254_initclocks;
219 
220 /*
221  * Format of boot information passed to us by 32-bit /boot
222  */
223 typedef struct _boot_args32 {
224 	int	ba_type;
225 	int	ba_size;
226 	int	ba_nextX;	/* a ptr in 32-bit world, but not here */
227 	char	ba_arg[1];
228 } bootarg32_t;
229 
230 #define BOOTARGC_MAX	NBPG	/* one page */
231 
232 bios_bootmac_t *bios_bootmac;
233 
234 /* locore copies the arguments from /boot to here for us */
235 char bootinfo[BOOTARGC_MAX];
236 int bootinfo_size = BOOTARGC_MAX;
237 
238 void getbootinfo(char *, int);
239 
240 /* Data passed to us by /boot, filled in by getbootinfo() */
241 bios_diskinfo_t	*bios_diskinfo;
242 bios_memmap_t	*bios_memmap;
243 u_int32_t	bios_cksumlen;
244 bios_efiinfo_t	*bios_efiinfo;
245 bios_ucode_t	*bios_ucode;
246 
247 /*
248  * Size of memory segments, before any memory is stolen.
249  */
250 phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX];
251 int	mem_cluster_cnt;
252 
253 int	cpu_dump(void);
254 int	cpu_dumpsize(void);
255 u_long	cpu_dump_mempagecnt(void);
256 void	dumpsys(void);
257 void	cpu_init_extents(void);
258 void	map_tramps(void);
259 void	init_x86_64(paddr_t);
260 void	(*cpuresetfn)(void);
261 void	enter_shared_special_pages(void);
262 
263 #ifdef APERTURE
264 int allowaperture = 0;
265 #endif
266 
267 /*
268  * Machine-dependent startup code
269  */
270 void
271 cpu_startup(void)
272 {
273 	vaddr_t minaddr, maxaddr;
274 
275 	msgbuf_vaddr = PMAP_DIRECT_MAP(msgbuf_paddr);
276 	initmsgbuf((caddr_t)msgbuf_vaddr, round_page(MSGBUFSIZE));
277 
278 	printf("%s", version);
279 	startclocks();
280 	rtcinit();
281 
282 	printf("real mem = %lu (%luMB)\n", ptoa((psize_t)physmem),
283 	    ptoa((psize_t)physmem)/1024/1024);
284 
285 	/*
286 	 * Allocate a submap for exec arguments.  This map effectively
287 	 * limits the number of processes exec'ing at any time.
288 	 */
289 	minaddr = vm_map_min(kernel_map);
290 	exec_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
291 				   16*NCARGS, VM_MAP_PAGEABLE, FALSE, NULL);
292 
293 	/*
294 	 * Allocate a submap for physio
295 	 */
296 	minaddr = vm_map_min(kernel_map);
297 	phys_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
298 				   VM_PHYS_SIZE, 0, FALSE, NULL);
299 
300 	printf("avail mem = %lu (%luMB)\n", ptoa((psize_t)uvmexp.free),
301 	    ptoa((psize_t)uvmexp.free)/1024/1024);
302 
303 	bufinit();
304 
305 	if (boothowto & RB_CONFIG) {
306 #ifdef BOOT_CONFIG
307 		user_config();
308 #else
309 		printf("kernel does not support -c; continuing..\n");
310 #endif
311 	}
312 
313 	/* Safe for i/o port / memory space allocation to use malloc now. */
314 	x86_bus_space_mallocok();
315 
316 #ifndef SMALL_KERNEL
317 	cpu_ucode_setup();
318 	cpu_ucode_apply(&cpu_info_primary);
319 #endif
320 	cpu_tsx_disable(&cpu_info_primary);
321 
322 	/* enter the IDT and trampoline code in the u-k maps */
323 	enter_shared_special_pages();
324 
325 	/* initialize CPU0's TSS and GDT and put them in the u-k maps */
326 	cpu_enter_pages(&cpu_info_full_primary);
327 }
328 
329 /*
330  * enter_shared_special_pages
331  *
332  * Requests mapping of various special pages required in the Intel Meltdown
333  * case (to be entered into the U-K page table):
334  *
335  *  1 IDT page
336  *  Various number of pages covering the U-K ".kutext" section. This section
337  *   contains code needed during trampoline operation
338  *  Various number of pages covering the U-K ".kudata" section. This section
339  *   contains data accessed by the trampoline, before switching to U+K
340  *   (for example, various shared global variables used by IPIs, etc)
341  *
342  * The linker script places the required symbols in the sections above.
343  *
344  * On CPUs not affected by Meltdown, the calls to pmap_enter_special below
345  * become no-ops.
346  */
347 void
348 enter_shared_special_pages(void)
349 {
350 	extern char __kutext_start[], __kutext_end[], __kernel_kutext_phys[];
351 	extern char __text_page_start[], __text_page_end[];
352 	extern char __kernel_kutext_page_phys[];
353 	extern char __kudata_start[], __kudata_end[], __kernel_kudata_phys[];
354 	vaddr_t va;
355 	paddr_t pa;
356 
357 	/* idt */
358 	pmap_enter_special(idt_vaddr, idt_paddr, PROT_READ);
359 	DPRINTF("%s: entered idt page va 0x%llx pa 0x%llx\n", __func__,
360 	    (uint64_t)idt_vaddr, (uint64_t)idt_paddr);
361 
362 	/* .kutext section */
363 	va = (vaddr_t)__kutext_start;
364 	pa = (paddr_t)__kernel_kutext_phys;
365 	while (va < (vaddr_t)__kutext_end) {
366 		pmap_enter_special(va, pa, PROT_READ | PROT_EXEC);
367 		DPRINTF("%s: entered kutext page va 0x%llx pa 0x%llx\n",
368 		    __func__, (uint64_t)va, (uint64_t)pa);
369 		va += PAGE_SIZE;
370 		pa += PAGE_SIZE;
371 	}
372 
373 	/* .kutext.page section */
374 	va = (vaddr_t)__text_page_start;
375 	pa = (paddr_t)__kernel_kutext_page_phys;
376 	while (va < (vaddr_t)__text_page_end) {
377 		pmap_enter_special(va, pa, PROT_READ | PROT_EXEC);
378 		DPRINTF("%s: entered kutext.page va 0x%llx pa 0x%llx\n",
379 		    __func__, (uint64_t)va, (uint64_t)pa);
380 		va += PAGE_SIZE;
381 		pa += PAGE_SIZE;
382 	}
383 
384 	/* .kudata section */
385 	va = (vaddr_t)__kudata_start;
386 	pa = (paddr_t)__kernel_kudata_phys;
387 	while (va < (vaddr_t)__kudata_end) {
388 		pmap_enter_special(va, pa, PROT_READ | PROT_WRITE);
389 		DPRINTF("%s: entered kudata page va 0x%llx pa 0x%llx\n",
390 		    __func__, (uint64_t)va, (uint64_t)pa);
391 		va += PAGE_SIZE;
392 		pa += PAGE_SIZE;
393 	}
394 }
395 
396 /*
397  * Set up proc0's PCB and the cpu's TSS.
398  */
399 void
400 x86_64_proc0_tss_ldt_init(void)
401 {
402 	struct pcb *pcb;
403 
404 	cpu_info_primary.ci_curpcb = pcb = &proc0.p_addr->u_pcb;
405 	pcb->pcb_fsbase = 0;
406 	pcb->pcb_kstack = (u_int64_t)proc0.p_addr + USPACE - 16;
407 	proc0.p_md.md_regs = (struct trapframe *)pcb->pcb_kstack - 1;
408 
409 	ltr(GSYSSEL(GPROC0_SEL, SEL_KPL));
410 	lldt(0);
411 }
412 
413 bios_diskinfo_t *
414 bios_getdiskinfo(dev_t dev)
415 {
416 	bios_diskinfo_t *pdi;
417 
418 	if (bios_diskinfo == NULL)
419 		return NULL;
420 
421 	for (pdi = bios_diskinfo; pdi->bios_number != -1; pdi++) {
422 		if ((dev & B_MAGICMASK) == B_DEVMAGIC) { /* search by bootdev */
423 			if (pdi->bsd_dev == dev)
424 				break;
425 		} else {
426 			if (pdi->bios_number == dev)
427 				break;
428 		}
429 	}
430 
431 	if (pdi->bios_number == -1)
432 		return NULL;
433 	else
434 		return pdi;
435 }
436 
437 int
438 bios_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
439     size_t newlen, struct proc *p)
440 {
441 	bios_diskinfo_t *pdi;
442 	extern dev_t bootdev;
443 	int biosdev;
444 
445 	/* all sysctl names at this level except diskinfo are terminal */
446 	if (namelen != 1 && name[0] != BIOS_DISKINFO)
447 		return (ENOTDIR);	       /* overloaded */
448 
449 	if (!(bootapiver & BAPIV_VECTOR))
450 		return EOPNOTSUPP;
451 
452 	switch (name[0]) {
453 	case BIOS_DEV:
454 		if ((pdi = bios_getdiskinfo(bootdev)) == NULL)
455 			return ENXIO;
456 		biosdev = pdi->bios_number;
457 		return sysctl_rdint(oldp, oldlenp, newp, biosdev);
458 	case BIOS_DISKINFO:
459 		if (namelen != 2)
460 			return ENOTDIR;
461 		if ((pdi = bios_getdiskinfo(name[1])) == NULL)
462 			return ENXIO;
463 		return sysctl_rdstruct(oldp, oldlenp, newp, pdi, sizeof(*pdi));
464 	case BIOS_CKSUMLEN:
465 		return sysctl_rdint(oldp, oldlenp, newp, bios_cksumlen);
466 	default:
467 		return EOPNOTSUPP;
468 	}
469 	/* NOTREACHED */
470 }
471 
472 extern int tsc_is_invariant;
473 extern int amd64_has_xcrypt;
474 
475 const struct sysctl_bounded_args cpuctl_vars[] = {
476 	{ CPU_LIDACTION, &lid_action, 0, 2 },
477 	{ CPU_PWRACTION, &pwr_action, 0, 2 },
478 	{ CPU_CPUID, &cpu_id, 1, 0 },
479 	{ CPU_CPUFEATURE, &cpu_feature, 1, 0 },
480 	{ CPU_XCRYPT, &amd64_has_xcrypt, 1, 0 },
481 	{ CPU_INVARIANTTSC, &tsc_is_invariant, 1, 0 },
482 };
483 
484 /*
485  * machine dependent system variables.
486  */
487 int
488 cpu_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
489     size_t newlen, struct proc *p)
490 {
491 	extern uint64_t tsc_frequency;
492 	dev_t consdev;
493 	dev_t dev;
494 
495 	switch (name[0]) {
496 	case CPU_CONSDEV:
497 		if (namelen != 1)
498 			return (ENOTDIR);		/* overloaded */
499 		if (cn_tab != NULL)
500 			consdev = cn_tab->cn_dev;
501 		else
502 			consdev = NODEV;
503 		return (sysctl_rdstruct(oldp, oldlenp, newp, &consdev,
504 		    sizeof consdev));
505 	case CPU_CHR2BLK:
506 		if (namelen != 2)
507 			return (ENOTDIR);		/* overloaded */
508 		dev = chrtoblk((dev_t)name[1]);
509 		return sysctl_rdstruct(oldp, oldlenp, newp, &dev, sizeof(dev));
510 	case CPU_BIOS:
511 		return bios_sysctl(name + 1, namelen - 1, oldp, oldlenp,
512 		    newp, newlen, p);
513 	case CPU_CPUVENDOR:
514 		return (sysctl_rdstring(oldp, oldlenp, newp, cpu_vendor));
515 	case CPU_KBDRESET:
516 		if (securelevel > 0)
517 			return (sysctl_rdint(oldp, oldlenp, newp,
518 			    kbd_reset));
519 		else
520 			return (sysctl_int(oldp, oldlenp, newp, newlen,
521 			    &kbd_reset));
522 	case CPU_ALLOWAPERTURE:
523 		if (namelen != 1)
524 			return (ENOTDIR);		/* overloaded */
525 #ifdef APERTURE
526 		if (securelevel > 0)
527 			return (sysctl_int_lower(oldp, oldlenp, newp, newlen,
528 			    &allowaperture));
529 		else
530 			return (sysctl_int(oldp, oldlenp, newp, newlen,
531 			    &allowaperture));
532 #else
533 		return (sysctl_rdint(oldp, oldlenp, newp, 0));
534 #endif
535 #if NPCKBC > 0 && NUKBD > 0
536 	case CPU_FORCEUKBD:
537 		{
538 		int error;
539 
540 		if (forceukbd)
541 			return (sysctl_rdint(oldp, oldlenp, newp, forceukbd));
542 
543 		error = sysctl_int(oldp, oldlenp, newp, newlen, &forceukbd);
544 		if (forceukbd)
545 			pckbc_release_console();
546 		return (error);
547 		}
548 #endif
549 	case CPU_TSCFREQ:
550 		return (sysctl_rdquad(oldp, oldlenp, newp, tsc_frequency));
551 	default:
552 		return (sysctl_bounded_arr(cpuctl_vars, nitems(cpuctl_vars),
553 		    name, namelen, oldp, oldlenp, newp, newlen));
554 	}
555 	/* NOTREACHED */
556 }
557 
558 /*
559  * Send an interrupt to process.
560  *
561  * Stack is set up to allow sigcode to call routine, followed by
562  * syscall to sigreturn routine below.  After sigreturn resets the
563  * signal mask, the stack, and the frame pointer, it returns to the
564  * user specified pc.
565  */
566 int
567 sendsig(sig_t catcher, int sig, sigset_t mask, const siginfo_t *ksip)
568 {
569 	struct proc *p = curproc;
570 	struct trapframe *tf = p->p_md.md_regs;
571 	struct sigacts *psp = p->p_p->ps_sigacts;
572 	struct sigcontext ksc;
573 	struct savefpu *sfp = &p->p_addr->u_pcb.pcb_savefpu;
574 	register_t sp, scp, sip;
575 	u_long sss;
576 
577 	memset(&ksc, 0, sizeof ksc);
578 	ksc.sc_rdi = tf->tf_rdi;
579 	ksc.sc_rsi = tf->tf_rsi;
580 	ksc.sc_rdx = tf->tf_rdx;
581 	ksc.sc_rcx = tf->tf_rcx;
582 	ksc.sc_r8  = tf->tf_r8;
583 	ksc.sc_r9  = tf->tf_r9;
584 	ksc.sc_r10 = tf->tf_r10;
585 	ksc.sc_r11 = tf->tf_r11;
586 	ksc.sc_r12 = tf->tf_r12;
587 	ksc.sc_r13 = tf->tf_r13;
588 	ksc.sc_r14 = tf->tf_r14;
589 	ksc.sc_r15 = tf->tf_r15;
590 	ksc.sc_rbx = tf->tf_rbx;
591 	ksc.sc_rax = tf->tf_rax;
592 	ksc.sc_rbp = tf->tf_rbp;
593 	ksc.sc_rip = tf->tf_rip;
594 	ksc.sc_cs  = tf->tf_cs;
595 	ksc.sc_rflags = tf->tf_rflags;
596 	ksc.sc_rsp = tf->tf_rsp;
597 	ksc.sc_ss  = tf->tf_ss;
598 	ksc.sc_mask = mask;
599 
600 	/* Allocate space for the signal handler context. */
601 	if ((p->p_sigstk.ss_flags & SS_DISABLE) == 0 &&
602 	    !sigonstack(tf->tf_rsp) && (psp->ps_sigonstack & sigmask(sig)))
603 		sp = trunc_page((vaddr_t)p->p_sigstk.ss_sp + p->p_sigstk.ss_size);
604 	else
605 		sp = tf->tf_rsp - 128;
606 
607 	sp &= ~15ULL;	/* just in case */
608 	sss = (sizeof(ksc) + 15) & ~15;
609 
610 	/* Save FPU state to PCB if necessary, then copy it out */
611 	if (curcpu()->ci_flags & CPUF_USERXSTATE) {
612 		curcpu()->ci_flags &= ~CPUF_USERXSTATE;
613 		fpusavereset(&p->p_addr->u_pcb.pcb_savefpu);
614 	}
615 	sp -= fpu_save_len;
616 	ksc.sc_fpstate = (struct fxsave64 *)sp;
617 	if (copyout(sfp, (void *)sp, fpu_save_len))
618 		return 1;
619 
620 	/* Now reset the FPU state in PCB */
621 	memcpy(&p->p_addr->u_pcb.pcb_savefpu,
622 	    &proc0.p_addr->u_pcb.pcb_savefpu, fpu_save_len);
623 
624 	sip = 0;
625 	if (psp->ps_siginfo & sigmask(sig)) {
626 		sip = sp - ((sizeof(*ksip) + 15) & ~15);
627 		sss += (sizeof(*ksip) + 15) & ~15;
628 
629 		if (copyout(ksip, (void *)sip, sizeof(*ksip)))
630 			return 1;
631 	}
632 	scp = sp - sss;
633 
634 	ksc.sc_cookie = (long)scp ^ p->p_p->ps_sigcookie;
635 	if (copyout(&ksc, (void *)scp, sizeof(ksc)))
636 		return 1;
637 
638 	/*
639 	 * Build context to run handler in.
640 	 */
641 	tf->tf_rax = (u_int64_t)catcher;
642 	tf->tf_rdi = sig;
643 	tf->tf_rsi = sip;
644 	tf->tf_rdx = scp;
645 
646 	tf->tf_rip = (u_int64_t)p->p_p->ps_sigcode;
647 	tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
648 	tf->tf_rflags &= ~(PSL_T|PSL_D|PSL_VM|PSL_AC);
649 	tf->tf_rsp = scp;
650 	tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
651 
652 	/* The reset state _is_ the userspace state for this thread now */
653 	curcpu()->ci_flags |= CPUF_USERXSTATE;
654 
655 	return 0;
656 }
657 
658 /*
659  * System call to cleanup state after a signal
660  * has been taken.  Reset signal mask and
661  * stack state from context left by sendsig (above).
662  * Return to previous pc and psl as specified by
663  * context left by sendsig. Check carefully to
664  * make sure that the user has not modified the
665  * psl to gain improper privileges or to cause
666  * a machine fault.
667  */
668 int
669 sys_sigreturn(struct proc *p, void *v, register_t *retval)
670 {
671 	struct sys_sigreturn_args /* {
672 		syscallarg(struct sigcontext *) sigcntxp;
673 	} */ *uap = v;
674 	struct sigcontext ksc, *scp = SCARG(uap, sigcntxp);
675 	struct trapframe *tf = p->p_md.md_regs;
676 	int error;
677 
678 	if (PROC_PC(p) != p->p_p->ps_sigcoderet) {
679 		sigexit(p, SIGILL);
680 		return (EPERM);
681 	}
682 
683 	if ((error = copyin((caddr_t)scp, &ksc, sizeof ksc)))
684 		return (error);
685 
686 	if (ksc.sc_cookie != ((long)scp ^ p->p_p->ps_sigcookie)) {
687 		sigexit(p, SIGILL);
688 		return (EFAULT);
689 	}
690 
691 	/* Prevent reuse of the sigcontext cookie */
692 	ksc.sc_cookie = 0;
693 	(void)copyout(&ksc.sc_cookie, (caddr_t)scp +
694 	    offsetof(struct sigcontext, sc_cookie), sizeof (ksc.sc_cookie));
695 
696 	if (((ksc.sc_rflags ^ tf->tf_rflags) & PSL_USERSTATIC) != 0 ||
697 	    !USERMODE(ksc.sc_cs, ksc.sc_eflags))
698 		return (EINVAL);
699 
700 	/* Current state is obsolete; toss it and force a reload */
701 	if (curcpu()->ci_flags & CPUF_USERXSTATE) {
702 		curcpu()->ci_flags &= ~CPUF_USERXSTATE;
703 		fpureset();
704 	}
705 
706 	/* Copy in the FPU state to restore */
707 	if (__predict_true(ksc.sc_fpstate != NULL)) {
708 		struct fxsave64 *fx = &p->p_addr->u_pcb.pcb_savefpu.fp_fxsave;
709 
710 		if ((error = copyin(ksc.sc_fpstate, fx, fpu_save_len)))
711 			return (error);
712 		fx->fx_mxcsr &= fpu_mxcsr_mask;
713 	} else {
714 		/* shouldn't happen, but handle it */
715 		memcpy(&p->p_addr->u_pcb.pcb_savefpu,
716 		    &proc0.p_addr->u_pcb.pcb_savefpu, fpu_save_len);
717 	}
718 
719 	tf->tf_rdi = ksc.sc_rdi;
720 	tf->tf_rsi = ksc.sc_rsi;
721 	tf->tf_rdx = ksc.sc_rdx;
722 	tf->tf_rcx = ksc.sc_rcx;
723 	tf->tf_r8  = ksc.sc_r8;
724 	tf->tf_r9  = ksc.sc_r9;
725 	tf->tf_r10 = ksc.sc_r10;
726 	tf->tf_r11 = ksc.sc_r11;
727 	tf->tf_r12 = ksc.sc_r12;
728 	tf->tf_r13 = ksc.sc_r13;
729 	tf->tf_r14 = ksc.sc_r14;
730 	tf->tf_r15 = ksc.sc_r15;
731 	tf->tf_rbx = ksc.sc_rbx;
732 	tf->tf_rax = ksc.sc_rax;
733 	tf->tf_rbp = ksc.sc_rbp;
734 	tf->tf_rip = ksc.sc_rip;
735 	tf->tf_cs  = ksc.sc_cs;
736 	tf->tf_rflags = ksc.sc_rflags;
737 	tf->tf_rsp = ksc.sc_rsp;
738 	tf->tf_ss  = ksc.sc_ss;
739 
740 	/* Restore signal mask. */
741 	p->p_sigmask = ksc.sc_mask & ~sigcantmask;
742 
743 	/*
744 	 * sigreturn() needs to return to userspace via the 'iretq'
745 	 * method, so that if the process was interrupted (by tick,
746 	 * an IPI, whatever) as opposed to already being in the kernel
747 	 * when a signal was being delivered, the process will be
748 	 * completely restored, including the userland %rcx and %r11
749 	 * registers which the 'sysretq' instruction cannot restore.
750 	 * Also need to make sure we can handle faulting on xrstor.
751 	 */
752 	p->p_md.md_flags |= MDP_IRET;
753 
754 	return (EJUSTRETURN);
755 }
756 
757 #ifdef MULTIPROCESSOR
758 /* force a CPU into the kernel, whether or not it's idle */
759 void
760 cpu_kick(struct cpu_info *ci)
761 {
762 	/* only need to kick other CPUs */
763 	if (ci != curcpu()) {
764 		if (cpu_mwait_size > 0) {
765 			/*
766 			 * If not idling, then send an IPI, else
767 			 * just clear the "keep idling" bit.
768 			 */
769 			if ((ci->ci_mwait & MWAIT_IN_IDLE) == 0)
770 				x86_send_ipi(ci, X86_IPI_NOP);
771 			else
772 				atomic_clearbits_int(&ci->ci_mwait,
773 				    MWAIT_KEEP_IDLING);
774 		} else {
775 			/* no mwait, so need an IPI */
776 			x86_send_ipi(ci, X86_IPI_NOP);
777 		}
778 	}
779 }
780 #endif
781 
782 /*
783  * Notify the current process (p) that it has a signal pending,
784  * process as soon as possible.
785  */
786 void
787 signotify(struct proc *p)
788 {
789 	aston(p);
790 	cpu_kick(p->p_cpu);
791 }
792 
793 #ifdef MULTIPROCESSOR
794 void
795 cpu_unidle(struct cpu_info *ci)
796 {
797 	if (cpu_mwait_size > 0 && (ci->ci_mwait & MWAIT_ONLY)) {
798 		/*
799 		 * Just clear the "keep idling" bit; if it wasn't
800 		 * idling then we didn't need to do anything anyway.
801 		 */
802 		atomic_clearbits_int(&ci->ci_mwait, MWAIT_KEEP_IDLING);
803 		return;
804 	}
805 
806 	if (ci != curcpu())
807 		x86_send_ipi(ci, X86_IPI_NOP);
808 }
809 #endif
810 
811 int	waittime = -1;
812 struct pcb dumppcb;
813 
814 __dead void
815 boot(int howto)
816 {
817 	if ((howto & RB_POWERDOWN) != 0)
818 		lid_action = 0;
819 
820 	if ((howto & RB_RESET) != 0)
821 		goto doreset;
822 
823 	if (cold) {
824 		if ((howto & RB_USERREQ) == 0)
825 			howto |= RB_HALT;
826 		goto haltsys;
827 	}
828 
829 	boothowto = howto;
830 	if ((howto & RB_NOSYNC) == 0 && waittime < 0) {
831 		waittime = 0;
832 		vfs_shutdown(curproc);
833 
834 		if ((howto & RB_TIMEBAD) == 0) {
835 			resettodr();
836 		} else {
837 			printf("WARNING: not updating battery clock\n");
838 		}
839 	}
840 	if_downall();
841 
842 	uvm_shutdown();
843 	splhigh();
844 	cold = 1;
845 
846 	if ((howto & RB_DUMP) != 0)
847 		dumpsys();
848 
849 haltsys:
850 	config_suspend_all(DVACT_POWERDOWN);
851 
852 #ifdef MULTIPROCESSOR
853 	x86_broadcast_ipi(X86_IPI_HALT);
854 #endif
855 
856 	if ((howto & RB_HALT) != 0) {
857 #if NACPI > 0 && !defined(SMALL_KERNEL)
858 		extern int acpi_enabled;
859 
860 		if (acpi_enabled) {
861 			delay(500000);
862 			if ((howto & RB_POWERDOWN) != 0)
863 				acpi_powerdown();
864 		}
865 #endif
866 		printf("\n");
867 		printf("The operating system has halted.\n");
868 		printf("Please press any key to reboot.\n\n");
869 		cnpollc(1);	/* for proper keyboard command handling */
870 		cngetc();
871 		cnpollc(0);
872 	}
873 
874 doreset:
875 	printf("rebooting...\n");
876 	if (cpureset_delay > 0)
877 		delay(cpureset_delay * 1000);
878 	cpu_reset();
879 	for (;;)
880 		continue;
881 	/* NOTREACHED */
882 }
883 
884 /*
885  * These variables are needed by /sbin/savecore
886  */
887 u_long	dumpmag = 0x8fca0101;	/* magic number */
888 int 	dumpsize = 0;		/* pages */
889 long	dumplo = 0; 		/* blocks */
890 
891 /*
892  * cpu_dump: dump the machine-dependent kernel core dump headers.
893  */
894 int
895 cpu_dump(void)
896 {
897 	int (*dump)(dev_t, daddr_t, caddr_t, size_t);
898 	char buf[dbtob(1)];
899 	kcore_seg_t *segp;
900 	cpu_kcore_hdr_t *cpuhdrp;
901 	phys_ram_seg_t *memsegp;
902 	caddr_t va;
903 	int i;
904 
905 	dump = bdevsw[major(dumpdev)].d_dump;
906 
907 	memset(buf, 0, sizeof buf);
908 	segp = (kcore_seg_t *)buf;
909 	cpuhdrp = (cpu_kcore_hdr_t *)&buf[ALIGN(sizeof(*segp))];
910 	memsegp = (phys_ram_seg_t *)&buf[ALIGN(sizeof(*segp)) +
911 	    ALIGN(sizeof(*cpuhdrp))];
912 
913 	/*
914 	 * Generate a segment header.
915 	 */
916 	CORE_SETMAGIC(*segp, KCORE_MAGIC, MID_MACHINE, CORE_CPU);
917 	segp->c_size = dbtob(1) - ALIGN(sizeof(*segp));
918 
919 	/*
920 	 * Add the machine-dependent header info.
921 	 */
922 	cpuhdrp->ptdpaddr = proc0.p_addr->u_pcb.pcb_cr3;
923 	cpuhdrp->nmemsegs = mem_cluster_cnt;
924 
925 	/*
926 	 * Fill in the memory segment descriptors.
927 	 */
928 	for (i = 0; i < mem_cluster_cnt; i++) {
929 		memsegp[i].start = mem_clusters[i].start;
930 		memsegp[i].size = mem_clusters[i].size & ~PAGE_MASK;
931 	}
932 
933 	/*
934 	 * If we have dump memory then assume the kernel stack is in high
935 	 * memory and bounce
936 	 */
937 	if (dumpmem_vaddr != 0) {
938 		memcpy((char *)dumpmem_vaddr, buf, sizeof(buf));
939 		va = (caddr_t)dumpmem_vaddr;
940 	} else {
941 		va = (caddr_t)buf;
942 	}
943 	return (dump(dumpdev, dumplo, va, dbtob(1)));
944 }
945 
946 /*
947  * This is called by main to set dumplo and dumpsize.
948  * Dumps always skip the first PAGE_SIZE of disk space
949  * in case there might be a disk label stored there.
950  * If there is extra space, put dump at the end to
951  * reduce the chance that swapping trashes it.
952  */
953 void
954 dumpconf(void)
955 {
956 	int nblks, dumpblks;	/* size of dump area */
957 
958 	if (dumpdev == NODEV ||
959 	    (nblks = (bdevsw[major(dumpdev)].d_psize)(dumpdev)) == 0)
960 		return;
961 	if (nblks <= ctod(1))
962 		return;
963 
964 	dumpblks = cpu_dumpsize();
965 	if (dumpblks < 0)
966 		return;
967 	dumpblks += ctod(cpu_dump_mempagecnt());
968 
969 	/* If dump won't fit (incl. room for possible label), punt. */
970 	if (dumpblks > (nblks - ctod(1)))
971 		return;
972 
973 	/* Put dump at end of partition */
974 	dumplo = nblks - dumpblks;
975 
976 	/* dumpsize is in page units, and doesn't include headers. */
977 	dumpsize = cpu_dump_mempagecnt();
978 }
979 
980 /*
981  * Doadump comes here after turning off memory management and
982  * getting on the dump stack, either when called above, or by
983  * the auto-restart code.
984  */
985 #define BYTES_PER_DUMP  MAXPHYS /* must be a multiple of pagesize */
986 
987 void
988 dumpsys(void)
989 {
990 	u_long totalbytesleft, bytes, i, n, memseg;
991 	u_long maddr;
992 	daddr_t blkno;
993 	void *va;
994 	int (*dump)(dev_t, daddr_t, caddr_t, size_t);
995 	int error;
996 
997 	/* Save registers. */
998 	savectx(&dumppcb);
999 
1000 	if (dumpdev == NODEV)
1001 		return;
1002 
1003 	/*
1004 	 * For dumps during autoconfiguration,
1005 	 * if dump device has already configured...
1006 	 */
1007 	if (dumpsize == 0)
1008 		dumpconf();
1009 	if (dumplo <= 0 || dumpsize == 0) {
1010 		printf("\ndump to dev %u,%u not possible\n", major(dumpdev),
1011 		    minor(dumpdev));
1012 		return;
1013 	}
1014 	printf("\ndumping to dev %u,%u offset %ld\n", major(dumpdev),
1015 	    minor(dumpdev), dumplo);
1016 
1017 	error = (*bdevsw[major(dumpdev)].d_psize)(dumpdev);
1018 	printf("dump ");
1019 	if (error == -1) {
1020 		printf("area unavailable\n");
1021 		return;
1022 	}
1023 
1024 	if ((error = cpu_dump()) != 0)
1025 		goto err;
1026 
1027 	totalbytesleft = ptoa(cpu_dump_mempagecnt());
1028 	blkno = dumplo + cpu_dumpsize();
1029 	dump = bdevsw[major(dumpdev)].d_dump;
1030 	error = 0;
1031 
1032 	for (memseg = 0; memseg < mem_cluster_cnt; memseg++) {
1033 		maddr = mem_clusters[memseg].start;
1034 		bytes = mem_clusters[memseg].size;
1035 
1036 		for (i = 0; i < bytes; i += n, totalbytesleft -= n) {
1037 			/* Print out how many MBs we have left to go. */
1038 			if ((totalbytesleft % (1024*1024)) < BYTES_PER_DUMP)
1039 				printf("%ld ", totalbytesleft / (1024 * 1024));
1040 
1041 			/* Limit size for next transfer. */
1042 			n = bytes - i;
1043 			if (n > BYTES_PER_DUMP)
1044 				n = BYTES_PER_DUMP;
1045 			if (maddr > 0xffffffff) {
1046 				va = (void *)dumpmem_vaddr;
1047 				if (n > dumpmem_sz)
1048 					n = dumpmem_sz;
1049 				memcpy(va, (void *)PMAP_DIRECT_MAP(maddr), n);
1050 			} else {
1051 				va = (void *)PMAP_DIRECT_MAP(maddr);
1052 			}
1053 
1054 			error = (*dump)(dumpdev, blkno, va, n);
1055 			if (error)
1056 				goto err;
1057 			maddr += n;
1058 			blkno += btodb(n);		/* XXX? */
1059 
1060 #if 0	/* XXX this doesn't work.  grr. */
1061 			/* operator aborting dump? */
1062 			if (sget() != NULL) {
1063 				error = EINTR;
1064 				break;
1065 			}
1066 #endif
1067 		}
1068 	}
1069 
1070  err:
1071 	switch (error) {
1072 
1073 	case ENXIO:
1074 		printf("device bad\n");
1075 		break;
1076 
1077 	case EFAULT:
1078 		printf("device not ready\n");
1079 		break;
1080 
1081 	case EINVAL:
1082 		printf("area improper\n");
1083 		break;
1084 
1085 	case EIO:
1086 		printf("i/o error\n");
1087 		break;
1088 
1089 	case EINTR:
1090 		printf("aborted from console\n");
1091 		break;
1092 
1093 	case 0:
1094 		printf("succeeded\n");
1095 		break;
1096 
1097 	default:
1098 		printf("error %d\n", error);
1099 		break;
1100 	}
1101 	printf("\n\n");
1102 	delay(5000000);		/* 5 seconds */
1103 }
1104 
1105 /*
1106  * Force the userspace FS.base to be reloaded from the PCB on return from
1107  * the kernel, and reset the segment registers (%ds, %es, %fs, and %gs)
1108  * to their expected userspace value.
1109  */
1110 void
1111 reset_segs(void)
1112 {
1113 	/*
1114 	 * This operates like the cpu_switchto() sequence: if we
1115 	 * haven't reset %[defg]s already, do so now.
1116 	*/
1117 	if (curcpu()->ci_flags & CPUF_USERSEGS) {
1118 		curcpu()->ci_flags &= ~CPUF_USERSEGS;
1119 		__asm volatile(
1120 		    "movw %%ax,%%ds\n\t"
1121 		    "movw %%ax,%%es\n\t"
1122 		    "movw %%ax,%%fs\n\t"
1123 		    "cli\n\t"		/* block intr when on user GS.base */
1124 		    "swapgs\n\t"	/* swap from kernel to user GS.base */
1125 		    "movw %%ax,%%gs\n\t"/* set %gs to UDATA and GS.base to 0 */
1126 		    "swapgs\n\t"	/* back to kernel GS.base */
1127 		    "sti" : : "a"(GSEL(GUDATA_SEL, SEL_UPL)));
1128 	}
1129 }
1130 
1131 /*
1132  * Clear registers on exec
1133  */
1134 void
1135 setregs(struct proc *p, struct exec_package *pack, u_long stack,
1136     register_t *retval)
1137 {
1138 	struct trapframe *tf;
1139 
1140 	/* Reset FPU state in PCB */
1141 	memcpy(&p->p_addr->u_pcb.pcb_savefpu,
1142 	    &proc0.p_addr->u_pcb.pcb_savefpu, fpu_save_len);
1143 
1144 	if (curcpu()->ci_flags & CPUF_USERXSTATE) {
1145 		/* state in CPU is obsolete; reset it */
1146 		fpureset();
1147 	} else {
1148 		/* the reset state _is_ the userspace state now */
1149 		curcpu()->ci_flags |= CPUF_USERXSTATE;
1150 	}
1151 
1152 	/* To reset all registers we have to return via iretq */
1153 	p->p_md.md_flags |= MDP_IRET;
1154 
1155 	reset_segs();
1156 	p->p_addr->u_pcb.pcb_fsbase = 0;
1157 
1158 	tf = p->p_md.md_regs;
1159 	tf->tf_rdi = 0;
1160 	tf->tf_rsi = 0;
1161 	tf->tf_rbp = 0;
1162 	tf->tf_rbx = 0;
1163 	tf->tf_rdx = 0;
1164 	tf->tf_rcx = 0;
1165 	tf->tf_rax = 0;
1166 	tf->tf_r8 = 0;
1167 	tf->tf_r9 = 0;
1168 	tf->tf_r10 = 0;
1169 	tf->tf_r11 = 0;
1170 	tf->tf_r12 = 0;
1171 	tf->tf_r13 = 0;
1172 	tf->tf_r14 = 0;
1173 	tf->tf_r15 = 0;
1174 	tf->tf_rip = pack->ep_entry;
1175 	tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
1176 	tf->tf_rflags = PSL_USERSET;
1177 	tf->tf_rsp = stack;
1178 	tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
1179 
1180 	retval[1] = 0;
1181 }
1182 
1183 /*
1184  * Initialize segments and descriptor tables
1185  */
1186 
1187 struct gate_descriptor *idt;
1188 char idt_allocmap[NIDT];
1189 extern  struct user *proc0paddr;
1190 
1191 void
1192 setgate(struct gate_descriptor *gd, void *func, int ist, int type, int dpl,
1193     int sel)
1194 {
1195 	gd->gd_looffset = (u_int64_t)func & 0xffff;
1196 	gd->gd_selector = sel;
1197 	gd->gd_ist = ist;
1198 	gd->gd_type = type;
1199 	gd->gd_dpl = dpl;
1200 	gd->gd_p = 1;
1201 	gd->gd_hioffset = (u_int64_t)func >> 16;
1202 	gd->gd_zero = 0;
1203 	gd->gd_xx1 = 0;
1204 	gd->gd_xx2 = 0;
1205 	gd->gd_xx3 = 0;
1206 }
1207 
1208 void
1209 unsetgate(struct gate_descriptor *gd)
1210 {
1211 	memset(gd, 0, sizeof (*gd));
1212 }
1213 
1214 void
1215 setregion(struct region_descriptor *rd, void *base, u_int16_t limit)
1216 {
1217 	rd->rd_limit = limit;
1218 	rd->rd_base = (u_int64_t)base;
1219 }
1220 
1221 /*
1222  * Note that the base and limit fields are ignored in long mode.
1223  */
1224 void
1225 set_mem_segment(struct mem_segment_descriptor *sd, void *base, size_t limit,
1226     int type, int dpl, int gran, int def32, int is64)
1227 {
1228 	sd->sd_lolimit = (unsigned)limit;
1229 	sd->sd_lobase = (unsigned long)base;
1230 	sd->sd_type = type;
1231 	sd->sd_dpl = dpl;
1232 	sd->sd_p = 1;
1233 	sd->sd_hilimit = (unsigned)limit >> 16;
1234 	sd->sd_avl = 0;
1235 	sd->sd_long = is64;
1236 	sd->sd_def32 = def32;
1237 	sd->sd_gran = gran;
1238 	sd->sd_hibase = (unsigned long)base >> 24;
1239 }
1240 
1241 void
1242 set_sys_segment(struct sys_segment_descriptor *sd, void *base, size_t limit,
1243     int type, int dpl, int gran)
1244 {
1245 	memset(sd, 0, sizeof *sd);
1246 	sd->sd_lolimit = (unsigned)limit;
1247 	sd->sd_lobase = (u_int64_t)base;
1248 	sd->sd_type = type;
1249 	sd->sd_dpl = dpl;
1250 	sd->sd_p = 1;
1251 	sd->sd_hilimit = (unsigned)limit >> 16;
1252 	sd->sd_gran = gran;
1253 	sd->sd_hibase = (u_int64_t)base >> 24;
1254 }
1255 
1256 void cpu_init_idt(void)
1257 {
1258 	struct region_descriptor region;
1259 
1260 	setregion(&region, idt, NIDT * sizeof(idt[0]) - 1);
1261 	lidt(&region);
1262 }
1263 
1264 void
1265 cpu_init_extents(void)
1266 {
1267 	extern struct extent *iomem_ex;
1268 	static int already_done;
1269 	int i;
1270 
1271 	/* We get called for each CPU, only first should do this */
1272 	if (already_done)
1273 		return;
1274 
1275 	/*
1276 	 * Allocate the physical addresses used by RAM from the iomem
1277 	 * extent map.
1278 	 */
1279 	for (i = 0; i < mem_cluster_cnt; i++) {
1280 		if (extent_alloc_region(iomem_ex, mem_clusters[i].start,
1281 		    mem_clusters[i].size, EX_NOWAIT)) {
1282 			/* XXX What should we do? */
1283 			printf("WARNING: CAN'T ALLOCATE RAM (%llx-%llx)"
1284 			    " FROM IOMEM EXTENT MAP!\n", mem_clusters[i].start,
1285 			    mem_clusters[i].start + mem_clusters[i].size - 1);
1286 		}
1287 	}
1288 
1289 	already_done = 1;
1290 }
1291 
1292 void
1293 map_tramps(void)
1294 {
1295 #if defined(MULTIPROCESSOR) || \
1296     (NACPI > 0 && !defined(SMALL_KERNEL))
1297 	struct pmap *kmp = pmap_kernel();
1298 	extern paddr_t tramp_pdirpa;
1299 #ifdef MULTIPROCESSOR
1300 	extern u_char cpu_spinup_trampoline[];
1301 	extern u_char cpu_spinup_trampoline_end[];
1302 	extern u_char mp_tramp_data_start[];
1303 	extern u_char mp_tramp_data_end[];
1304 	extern u_int32_t mp_pdirpa;
1305 #endif
1306 
1307 	/*
1308 	 * The initial PML4 pointer must be below 4G, so if the
1309 	 * current one isn't, use a "bounce buffer" and save it
1310 	 * for tramps to use.
1311 	 */
1312 	if (kmp->pm_pdirpa > 0xffffffff) {
1313 		pmap_kenter_pa(lo32_vaddr, lo32_paddr, PROT_READ | PROT_WRITE);
1314 		memcpy((void *)lo32_vaddr, kmp->pm_pdir, PAGE_SIZE);
1315 		tramp_pdirpa = lo32_paddr;
1316 		pmap_kremove(lo32_vaddr, PAGE_SIZE);
1317 	} else
1318 		tramp_pdirpa = kmp->pm_pdirpa;
1319 
1320 
1321 #ifdef MULTIPROCESSOR
1322 	/* Map MP tramp code and data pages RW for copy */
1323 	pmap_kenter_pa(MP_TRAMPOLINE, MP_TRAMPOLINE,
1324 	    PROT_READ | PROT_WRITE);
1325 
1326 	pmap_kenter_pa(MP_TRAMP_DATA, MP_TRAMP_DATA,
1327 	    PROT_READ | PROT_WRITE);
1328 
1329 	memset((caddr_t)MP_TRAMPOLINE, 0xcc, PAGE_SIZE);
1330 	memset((caddr_t)MP_TRAMP_DATA, 0xcc, PAGE_SIZE);
1331 
1332 	memcpy((caddr_t)MP_TRAMPOLINE,
1333 	    cpu_spinup_trampoline,
1334 	    cpu_spinup_trampoline_end-cpu_spinup_trampoline);
1335 
1336 	memcpy((caddr_t)MP_TRAMP_DATA,
1337 		mp_tramp_data_start,
1338 		mp_tramp_data_end - mp_tramp_data_start);
1339 
1340 	/*
1341 	 * We need to patch this after we copy the tramp data,
1342 	 * the symbol points into the copied tramp data page.
1343 	 */
1344 	mp_pdirpa = tramp_pdirpa;
1345 
1346 	/* Unmap, will be remapped in cpu_start_secondary */
1347 	pmap_kremove(MP_TRAMPOLINE, PAGE_SIZE);
1348 	pmap_kremove(MP_TRAMP_DATA, PAGE_SIZE);
1349 #endif /* MULTIPROCESSOR */
1350 #endif
1351 }
1352 
1353 #define	IDTVEC(name)	__CONCAT(X, name)
1354 typedef void (vector)(void);
1355 extern vector *IDTVEC(exceptions)[];
1356 
1357 paddr_t early_pte_pages;
1358 
1359 void
1360 init_x86_64(paddr_t first_avail)
1361 {
1362 	struct region_descriptor region;
1363 	bios_memmap_t *bmp;
1364 	int x, ist;
1365 	uint64_t max_dm_size = ((uint64_t)512 * NUM_L4_SLOT_DIRECT) << 30;
1366 
1367 	/*
1368 	 * locore0 mapped 3 pages for use before the pmap is initialized
1369 	 * starting at first_avail. These pages are currently used by
1370 	 * efifb to create early-use VAs for the framebuffer before efifb
1371 	 * is attached.
1372 	 */
1373 	early_pte_pages = first_avail;
1374 	first_avail += 3 * NBPG;
1375 
1376 	cpu_init_msrs(&cpu_info_primary);
1377 
1378 	proc0.p_addr = proc0paddr;
1379 	cpu_info_primary.ci_curpcb = &proc0.p_addr->u_pcb;
1380 
1381 	x86_bus_space_init();
1382 
1383 	i8254_startclock();
1384 
1385 	/*
1386 	 * Initialize PAGE_SIZE-dependent variables.
1387 	 */
1388 	uvm_setpagesize();
1389 
1390 	/*
1391 	 * Boot arguments are in a single page specified by /boot.
1392 	 *
1393 	 * We require the "new" vector form, as well as memory ranges
1394 	 * to be given in bytes rather than KB.
1395 	 *
1396 	 * locore copies the data into bootinfo[] for us.
1397 	 */
1398 	if ((bootapiver & (BAPIV_VECTOR | BAPIV_BMEMMAP)) ==
1399 	    (BAPIV_VECTOR | BAPIV_BMEMMAP)) {
1400 		if (bootinfo_size >= sizeof(bootinfo))
1401 			panic("boot args too big");
1402 
1403 		getbootinfo(bootinfo, bootinfo_size);
1404 	} else
1405 		panic("invalid /boot");
1406 
1407 	cninit();
1408 
1409 /*
1410  * Memory on the AMD64 port is described by three different things.
1411  *
1412  * 1. biosbasemem - This is outdated, and should really only be used to
1413  *    sanitize the other values. This is what we get back from the BIOS
1414  *    using the legacy routines, describing memory below 640KB.
1415  *
1416  * 2. bios_memmap[] - This is the memory map as the bios has returned
1417  *    it to us.  It includes memory the kernel occupies, etc.
1418  *
1419  * 3. mem_cluster[] - This is the massaged free memory segments after
1420  *    taking into account the contents of bios_memmap, biosbasemem,
1421  *    and locore/machdep/pmap kernel allocations of physical
1422  *    pages.
1423  *
1424  * The other thing is that the physical page *RANGE* is described by
1425  * three more variables:
1426  *
1427  * avail_start - This is a physical address of the start of available
1428  *               pages, until IOM_BEGIN.  This is basically the start
1429  *               of the UVM managed range of memory, with some holes...
1430  *
1431  * avail_end - This is the end of physical pages.  All physical pages
1432  *             that UVM manages are between avail_start and avail_end.
1433  *             There are holes...
1434  *
1435  * first_avail - This is the first available physical page after the
1436  *               kernel, page tables, etc.
1437  *
1438  * We skip the first few pages for trampolines, hibernate, and to avoid
1439  * buggy SMI implementations that could corrupt the first 64KB.
1440  */
1441 	avail_start = 16*PAGE_SIZE;
1442 
1443 #ifdef MULTIPROCESSOR
1444 	if (avail_start < MP_TRAMPOLINE + PAGE_SIZE)
1445 		avail_start = MP_TRAMPOLINE + PAGE_SIZE;
1446 	if (avail_start < MP_TRAMP_DATA + PAGE_SIZE)
1447 		avail_start = MP_TRAMP_DATA + PAGE_SIZE;
1448 #endif
1449 
1450 #if (NACPI > 0 && !defined(SMALL_KERNEL))
1451 	if (avail_start < ACPI_TRAMPOLINE + PAGE_SIZE)
1452 		avail_start = ACPI_TRAMPOLINE + PAGE_SIZE;
1453 	if (avail_start < ACPI_TRAMP_DATA + PAGE_SIZE)
1454 		avail_start = ACPI_TRAMP_DATA + PAGE_SIZE;
1455 #endif
1456 
1457 #ifdef HIBERNATE
1458 	if (avail_start < HIBERNATE_HIBALLOC_PAGE + PAGE_SIZE)
1459 		avail_start = HIBERNATE_HIBALLOC_PAGE + PAGE_SIZE;
1460 #endif /* HIBERNATE */
1461 
1462 	/*
1463 	 * We need to go through the BIOS memory map given, and
1464 	 * fill out mem_clusters and mem_cluster_cnt stuff, taking
1465 	 * into account all the points listed above.
1466 	 */
1467 	avail_end = mem_cluster_cnt = 0;
1468 	for (bmp = bios_memmap; bmp->type != BIOS_MAP_END; bmp++) {
1469 		paddr_t s1, s2, e1, e2;
1470 
1471 		/* Ignore non-free memory */
1472 		if (bmp->type != BIOS_MAP_FREE)
1473 			continue;
1474 		if (bmp->size < PAGE_SIZE)
1475 			continue;
1476 
1477 		/* Init our segment(s), round/trunc to pages */
1478 		s1 = round_page(bmp->addr);
1479 		e1 = trunc_page(bmp->addr + bmp->size);
1480 		s2 = e2 = 0;
1481 
1482 		/*
1483 		 * XXX Some buggy ACPI BIOSes use memory that they
1484 		 * declare as free.  Typically the affected memory
1485 		 * areas are small blocks between areas reserved for
1486 		 * ACPI and other BIOS goo.  So skip areas smaller
1487 		 * than 1 MB above the 16 MB boundary (to avoid
1488 		 * affecting legacy stuff).
1489 		 */
1490 		if (s1 > 16*1024*1024 && (e1 - s1) < 1*1024*1024)
1491 			continue;
1492 
1493 		/* Check and adjust our segment(s) */
1494 		/* Nuke low pages */
1495 		if (s1 < avail_start) {
1496 			s1 = avail_start;
1497 			if (s1 > e1)
1498 				continue;
1499 		}
1500 
1501 		/*
1502 		 * The direct map is limited to 512GB * NUM_L4_SLOT_DIRECT of
1503 		 * memory, so discard anything above that.
1504 		 */
1505 		if (e1 >= max_dm_size) {
1506 			e1 = max_dm_size;
1507 			if (s1 > e1)
1508 				continue;
1509 		}
1510 
1511 		/* Crop stuff into "640K hole" */
1512 		if (s1 < IOM_BEGIN && e1 > IOM_BEGIN)
1513 			e1 = IOM_BEGIN;
1514 		if (s1 < biosbasemem && e1 > biosbasemem)
1515 			e1 = biosbasemem;
1516 
1517 		/* Split any segments straddling the 16MB boundary */
1518 		if (s1 < 16*1024*1024 && e1 > 16*1024*1024) {
1519 			e2 = e1;
1520 			s2 = e1 = 16*1024*1024;
1521 		}
1522 
1523 		/* Store segment(s) */
1524 		if (e1 - s1 >= PAGE_SIZE) {
1525 			mem_clusters[mem_cluster_cnt].start = s1;
1526 			mem_clusters[mem_cluster_cnt].size = e1 - s1;
1527 			mem_cluster_cnt++;
1528 		}
1529 		if (e2 - s2 >= PAGE_SIZE) {
1530 			mem_clusters[mem_cluster_cnt].start = s2;
1531 			mem_clusters[mem_cluster_cnt].size = e2 - s2;
1532 			mem_cluster_cnt++;
1533 		}
1534 		if (avail_end < e1) avail_end = e1;
1535 		if (avail_end < e2) avail_end = e2;
1536 	}
1537 
1538 	/*
1539 	 * Call pmap initialization to make new kernel address space.
1540 	 * We must do this before loading pages into the VM system.
1541 	 */
1542 	first_avail = pmap_bootstrap(first_avail, trunc_page(avail_end));
1543 
1544 	/* Allocate these out of the 640KB base memory */
1545 	if (avail_start != PAGE_SIZE)
1546 		avail_start = pmap_prealloc_lowmem_ptps(avail_start);
1547 
1548 	cpu_init_extents();
1549 
1550 	/* Make sure the end of the space used by the kernel is rounded. */
1551 	first_avail = round_page(first_avail);
1552 	kern_end = KERNBASE + first_avail;
1553 
1554 	/*
1555 	 * Now, load the memory clusters (which have already been
1556 	 * flensed) into the VM system.
1557 	 */
1558 	for (x = 0; x < mem_cluster_cnt; x++) {
1559 		paddr_t seg_start = mem_clusters[x].start;
1560 		paddr_t seg_end = seg_start + mem_clusters[x].size;
1561 
1562 		if (seg_start < first_avail) seg_start = first_avail;
1563 		if (seg_start > seg_end) continue;
1564 		if (seg_end - seg_start < PAGE_SIZE) continue;
1565 
1566 		physmem += atop(mem_clusters[x].size);
1567 
1568 #if DEBUG_MEMLOAD
1569 		printf("loading 0x%lx-0x%lx (0x%lx-0x%lx)\n",
1570 		    seg_start, seg_end, atop(seg_start), atop(seg_end));
1571 #endif
1572 		uvm_page_physload(atop(seg_start), atop(seg_end),
1573 		    atop(seg_start), atop(seg_end), 0);
1574 	}
1575 
1576 	/*
1577          * Now, load the memory between the end of I/O memory "hole"
1578          * and the kernel.
1579 	 */
1580 	{
1581 		paddr_t seg_start = round_page(IOM_END);
1582 		paddr_t seg_end = trunc_page(KERNTEXTOFF - KERNBASE);
1583 
1584 		if (seg_start < seg_end) {
1585 #if DEBUG_MEMLOAD
1586 			printf("loading 0x%lx-0x%lx\n", seg_start, seg_end);
1587 #endif
1588 			uvm_page_physload(atop(seg_start), atop(seg_end),
1589 			    atop(seg_start), atop(seg_end), 0);
1590 		}
1591 	}
1592 
1593 #if DEBUG_MEMLOAD
1594 	printf("avail_start = 0x%lx\n", avail_start);
1595 	printf("avail_end = 0x%lx\n", avail_end);
1596 	printf("first_avail = 0x%lx\n", first_avail);
1597 #endif
1598 
1599 	/*
1600 	 * Steal memory for the message buffer (at end of core).
1601 	 */
1602 	{
1603 		struct vm_physseg *vps = NULL;
1604 		psize_t sz = round_page(MSGBUFSIZE);
1605 		psize_t reqsz = sz;
1606 
1607 		for (x = 0; x < vm_nphysseg; x++) {
1608 			vps = &vm_physmem[x];
1609 			if (ptoa(vps->avail_end) == avail_end)
1610 				break;
1611 		}
1612 		if (x == vm_nphysseg)
1613 			panic("init_x86_64: can't find end of memory");
1614 
1615 		/* Shrink so it'll fit in the last segment. */
1616 		if ((vps->avail_end - vps->avail_start) < atop(sz))
1617 			sz = ptoa(vps->avail_end - vps->avail_start);
1618 
1619 		vps->avail_end -= atop(sz);
1620 		vps->end -= atop(sz);
1621 		msgbuf_paddr = ptoa(vps->avail_end);
1622 
1623 		/* Remove the last segment if it now has no pages. */
1624 		if (vps->start == vps->end) {
1625 			for (vm_nphysseg--; x < vm_nphysseg; x++)
1626 				vm_physmem[x] = vm_physmem[x + 1];
1627 		}
1628 
1629 		/* Now find where the new avail_end is. */
1630 		for (avail_end = 0, x = 0; x < vm_nphysseg; x++)
1631 			if (vm_physmem[x].avail_end > avail_end)
1632 				avail_end = vm_physmem[x].avail_end;
1633 		avail_end = ptoa(avail_end);
1634 
1635 		/* Warn if the message buffer had to be shrunk. */
1636 		if (sz != reqsz)
1637 			printf("WARNING: %ld bytes not available for msgbuf "
1638 			    "in last cluster (%ld used)\n", reqsz, sz);
1639 	}
1640 
1641 	/*
1642 	 * Steal some memory for a dump bouncebuffer if we have memory over
1643 	 * the 32-bit barrier.
1644 	 */
1645 	if (avail_end > 0xffffffff) {
1646 		struct vm_physseg *vps = NULL;
1647 		psize_t sz = round_page(MAX(BYTES_PER_DUMP, dbtob(1)));
1648 
1649 		/* XXX assumes segments are ordered */
1650 		for (x = 0; x < vm_nphysseg; x++) {
1651 			vps = &vm_physmem[x];
1652 			/* Find something between 16meg and 4gig */
1653 			if (ptoa(vps->avail_end) <= 0xffffffff &&
1654 			    ptoa(vps->avail_start) >= 0xffffff)
1655 				break;
1656 		}
1657 		if (x == vm_nphysseg)
1658 			panic("init_x86_64: no memory between "
1659 			    "0xffffff-0xffffffff");
1660 
1661 		/* Shrink so it'll fit in the segment. */
1662 		if ((vps->avail_end - vps->avail_start) < atop(sz))
1663 			sz = ptoa(vps->avail_end - vps->avail_start);
1664 
1665 		vps->avail_end -= atop(sz);
1666 		vps->end -= atop(sz);
1667 		dumpmem_paddr = ptoa(vps->avail_end);
1668 		dumpmem_vaddr = PMAP_DIRECT_MAP(dumpmem_paddr);
1669 		dumpmem_sz = sz;
1670 
1671 		/* Remove the last segment if it now has no pages. */
1672 		if (vps->start == vps->end) {
1673 			for (vm_nphysseg--; x < vm_nphysseg; x++)
1674 				vm_physmem[x] = vm_physmem[x + 1];
1675 		}
1676 	}
1677 
1678 	pmap_growkernel(VM_MIN_KERNEL_ADDRESS + 32 * 1024 * 1024);
1679 
1680 	pmap_kenter_pa(idt_vaddr, idt_paddr, PROT_READ | PROT_WRITE);
1681 
1682 	idt = (struct gate_descriptor *)idt_vaddr;
1683 	cpu_info_primary.ci_tss = &cpu_info_full_primary.cif_tss;
1684 	cpu_info_primary.ci_gdt = &cpu_info_full_primary.cif_gdt;
1685 
1686 	/* make gdt gates and memory segments */
1687 	set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GCODE_SEL), 0,
1688 	    0xfffff, SDT_MEMERA, SEL_KPL, 1, 0, 1);
1689 
1690 	set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GDATA_SEL), 0,
1691 	    0xfffff, SDT_MEMRWA, SEL_KPL, 1, 0, 1);
1692 
1693 	set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GUCODE32_SEL), 0,
1694 	    atop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMERA, SEL_UPL, 1, 1, 0);
1695 
1696 	set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GUDATA_SEL), 0,
1697 	    atop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMRWA, SEL_UPL, 1, 0, 1);
1698 
1699 	set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GUCODE_SEL), 0,
1700 	    atop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMERA, SEL_UPL, 1, 0, 1);
1701 
1702 	set_sys_segment(GDT_ADDR_SYS(cpu_info_primary.ci_gdt, GPROC0_SEL),
1703 	    cpu_info_primary.ci_tss, sizeof (struct x86_64_tss)-1,
1704 	    SDT_SYS386TSS, SEL_KPL, 0);
1705 
1706 	/* exceptions */
1707 	for (x = 0; x < 32; x++) {
1708 		/* trap2 == NMI, trap8 == double fault */
1709 		ist = (x == 2) ? 2 : (x == 8) ? 1 : 0;
1710 		setgate(&idt[x], IDTVEC(exceptions)[x], ist, SDT_SYS386IGT,
1711 		    (x == 3) ? SEL_UPL : SEL_KPL,
1712 		    GSEL(GCODE_SEL, SEL_KPL));
1713 		idt_allocmap[x] = 1;
1714 	}
1715 
1716 	setregion(&region, cpu_info_primary.ci_gdt, GDT_SIZE - 1);
1717 	lgdt(&region);
1718 
1719 	cpu_init_idt();
1720 
1721 	intr_default_setup();
1722 
1723 	fpuinit(&cpu_info_primary);
1724 
1725 	softintr_init();
1726 	splraise(IPL_IPI);
1727 	intr_enable();
1728 
1729 #ifdef DDB
1730 	db_machine_init();
1731 	ddb_init();
1732 	if (boothowto & RB_KDB)
1733 		db_enter();
1734 #endif
1735 }
1736 
1737 void
1738 cpu_reset(void)
1739 {
1740 	intr_disable();
1741 
1742 	if (cpuresetfn)
1743 		(*cpuresetfn)();
1744 
1745 	/*
1746 	 * The keyboard controller has 4 random output pins, one of which is
1747 	 * connected to the RESET pin on the CPU in many PCs.  We tell the
1748 	 * keyboard controller to pulse this line a couple of times.
1749 	 */
1750 	outb(IO_KBD + KBCMDP, KBC_PULSE0);
1751 	delay(100000);
1752 	outb(IO_KBD + KBCMDP, KBC_PULSE0);
1753 	delay(100000);
1754 
1755 	/*
1756 	 * Try to cause a triple fault and watchdog reset by making the IDT
1757 	 * invalid and causing a fault.
1758 	 */
1759 	memset((caddr_t)idt, 0, NIDT * sizeof(idt[0]));
1760 	__asm volatile("divl %0,%1" : : "q" (0), "a" (0));
1761 
1762 	for (;;)
1763 		continue;
1764 	/* NOTREACHED */
1765 }
1766 
1767 /*
1768  * cpu_dumpsize: calculate size of machine-dependent kernel core dump headers.
1769  */
1770 int
1771 cpu_dumpsize(void)
1772 {
1773 	int size;
1774 
1775 	size = ALIGN(sizeof(kcore_seg_t)) +
1776 	    ALIGN(mem_cluster_cnt * sizeof(phys_ram_seg_t));
1777 	if (roundup(size, dbtob(1)) != dbtob(1))
1778 		return (-1);
1779 
1780 	return (1);
1781 }
1782 
1783 /*
1784  * cpu_dump_mempagecnt: calculate the size of RAM (in pages) to be dumped.
1785  */
1786 u_long
1787 cpu_dump_mempagecnt(void)
1788 {
1789 	u_long i, n;
1790 
1791 	n = 0;
1792 	for (i = 0; i < mem_cluster_cnt; i++)
1793 		n += atop(mem_clusters[i].size);
1794 	return (n);
1795 }
1796 
1797 /*
1798  * Figure out which portions of memory are used by the kernel/system.
1799  */
1800 int
1801 amd64_pa_used(paddr_t addr)
1802 {
1803 	struct vm_page	*pg;
1804 
1805 	/* Kernel manages these */
1806 	if ((pg = PHYS_TO_VM_PAGE(addr)) && (pg->pg_flags & PG_DEV) == 0)
1807 		return 1;
1808 
1809 	/* Kernel is loaded here */
1810 	if (addr > IOM_END && addr < (kern_end - KERNBASE))
1811 		return 1;
1812 
1813 	/* Low memory used for various bootstrap things */
1814 	if (addr < avail_start)
1815 		return 1;
1816 
1817 	/*
1818 	 * The only regions I can think of that are left are the things
1819 	 * we steal away from UVM.  The message buffer?
1820 	 * XXX - ignore these for now.
1821 	 */
1822 
1823 	return 0;
1824 }
1825 
1826 void
1827 cpu_initclocks(void)
1828 {
1829 	(*initclock_func)();
1830 }
1831 
1832 void
1833 need_resched(struct cpu_info *ci)
1834 {
1835 	ci->ci_want_resched = 1;
1836 
1837 	/* There's a risk we'll be called before the idle threads start */
1838 	if (ci->ci_curproc) {
1839 		aston(ci->ci_curproc);
1840 		cpu_kick(ci);
1841 	}
1842 }
1843 
1844 /*
1845  * Allocate an IDT vector slot within the given range.
1846  * XXX needs locking to avoid MP allocation races.
1847  */
1848 
1849 int
1850 idt_vec_alloc(int low, int high)
1851 {
1852 	int vec;
1853 
1854 	for (vec = low; vec <= high; vec++) {
1855 		if (idt_allocmap[vec] == 0) {
1856 			idt_allocmap[vec] = 1;
1857 			return vec;
1858 		}
1859 	}
1860 	return 0;
1861 }
1862 
1863 void
1864 idt_vec_set(int vec, void (*function)(void))
1865 {
1866 	/*
1867 	 * Vector should be allocated, so no locking needed.
1868 	 */
1869 	KASSERT(idt_allocmap[vec] == 1);
1870 	setgate(&idt[vec], function, 0, SDT_SYS386IGT, SEL_KPL,
1871 	    GSEL(GCODE_SEL, SEL_KPL));
1872 }
1873 
1874 void
1875 idt_vec_free(int vec)
1876 {
1877 	unsetgate(&idt[vec]);
1878 	idt_allocmap[vec] = 0;
1879 }
1880 
1881 #ifdef DIAGNOSTIC
1882 void
1883 splassert_check(int wantipl, const char *func)
1884 {
1885 	int cpl = curcpu()->ci_ilevel;
1886 	int floor = curcpu()->ci_handled_intr_level;
1887 
1888 	if (cpl < wantipl) {
1889 		splassert_fail(wantipl, cpl, func);
1890 	}
1891 	if (floor > wantipl) {
1892 		splassert_fail(wantipl, floor, func);
1893 	}
1894 
1895 }
1896 #endif
1897 
1898 int
1899 copyin32(const uint32_t *uaddr, uint32_t *kaddr)
1900 {
1901 	if ((vaddr_t)uaddr & 0x3)
1902 		return EFAULT;
1903 
1904 	/* copyin(9) is atomic */
1905 	return copyin(uaddr, kaddr, sizeof(uint32_t));
1906 }
1907 
1908 void
1909 getbootinfo(char *bootinfo, int bootinfo_size)
1910 {
1911 	bootarg32_t *q;
1912 	bios_ddb_t *bios_ddb;
1913 	bios_bootduid_t *bios_bootduid;
1914 	bios_bootsr_t *bios_bootsr;
1915 #undef BOOTINFO_DEBUG
1916 #ifdef BOOTINFO_DEBUG
1917 	printf("bootargv:");
1918 #endif
1919 
1920 	for (q = (bootarg32_t *)bootinfo;
1921 	    (q->ba_type != BOOTARG_END) &&
1922 	    ((((char *)q) - bootinfo) < bootinfo_size);
1923 	    q = (bootarg32_t *)(((char *)q) + q->ba_size)) {
1924 
1925 		switch (q->ba_type) {
1926 		case BOOTARG_MEMMAP:
1927 			bios_memmap = (bios_memmap_t *)q->ba_arg;
1928 #ifdef BOOTINFO_DEBUG
1929 			printf(" memmap %p", bios_memmap);
1930 #endif
1931 			break;
1932 		case BOOTARG_DISKINFO:
1933 			bios_diskinfo = (bios_diskinfo_t *)q->ba_arg;
1934 #ifdef BOOTINFO_DEBUG
1935 			printf(" diskinfo %p", bios_diskinfo);
1936 #endif
1937 			break;
1938 		case BOOTARG_APMINFO:
1939 			/* generated by i386 boot loader */
1940 			break;
1941 		case BOOTARG_CKSUMLEN:
1942 			bios_cksumlen = *(u_int32_t *)q->ba_arg;
1943 #ifdef BOOTINFO_DEBUG
1944 			printf(" cksumlen %d", bios_cksumlen);
1945 #endif
1946 			break;
1947 		case BOOTARG_PCIINFO:
1948 			/* generated by i386 boot loader */
1949 			break;
1950 		case BOOTARG_CONSDEV:
1951 			if (q->ba_size >= sizeof(bios_consdev_t) +
1952 			    offsetof(struct _boot_args32, ba_arg)) {
1953 #if NCOM > 0
1954 				bios_consdev_t *cdp =
1955 				    (bios_consdev_t*)q->ba_arg;
1956 				static const int ports[] =
1957 				    { 0x3f8, 0x2f8, 0x3e8, 0x2e8 };
1958 				int unit = minor(cdp->consdev);
1959 				int consaddr = cdp->consaddr;
1960 				if (consaddr == -1 && unit >= 0 &&
1961 				    unit < nitems(ports))
1962 					consaddr = ports[unit];
1963 				if (major(cdp->consdev) == 8 &&
1964 				    consaddr != -1) {
1965 					comconsunit = unit;
1966 					comconsaddr = consaddr;
1967 					comconsrate = cdp->conspeed;
1968 					comconsiot = X86_BUS_SPACE_IO;
1969 				}
1970 #endif
1971 #ifdef BOOTINFO_DEBUG
1972 				printf(" console 0x%x:%d",
1973 				    cdp->consdev, cdp->conspeed);
1974 #endif
1975 			}
1976 			break;
1977 		case BOOTARG_BOOTMAC:
1978 			bios_bootmac = (bios_bootmac_t *)q->ba_arg;
1979 			break;
1980 
1981 		case BOOTARG_DDB:
1982 			bios_ddb = (bios_ddb_t *)q->ba_arg;
1983 #ifdef DDB
1984 			db_console = bios_ddb->db_console;
1985 #endif
1986 			break;
1987 
1988 		case BOOTARG_BOOTDUID:
1989 			bios_bootduid = (bios_bootduid_t *)q->ba_arg;
1990 			memcpy(bootduid, bios_bootduid, sizeof(bootduid));
1991 			break;
1992 
1993 		case BOOTARG_BOOTSR:
1994 			bios_bootsr = (bios_bootsr_t *)q->ba_arg;
1995 #if NSOFTRAID > 0
1996 			memcpy(&sr_bootuuid, &bios_bootsr->uuid,
1997 			    sizeof(sr_bootuuid));
1998 			memcpy(&sr_bootkey, &bios_bootsr->maskkey,
1999 			    sizeof(sr_bootkey));
2000 #endif
2001 			explicit_bzero(bios_bootsr, sizeof(bios_bootsr_t));
2002 			break;
2003 
2004 		case BOOTARG_EFIINFO:
2005 			bios_efiinfo = (bios_efiinfo_t *)q->ba_arg;
2006 			break;
2007 
2008 		case BOOTARG_UCODE:
2009 			bios_ucode = (bios_ucode_t *)q->ba_arg;
2010 			break;
2011 
2012 		default:
2013 #ifdef BOOTINFO_DEBUG
2014 			printf(" unsupported arg (%d) %p", q->ba_type,
2015 			    q->ba_arg);
2016 #endif
2017 			break;
2018 		}
2019 	}
2020 #ifdef BOOTINFO_DEBUG
2021 	printf("\n");
2022 #endif
2023 }
2024 
2025 int
2026 check_context(const struct reg *regs, struct trapframe *tf)
2027 {
2028 	uint16_t sel;
2029 
2030 	if (((regs->r_rflags ^ tf->tf_rflags) & PSL_USERSTATIC) != 0)
2031 		return EINVAL;
2032 
2033 	sel = regs->r_ss & 0xffff;
2034 	if (!VALID_USER_DSEL(sel))
2035 		return EINVAL;
2036 
2037 	sel = regs->r_cs & 0xffff;
2038 	if (!VALID_USER_CSEL(sel))
2039 		return EINVAL;
2040 
2041 	if (regs->r_rip >= VM_MAXUSER_ADDRESS)
2042 		return EINVAL;
2043 
2044 	return 0;
2045 }
2046