xref: /openbsd/sys/arch/amd64/amd64/machdep.c (revision 20bdc91c)
1 /*	$OpenBSD: machdep.c,v 1.297 2024/09/21 19:06:07 deraadt Exp $	*/
2 /*	$NetBSD: machdep.c,v 1.3 2003/05/07 22:58:18 fvdl Exp $	*/
3 
4 /*-
5  * Copyright (c) 1996, 1997, 1998, 2000 The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace
10  * Simulation Facility, NASA Ames Research Center.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
25  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 /*-
35  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
36  * All rights reserved.
37  *
38  * This code is derived from software contributed to Berkeley by
39  * William Jolitz.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  *	@(#)machdep.c	7.4 (Berkeley) 6/3/91
66  */
67 
68 #include <sys/param.h>
69 #include <sys/systm.h>
70 #include <sys/signal.h>
71 #include <sys/signalvar.h>
72 #include <sys/proc.h>
73 #include <sys/user.h>
74 #include <sys/exec.h>
75 #include <sys/buf.h>
76 #include <sys/reboot.h>
77 #include <sys/conf.h>
78 #include <sys/msgbuf.h>
79 #include <sys/mount.h>
80 #include <sys/extent.h>
81 #include <sys/core.h>
82 #include <sys/kcore.h>
83 #include <sys/syscallargs.h>
84 
85 #include <dev/cons.h>
86 #include <stand/boot/bootarg.h>
87 
88 #include <net/if.h>
89 #include <uvm/uvm_extern.h>
90 
91 #include <sys/sysctl.h>
92 
93 #include <machine/cpu_full.h>
94 #include <machine/cpufunc.h>
95 #include <machine/pio.h>
96 #include <machine/psl.h>
97 #include <machine/reg.h>
98 #include <machine/fpu.h>
99 #include <machine/biosvar.h>
100 #include <machine/mpbiosvar.h>
101 #include <machine/kcore.h>
102 #include <machine/tss.h>
103 
104 #include <dev/isa/isareg.h>
105 #include <dev/ic/i8042reg.h>
106 
107 #ifdef DDB
108 #include <machine/db_machdep.h>
109 #include <ddb/db_extern.h>
110 extern int db_console;
111 #endif
112 
113 #include "isa.h"
114 #include "isadma.h"
115 #include "ksyms.h"
116 
117 #include "acpi.h"
118 #if NACPI > 0
119 #include <dev/acpi/acpireg.h>
120 #include <dev/acpi/acpivar.h>
121 #endif
122 
123 #include "com.h"
124 #if NCOM > 0
125 #include <sys/tty.h>
126 #include <dev/ic/comvar.h>
127 #include <dev/ic/comreg.h>
128 #endif
129 
130 #include "efi.h"
131 #if NEFI > 0
132 #include <dev/efi/efi.h>
133 #endif
134 
135 #include "softraid.h"
136 #if NSOFTRAID > 0
137 #include <dev/softraidvar.h>
138 #endif
139 
140 #ifdef HIBERNATE
141 #include <machine/hibernate_var.h>
142 #endif /* HIBERNATE */
143 
144 #include "ukbd.h"
145 #include "pckbc.h"
146 #if NPCKBC > 0 && NUKBD > 0
147 #include <dev/ic/pckbcvar.h>
148 #endif
149 
150 /* #define MACHDEP_DEBUG */
151 
152 #ifdef MACHDEP_DEBUG
153 #define DPRINTF(x...)	do { printf(x); } while(0)
154 #else
155 #define DPRINTF(x...)
156 #endif /* MACHDEP_DEBUG */
157 
158 /* the following is used externally (sysctl_hw) */
159 char machine[] = MACHINE;
160 
161 /*
162  * switchto vectors
163  */
164 void cpu_idle_cycle_hlt(void);
165 void (*cpu_idle_cycle_fcn)(void) = &cpu_idle_cycle_hlt;
166 void (*cpu_suspend_cycle_fcn)(void);
167 
168 /* the following is used externally for concurrent handlers */
169 int setperf_prio = 0;
170 
171 #ifdef CPURESET_DELAY
172 int	cpureset_delay = CPURESET_DELAY;
173 #else
174 int     cpureset_delay = 0;
175 #endif
176 
177 char *ssym = 0, *esym = 0;	/* start and end of symbol table */
178 dev_t bootdev = 0;		/* device we booted from */
179 int biosbasemem = 0;		/* base memory reported by BIOS */
180 u_int bootapiver = 0;		/* /boot API version */
181 
182 int	physmem;
183 extern int	boothowto;
184 
185 paddr_t	dumpmem_paddr;
186 vaddr_t	dumpmem_vaddr;
187 psize_t	dumpmem_sz;
188 
189 vaddr_t kern_end;
190 
191 vaddr_t	msgbuf_vaddr;
192 paddr_t msgbuf_paddr;
193 
194 vaddr_t	idt_vaddr;
195 paddr_t	idt_paddr;
196 
197 vaddr_t lo32_vaddr;
198 paddr_t lo32_paddr;
199 paddr_t tramp_pdirpa;
200 
201 int kbd_reset;
202 int lid_action = 1;
203 int pwr_action = 1;
204 int forceukbd;
205 
206 /*
207  * safepri is a safe priority for sleep to set for a spin-wait
208  * during autoconfiguration or after a panic.
209  */
210 int	safepri = 0;
211 
212 struct vm_map *exec_map = NULL;
213 struct vm_map *phys_map = NULL;
214 
215 /* UVM constraint ranges. */
216 struct uvm_constraint_range  isa_constraint = { 0x0, 0x00ffffffUL };
217 struct uvm_constraint_range  dma_constraint = { 0x0, 0xffffffffUL };
218 struct uvm_constraint_range *uvm_md_constraints[] = {
219     &isa_constraint,
220     &dma_constraint,
221     NULL,
222 };
223 
224 paddr_t avail_start;
225 paddr_t avail_end;
226 
227 void (*delay_func)(int) = i8254_delay;
228 void (*initclock_func)(void) = i8254_initclocks;
229 void (*startclock_func)(void) = i8254_start_both_clocks;
230 
231 /*
232  * Format of boot information passed to us by 32-bit /boot
233  */
234 typedef struct _boot_args32 {
235 	int	ba_type;
236 	int	ba_size;
237 	int	ba_nextX;	/* a ptr in 32-bit world, but not here */
238 	char	ba_arg[1];
239 } bootarg32_t;
240 
241 #define BOOTARGC_MAX	NBPG	/* one page */
242 
243 bios_bootmac_t *bios_bootmac;
244 
245 /* locore copies the arguments from /boot to here for us */
246 char bootinfo[BOOTARGC_MAX];
247 int bootinfo_size = BOOTARGC_MAX;
248 
249 void getbootinfo(char *, int);
250 
251 /* Data passed to us by /boot, filled in by getbootinfo() */
252 bios_diskinfo_t	*bios_diskinfo;
253 bios_memmap_t	*bios_memmap;
254 u_int32_t	bios_cksumlen;
255 bios_efiinfo_t	*bios_efiinfo;
256 bios_ucode_t	*bios_ucode;
257 
258 #if NEFI > 0
259 EFI_MEMORY_DESCRIPTOR *mmap;
260 #endif
261 
262 /*
263  * Size of memory segments, before any memory is stolen.
264  */
265 phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX];
266 int	mem_cluster_cnt;
267 
268 int	cpu_dump(void);
269 int	cpu_dumpsize(void);
270 u_long	cpu_dump_mempagecnt(void);
271 void	dumpsys(void);
272 void	cpu_init_extents(void);
273 void	map_tramps(void);
274 void	init_x86_64(paddr_t);
275 void	(*cpuresetfn)(void);
276 void	enter_shared_special_pages(void);
277 
278 #ifdef APERTURE
279 int allowaperture = 0;
280 #endif
281 
282 /*
283  * Machine-dependent startup code
284  */
285 void
cpu_startup(void)286 cpu_startup(void)
287 {
288 	vaddr_t minaddr, maxaddr;
289 
290 	msgbuf_vaddr = PMAP_DIRECT_MAP(msgbuf_paddr);
291 	initmsgbuf((caddr_t)msgbuf_vaddr, round_page(MSGBUFSIZE));
292 
293 	printf("%s", version);
294 	startclocks();
295 	rtcinit();
296 
297 	printf("real mem = %lu (%luMB)\n", ptoa((psize_t)physmem),
298 	    ptoa((psize_t)physmem)/1024/1024);
299 
300 	/*
301 	 * Allocate a submap for exec arguments.  This map effectively
302 	 * limits the number of processes exec'ing at any time.
303 	 */
304 	minaddr = vm_map_min(kernel_map);
305 	exec_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
306 				   16*NCARGS, VM_MAP_PAGEABLE, FALSE, NULL);
307 
308 	/*
309 	 * Allocate a submap for physio
310 	 */
311 	minaddr = vm_map_min(kernel_map);
312 	phys_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
313 				   VM_PHYS_SIZE, 0, FALSE, NULL);
314 
315 	printf("avail mem = %lu (%luMB)\n", ptoa((psize_t)uvmexp.free),
316 	    ptoa((psize_t)uvmexp.free)/1024/1024);
317 
318 	bufinit();
319 
320 	if (boothowto & RB_CONFIG) {
321 #ifdef BOOT_CONFIG
322 		user_config();
323 #else
324 		printf("kernel does not support -c; continuing..\n");
325 #endif
326 	}
327 
328 	/* Safe for i/o port / memory space allocation to use malloc now. */
329 	x86_bus_space_mallocok();
330 
331 #ifndef SMALL_KERNEL
332 	cpu_ucode_setup();
333 	cpu_ucode_apply(&cpu_info_primary);
334 #endif
335 	cpu_tsx_disable(&cpu_info_primary);
336 
337 	/* enter the IDT and trampoline code in the u-k maps */
338 	enter_shared_special_pages();
339 
340 	/* initialize CPU0's TSS and GDT and put them in the u-k maps */
341 	cpu_enter_pages(&cpu_info_full_primary);
342 }
343 
344 /*
345  * enter_shared_special_pages
346  *
347  * Requests mapping of various special pages required in the Intel Meltdown
348  * case (to be entered into the U-K page table):
349  *
350  *  1 IDT page
351  *  Various number of pages covering the U-K ".kutext" section. This section
352  *   contains code needed during trampoline operation
353  *  Various number of pages covering the U-K ".kudata" section. This section
354  *   contains data accessed by the trampoline, before switching to U+K
355  *   (for example, various shared global variables used by IPIs, etc)
356  *
357  * The linker script places the required symbols in the sections above.
358  *
359  * On CPUs not affected by Meltdown, the calls to pmap_enter_special below
360  * become no-ops.
361  */
362 void
enter_shared_special_pages(void)363 enter_shared_special_pages(void)
364 {
365 	extern char __kutext_start[], __kutext_end[], __kernel_kutext_phys[];
366 	extern char __text_page_start[], __text_page_end[];
367 	extern char __kernel_kutext_page_phys[];
368 	extern char __kudata_start[], __kudata_end[], __kernel_kudata_phys[];
369 	vaddr_t va;
370 	paddr_t pa;
371 
372 	/* idt */
373 	pmap_enter_special(idt_vaddr, idt_paddr, PROT_READ);
374 	DPRINTF("%s: entered idt page va 0x%llx pa 0x%llx\n", __func__,
375 	    (uint64_t)idt_vaddr, (uint64_t)idt_paddr);
376 
377 	/* .kutext section */
378 	va = (vaddr_t)__kutext_start;
379 	pa = (paddr_t)__kernel_kutext_phys;
380 	while (va < (vaddr_t)__kutext_end) {
381 		pmap_enter_special(va, pa, PROT_READ | PROT_EXEC);
382 		DPRINTF("%s: entered kutext page va 0x%llx pa 0x%llx\n",
383 		    __func__, (uint64_t)va, (uint64_t)pa);
384 		va += PAGE_SIZE;
385 		pa += PAGE_SIZE;
386 	}
387 
388 	/* .kutext.page section */
389 	va = (vaddr_t)__text_page_start;
390 	pa = (paddr_t)__kernel_kutext_page_phys;
391 	while (va < (vaddr_t)__text_page_end) {
392 		pmap_enter_special(va, pa, PROT_READ | PROT_EXEC);
393 		DPRINTF("%s: entered kutext.page va 0x%llx pa 0x%llx\n",
394 		    __func__, (uint64_t)va, (uint64_t)pa);
395 		va += PAGE_SIZE;
396 		pa += PAGE_SIZE;
397 	}
398 
399 	/* .kudata section */
400 	va = (vaddr_t)__kudata_start;
401 	pa = (paddr_t)__kernel_kudata_phys;
402 	while (va < (vaddr_t)__kudata_end) {
403 		pmap_enter_special(va, pa, PROT_READ | PROT_WRITE);
404 		DPRINTF("%s: entered kudata page va 0x%llx pa 0x%llx\n",
405 		    __func__, (uint64_t)va, (uint64_t)pa);
406 		va += PAGE_SIZE;
407 		pa += PAGE_SIZE;
408 	}
409 }
410 
411 /*
412  * Set up proc0's PCB and the cpu's TSS.
413  */
414 void
x86_64_proc0_tss_ldt_init(void)415 x86_64_proc0_tss_ldt_init(void)
416 {
417 	struct pcb *pcb;
418 
419 	cpu_info_primary.ci_curpcb = pcb = &proc0.p_addr->u_pcb;
420 	pcb->pcb_fsbase = 0;
421 	pcb->pcb_kstack = (u_int64_t)proc0.p_addr + USPACE - 16;
422 	proc0.p_md.md_regs = (struct trapframe *)pcb->pcb_kstack - 1;
423 
424 	ltr(GSYSSEL(GPROC0_SEL, SEL_KPL));
425 	lldt(0);
426 }
427 
428 bios_diskinfo_t *
bios_getdiskinfo(dev_t dev)429 bios_getdiskinfo(dev_t dev)
430 {
431 	bios_diskinfo_t *pdi;
432 
433 	if (bios_diskinfo == NULL)
434 		return NULL;
435 
436 	for (pdi = bios_diskinfo; pdi->bios_number != -1; pdi++) {
437 		if ((dev & B_MAGICMASK) == B_DEVMAGIC) { /* search by bootdev */
438 			if (pdi->bsd_dev == dev)
439 				break;
440 		} else {
441 			if (pdi->bios_number == dev)
442 				break;
443 		}
444 	}
445 
446 	if (pdi->bios_number == -1)
447 		return NULL;
448 	else
449 		return pdi;
450 }
451 
452 int
bios_sysctl(int * name,u_int namelen,void * oldp,size_t * oldlenp,void * newp,size_t newlen,struct proc * p)453 bios_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
454     size_t newlen, struct proc *p)
455 {
456 	bios_diskinfo_t *pdi;
457 	int biosdev;
458 
459 	/* all sysctl names at this level except diskinfo are terminal */
460 	if (namelen != 1 && name[0] != BIOS_DISKINFO)
461 		return (ENOTDIR);	       /* overloaded */
462 
463 	if (!(bootapiver & BAPIV_VECTOR))
464 		return EOPNOTSUPP;
465 
466 	switch (name[0]) {
467 	case BIOS_DEV:
468 		if ((pdi = bios_getdiskinfo(bootdev)) == NULL)
469 			return ENXIO;
470 		biosdev = pdi->bios_number;
471 		return sysctl_rdint(oldp, oldlenp, newp, biosdev);
472 	case BIOS_DISKINFO:
473 		if (namelen != 2)
474 			return ENOTDIR;
475 		if ((pdi = bios_getdiskinfo(name[1])) == NULL)
476 			return ENXIO;
477 		return sysctl_rdstruct(oldp, oldlenp, newp, pdi, sizeof(*pdi));
478 	case BIOS_CKSUMLEN:
479 		return sysctl_rdint(oldp, oldlenp, newp, bios_cksumlen);
480 	default:
481 		return EOPNOTSUPP;
482 	}
483 	/* NOTREACHED */
484 }
485 
486 extern int tsc_is_invariant;
487 extern int amd64_has_xcrypt;
488 extern int need_retpoline;
489 
490 const struct sysctl_bounded_args cpuctl_vars[] = {
491 	{ CPU_LIDACTION, &lid_action, -1, 2 },
492 	{ CPU_PWRACTION, &pwr_action, 0, 2 },
493 	{ CPU_CPUID, &cpu_id, SYSCTL_INT_READONLY },
494 	{ CPU_CPUFEATURE, &cpu_feature, SYSCTL_INT_READONLY },
495 	{ CPU_XCRYPT, &amd64_has_xcrypt, SYSCTL_INT_READONLY },
496 	{ CPU_INVARIANTTSC, &tsc_is_invariant, SYSCTL_INT_READONLY },
497 	{ CPU_RETPOLINE, &need_retpoline, SYSCTL_INT_READONLY },
498 };
499 
500 /*
501  * machine dependent system variables.
502  */
503 int
cpu_sysctl(int * name,u_int namelen,void * oldp,size_t * oldlenp,void * newp,size_t newlen,struct proc * p)504 cpu_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
505     size_t newlen, struct proc *p)
506 {
507 	extern uint64_t tsc_frequency;
508 	dev_t consdev;
509 	dev_t dev;
510 
511 	switch (name[0]) {
512 	case CPU_CONSDEV:
513 		if (namelen != 1)
514 			return (ENOTDIR);		/* overloaded */
515 		if (cn_tab != NULL)
516 			consdev = cn_tab->cn_dev;
517 		else
518 			consdev = NODEV;
519 		return (sysctl_rdstruct(oldp, oldlenp, newp, &consdev,
520 		    sizeof consdev));
521 	case CPU_CHR2BLK:
522 		if (namelen != 2)
523 			return (ENOTDIR);		/* overloaded */
524 		dev = chrtoblk((dev_t)name[1]);
525 		return sysctl_rdstruct(oldp, oldlenp, newp, &dev, sizeof(dev));
526 	case CPU_BIOS:
527 		return bios_sysctl(name + 1, namelen - 1, oldp, oldlenp,
528 		    newp, newlen, p);
529 	case CPU_CPUVENDOR:
530 		return (sysctl_rdstring(oldp, oldlenp, newp, cpu_vendor));
531 	case CPU_KBDRESET:
532 		return (sysctl_securelevel_int(oldp, oldlenp, newp, newlen,
533 		    &kbd_reset));
534 	case CPU_ALLOWAPERTURE:
535 		if (namelen != 1)
536 			return (ENOTDIR);		/* overloaded */
537 #ifdef APERTURE
538 		if (securelevel > 0)
539 			return (sysctl_int_lower(oldp, oldlenp, newp, newlen,
540 			    &allowaperture));
541 		else
542 			return (sysctl_int(oldp, oldlenp, newp, newlen,
543 			    &allowaperture));
544 #else
545 		return (sysctl_rdint(oldp, oldlenp, newp, 0));
546 #endif
547 #if NPCKBC > 0 && NUKBD > 0
548 	case CPU_FORCEUKBD:
549 		{
550 		int error;
551 
552 		if (forceukbd)
553 			return (sysctl_rdint(oldp, oldlenp, newp, forceukbd));
554 
555 		error = sysctl_int(oldp, oldlenp, newp, newlen, &forceukbd);
556 		if (forceukbd)
557 			pckbc_release_console();
558 		return (error);
559 		}
560 #endif
561 	case CPU_TSCFREQ:
562 		return (sysctl_rdquad(oldp, oldlenp, newp, tsc_frequency));
563 	default:
564 		return (sysctl_bounded_arr(cpuctl_vars, nitems(cpuctl_vars),
565 		    name, namelen, oldp, oldlenp, newp, newlen));
566 	}
567 	/* NOTREACHED */
568 }
569 
570 static inline void
maybe_enable_user_cet(struct proc * p)571 maybe_enable_user_cet(struct proc *p)
572 {
573 #ifndef SMALL_KERNEL
574 	/* Enable indirect-branch tracking if present and not disabled */
575 	if ((xsave_mask & XFEATURE_CET_U) &&
576 	    (p->p_p->ps_flags & PS_NOBTCFI) == 0) {
577 		uint64_t msr = rdmsr(MSR_U_CET);
578 		wrmsr(MSR_U_CET, msr | MSR_CET_ENDBR_EN | MSR_CET_NO_TRACK_EN);
579 	}
580 #endif
581 }
582 
583 static inline void
initialize_thread_xstate(struct proc * p)584 initialize_thread_xstate(struct proc *p)
585 {
586 	if (cpu_use_xsaves) {
587 		xrstors(fpu_cleandata, xsave_mask);
588 		maybe_enable_user_cet(p);
589 	} else {
590 		/* Reset FPU state in PCB */
591 		memcpy(&p->p_addr->u_pcb.pcb_savefpu, fpu_cleandata,
592 		    fpu_save_len);
593 
594 		if (curcpu()->ci_pflags & CPUPF_USERXSTATE) {
595 			/* state in CPU is obsolete; reset it */
596 			fpureset();
597 		}
598 	}
599 
600 	/* The reset state _is_ the userspace state for this thread now */
601 	curcpu()->ci_pflags |= CPUPF_USERXSTATE;
602 }
603 
604 /*
605  * Copy out the FPU state, massaging it to be usable from userspace
606  * and acceptable to xrstor_user()
607  */
608 static inline int
copyoutfpu(struct savefpu * sfp,char * sp,size_t len)609 copyoutfpu(struct savefpu *sfp, char *sp, size_t len)
610 {
611 	uint64_t bvs[2];
612 
613 	if (copyout(sfp, sp, len))
614 		return 1;
615 	if (len > offsetof(struct savefpu, fp_xstate.xstate_bv)) {
616 		sp  += offsetof(struct savefpu, fp_xstate.xstate_bv);
617 		len -= offsetof(struct savefpu, fp_xstate.xstate_bv);
618 		bvs[0] = sfp->fp_xstate.xstate_bv & XFEATURE_XCR0_MASK;
619 		bvs[1] = sfp->fp_xstate.xstate_xcomp_bv &
620 		    (XFEATURE_XCR0_MASK | XFEATURE_COMPRESSED);
621 		if (copyout(bvs, sp, min(len, sizeof bvs)))
622 			return 1;
623 	}
624 	return 0;
625 }
626 
627 /*
628  * Send an interrupt to process.
629  *
630  * Stack is set up to allow sigcode to call routine, followed by
631  * syscall to sigreturn routine below.  After sigreturn resets the
632  * signal mask, the stack, and the frame pointer, it returns to the
633  * user specified pc.
634  */
635 int
sendsig(sig_t catcher,int sig,sigset_t mask,const siginfo_t * ksip,int info,int onstack)636 sendsig(sig_t catcher, int sig, sigset_t mask, const siginfo_t *ksip,
637     int info, int onstack)
638 {
639 	struct proc *p = curproc;
640 	struct trapframe *tf = p->p_md.md_regs;
641 	struct sigcontext ksc;
642 	struct savefpu *sfp = &p->p_addr->u_pcb.pcb_savefpu;
643 	register_t sp, scp, sip;
644 	u_long sss;
645 
646 	memset(&ksc, 0, sizeof ksc);
647 	ksc.sc_rdi = tf->tf_rdi;
648 	ksc.sc_rsi = tf->tf_rsi;
649 	ksc.sc_rdx = tf->tf_rdx;
650 	ksc.sc_rcx = tf->tf_rcx;
651 	ksc.sc_r8  = tf->tf_r8;
652 	ksc.sc_r9  = tf->tf_r9;
653 	ksc.sc_r10 = tf->tf_r10;
654 	ksc.sc_r11 = tf->tf_r11;
655 	ksc.sc_r12 = tf->tf_r12;
656 	ksc.sc_r13 = tf->tf_r13;
657 	ksc.sc_r14 = tf->tf_r14;
658 	ksc.sc_r15 = tf->tf_r15;
659 	ksc.sc_rbx = tf->tf_rbx;
660 	ksc.sc_rax = tf->tf_rax;
661 	ksc.sc_rbp = tf->tf_rbp;
662 	ksc.sc_rip = tf->tf_rip;
663 	ksc.sc_cs  = tf->tf_cs;
664 	ksc.sc_rflags = tf->tf_rflags;
665 	ksc.sc_rsp = tf->tf_rsp;
666 	ksc.sc_ss  = tf->tf_ss;
667 	ksc.sc_mask = mask;
668 
669 	/* Allocate space for the signal handler context. */
670 	if ((p->p_sigstk.ss_flags & SS_DISABLE) == 0 &&
671 	    !sigonstack(tf->tf_rsp) && onstack)
672 		sp = trunc_page((vaddr_t)p->p_sigstk.ss_sp + p->p_sigstk.ss_size);
673 	else
674 		sp = tf->tf_rsp - 128;
675 
676 	sp -= fpu_save_len;
677 	if (cpu_use_xsaves)
678 		sp &= ~63ULL;	/* just in case */
679 	else
680 		sp &= ~15ULL;	/* just in case */
681 
682 	/* Save FPU state to PCB if necessary, then copy it out */
683 	if (curcpu()->ci_pflags & CPUPF_USERXSTATE)
684 		fpusave(&p->p_addr->u_pcb.pcb_savefpu);
685 	if (copyoutfpu(sfp, (void *)sp, fpu_save_len))
686 		return 1;
687 
688 	initialize_thread_xstate(p);
689 
690 	ksc.sc_fpstate = (struct fxsave64 *)sp;
691 	sss = (sizeof(ksc) + 15) & ~15;
692 	sip = 0;
693 	if (info) {
694 		sip = sp - ((sizeof(*ksip) + 15) & ~15);
695 		sss += (sizeof(*ksip) + 15) & ~15;
696 
697 		if (copyout(ksip, (void *)sip, sizeof(*ksip)))
698 			return 1;
699 	}
700 	scp = sp - sss;
701 
702 	ksc.sc_cookie = (long)scp ^ p->p_p->ps_sigcookie;
703 	if (copyout(&ksc, (void *)scp, sizeof(ksc)))
704 		return 1;
705 
706 	/*
707 	 * Build context to run handler in.
708 	 */
709 	tf->tf_rax = (u_int64_t)catcher;
710 	tf->tf_rdi = sig;
711 	tf->tf_rsi = sip;
712 	tf->tf_rdx = scp;
713 
714 	tf->tf_rip = (u_int64_t)p->p_p->ps_sigcode;
715 	tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
716 	tf->tf_rflags &= ~(PSL_T|PSL_D|PSL_VM|PSL_AC);
717 	tf->tf_rsp = scp;
718 	tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
719 
720 	return 0;
721 }
722 
723 /*
724  * System call to cleanup state after a signal
725  * has been taken.  Reset signal mask and
726  * stack state from context left by sendsig (above).
727  * Return to previous pc and psl as specified by
728  * context left by sendsig. Check carefully to
729  * make sure that the user has not modified the
730  * psl to gain improper privileges or to cause
731  * a machine fault.
732  */
733 int
sys_sigreturn(struct proc * p,void * v,register_t * retval)734 sys_sigreturn(struct proc *p, void *v, register_t *retval)
735 {
736 	struct sys_sigreturn_args /* {
737 		syscallarg(struct sigcontext *) sigcntxp;
738 	} */ *uap = v;
739 	struct sigcontext ksc, *scp = SCARG(uap, sigcntxp);
740 	struct trapframe *tf = p->p_md.md_regs;
741 	struct savefpu *sfp = &p->p_addr->u_pcb.pcb_savefpu;
742 	int error;
743 
744 	if (PROC_PC(p) != p->p_p->ps_sigcoderet) {
745 		sigexit(p, SIGILL);
746 		return (EPERM);
747 	}
748 
749 	if ((error = copyin((caddr_t)scp, &ksc, sizeof ksc)))
750 		return (error);
751 
752 	if (ksc.sc_cookie != ((long)scp ^ p->p_p->ps_sigcookie)) {
753 		sigexit(p, SIGILL);
754 		return (EFAULT);
755 	}
756 
757 	/* Prevent reuse of the sigcontext cookie */
758 	ksc.sc_cookie = 0;
759 	(void)copyout(&ksc.sc_cookie, (caddr_t)scp +
760 	    offsetof(struct sigcontext, sc_cookie), sizeof (ksc.sc_cookie));
761 
762 	if (((ksc.sc_rflags ^ tf->tf_rflags) & PSL_USERSTATIC) != 0 ||
763 	    !USERMODE(ksc.sc_cs, ksc.sc_eflags))
764 		return (EINVAL);
765 
766 	/* Current FPU state is obsolete; toss it and force a reload */
767 	if (curcpu()->ci_pflags & CPUPF_USERXSTATE) {
768 		curcpu()->ci_pflags &= ~CPUPF_USERXSTATE;
769 		fpureset();
770 	}
771 
772 	/* Copy in the FPU state to restore */
773 	if (__predict_true(ksc.sc_fpstate != NULL)) {
774 		if ((error = copyin(ksc.sc_fpstate, sfp, fpu_save_len)))
775 			return error;
776 		if (xrstor_user(sfp, xsave_mask)) {
777 			memcpy(sfp, fpu_cleandata, fpu_save_len);
778 			return EINVAL;
779 		}
780 		maybe_enable_user_cet(p);
781 		curcpu()->ci_pflags |= CPUPF_USERXSTATE;
782 	} else {
783 		/* shouldn't happen, but handle it */
784 		initialize_thread_xstate(p);
785 	}
786 
787 	tf->tf_rdi = ksc.sc_rdi;
788 	tf->tf_rsi = ksc.sc_rsi;
789 	tf->tf_rdx = ksc.sc_rdx;
790 	tf->tf_rcx = ksc.sc_rcx;
791 	tf->tf_r8  = ksc.sc_r8;
792 	tf->tf_r9  = ksc.sc_r9;
793 	tf->tf_r10 = ksc.sc_r10;
794 	tf->tf_r11 = ksc.sc_r11;
795 	tf->tf_r12 = ksc.sc_r12;
796 	tf->tf_r13 = ksc.sc_r13;
797 	tf->tf_r14 = ksc.sc_r14;
798 	tf->tf_r15 = ksc.sc_r15;
799 	tf->tf_rbx = ksc.sc_rbx;
800 	tf->tf_rax = ksc.sc_rax;
801 	tf->tf_rbp = ksc.sc_rbp;
802 	tf->tf_rip = ksc.sc_rip;
803 	tf->tf_cs  = ksc.sc_cs;
804 	tf->tf_rflags = ksc.sc_rflags;
805 	tf->tf_rsp = ksc.sc_rsp;
806 	tf->tf_ss  = ksc.sc_ss;
807 
808 	/* Restore signal mask. */
809 	p->p_sigmask = ksc.sc_mask & ~sigcantmask;
810 
811 	/*
812 	 * sigreturn() needs to return to userspace via the 'iretq'
813 	 * method, so that if the process was interrupted (by tick,
814 	 * an IPI, whatever) as opposed to already being in the kernel
815 	 * when a signal was being delivered, the process will be
816 	 * completely restored, including the userland %rcx and %r11
817 	 * registers which the 'sysretq' instruction cannot restore.
818 	 * Also need to make sure we can handle faulting on xrstor.
819 	 */
820 	p->p_md.md_flags |= MDP_IRET;
821 
822 	return (EJUSTRETURN);
823 }
824 
825 #ifdef MULTIPROCESSOR
826 /* force a CPU into the kernel, whether or not it's idle */
827 void
cpu_kick(struct cpu_info * ci)828 cpu_kick(struct cpu_info *ci)
829 {
830 	/* only need to kick other CPUs */
831 	if (ci != curcpu()) {
832 		if (cpu_mwait_size > 0) {
833 			/*
834 			 * If not idling, then send an IPI, else
835 			 * just clear the "keep idling" bit.
836 			 */
837 			if ((ci->ci_mwait & MWAIT_IN_IDLE) == 0)
838 				x86_send_ipi(ci, X86_IPI_NOP);
839 			else
840 				atomic_clearbits_int(&ci->ci_mwait,
841 				    MWAIT_KEEP_IDLING);
842 		} else {
843 			/* no mwait, so need an IPI */
844 			x86_send_ipi(ci, X86_IPI_NOP);
845 		}
846 	}
847 }
848 #endif
849 
850 /*
851  * Notify the current process (p) that it has a signal pending,
852  * process as soon as possible.
853  */
854 void
signotify(struct proc * p)855 signotify(struct proc *p)
856 {
857 	aston(p);
858 	cpu_kick(p->p_cpu);
859 }
860 
861 #ifdef MULTIPROCESSOR
862 void
cpu_unidle(struct cpu_info * ci)863 cpu_unidle(struct cpu_info *ci)
864 {
865 	if (cpu_mwait_size > 0 && (ci->ci_mwait & MWAIT_ONLY)) {
866 		/*
867 		 * Just clear the "keep idling" bit; if it wasn't
868 		 * idling then we didn't need to do anything anyway.
869 		 */
870 		atomic_clearbits_int(&ci->ci_mwait, MWAIT_KEEP_IDLING);
871 		return;
872 	}
873 
874 	if (ci != curcpu())
875 		x86_send_ipi(ci, X86_IPI_NOP);
876 }
877 #endif
878 
879 int	waittime = -1;
880 struct pcb dumppcb;
881 
882 __dead void
boot(int howto)883 boot(int howto)
884 {
885 #if NACPI > 0
886 	if ((howto & RB_POWERDOWN) != 0 && acpi_softc)
887 		acpi_softc->sc_state = ACPI_STATE_S5;
888 #endif
889 
890 	if ((howto & RB_POWERDOWN) != 0)
891 		lid_action = 0;
892 
893 	if ((howto & RB_RESET) != 0)
894 		goto doreset;
895 
896 	if (cold) {
897 		if ((howto & RB_USERREQ) == 0)
898 			howto |= RB_HALT;
899 		goto haltsys;
900 	}
901 
902 	boothowto = howto;
903 	if ((howto & RB_NOSYNC) == 0 && waittime < 0) {
904 		waittime = 0;
905 		vfs_shutdown(curproc);
906 
907 		if ((howto & RB_TIMEBAD) == 0) {
908 			resettodr();
909 		} else {
910 			printf("WARNING: not updating battery clock\n");
911 		}
912 	}
913 	if_downall();
914 
915 	uvm_shutdown();
916 	splhigh();
917 	cold = 1;
918 
919 	if ((howto & RB_DUMP) != 0)
920 		dumpsys();
921 
922 haltsys:
923 	config_suspend_all(DVACT_POWERDOWN);
924 
925 #ifdef MULTIPROCESSOR
926 	x86_broadcast_ipi(X86_IPI_HALT);
927 #endif
928 
929 	if ((howto & RB_HALT) != 0) {
930 #if NACPI > 0 && !defined(SMALL_KERNEL)
931 		extern int acpi_enabled;
932 
933 		if (acpi_enabled) {
934 			delay(500000);
935 			if ((howto & RB_POWERDOWN) != 0)
936 				acpi_powerdown();
937 		}
938 #endif
939 		printf("\n");
940 		printf("The operating system has halted.\n");
941 		printf("Please press any key to reboot.\n\n");
942 		cnpollc(1);	/* for proper keyboard command handling */
943 		cngetc();
944 		cnpollc(0);
945 	}
946 
947 doreset:
948 	printf("rebooting...\n");
949 	if (cpureset_delay > 0)
950 		delay(cpureset_delay * 1000);
951 	cpu_reset();
952 	for (;;)
953 		continue;
954 	/* NOTREACHED */
955 }
956 
957 /*
958  * These variables are needed by /sbin/savecore
959  */
960 u_long	dumpmag = 0x8fca0101;	/* magic number */
961 int 	dumpsize = 0;		/* pages */
962 long	dumplo = 0; 		/* blocks */
963 
964 /*
965  * cpu_dump: dump the machine-dependent kernel core dump headers.
966  */
967 int
cpu_dump(void)968 cpu_dump(void)
969 {
970 	int (*dump)(dev_t, daddr_t, caddr_t, size_t);
971 	char buf[dbtob(1)];
972 	kcore_seg_t *segp;
973 	cpu_kcore_hdr_t *cpuhdrp;
974 	phys_ram_seg_t *memsegp;
975 	caddr_t va;
976 	int i;
977 
978 	dump = bdevsw[major(dumpdev)].d_dump;
979 
980 	memset(buf, 0, sizeof buf);
981 	segp = (kcore_seg_t *)buf;
982 	cpuhdrp = (cpu_kcore_hdr_t *)&buf[ALIGN(sizeof(*segp))];
983 	memsegp = (phys_ram_seg_t *)&buf[ALIGN(sizeof(*segp)) +
984 	    ALIGN(sizeof(*cpuhdrp))];
985 
986 	/*
987 	 * Generate a segment header.
988 	 */
989 	CORE_SETMAGIC(*segp, KCORE_MAGIC, MID_MACHINE, CORE_CPU);
990 	segp->c_size = dbtob(1) - ALIGN(sizeof(*segp));
991 
992 	/*
993 	 * Add the machine-dependent header info.
994 	 */
995 	cpuhdrp->ptdpaddr = proc0.p_addr->u_pcb.pcb_cr3;
996 	cpuhdrp->nmemsegs = mem_cluster_cnt;
997 
998 	/*
999 	 * Fill in the memory segment descriptors.
1000 	 */
1001 	for (i = 0; i < mem_cluster_cnt; i++) {
1002 		memsegp[i].start = mem_clusters[i].start;
1003 		memsegp[i].size = mem_clusters[i].size & ~PAGE_MASK;
1004 	}
1005 
1006 	/*
1007 	 * If we have dump memory then assume the kernel stack is in high
1008 	 * memory and bounce
1009 	 */
1010 	if (dumpmem_vaddr != 0) {
1011 		memcpy((char *)dumpmem_vaddr, buf, sizeof(buf));
1012 		va = (caddr_t)dumpmem_vaddr;
1013 	} else {
1014 		va = (caddr_t)buf;
1015 	}
1016 	return (dump(dumpdev, dumplo, va, dbtob(1)));
1017 }
1018 
1019 /*
1020  * This is called by main to set dumplo and dumpsize.
1021  * Dumps always skip the first PAGE_SIZE of disk space
1022  * in case there might be a disk label stored there.
1023  * If there is extra space, put dump at the end to
1024  * reduce the chance that swapping trashes it.
1025  */
1026 void
dumpconf(void)1027 dumpconf(void)
1028 {
1029 	int nblks, dumpblks;	/* size of dump area */
1030 
1031 	if (dumpdev == NODEV ||
1032 	    (nblks = (bdevsw[major(dumpdev)].d_psize)(dumpdev)) == 0)
1033 		return;
1034 	if (nblks <= ctod(1))
1035 		return;
1036 
1037 	dumpblks = cpu_dumpsize();
1038 	if (dumpblks < 0)
1039 		return;
1040 	dumpblks += ctod(cpu_dump_mempagecnt());
1041 
1042 	/* If dump won't fit (incl. room for possible label), punt. */
1043 	if (dumpblks > (nblks - ctod(1)))
1044 		return;
1045 
1046 	/* Put dump at end of partition */
1047 	dumplo = nblks - dumpblks;
1048 
1049 	/* dumpsize is in page units, and doesn't include headers. */
1050 	dumpsize = cpu_dump_mempagecnt();
1051 }
1052 
1053 /*
1054  * Doadump comes here after turning off memory management and
1055  * getting on the dump stack, either when called above, or by
1056  * the auto-restart code.
1057  */
1058 #define BYTES_PER_DUMP  MAXPHYS /* must be a multiple of pagesize */
1059 
1060 void
dumpsys(void)1061 dumpsys(void)
1062 {
1063 	u_long totalbytesleft, bytes, i, n, memseg;
1064 	u_long maddr;
1065 	daddr_t blkno;
1066 	void *va;
1067 	int (*dump)(dev_t, daddr_t, caddr_t, size_t);
1068 	int error;
1069 
1070 	/* Save registers. */
1071 	savectx(&dumppcb);
1072 
1073 	if (dumpdev == NODEV)
1074 		return;
1075 
1076 	/*
1077 	 * For dumps during autoconfiguration,
1078 	 * if dump device has already configured...
1079 	 */
1080 	if (dumpsize == 0)
1081 		dumpconf();
1082 	if (dumplo <= 0 || dumpsize == 0) {
1083 		printf("\ndump to dev %u,%u not possible\n", major(dumpdev),
1084 		    minor(dumpdev));
1085 		return;
1086 	}
1087 	printf("\ndumping to dev %u,%u offset %ld\n", major(dumpdev),
1088 	    minor(dumpdev), dumplo);
1089 
1090 	error = (*bdevsw[major(dumpdev)].d_psize)(dumpdev);
1091 	printf("dump ");
1092 	if (error == -1) {
1093 		printf("area unavailable\n");
1094 		return;
1095 	}
1096 
1097 	if ((error = cpu_dump()) != 0)
1098 		goto err;
1099 
1100 	totalbytesleft = ptoa(cpu_dump_mempagecnt());
1101 	blkno = dumplo + cpu_dumpsize();
1102 	dump = bdevsw[major(dumpdev)].d_dump;
1103 	error = 0;
1104 
1105 	for (memseg = 0; memseg < mem_cluster_cnt; memseg++) {
1106 		maddr = mem_clusters[memseg].start;
1107 		bytes = mem_clusters[memseg].size;
1108 
1109 		for (i = 0; i < bytes; i += n, totalbytesleft -= n) {
1110 			/* Print out how many MBs we have left to go. */
1111 			if ((totalbytesleft % (1024*1024)) < BYTES_PER_DUMP)
1112 				printf("%ld ", totalbytesleft / (1024 * 1024));
1113 
1114 			/* Limit size for next transfer. */
1115 			n = bytes - i;
1116 			if (n > BYTES_PER_DUMP)
1117 				n = BYTES_PER_DUMP;
1118 			if (maddr > 0xffffffff) {
1119 				va = (void *)dumpmem_vaddr;
1120 				if (n > dumpmem_sz)
1121 					n = dumpmem_sz;
1122 				memcpy(va, (void *)PMAP_DIRECT_MAP(maddr), n);
1123 			} else {
1124 				va = (void *)PMAP_DIRECT_MAP(maddr);
1125 			}
1126 
1127 			error = (*dump)(dumpdev, blkno, va, n);
1128 			if (error)
1129 				goto err;
1130 			maddr += n;
1131 			blkno += btodb(n);		/* XXX? */
1132 
1133 #if 0	/* XXX this doesn't work.  grr. */
1134 			/* operator aborting dump? */
1135 			if (sget() != NULL) {
1136 				error = EINTR;
1137 				break;
1138 			}
1139 #endif
1140 		}
1141 	}
1142 
1143  err:
1144 	switch (error) {
1145 
1146 	case ENXIO:
1147 		printf("device bad\n");
1148 		break;
1149 
1150 	case EFAULT:
1151 		printf("device not ready\n");
1152 		break;
1153 
1154 	case EINVAL:
1155 		printf("area improper\n");
1156 		break;
1157 
1158 	case EIO:
1159 		printf("i/o error\n");
1160 		break;
1161 
1162 	case EINTR:
1163 		printf("aborted from console\n");
1164 		break;
1165 
1166 	case 0:
1167 		printf("succeeded\n");
1168 		break;
1169 
1170 	default:
1171 		printf("error %d\n", error);
1172 		break;
1173 	}
1174 	printf("\n\n");
1175 	delay(5000000);		/* 5 seconds */
1176 }
1177 
1178 /*
1179  * Force the userspace FS.base to be reloaded from the PCB on return from
1180  * the kernel, and reset the segment registers (%ds, %es, %fs, and %gs)
1181  * to their expected userspace value.
1182  */
1183 void
reset_segs(void)1184 reset_segs(void)
1185 {
1186 	/*
1187 	 * This operates like the cpu_switchto() sequence: if we
1188 	 * haven't reset %[defg]s already, do so now.
1189 	*/
1190 	if (curcpu()->ci_pflags & CPUPF_USERSEGS) {
1191 		curcpu()->ci_pflags &= ~CPUPF_USERSEGS;
1192 		__asm volatile(
1193 		    "movw %%ax,%%ds\n\t"
1194 		    "movw %%ax,%%es\n\t"
1195 		    "movw %%ax,%%fs\n\t"
1196 		    "cli\n\t"		/* block intr when on user GS.base */
1197 		    "swapgs\n\t"	/* swap from kernel to user GS.base */
1198 		    "movw %%ax,%%gs\n\t"/* set %gs to UDATA and GS.base to 0 */
1199 		    "swapgs\n\t"	/* back to kernel GS.base */
1200 		    "sti" : : "a"(GSEL(GUDATA_SEL, SEL_UPL)));
1201 	}
1202 }
1203 
1204 /*
1205  * Clear registers on exec
1206  */
1207 void
setregs(struct proc * p,struct exec_package * pack,u_long stack,struct ps_strings * arginfo)1208 setregs(struct proc *p, struct exec_package *pack, u_long stack,
1209     struct ps_strings *arginfo)
1210 {
1211 	struct trapframe *tf;
1212 
1213 	initialize_thread_xstate(p);
1214 
1215 	/* To reset all registers we have to return via iretq */
1216 	p->p_md.md_flags |= MDP_IRET;
1217 
1218 	reset_segs();
1219 	p->p_addr->u_pcb.pcb_fsbase = 0;
1220 
1221 	tf = p->p_md.md_regs;
1222 	memset(tf, 0, sizeof *tf);
1223 	tf->tf_rip = pack->ep_entry;
1224 	tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
1225 	tf->tf_rflags = PSL_USERSET;
1226 	tf->tf_rsp = stack;
1227 	tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
1228 }
1229 
1230 /*
1231  * Initialize segments and descriptor tables
1232  */
1233 
1234 struct gate_descriptor *idt;
1235 char idt_allocmap[NIDT];
1236 struct user *proc0paddr = NULL;
1237 
1238 void
setgate(struct gate_descriptor * gd,void * func,int ist,int type,int dpl,int sel)1239 setgate(struct gate_descriptor *gd, void *func, int ist, int type, int dpl,
1240     int sel)
1241 {
1242 	gd->gd_looffset = (u_int64_t)func & 0xffff;
1243 	gd->gd_selector = sel;
1244 	gd->gd_ist = ist;
1245 	gd->gd_type = type;
1246 	gd->gd_dpl = dpl;
1247 	gd->gd_p = 1;
1248 	gd->gd_hioffset = (u_int64_t)func >> 16;
1249 	gd->gd_zero = 0;
1250 	gd->gd_xx1 = 0;
1251 	gd->gd_xx2 = 0;
1252 	gd->gd_xx3 = 0;
1253 }
1254 
1255 void
unsetgate(struct gate_descriptor * gd)1256 unsetgate(struct gate_descriptor *gd)
1257 {
1258 	memset(gd, 0, sizeof (*gd));
1259 }
1260 
1261 void
setregion(struct region_descriptor * rd,void * base,u_int16_t limit)1262 setregion(struct region_descriptor *rd, void *base, u_int16_t limit)
1263 {
1264 	rd->rd_limit = limit;
1265 	rd->rd_base = (u_int64_t)base;
1266 }
1267 
1268 /*
1269  * Note that the base and limit fields are ignored in long mode.
1270  */
1271 void
set_mem_segment(struct mem_segment_descriptor * sd,void * base,size_t limit,int type,int dpl,int gran,int def32,int is64)1272 set_mem_segment(struct mem_segment_descriptor *sd, void *base, size_t limit,
1273     int type, int dpl, int gran, int def32, int is64)
1274 {
1275 	sd->sd_lolimit = (unsigned)limit;
1276 	sd->sd_lobase = (unsigned long)base;
1277 	sd->sd_type = type;
1278 	sd->sd_dpl = dpl;
1279 	sd->sd_p = 1;
1280 	sd->sd_hilimit = (unsigned)limit >> 16;
1281 	sd->sd_avl = 0;
1282 	sd->sd_long = is64;
1283 	sd->sd_def32 = def32;
1284 	sd->sd_gran = gran;
1285 	sd->sd_hibase = (unsigned long)base >> 24;
1286 }
1287 
1288 void
set_sys_segment(struct sys_segment_descriptor * sd,void * base,size_t limit,int type,int dpl,int gran)1289 set_sys_segment(struct sys_segment_descriptor *sd, void *base, size_t limit,
1290     int type, int dpl, int gran)
1291 {
1292 	memset(sd, 0, sizeof *sd);
1293 	sd->sd_lolimit = (unsigned)limit;
1294 	sd->sd_lobase = (u_int64_t)base;
1295 	sd->sd_type = type;
1296 	sd->sd_dpl = dpl;
1297 	sd->sd_p = 1;
1298 	sd->sd_hilimit = (unsigned)limit >> 16;
1299 	sd->sd_gran = gran;
1300 	sd->sd_hibase = (u_int64_t)base >> 24;
1301 }
1302 
1303 void
cpu_init_idt(void)1304 cpu_init_idt(void)
1305 {
1306 	struct region_descriptor region;
1307 
1308 	setregion(&region, idt, NIDT * sizeof(idt[0]) - 1);
1309 	lidt(&region);
1310 }
1311 
1312 void
cpu_init_extents(void)1313 cpu_init_extents(void)
1314 {
1315 	extern struct extent *iomem_ex;
1316 	static int already_done;
1317 	int i;
1318 
1319 	/* We get called for each CPU, only first should do this */
1320 	if (already_done)
1321 		return;
1322 
1323 	/*
1324 	 * Allocate the physical addresses used by RAM from the iomem
1325 	 * extent map.
1326 	 */
1327 	for (i = 0; i < mem_cluster_cnt; i++) {
1328 		if (extent_alloc_region(iomem_ex, mem_clusters[i].start,
1329 		    mem_clusters[i].size, EX_NOWAIT)) {
1330 			/* XXX What should we do? */
1331 			printf("WARNING: CAN'T ALLOCATE RAM (%llx-%llx)"
1332 			    " FROM IOMEM EXTENT MAP!\n", mem_clusters[i].start,
1333 			    mem_clusters[i].start + mem_clusters[i].size - 1);
1334 		}
1335 	}
1336 
1337 	already_done = 1;
1338 }
1339 
1340 void
map_tramps(void)1341 map_tramps(void)
1342 {
1343 #if defined(MULTIPROCESSOR) || \
1344     (NACPI > 0 && !defined(SMALL_KERNEL))
1345 	struct pmap *kmp = pmap_kernel();
1346 	extern paddr_t tramp_pdirpa;
1347 #ifdef MULTIPROCESSOR
1348 	extern u_char cpu_spinup_trampoline[];
1349 	extern u_char cpu_spinup_trampoline_end[];
1350 	extern u_char mp_tramp_data_start[];
1351 	extern u_char mp_tramp_data_end[];
1352 	extern u_int32_t mp_pdirpa;
1353 #endif
1354 
1355 	/*
1356 	 * The initial PML4 pointer must be below 4G, so if the
1357 	 * current one isn't, use a "bounce buffer" and save it
1358 	 * for tramps to use.
1359 	 */
1360 	if (kmp->pm_pdirpa > 0xffffffff) {
1361 		pmap_kenter_pa(lo32_vaddr, lo32_paddr, PROT_READ | PROT_WRITE);
1362 		memcpy((void *)lo32_vaddr, kmp->pm_pdir, PAGE_SIZE);
1363 		tramp_pdirpa = lo32_paddr;
1364 		pmap_kremove(lo32_vaddr, PAGE_SIZE);
1365 	} else
1366 		tramp_pdirpa = kmp->pm_pdirpa;
1367 
1368 
1369 #ifdef MULTIPROCESSOR
1370 	/* Map MP tramp code and data pages RW for copy */
1371 	pmap_kenter_pa(MP_TRAMPOLINE, MP_TRAMPOLINE,
1372 	    PROT_READ | PROT_WRITE);
1373 
1374 	pmap_kenter_pa(MP_TRAMP_DATA, MP_TRAMP_DATA,
1375 	    PROT_READ | PROT_WRITE);
1376 
1377 	memset((caddr_t)MP_TRAMPOLINE, 0xcc, PAGE_SIZE);
1378 	memset((caddr_t)MP_TRAMP_DATA, 0xcc, PAGE_SIZE);
1379 
1380 	memcpy((caddr_t)MP_TRAMPOLINE,
1381 	    cpu_spinup_trampoline,
1382 	    cpu_spinup_trampoline_end-cpu_spinup_trampoline);
1383 
1384 	memcpy((caddr_t)MP_TRAMP_DATA,
1385 		mp_tramp_data_start,
1386 		mp_tramp_data_end - mp_tramp_data_start);
1387 
1388 	/*
1389 	 * We need to patch this after we copy the tramp data,
1390 	 * the symbol points into the copied tramp data page.
1391 	 */
1392 	mp_pdirpa = tramp_pdirpa;
1393 
1394 	/* Unmap, will be remapped in cpu_start_secondary */
1395 	pmap_kremove(MP_TRAMPOLINE, PAGE_SIZE);
1396 	pmap_kremove(MP_TRAMP_DATA, PAGE_SIZE);
1397 #endif /* MULTIPROCESSOR */
1398 #endif
1399 }
1400 
1401 void
cpu_set_vendor(struct cpu_info * ci,int level,const char * vendor)1402 cpu_set_vendor(struct cpu_info *ci, int level, const char *vendor)
1403 {
1404 	ci->ci_cpuid_level = level;
1405 	cpuid_level = MIN(cpuid_level, level);
1406 
1407 	/* map the vendor string to an integer */
1408 	if (strcmp(vendor, "AuthenticAMD") == 0)
1409 		ci->ci_vendor = CPUV_AMD;
1410 	else if (strcmp(vendor, "GenuineIntel") == 0)
1411 		ci->ci_vendor = CPUV_INTEL;
1412 	else if (strcmp(vendor, "CentaurHauls") == 0)
1413 		ci->ci_vendor = CPUV_VIA;
1414 	else
1415 		ci->ci_vendor = CPUV_UNKNOWN;
1416 }
1417 
1418 #define	IDTVEC(name)	__CONCAT(X, name)
1419 typedef void (vector)(void);
1420 extern vector *IDTVEC(exceptions)[];
1421 
1422 paddr_t early_pte_pages;
1423 
1424 void
init_x86_64(paddr_t first_avail)1425 init_x86_64(paddr_t first_avail)
1426 {
1427 	struct region_descriptor region;
1428 	bios_memmap_t *bmp;
1429 	int x, ist;
1430 	uint64_t max_dm_size = ((uint64_t)512 * NUM_L4_SLOT_DIRECT) << 30;
1431 
1432 	/*
1433 	 * locore0 mapped 3 pages for use before the pmap is initialized
1434 	 * starting at first_avail. These pages are currently used by
1435 	 * efifb to create early-use VAs for the framebuffer before efifb
1436 	 * is attached.
1437 	 */
1438 	early_pte_pages = first_avail;
1439 	first_avail += 3 * NBPG;
1440 
1441 	cpu_set_vendor(&cpu_info_primary, cpuid_level, cpu_vendor);
1442 	cpu_init_msrs(&cpu_info_primary);
1443 
1444 	proc0.p_addr = proc0paddr;
1445 	cpu_info_primary.ci_curpcb = &proc0.p_addr->u_pcb;
1446 
1447 	x86_bus_space_init();
1448 
1449 	i8254_startclock();
1450 
1451 	/*
1452 	 * Initialize PAGE_SIZE-dependent variables.
1453 	 */
1454 	uvm_setpagesize();
1455 
1456 	/*
1457 	 * Boot arguments are in a single page specified by /boot.
1458 	 *
1459 	 * We require the "new" vector form, as well as memory ranges
1460 	 * to be given in bytes rather than KB.
1461 	 *
1462 	 * locore copies the data into bootinfo[] for us.
1463 	 */
1464 	if ((bootapiver & (BAPIV_VECTOR | BAPIV_BMEMMAP)) ==
1465 	    (BAPIV_VECTOR | BAPIV_BMEMMAP)) {
1466 		if (bootinfo_size >= sizeof(bootinfo))
1467 			panic("boot args too big");
1468 
1469 		getbootinfo(bootinfo, bootinfo_size);
1470 	} else
1471 		panic("invalid /boot");
1472 
1473 	cninit();
1474 
1475 /*
1476  * Memory on the AMD64 port is described by three different things.
1477  *
1478  * 1. biosbasemem - This is outdated, and should really only be used to
1479  *    sanitize the other values. This is what we get back from the BIOS
1480  *    using the legacy routines, describing memory below 640KB.
1481  *
1482  * 2. bios_memmap[] - This is the memory map as the bios has returned
1483  *    it to us.  It includes memory the kernel occupies, etc.
1484  *
1485  * 3. mem_cluster[] - This is the massaged free memory segments after
1486  *    taking into account the contents of bios_memmap, biosbasemem,
1487  *    and locore/machdep/pmap kernel allocations of physical
1488  *    pages.
1489  *
1490  * The other thing is that the physical page *RANGE* is described by
1491  * three more variables:
1492  *
1493  * avail_start - This is a physical address of the start of available
1494  *               pages, until IOM_BEGIN.  This is basically the start
1495  *               of the UVM managed range of memory, with some holes...
1496  *
1497  * avail_end - This is the end of physical pages.  All physical pages
1498  *             that UVM manages are between avail_start and avail_end.
1499  *             There are holes...
1500  *
1501  * first_avail - This is the first available physical page after the
1502  *               kernel, page tables, etc.
1503  *
1504  * We skip the first few pages for trampolines, hibernate, and to avoid
1505  * buggy SMI implementations that could corrupt the first 64KB.
1506  */
1507 	avail_start = 16*PAGE_SIZE;
1508 
1509 #ifdef MULTIPROCESSOR
1510 	if (avail_start < MP_TRAMPOLINE + PAGE_SIZE)
1511 		avail_start = MP_TRAMPOLINE + PAGE_SIZE;
1512 	if (avail_start < MP_TRAMP_DATA + PAGE_SIZE)
1513 		avail_start = MP_TRAMP_DATA + PAGE_SIZE;
1514 #endif
1515 
1516 #if (NACPI > 0 && !defined(SMALL_KERNEL))
1517 	if (avail_start < ACPI_TRAMPOLINE + PAGE_SIZE)
1518 		avail_start = ACPI_TRAMPOLINE + PAGE_SIZE;
1519 	if (avail_start < ACPI_TRAMP_DATA + PAGE_SIZE)
1520 		avail_start = ACPI_TRAMP_DATA + PAGE_SIZE;
1521 #endif
1522 
1523 #ifdef HIBERNATE
1524 	if (avail_start < HIBERNATE_HIBALLOC_PAGE + PAGE_SIZE)
1525 		avail_start = HIBERNATE_HIBALLOC_PAGE + PAGE_SIZE;
1526 #endif /* HIBERNATE */
1527 
1528 	/*
1529 	 * We need to go through the BIOS memory map given, and
1530 	 * fill out mem_clusters and mem_cluster_cnt stuff, taking
1531 	 * into account all the points listed above.
1532 	 */
1533 	avail_end = mem_cluster_cnt = 0;
1534 	for (bmp = bios_memmap; bmp->type != BIOS_MAP_END; bmp++) {
1535 		paddr_t s1, s2, e1, e2;
1536 
1537 		/* Ignore non-free memory */
1538 		if (bmp->type != BIOS_MAP_FREE)
1539 			continue;
1540 		if (bmp->size < PAGE_SIZE)
1541 			continue;
1542 
1543 		/* Init our segment(s), round/trunc to pages */
1544 		s1 = round_page(bmp->addr);
1545 		e1 = trunc_page(bmp->addr + bmp->size);
1546 		s2 = e2 = 0;
1547 
1548 		/*
1549 		 * XXX Some buggy ACPI BIOSes use memory that they
1550 		 * declare as free.  Current worst offender is
1551 		 * Supermicro 5019D-FTN4.  Typically the affected memory
1552 		 * areas are small blocks between areas reserved for
1553 		 * ACPI and other BIOS goo.  So skip areas smaller
1554 		 * than 32 MB above the 16 MB boundary (to avoid
1555 		 * affecting legacy stuff).
1556 		 */
1557 		if (s1 > 16*1024*1024 && (e1 - s1) < 32*1024*1024)
1558 			continue;
1559 
1560 		/* Check and adjust our segment(s) */
1561 		/* Nuke low pages */
1562 		if (s1 < avail_start) {
1563 			s1 = avail_start;
1564 			if (s1 > e1)
1565 				continue;
1566 		}
1567 
1568 		/*
1569 		 * The direct map is limited to 512GB * NUM_L4_SLOT_DIRECT of
1570 		 * memory, so discard anything above that.
1571 		 */
1572 		if (e1 >= max_dm_size) {
1573 			e1 = max_dm_size;
1574 			if (s1 > e1)
1575 				continue;
1576 		}
1577 
1578 		/* Crop stuff into "640K hole" */
1579 		if (s1 < IOM_BEGIN && e1 > IOM_BEGIN)
1580 			e1 = IOM_BEGIN;
1581 		if (s1 < biosbasemem && e1 > biosbasemem)
1582 			e1 = biosbasemem;
1583 
1584 		/* Split any segments straddling the 16MB boundary */
1585 		if (s1 < 16*1024*1024 && e1 > 16*1024*1024) {
1586 			e2 = e1;
1587 			s2 = e1 = 16*1024*1024;
1588 		}
1589 
1590 		/* Store segment(s) */
1591 		if (e1 - s1 >= PAGE_SIZE) {
1592 			mem_clusters[mem_cluster_cnt].start = s1;
1593 			mem_clusters[mem_cluster_cnt].size = e1 - s1;
1594 			mem_cluster_cnt++;
1595 		}
1596 		if (e2 - s2 >= PAGE_SIZE) {
1597 			mem_clusters[mem_cluster_cnt].start = s2;
1598 			mem_clusters[mem_cluster_cnt].size = e2 - s2;
1599 			mem_cluster_cnt++;
1600 		}
1601 		if (avail_end < e1) avail_end = e1;
1602 		if (avail_end < e2) avail_end = e2;
1603 	}
1604 
1605 	/*
1606 	 * Call pmap initialization to make new kernel address space.
1607 	 * We must do this before loading pages into the VM system.
1608 	 */
1609 	first_avail = pmap_bootstrap(first_avail, trunc_page(avail_end));
1610 
1611 #if NEFI > 0
1612 	/* Relocate the EFI memory map. */
1613 	if (bios_efiinfo && bios_efiinfo->mmap_start) {
1614 		mmap = (EFI_MEMORY_DESCRIPTOR *)PMAP_DIRECT_MAP(first_avail);
1615 		memcpy(mmap, (void *)PMAP_DIRECT_MAP(bios_efiinfo->mmap_start),
1616 		    bios_efiinfo->mmap_size);
1617 		first_avail += round_page(bios_efiinfo->mmap_size);
1618 	}
1619 #endif
1620 
1621 	/* Allocate these out of the 640KB base memory */
1622 	if (avail_start != PAGE_SIZE)
1623 		avail_start = pmap_prealloc_lowmem_ptps(avail_start);
1624 
1625 	cpu_init_extents();
1626 
1627 	/* Make sure the end of the space used by the kernel is rounded. */
1628 	first_avail = round_page(first_avail);
1629 	kern_end = KERNBASE + first_avail;
1630 
1631 	/*
1632 	 * Now, load the memory clusters (which have already been
1633 	 * flensed) into the VM system.
1634 	 */
1635 	for (x = 0; x < mem_cluster_cnt; x++) {
1636 		paddr_t seg_start = mem_clusters[x].start;
1637 		paddr_t seg_end = seg_start + mem_clusters[x].size;
1638 
1639 		if (seg_start < first_avail) seg_start = first_avail;
1640 		if (seg_start > seg_end) continue;
1641 		if (seg_end - seg_start < PAGE_SIZE) continue;
1642 
1643 		physmem += atop(mem_clusters[x].size);
1644 
1645 #if DEBUG_MEMLOAD
1646 		printf("loading 0x%lx-0x%lx (0x%lx-0x%lx)\n",
1647 		    seg_start, seg_end, atop(seg_start), atop(seg_end));
1648 #endif
1649 		uvm_page_physload(atop(seg_start), atop(seg_end),
1650 		    atop(seg_start), atop(seg_end), 0);
1651 	}
1652 
1653 	/*
1654          * Now, load the memory between the end of I/O memory "hole"
1655          * and the kernel.
1656 	 */
1657 	{
1658 		paddr_t seg_start = round_page(IOM_END);
1659 		paddr_t seg_end = trunc_page(KERNTEXTOFF - KERNBASE);
1660 
1661 		if (seg_start < seg_end) {
1662 #if DEBUG_MEMLOAD
1663 			printf("loading 0x%lx-0x%lx\n", seg_start, seg_end);
1664 #endif
1665 			uvm_page_physload(atop(seg_start), atop(seg_end),
1666 			    atop(seg_start), atop(seg_end), 0);
1667 		}
1668 	}
1669 
1670 #if DEBUG_MEMLOAD
1671 	printf("avail_start = 0x%lx\n", avail_start);
1672 	printf("avail_end = 0x%lx\n", avail_end);
1673 	printf("first_avail = 0x%lx\n", first_avail);
1674 #endif
1675 
1676 	/*
1677 	 * Steal memory for the message buffer (at end of core).
1678 	 */
1679 	{
1680 		struct vm_physseg *vps = NULL;
1681 		psize_t sz = round_page(MSGBUFSIZE);
1682 		psize_t reqsz = sz;
1683 
1684 		for (x = 0; x < vm_nphysseg; x++) {
1685 			vps = &vm_physmem[x];
1686 			if (ptoa(vps->avail_end) == avail_end)
1687 				break;
1688 		}
1689 		if (x == vm_nphysseg)
1690 			panic("init_x86_64: can't find end of memory");
1691 
1692 		/* Shrink so it'll fit in the last segment. */
1693 		if ((vps->avail_end - vps->avail_start) < atop(sz))
1694 			sz = ptoa(vps->avail_end - vps->avail_start);
1695 
1696 		vps->avail_end -= atop(sz);
1697 		vps->end -= atop(sz);
1698 		msgbuf_paddr = ptoa(vps->avail_end);
1699 
1700 		/* Remove the last segment if it now has no pages. */
1701 		if (vps->start == vps->end) {
1702 			for (vm_nphysseg--; x < vm_nphysseg; x++)
1703 				vm_physmem[x] = vm_physmem[x + 1];
1704 		}
1705 
1706 		/* Now find where the new avail_end is. */
1707 		for (avail_end = 0, x = 0; x < vm_nphysseg; x++)
1708 			if (vm_physmem[x].avail_end > avail_end)
1709 				avail_end = vm_physmem[x].avail_end;
1710 		avail_end = ptoa(avail_end);
1711 
1712 		/* Warn if the message buffer had to be shrunk. */
1713 		if (sz != reqsz)
1714 			printf("WARNING: %ld bytes not available for msgbuf "
1715 			    "in last cluster (%ld used)\n", reqsz, sz);
1716 	}
1717 
1718 	/*
1719 	 * Steal some memory for a dump bouncebuffer if we have memory over
1720 	 * the 32-bit barrier.
1721 	 */
1722 	if (avail_end > 0xffffffff) {
1723 		struct vm_physseg *vps = NULL;
1724 		psize_t sz = round_page(MAX(BYTES_PER_DUMP, dbtob(1)));
1725 
1726 		/* XXX assumes segments are ordered */
1727 		for (x = 0; x < vm_nphysseg; x++) {
1728 			vps = &vm_physmem[x];
1729 			/* Find something between 16meg and 4gig */
1730 			if (ptoa(vps->avail_end) <= 0xffffffff &&
1731 			    ptoa(vps->avail_start) >= 0xffffff)
1732 				break;
1733 		}
1734 		if (x == vm_nphysseg)
1735 			panic("init_x86_64: no memory between "
1736 			    "0xffffff-0xffffffff");
1737 
1738 		/* Shrink so it'll fit in the segment. */
1739 		if ((vps->avail_end - vps->avail_start) < atop(sz))
1740 			sz = ptoa(vps->avail_end - vps->avail_start);
1741 
1742 		vps->avail_end -= atop(sz);
1743 		vps->end -= atop(sz);
1744 		dumpmem_paddr = ptoa(vps->avail_end);
1745 		dumpmem_vaddr = PMAP_DIRECT_MAP(dumpmem_paddr);
1746 		dumpmem_sz = sz;
1747 
1748 		/* Remove the last segment if it now has no pages. */
1749 		if (vps->start == vps->end) {
1750 			for (vm_nphysseg--; x < vm_nphysseg; x++)
1751 				vm_physmem[x] = vm_physmem[x + 1];
1752 		}
1753 	}
1754 
1755 	pmap_growkernel(VM_MIN_KERNEL_ADDRESS + 32 * 1024 * 1024);
1756 
1757 	pmap_kenter_pa(idt_vaddr, idt_paddr, PROT_READ | PROT_WRITE);
1758 
1759 	idt = (struct gate_descriptor *)idt_vaddr;
1760 	cpu_info_primary.ci_tss = &cpu_info_full_primary.cif_tss;
1761 	cpu_info_primary.ci_gdt = &cpu_info_full_primary.cif_gdt;
1762 
1763 	/* make gdt gates and memory segments */
1764 	set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GCODE_SEL), 0,
1765 	    0xfffff, SDT_MEMERA, SEL_KPL, 1, 0, 1);
1766 
1767 	set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GDATA_SEL), 0,
1768 	    0xfffff, SDT_MEMRWA, SEL_KPL, 1, 0, 1);
1769 
1770 	set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GUDATA_SEL), 0,
1771 	    atop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMRWA, SEL_UPL, 1, 0, 1);
1772 
1773 	set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GUCODE_SEL), 0,
1774 	    atop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMERA, SEL_UPL, 1, 0, 1);
1775 
1776 	set_sys_segment(GDT_ADDR_SYS(cpu_info_primary.ci_gdt, GPROC0_SEL),
1777 	    cpu_info_primary.ci_tss, sizeof (struct x86_64_tss)-1,
1778 	    SDT_SYS386TSS, SEL_KPL, 0);
1779 
1780 	/* exceptions */
1781 	for (x = 0; x < 32; x++) {
1782 		/* trap2 == NMI, trap8 == double fault */
1783 		ist = (x == 2) ? 2 : (x == 8) ? 1 : 0;
1784 		setgate(&idt[x], IDTVEC(exceptions)[x], ist, SDT_SYS386IGT,
1785 		    (x == 3) ? SEL_UPL : SEL_KPL,
1786 		    GSEL(GCODE_SEL, SEL_KPL));
1787 		idt_allocmap[x] = 1;
1788 	}
1789 
1790 	setregion(&region, cpu_info_primary.ci_gdt, GDT_SIZE - 1);
1791 	lgdt(&region);
1792 
1793 	cpu_init_idt();
1794 
1795 	intr_default_setup();
1796 
1797 	fpuinit(&cpu_info_primary);
1798 
1799 	softintr_init();
1800 	splraise(IPL_IPI);
1801 	intr_enable();
1802 
1803 #ifdef DDB
1804 	db_machine_init();
1805 	ddb_init();
1806 	if (boothowto & RB_KDB)
1807 		db_enter();
1808 #endif
1809 }
1810 
1811 void
cpu_reset(void)1812 cpu_reset(void)
1813 {
1814 	intr_disable();
1815 
1816 	if (cpuresetfn)
1817 		(*cpuresetfn)();
1818 
1819 	/*
1820 	 * The keyboard controller has 4 random output pins, one of which is
1821 	 * connected to the RESET pin on the CPU in many PCs.  We tell the
1822 	 * keyboard controller to pulse this line a couple of times.
1823 	 */
1824 	outb(IO_KBD + KBCMDP, KBC_PULSE0);
1825 	delay(100000);
1826 	outb(IO_KBD + KBCMDP, KBC_PULSE0);
1827 	delay(100000);
1828 
1829 	/*
1830 	 * Try to cause a triple fault and watchdog reset by making the IDT
1831 	 * invalid and causing a fault.
1832 	 */
1833 	memset((caddr_t)idt, 0, NIDT * sizeof(idt[0]));
1834 	__asm volatile("divl %0,%1" : : "q" (0), "a" (0));
1835 
1836 	for (;;)
1837 		continue;
1838 	/* NOTREACHED */
1839 }
1840 
1841 /*
1842  * cpu_dumpsize: calculate size of machine-dependent kernel core dump headers.
1843  */
1844 int
cpu_dumpsize(void)1845 cpu_dumpsize(void)
1846 {
1847 	int size;
1848 
1849 	size = ALIGN(sizeof(kcore_seg_t)) +
1850 	    ALIGN(mem_cluster_cnt * sizeof(phys_ram_seg_t));
1851 	if (roundup(size, dbtob(1)) != dbtob(1))
1852 		return (-1);
1853 
1854 	return (1);
1855 }
1856 
1857 /*
1858  * cpu_dump_mempagecnt: calculate the size of RAM (in pages) to be dumped.
1859  */
1860 u_long
cpu_dump_mempagecnt(void)1861 cpu_dump_mempagecnt(void)
1862 {
1863 	u_long i, n;
1864 
1865 	n = 0;
1866 	for (i = 0; i < mem_cluster_cnt; i++)
1867 		n += atop(mem_clusters[i].size);
1868 	return (n);
1869 }
1870 
1871 /*
1872  * Figure out which portions of memory are used by the kernel/system.
1873  */
1874 int
amd64_pa_used(paddr_t addr)1875 amd64_pa_used(paddr_t addr)
1876 {
1877 	struct vm_page	*pg;
1878 
1879 	/* Kernel manages these */
1880 	if ((pg = PHYS_TO_VM_PAGE(addr)) && (pg->pg_flags & PG_DEV) == 0)
1881 		return 1;
1882 
1883 	/* Kernel is loaded here */
1884 	if (addr > IOM_END && addr < (kern_end - KERNBASE))
1885 		return 1;
1886 
1887 	/* Low memory used for various bootstrap things */
1888 	if (addr < avail_start)
1889 		return 1;
1890 
1891 	/*
1892 	 * The only regions I can think of that are left are the things
1893 	 * we steal away from UVM.  The message buffer?
1894 	 * XXX - ignore these for now.
1895 	 */
1896 
1897 	return 0;
1898 }
1899 
1900 void
cpu_initclocks(void)1901 cpu_initclocks(void)
1902 {
1903 	(*initclock_func)();
1904 }
1905 
1906 void
cpu_startclock(void)1907 cpu_startclock(void)
1908 {
1909 	(*startclock_func)();
1910 }
1911 
1912 void
need_resched(struct cpu_info * ci)1913 need_resched(struct cpu_info *ci)
1914 {
1915 	ci->ci_want_resched = 1;
1916 
1917 	/* There's a risk we'll be called before the idle threads start */
1918 	if (ci->ci_curproc) {
1919 		aston(ci->ci_curproc);
1920 		cpu_kick(ci);
1921 	}
1922 }
1923 
1924 /*
1925  * Allocate an IDT vector slot within the given range.
1926  * XXX needs locking to avoid MP allocation races.
1927  */
1928 
1929 int
idt_vec_alloc(int low,int high)1930 idt_vec_alloc(int low, int high)
1931 {
1932 	int vec;
1933 
1934 	for (vec = low; vec <= high; vec++) {
1935 		if (idt_allocmap[vec] == 0) {
1936 			idt_allocmap[vec] = 1;
1937 			return vec;
1938 		}
1939 	}
1940 	return 0;
1941 }
1942 
1943 int
idt_vec_alloc_range(int low,int high,int num)1944 idt_vec_alloc_range(int low, int high, int num)
1945 {
1946 	int i, vec;
1947 
1948 	KASSERT(powerof2(num));
1949 	low = (low + num - 1) & ~(num - 1);
1950 	high = ((high + 1) & ~(num - 1)) - 1;
1951 
1952 	for (vec = low; vec <= high; vec += num) {
1953 		for (i = 0; i < num; i++) {
1954 			if (idt_allocmap[vec + i] != 0)
1955 				break;
1956 		}
1957 		if (i == num) {
1958 			for (i = 0; i < num; i++)
1959 				idt_allocmap[vec + i] = 1;
1960 			return vec;
1961 		}
1962 	}
1963 	return 0;
1964 }
1965 
1966 void
idt_vec_set(int vec,void (* function)(void))1967 idt_vec_set(int vec, void (*function)(void))
1968 {
1969 	/*
1970 	 * Vector should be allocated, so no locking needed.
1971 	 */
1972 	KASSERT(idt_allocmap[vec] == 1);
1973 	setgate(&idt[vec], function, 0, SDT_SYS386IGT, SEL_KPL,
1974 	    GSEL(GCODE_SEL, SEL_KPL));
1975 }
1976 
1977 void
idt_vec_free(int vec)1978 idt_vec_free(int vec)
1979 {
1980 	unsetgate(&idt[vec]);
1981 	idt_allocmap[vec] = 0;
1982 }
1983 
1984 #ifdef DIAGNOSTIC
1985 void
splassert_check(int wantipl,const char * func)1986 splassert_check(int wantipl, const char *func)
1987 {
1988 	int cpl = curcpu()->ci_ilevel;
1989 	int floor = curcpu()->ci_handled_intr_level;
1990 
1991 	if (cpl < wantipl) {
1992 		splassert_fail(wantipl, cpl, func);
1993 	}
1994 	if (floor > wantipl) {
1995 		splassert_fail(wantipl, floor, func);
1996 	}
1997 
1998 }
1999 #endif
2000 
2001 int
copyin32(const uint32_t * uaddr,uint32_t * kaddr)2002 copyin32(const uint32_t *uaddr, uint32_t *kaddr)
2003 {
2004 	if ((vaddr_t)uaddr & 0x3)
2005 		return EFAULT;
2006 
2007 	/* copyin(9) is atomic */
2008 	return copyin(uaddr, kaddr, sizeof(uint32_t));
2009 }
2010 
2011 void
getbootinfo(char * bootinfo,int bootinfo_size)2012 getbootinfo(char *bootinfo, int bootinfo_size)
2013 {
2014 	bootarg32_t *q;
2015 	bios_ddb_t *bios_ddb;
2016 	bios_bootduid_t *bios_bootduid;
2017 	bios_bootsr_t *bios_bootsr;
2018 #undef BOOTINFO_DEBUG
2019 #ifdef BOOTINFO_DEBUG
2020 	printf("bootargv:");
2021 #endif
2022 
2023 	for (q = (bootarg32_t *)bootinfo;
2024 	    (q->ba_type != BOOTARG_END) &&
2025 	    ((((char *)q) - bootinfo) < bootinfo_size);
2026 	    q = (bootarg32_t *)(((char *)q) + q->ba_size)) {
2027 
2028 		switch (q->ba_type) {
2029 		case BOOTARG_MEMMAP:
2030 			bios_memmap = (bios_memmap_t *)q->ba_arg;
2031 #ifdef BOOTINFO_DEBUG
2032 			printf(" memmap %p", bios_memmap);
2033 #endif
2034 			break;
2035 		case BOOTARG_DISKINFO:
2036 			bios_diskinfo = (bios_diskinfo_t *)q->ba_arg;
2037 #ifdef BOOTINFO_DEBUG
2038 			printf(" diskinfo %p", bios_diskinfo);
2039 #endif
2040 			break;
2041 		case BOOTARG_APMINFO:
2042 			/* generated by i386 boot loader */
2043 			break;
2044 		case BOOTARG_CKSUMLEN:
2045 			bios_cksumlen = *(u_int32_t *)q->ba_arg;
2046 #ifdef BOOTINFO_DEBUG
2047 			printf(" cksumlen %d", bios_cksumlen);
2048 #endif
2049 			break;
2050 		case BOOTARG_PCIINFO:
2051 			/* generated by i386 boot loader */
2052 			break;
2053 		case BOOTARG_CONSDEV: {
2054 #if NCOM > 0
2055 			bios_consdev_t *cdp = (bios_consdev_t*)q->ba_arg;
2056 			static const int ports[] =
2057 			    { 0x3f8, 0x2f8, 0x3e8, 0x2e8 };
2058 			int unit = minor(cdp->consdev);
2059 			uint64_t consaddr = cdp->consaddr;
2060 			if (consaddr == -1 && unit >= 0 && unit < nitems(ports))
2061 				consaddr = ports[unit];
2062 			if (major(cdp->consdev) == 8 && consaddr != -1) {
2063 				comconsunit = unit;
2064 				comconsaddr = consaddr;
2065 				comconsrate = cdp->conspeed;
2066 				comconsfreq = cdp->consfreq;
2067 				comcons_reg_width = cdp->reg_width;
2068 				comcons_reg_shift = cdp->reg_shift;
2069 				if (cdp->flags & BCD_MMIO)
2070 					comconsiot = X86_BUS_SPACE_MEM;
2071 				else
2072 					comconsiot = X86_BUS_SPACE_IO;
2073 			}
2074 #endif
2075 #ifdef BOOTINFO_DEBUG
2076 			printf(" console 0x%x:%d", cdp->consdev, cdp->conspeed);
2077 #endif
2078 			break;
2079 		}
2080 		case BOOTARG_BOOTMAC:
2081 			bios_bootmac = (bios_bootmac_t *)q->ba_arg;
2082 			break;
2083 
2084 		case BOOTARG_DDB:
2085 			bios_ddb = (bios_ddb_t *)q->ba_arg;
2086 #ifdef DDB
2087 			db_console = bios_ddb->db_console;
2088 #endif
2089 			break;
2090 
2091 		case BOOTARG_BOOTDUID:
2092 			bios_bootduid = (bios_bootduid_t *)q->ba_arg;
2093 			memcpy(bootduid, bios_bootduid, sizeof(bootduid));
2094 			break;
2095 
2096 		case BOOTARG_BOOTSR:
2097 			bios_bootsr = (bios_bootsr_t *)q->ba_arg;
2098 #if NSOFTRAID > 0
2099 			memcpy(&sr_bootuuid, &bios_bootsr->uuid,
2100 			    sizeof(sr_bootuuid));
2101 			memcpy(&sr_bootkey, &bios_bootsr->maskkey,
2102 			    sizeof(sr_bootkey));
2103 #endif
2104 			explicit_bzero(bios_bootsr, sizeof(bios_bootsr_t));
2105 			break;
2106 
2107 		case BOOTARG_EFIINFO:
2108 			bios_efiinfo = (bios_efiinfo_t *)q->ba_arg;
2109 			break;
2110 
2111 		case BOOTARG_UCODE:
2112 			bios_ucode = (bios_ucode_t *)q->ba_arg;
2113 			break;
2114 
2115 		default:
2116 #ifdef BOOTINFO_DEBUG
2117 			printf(" unsupported arg (%d) %p", q->ba_type,
2118 			    q->ba_arg);
2119 #endif
2120 			break;
2121 		}
2122 	}
2123 #ifdef BOOTINFO_DEBUG
2124 	printf("\n");
2125 #endif
2126 }
2127 
2128 int
check_context(const struct reg * regs,struct trapframe * tf)2129 check_context(const struct reg *regs, struct trapframe *tf)
2130 {
2131 	uint16_t sel;
2132 
2133 	if (((regs->r_rflags ^ tf->tf_rflags) & PSL_USERSTATIC) != 0)
2134 		return EINVAL;
2135 
2136 	sel = regs->r_ss & 0xffff;
2137 	if (!VALID_USER_DSEL(sel))
2138 		return EINVAL;
2139 
2140 	sel = regs->r_cs & 0xffff;
2141 	if (!VALID_USER_CSEL(sel))
2142 		return EINVAL;
2143 
2144 	if (regs->r_rip >= VM_MAXUSER_ADDRESS)
2145 		return EINVAL;
2146 
2147 	return 0;
2148 }
2149 
2150 int amd64_delay_quality;
2151 
2152 void
delay_init(void (* fn)(int),int fn_quality)2153 delay_init(void(*fn)(int), int fn_quality)
2154 {
2155 	if (fn_quality > amd64_delay_quality) {
2156 		delay_func = fn;
2157 		amd64_delay_quality = fn_quality;
2158 	}
2159 }
2160 
2161 void
delay_fini(void (* fn)(int))2162 delay_fini(void (*fn)(int))
2163 {
2164 	if (fn == delay_func) {
2165 		delay_func = i8254_delay;
2166 		amd64_delay_quality = 0;
2167 	}
2168 }
2169