xref: /openbsd/sys/arch/amd64/amd64/machdep.c (revision 6c195505)
1 /*	$OpenBSD: machdep.c,v 1.294 2024/06/07 16:53:35 kettenis Exp $	*/
2 /*	$NetBSD: machdep.c,v 1.3 2003/05/07 22:58:18 fvdl Exp $	*/
3 
4 /*-
5  * Copyright (c) 1996, 1997, 1998, 2000 The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace
10  * Simulation Facility, NASA Ames Research Center.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
25  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 /*-
35  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
36  * All rights reserved.
37  *
38  * This code is derived from software contributed to Berkeley by
39  * William Jolitz.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  *	@(#)machdep.c	7.4 (Berkeley) 6/3/91
66  */
67 
68 #include <sys/param.h>
69 #include <sys/systm.h>
70 #include <sys/signal.h>
71 #include <sys/signalvar.h>
72 #include <sys/proc.h>
73 #include <sys/user.h>
74 #include <sys/exec.h>
75 #include <sys/buf.h>
76 #include <sys/reboot.h>
77 #include <sys/conf.h>
78 #include <sys/msgbuf.h>
79 #include <sys/mount.h>
80 #include <sys/extent.h>
81 #include <sys/core.h>
82 #include <sys/kcore.h>
83 #include <sys/syscallargs.h>
84 
85 #include <dev/cons.h>
86 #include <stand/boot/bootarg.h>
87 
88 #include <net/if.h>
89 #include <uvm/uvm_extern.h>
90 
91 #include <sys/sysctl.h>
92 
93 #include <machine/cpu_full.h>
94 #include <machine/cpufunc.h>
95 #include <machine/pio.h>
96 #include <machine/psl.h>
97 #include <machine/reg.h>
98 #include <machine/fpu.h>
99 #include <machine/biosvar.h>
100 #include <machine/mpbiosvar.h>
101 #include <machine/kcore.h>
102 #include <machine/tss.h>
103 
104 #include <dev/isa/isareg.h>
105 #include <dev/ic/i8042reg.h>
106 
107 #ifdef DDB
108 #include <machine/db_machdep.h>
109 #include <ddb/db_extern.h>
110 extern int db_console;
111 #endif
112 
113 #include "isa.h"
114 #include "isadma.h"
115 #include "ksyms.h"
116 
117 #include "acpi.h"
118 #if NACPI > 0
119 #include <dev/acpi/acpivar.h>
120 #endif
121 
122 #include "com.h"
123 #if NCOM > 0
124 #include <sys/tty.h>
125 #include <dev/ic/comvar.h>
126 #include <dev/ic/comreg.h>
127 #endif
128 
129 #include "efi.h"
130 #if NEFI > 0
131 #include <dev/efi/efi.h>
132 #endif
133 
134 #include "softraid.h"
135 #if NSOFTRAID > 0
136 #include <dev/softraidvar.h>
137 #endif
138 
139 #ifdef HIBERNATE
140 #include <machine/hibernate_var.h>
141 #endif /* HIBERNATE */
142 
143 #include "ukbd.h"
144 #include "pckbc.h"
145 #if NPCKBC > 0 && NUKBD > 0
146 #include <dev/ic/pckbcvar.h>
147 #endif
148 
149 /* #define MACHDEP_DEBUG */
150 
151 #ifdef MACHDEP_DEBUG
152 #define DPRINTF(x...)	do { printf(x); } while(0)
153 #else
154 #define DPRINTF(x...)
155 #endif /* MACHDEP_DEBUG */
156 
157 /* the following is used externally (sysctl_hw) */
158 char machine[] = MACHINE;
159 
160 /*
161  * switchto vectors
162  */
163 void cpu_idle_cycle_hlt(void);
164 void (*cpu_idle_cycle_fcn)(void) = &cpu_idle_cycle_hlt;
165 void (*cpu_suspend_cycle_fcn)(void);
166 
167 /* the following is used externally for concurrent handlers */
168 int setperf_prio = 0;
169 
170 #ifdef CPURESET_DELAY
171 int	cpureset_delay = CPURESET_DELAY;
172 #else
173 int     cpureset_delay = 0;
174 #endif
175 
176 char *ssym = 0, *esym = 0;	/* start and end of symbol table */
177 dev_t bootdev = 0;		/* device we booted from */
178 int biosbasemem = 0;		/* base memory reported by BIOS */
179 u_int bootapiver = 0;		/* /boot API version */
180 
181 int	physmem;
182 extern int	boothowto;
183 
184 paddr_t	dumpmem_paddr;
185 vaddr_t	dumpmem_vaddr;
186 psize_t	dumpmem_sz;
187 
188 vaddr_t kern_end;
189 
190 vaddr_t	msgbuf_vaddr;
191 paddr_t msgbuf_paddr;
192 
193 vaddr_t	idt_vaddr;
194 paddr_t	idt_paddr;
195 
196 vaddr_t lo32_vaddr;
197 paddr_t lo32_paddr;
198 paddr_t tramp_pdirpa;
199 
200 int kbd_reset;
201 int lid_action = 1;
202 int pwr_action = 1;
203 int forceukbd;
204 
205 /*
206  * safepri is a safe priority for sleep to set for a spin-wait
207  * during autoconfiguration or after a panic.
208  */
209 int	safepri = 0;
210 
211 struct vm_map *exec_map = NULL;
212 struct vm_map *phys_map = NULL;
213 
214 /* UVM constraint ranges. */
215 struct uvm_constraint_range  isa_constraint = { 0x0, 0x00ffffffUL };
216 struct uvm_constraint_range  dma_constraint = { 0x0, 0xffffffffUL };
217 struct uvm_constraint_range *uvm_md_constraints[] = {
218     &isa_constraint,
219     &dma_constraint,
220     NULL,
221 };
222 
223 paddr_t avail_start;
224 paddr_t avail_end;
225 
226 void (*delay_func)(int) = i8254_delay;
227 void (*initclock_func)(void) = i8254_initclocks;
228 void (*startclock_func)(void) = i8254_start_both_clocks;
229 
230 /*
231  * Format of boot information passed to us by 32-bit /boot
232  */
233 typedef struct _boot_args32 {
234 	int	ba_type;
235 	int	ba_size;
236 	int	ba_nextX;	/* a ptr in 32-bit world, but not here */
237 	char	ba_arg[1];
238 } bootarg32_t;
239 
240 #define BOOTARGC_MAX	NBPG	/* one page */
241 
242 bios_bootmac_t *bios_bootmac;
243 
244 /* locore copies the arguments from /boot to here for us */
245 char bootinfo[BOOTARGC_MAX];
246 int bootinfo_size = BOOTARGC_MAX;
247 
248 void getbootinfo(char *, int);
249 
250 /* Data passed to us by /boot, filled in by getbootinfo() */
251 bios_diskinfo_t	*bios_diskinfo;
252 bios_memmap_t	*bios_memmap;
253 u_int32_t	bios_cksumlen;
254 bios_efiinfo_t	*bios_efiinfo;
255 bios_ucode_t	*bios_ucode;
256 
257 #if NEFI > 0
258 EFI_MEMORY_DESCRIPTOR *mmap;
259 #endif
260 
261 /*
262  * Size of memory segments, before any memory is stolen.
263  */
264 phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX];
265 int	mem_cluster_cnt;
266 
267 int	cpu_dump(void);
268 int	cpu_dumpsize(void);
269 u_long	cpu_dump_mempagecnt(void);
270 void	dumpsys(void);
271 void	cpu_init_extents(void);
272 void	map_tramps(void);
273 void	init_x86_64(paddr_t);
274 void	(*cpuresetfn)(void);
275 void	enter_shared_special_pages(void);
276 
277 #ifdef APERTURE
278 int allowaperture = 0;
279 #endif
280 
281 /*
282  * Machine-dependent startup code
283  */
284 void
cpu_startup(void)285 cpu_startup(void)
286 {
287 	vaddr_t minaddr, maxaddr;
288 
289 	msgbuf_vaddr = PMAP_DIRECT_MAP(msgbuf_paddr);
290 	initmsgbuf((caddr_t)msgbuf_vaddr, round_page(MSGBUFSIZE));
291 
292 	printf("%s", version);
293 	startclocks();
294 	rtcinit();
295 
296 	printf("real mem = %lu (%luMB)\n", ptoa((psize_t)physmem),
297 	    ptoa((psize_t)physmem)/1024/1024);
298 
299 	/*
300 	 * Allocate a submap for exec arguments.  This map effectively
301 	 * limits the number of processes exec'ing at any time.
302 	 */
303 	minaddr = vm_map_min(kernel_map);
304 	exec_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
305 				   16*NCARGS, VM_MAP_PAGEABLE, FALSE, NULL);
306 
307 	/*
308 	 * Allocate a submap for physio
309 	 */
310 	minaddr = vm_map_min(kernel_map);
311 	phys_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
312 				   VM_PHYS_SIZE, 0, FALSE, NULL);
313 
314 	printf("avail mem = %lu (%luMB)\n", ptoa((psize_t)uvmexp.free),
315 	    ptoa((psize_t)uvmexp.free)/1024/1024);
316 
317 	bufinit();
318 
319 	if (boothowto & RB_CONFIG) {
320 #ifdef BOOT_CONFIG
321 		user_config();
322 #else
323 		printf("kernel does not support -c; continuing..\n");
324 #endif
325 	}
326 
327 	/* Safe for i/o port / memory space allocation to use malloc now. */
328 	x86_bus_space_mallocok();
329 
330 #ifndef SMALL_KERNEL
331 	cpu_ucode_setup();
332 	cpu_ucode_apply(&cpu_info_primary);
333 #endif
334 	cpu_tsx_disable(&cpu_info_primary);
335 
336 	/* enter the IDT and trampoline code in the u-k maps */
337 	enter_shared_special_pages();
338 
339 	/* initialize CPU0's TSS and GDT and put them in the u-k maps */
340 	cpu_enter_pages(&cpu_info_full_primary);
341 }
342 
343 /*
344  * enter_shared_special_pages
345  *
346  * Requests mapping of various special pages required in the Intel Meltdown
347  * case (to be entered into the U-K page table):
348  *
349  *  1 IDT page
350  *  Various number of pages covering the U-K ".kutext" section. This section
351  *   contains code needed during trampoline operation
352  *  Various number of pages covering the U-K ".kudata" section. This section
353  *   contains data accessed by the trampoline, before switching to U+K
354  *   (for example, various shared global variables used by IPIs, etc)
355  *
356  * The linker script places the required symbols in the sections above.
357  *
358  * On CPUs not affected by Meltdown, the calls to pmap_enter_special below
359  * become no-ops.
360  */
361 void
enter_shared_special_pages(void)362 enter_shared_special_pages(void)
363 {
364 	extern char __kutext_start[], __kutext_end[], __kernel_kutext_phys[];
365 	extern char __text_page_start[], __text_page_end[];
366 	extern char __kernel_kutext_page_phys[];
367 	extern char __kudata_start[], __kudata_end[], __kernel_kudata_phys[];
368 	vaddr_t va;
369 	paddr_t pa;
370 
371 	/* idt */
372 	pmap_enter_special(idt_vaddr, idt_paddr, PROT_READ);
373 	DPRINTF("%s: entered idt page va 0x%llx pa 0x%llx\n", __func__,
374 	    (uint64_t)idt_vaddr, (uint64_t)idt_paddr);
375 
376 	/* .kutext section */
377 	va = (vaddr_t)__kutext_start;
378 	pa = (paddr_t)__kernel_kutext_phys;
379 	while (va < (vaddr_t)__kutext_end) {
380 		pmap_enter_special(va, pa, PROT_READ | PROT_EXEC);
381 		DPRINTF("%s: entered kutext page va 0x%llx pa 0x%llx\n",
382 		    __func__, (uint64_t)va, (uint64_t)pa);
383 		va += PAGE_SIZE;
384 		pa += PAGE_SIZE;
385 	}
386 
387 	/* .kutext.page section */
388 	va = (vaddr_t)__text_page_start;
389 	pa = (paddr_t)__kernel_kutext_page_phys;
390 	while (va < (vaddr_t)__text_page_end) {
391 		pmap_enter_special(va, pa, PROT_READ | PROT_EXEC);
392 		DPRINTF("%s: entered kutext.page va 0x%llx pa 0x%llx\n",
393 		    __func__, (uint64_t)va, (uint64_t)pa);
394 		va += PAGE_SIZE;
395 		pa += PAGE_SIZE;
396 	}
397 
398 	/* .kudata section */
399 	va = (vaddr_t)__kudata_start;
400 	pa = (paddr_t)__kernel_kudata_phys;
401 	while (va < (vaddr_t)__kudata_end) {
402 		pmap_enter_special(va, pa, PROT_READ | PROT_WRITE);
403 		DPRINTF("%s: entered kudata page va 0x%llx pa 0x%llx\n",
404 		    __func__, (uint64_t)va, (uint64_t)pa);
405 		va += PAGE_SIZE;
406 		pa += PAGE_SIZE;
407 	}
408 }
409 
410 /*
411  * Set up proc0's PCB and the cpu's TSS.
412  */
413 void
x86_64_proc0_tss_ldt_init(void)414 x86_64_proc0_tss_ldt_init(void)
415 {
416 	struct pcb *pcb;
417 
418 	cpu_info_primary.ci_curpcb = pcb = &proc0.p_addr->u_pcb;
419 	pcb->pcb_fsbase = 0;
420 	pcb->pcb_kstack = (u_int64_t)proc0.p_addr + USPACE - 16;
421 	proc0.p_md.md_regs = (struct trapframe *)pcb->pcb_kstack - 1;
422 
423 	ltr(GSYSSEL(GPROC0_SEL, SEL_KPL));
424 	lldt(0);
425 }
426 
427 bios_diskinfo_t *
bios_getdiskinfo(dev_t dev)428 bios_getdiskinfo(dev_t dev)
429 {
430 	bios_diskinfo_t *pdi;
431 
432 	if (bios_diskinfo == NULL)
433 		return NULL;
434 
435 	for (pdi = bios_diskinfo; pdi->bios_number != -1; pdi++) {
436 		if ((dev & B_MAGICMASK) == B_DEVMAGIC) { /* search by bootdev */
437 			if (pdi->bsd_dev == dev)
438 				break;
439 		} else {
440 			if (pdi->bios_number == dev)
441 				break;
442 		}
443 	}
444 
445 	if (pdi->bios_number == -1)
446 		return NULL;
447 	else
448 		return pdi;
449 }
450 
451 int
bios_sysctl(int * name,u_int namelen,void * oldp,size_t * oldlenp,void * newp,size_t newlen,struct proc * p)452 bios_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
453     size_t newlen, struct proc *p)
454 {
455 	bios_diskinfo_t *pdi;
456 	int biosdev;
457 
458 	/* all sysctl names at this level except diskinfo are terminal */
459 	if (namelen != 1 && name[0] != BIOS_DISKINFO)
460 		return (ENOTDIR);	       /* overloaded */
461 
462 	if (!(bootapiver & BAPIV_VECTOR))
463 		return EOPNOTSUPP;
464 
465 	switch (name[0]) {
466 	case BIOS_DEV:
467 		if ((pdi = bios_getdiskinfo(bootdev)) == NULL)
468 			return ENXIO;
469 		biosdev = pdi->bios_number;
470 		return sysctl_rdint(oldp, oldlenp, newp, biosdev);
471 	case BIOS_DISKINFO:
472 		if (namelen != 2)
473 			return ENOTDIR;
474 		if ((pdi = bios_getdiskinfo(name[1])) == NULL)
475 			return ENXIO;
476 		return sysctl_rdstruct(oldp, oldlenp, newp, pdi, sizeof(*pdi));
477 	case BIOS_CKSUMLEN:
478 		return sysctl_rdint(oldp, oldlenp, newp, bios_cksumlen);
479 	default:
480 		return EOPNOTSUPP;
481 	}
482 	/* NOTREACHED */
483 }
484 
485 extern int tsc_is_invariant;
486 extern int amd64_has_xcrypt;
487 extern int need_retpoline;
488 
489 const struct sysctl_bounded_args cpuctl_vars[] = {
490 	{ CPU_LIDACTION, &lid_action, 0, 2 },
491 	{ CPU_PWRACTION, &pwr_action, 0, 2 },
492 	{ CPU_CPUID, &cpu_id, SYSCTL_INT_READONLY },
493 	{ CPU_CPUFEATURE, &cpu_feature, SYSCTL_INT_READONLY },
494 	{ CPU_XCRYPT, &amd64_has_xcrypt, SYSCTL_INT_READONLY },
495 	{ CPU_INVARIANTTSC, &tsc_is_invariant, SYSCTL_INT_READONLY },
496 	{ CPU_RETPOLINE, &need_retpoline, SYSCTL_INT_READONLY },
497 };
498 
499 /*
500  * machine dependent system variables.
501  */
502 int
cpu_sysctl(int * name,u_int namelen,void * oldp,size_t * oldlenp,void * newp,size_t newlen,struct proc * p)503 cpu_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
504     size_t newlen, struct proc *p)
505 {
506 	extern uint64_t tsc_frequency;
507 	dev_t consdev;
508 	dev_t dev;
509 
510 	switch (name[0]) {
511 	case CPU_CONSDEV:
512 		if (namelen != 1)
513 			return (ENOTDIR);		/* overloaded */
514 		if (cn_tab != NULL)
515 			consdev = cn_tab->cn_dev;
516 		else
517 			consdev = NODEV;
518 		return (sysctl_rdstruct(oldp, oldlenp, newp, &consdev,
519 		    sizeof consdev));
520 	case CPU_CHR2BLK:
521 		if (namelen != 2)
522 			return (ENOTDIR);		/* overloaded */
523 		dev = chrtoblk((dev_t)name[1]);
524 		return sysctl_rdstruct(oldp, oldlenp, newp, &dev, sizeof(dev));
525 	case CPU_BIOS:
526 		return bios_sysctl(name + 1, namelen - 1, oldp, oldlenp,
527 		    newp, newlen, p);
528 	case CPU_CPUVENDOR:
529 		return (sysctl_rdstring(oldp, oldlenp, newp, cpu_vendor));
530 	case CPU_KBDRESET:
531 		return (sysctl_securelevel_int(oldp, oldlenp, newp, newlen,
532 		    &kbd_reset));
533 	case CPU_ALLOWAPERTURE:
534 		if (namelen != 1)
535 			return (ENOTDIR);		/* overloaded */
536 #ifdef APERTURE
537 		if (securelevel > 0)
538 			return (sysctl_int_lower(oldp, oldlenp, newp, newlen,
539 			    &allowaperture));
540 		else
541 			return (sysctl_int(oldp, oldlenp, newp, newlen,
542 			    &allowaperture));
543 #else
544 		return (sysctl_rdint(oldp, oldlenp, newp, 0));
545 #endif
546 #if NPCKBC > 0 && NUKBD > 0
547 	case CPU_FORCEUKBD:
548 		{
549 		int error;
550 
551 		if (forceukbd)
552 			return (sysctl_rdint(oldp, oldlenp, newp, forceukbd));
553 
554 		error = sysctl_int(oldp, oldlenp, newp, newlen, &forceukbd);
555 		if (forceukbd)
556 			pckbc_release_console();
557 		return (error);
558 		}
559 #endif
560 	case CPU_TSCFREQ:
561 		return (sysctl_rdquad(oldp, oldlenp, newp, tsc_frequency));
562 	default:
563 		return (sysctl_bounded_arr(cpuctl_vars, nitems(cpuctl_vars),
564 		    name, namelen, oldp, oldlenp, newp, newlen));
565 	}
566 	/* NOTREACHED */
567 }
568 
569 static inline void
maybe_enable_user_cet(struct proc * p)570 maybe_enable_user_cet(struct proc *p)
571 {
572 #ifndef SMALL_KERNEL
573 	/* Enable indirect-branch tracking if present and not disabled */
574 	if ((xsave_mask & XFEATURE_CET_U) &&
575 	    (p->p_p->ps_flags & PS_NOBTCFI) == 0) {
576 		uint64_t msr = rdmsr(MSR_U_CET);
577 		wrmsr(MSR_U_CET, msr | MSR_CET_ENDBR_EN | MSR_CET_NO_TRACK_EN);
578 	}
579 #endif
580 }
581 
582 static inline void
initialize_thread_xstate(struct proc * p)583 initialize_thread_xstate(struct proc *p)
584 {
585 	if (cpu_use_xsaves) {
586 		xrstors(fpu_cleandata, xsave_mask);
587 		maybe_enable_user_cet(p);
588 	} else {
589 		/* Reset FPU state in PCB */
590 		memcpy(&p->p_addr->u_pcb.pcb_savefpu, fpu_cleandata,
591 		    fpu_save_len);
592 
593 		if (curcpu()->ci_pflags & CPUPF_USERXSTATE) {
594 			/* state in CPU is obsolete; reset it */
595 			fpureset();
596 		}
597 	}
598 
599 	/* The reset state _is_ the userspace state for this thread now */
600 	curcpu()->ci_pflags |= CPUPF_USERXSTATE;
601 }
602 
603 /*
604  * Copy out the FPU state, massaging it to be usable from userspace
605  * and acceptable to xrstor_user()
606  */
607 static inline int
copyoutfpu(struct savefpu * sfp,char * sp,size_t len)608 copyoutfpu(struct savefpu *sfp, char *sp, size_t len)
609 {
610 	uint64_t bvs[2];
611 
612 	if (copyout(sfp, sp, len))
613 		return 1;
614 	if (len > offsetof(struct savefpu, fp_xstate.xstate_bv)) {
615 		sp  += offsetof(struct savefpu, fp_xstate.xstate_bv);
616 		len -= offsetof(struct savefpu, fp_xstate.xstate_bv);
617 		bvs[0] = sfp->fp_xstate.xstate_bv & XFEATURE_XCR0_MASK;
618 		bvs[1] = sfp->fp_xstate.xstate_xcomp_bv &
619 		    (XFEATURE_XCR0_MASK | XFEATURE_COMPRESSED);
620 		if (copyout(bvs, sp, min(len, sizeof bvs)))
621 			return 1;
622 	}
623 	return 0;
624 }
625 
626 /*
627  * Send an interrupt to process.
628  *
629  * Stack is set up to allow sigcode to call routine, followed by
630  * syscall to sigreturn routine below.  After sigreturn resets the
631  * signal mask, the stack, and the frame pointer, it returns to the
632  * user specified pc.
633  */
634 int
sendsig(sig_t catcher,int sig,sigset_t mask,const siginfo_t * ksip,int info,int onstack)635 sendsig(sig_t catcher, int sig, sigset_t mask, const siginfo_t *ksip,
636     int info, int onstack)
637 {
638 	struct proc *p = curproc;
639 	struct trapframe *tf = p->p_md.md_regs;
640 	struct sigcontext ksc;
641 	struct savefpu *sfp = &p->p_addr->u_pcb.pcb_savefpu;
642 	register_t sp, scp, sip;
643 	u_long sss;
644 
645 	memset(&ksc, 0, sizeof ksc);
646 	ksc.sc_rdi = tf->tf_rdi;
647 	ksc.sc_rsi = tf->tf_rsi;
648 	ksc.sc_rdx = tf->tf_rdx;
649 	ksc.sc_rcx = tf->tf_rcx;
650 	ksc.sc_r8  = tf->tf_r8;
651 	ksc.sc_r9  = tf->tf_r9;
652 	ksc.sc_r10 = tf->tf_r10;
653 	ksc.sc_r11 = tf->tf_r11;
654 	ksc.sc_r12 = tf->tf_r12;
655 	ksc.sc_r13 = tf->tf_r13;
656 	ksc.sc_r14 = tf->tf_r14;
657 	ksc.sc_r15 = tf->tf_r15;
658 	ksc.sc_rbx = tf->tf_rbx;
659 	ksc.sc_rax = tf->tf_rax;
660 	ksc.sc_rbp = tf->tf_rbp;
661 	ksc.sc_rip = tf->tf_rip;
662 	ksc.sc_cs  = tf->tf_cs;
663 	ksc.sc_rflags = tf->tf_rflags;
664 	ksc.sc_rsp = tf->tf_rsp;
665 	ksc.sc_ss  = tf->tf_ss;
666 	ksc.sc_mask = mask;
667 
668 	/* Allocate space for the signal handler context. */
669 	if ((p->p_sigstk.ss_flags & SS_DISABLE) == 0 &&
670 	    !sigonstack(tf->tf_rsp) && onstack)
671 		sp = trunc_page((vaddr_t)p->p_sigstk.ss_sp + p->p_sigstk.ss_size);
672 	else
673 		sp = tf->tf_rsp - 128;
674 
675 	sp -= fpu_save_len;
676 	if (cpu_use_xsaves)
677 		sp &= ~63ULL;	/* just in case */
678 	else
679 		sp &= ~15ULL;	/* just in case */
680 
681 	/* Save FPU state to PCB if necessary, then copy it out */
682 	if (curcpu()->ci_pflags & CPUPF_USERXSTATE)
683 		fpusave(&p->p_addr->u_pcb.pcb_savefpu);
684 	if (copyoutfpu(sfp, (void *)sp, fpu_save_len))
685 		return 1;
686 
687 	initialize_thread_xstate(p);
688 
689 	ksc.sc_fpstate = (struct fxsave64 *)sp;
690 	sss = (sizeof(ksc) + 15) & ~15;
691 	sip = 0;
692 	if (info) {
693 		sip = sp - ((sizeof(*ksip) + 15) & ~15);
694 		sss += (sizeof(*ksip) + 15) & ~15;
695 
696 		if (copyout(ksip, (void *)sip, sizeof(*ksip)))
697 			return 1;
698 	}
699 	scp = sp - sss;
700 
701 	ksc.sc_cookie = (long)scp ^ p->p_p->ps_sigcookie;
702 	if (copyout(&ksc, (void *)scp, sizeof(ksc)))
703 		return 1;
704 
705 	/*
706 	 * Build context to run handler in.
707 	 */
708 	tf->tf_rax = (u_int64_t)catcher;
709 	tf->tf_rdi = sig;
710 	tf->tf_rsi = sip;
711 	tf->tf_rdx = scp;
712 
713 	tf->tf_rip = (u_int64_t)p->p_p->ps_sigcode;
714 	tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
715 	tf->tf_rflags &= ~(PSL_T|PSL_D|PSL_VM|PSL_AC);
716 	tf->tf_rsp = scp;
717 	tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
718 
719 	return 0;
720 }
721 
722 /*
723  * System call to cleanup state after a signal
724  * has been taken.  Reset signal mask and
725  * stack state from context left by sendsig (above).
726  * Return to previous pc and psl as specified by
727  * context left by sendsig. Check carefully to
728  * make sure that the user has not modified the
729  * psl to gain improper privileges or to cause
730  * a machine fault.
731  */
732 int
sys_sigreturn(struct proc * p,void * v,register_t * retval)733 sys_sigreturn(struct proc *p, void *v, register_t *retval)
734 {
735 	struct sys_sigreturn_args /* {
736 		syscallarg(struct sigcontext *) sigcntxp;
737 	} */ *uap = v;
738 	struct sigcontext ksc, *scp = SCARG(uap, sigcntxp);
739 	struct trapframe *tf = p->p_md.md_regs;
740 	struct savefpu *sfp = &p->p_addr->u_pcb.pcb_savefpu;
741 	int error;
742 
743 	if (PROC_PC(p) != p->p_p->ps_sigcoderet) {
744 		sigexit(p, SIGILL);
745 		return (EPERM);
746 	}
747 
748 	if ((error = copyin((caddr_t)scp, &ksc, sizeof ksc)))
749 		return (error);
750 
751 	if (ksc.sc_cookie != ((long)scp ^ p->p_p->ps_sigcookie)) {
752 		sigexit(p, SIGILL);
753 		return (EFAULT);
754 	}
755 
756 	/* Prevent reuse of the sigcontext cookie */
757 	ksc.sc_cookie = 0;
758 	(void)copyout(&ksc.sc_cookie, (caddr_t)scp +
759 	    offsetof(struct sigcontext, sc_cookie), sizeof (ksc.sc_cookie));
760 
761 	if (((ksc.sc_rflags ^ tf->tf_rflags) & PSL_USERSTATIC) != 0 ||
762 	    !USERMODE(ksc.sc_cs, ksc.sc_eflags))
763 		return (EINVAL);
764 
765 	/* Current FPU state is obsolete; toss it and force a reload */
766 	if (curcpu()->ci_pflags & CPUPF_USERXSTATE) {
767 		curcpu()->ci_pflags &= ~CPUPF_USERXSTATE;
768 		fpureset();
769 	}
770 
771 	/* Copy in the FPU state to restore */
772 	if (__predict_true(ksc.sc_fpstate != NULL)) {
773 		if ((error = copyin(ksc.sc_fpstate, sfp, fpu_save_len)))
774 			return error;
775 		if (xrstor_user(sfp, xsave_mask)) {
776 			memcpy(sfp, fpu_cleandata, fpu_save_len);
777 			return EINVAL;
778 		}
779 		maybe_enable_user_cet(p);
780 		curcpu()->ci_pflags |= CPUPF_USERXSTATE;
781 	} else {
782 		/* shouldn't happen, but handle it */
783 		initialize_thread_xstate(p);
784 	}
785 
786 	tf->tf_rdi = ksc.sc_rdi;
787 	tf->tf_rsi = ksc.sc_rsi;
788 	tf->tf_rdx = ksc.sc_rdx;
789 	tf->tf_rcx = ksc.sc_rcx;
790 	tf->tf_r8  = ksc.sc_r8;
791 	tf->tf_r9  = ksc.sc_r9;
792 	tf->tf_r10 = ksc.sc_r10;
793 	tf->tf_r11 = ksc.sc_r11;
794 	tf->tf_r12 = ksc.sc_r12;
795 	tf->tf_r13 = ksc.sc_r13;
796 	tf->tf_r14 = ksc.sc_r14;
797 	tf->tf_r15 = ksc.sc_r15;
798 	tf->tf_rbx = ksc.sc_rbx;
799 	tf->tf_rax = ksc.sc_rax;
800 	tf->tf_rbp = ksc.sc_rbp;
801 	tf->tf_rip = ksc.sc_rip;
802 	tf->tf_cs  = ksc.sc_cs;
803 	tf->tf_rflags = ksc.sc_rflags;
804 	tf->tf_rsp = ksc.sc_rsp;
805 	tf->tf_ss  = ksc.sc_ss;
806 
807 	/* Restore signal mask. */
808 	p->p_sigmask = ksc.sc_mask & ~sigcantmask;
809 
810 	/*
811 	 * sigreturn() needs to return to userspace via the 'iretq'
812 	 * method, so that if the process was interrupted (by tick,
813 	 * an IPI, whatever) as opposed to already being in the kernel
814 	 * when a signal was being delivered, the process will be
815 	 * completely restored, including the userland %rcx and %r11
816 	 * registers which the 'sysretq' instruction cannot restore.
817 	 * Also need to make sure we can handle faulting on xrstor.
818 	 */
819 	p->p_md.md_flags |= MDP_IRET;
820 
821 	return (EJUSTRETURN);
822 }
823 
824 #ifdef MULTIPROCESSOR
825 /* force a CPU into the kernel, whether or not it's idle */
826 void
cpu_kick(struct cpu_info * ci)827 cpu_kick(struct cpu_info *ci)
828 {
829 	/* only need to kick other CPUs */
830 	if (ci != curcpu()) {
831 		if (cpu_mwait_size > 0) {
832 			/*
833 			 * If not idling, then send an IPI, else
834 			 * just clear the "keep idling" bit.
835 			 */
836 			if ((ci->ci_mwait & MWAIT_IN_IDLE) == 0)
837 				x86_send_ipi(ci, X86_IPI_NOP);
838 			else
839 				atomic_clearbits_int(&ci->ci_mwait,
840 				    MWAIT_KEEP_IDLING);
841 		} else {
842 			/* no mwait, so need an IPI */
843 			x86_send_ipi(ci, X86_IPI_NOP);
844 		}
845 	}
846 }
847 #endif
848 
849 /*
850  * Notify the current process (p) that it has a signal pending,
851  * process as soon as possible.
852  */
853 void
signotify(struct proc * p)854 signotify(struct proc *p)
855 {
856 	aston(p);
857 	cpu_kick(p->p_cpu);
858 }
859 
860 #ifdef MULTIPROCESSOR
861 void
cpu_unidle(struct cpu_info * ci)862 cpu_unidle(struct cpu_info *ci)
863 {
864 	if (cpu_mwait_size > 0 && (ci->ci_mwait & MWAIT_ONLY)) {
865 		/*
866 		 * Just clear the "keep idling" bit; if it wasn't
867 		 * idling then we didn't need to do anything anyway.
868 		 */
869 		atomic_clearbits_int(&ci->ci_mwait, MWAIT_KEEP_IDLING);
870 		return;
871 	}
872 
873 	if (ci != curcpu())
874 		x86_send_ipi(ci, X86_IPI_NOP);
875 }
876 #endif
877 
878 int	waittime = -1;
879 struct pcb dumppcb;
880 
881 __dead void
boot(int howto)882 boot(int howto)
883 {
884 	if ((howto & RB_POWERDOWN) != 0)
885 		lid_action = 0;
886 
887 	if ((howto & RB_RESET) != 0)
888 		goto doreset;
889 
890 	if (cold) {
891 		if ((howto & RB_USERREQ) == 0)
892 			howto |= RB_HALT;
893 		goto haltsys;
894 	}
895 
896 	boothowto = howto;
897 	if ((howto & RB_NOSYNC) == 0 && waittime < 0) {
898 		waittime = 0;
899 		vfs_shutdown(curproc);
900 
901 		if ((howto & RB_TIMEBAD) == 0) {
902 			resettodr();
903 		} else {
904 			printf("WARNING: not updating battery clock\n");
905 		}
906 	}
907 	if_downall();
908 
909 	uvm_shutdown();
910 	splhigh();
911 	cold = 1;
912 
913 	if ((howto & RB_DUMP) != 0)
914 		dumpsys();
915 
916 haltsys:
917 	config_suspend_all(DVACT_POWERDOWN);
918 
919 #ifdef MULTIPROCESSOR
920 	x86_broadcast_ipi(X86_IPI_HALT);
921 #endif
922 
923 	if ((howto & RB_HALT) != 0) {
924 #if NACPI > 0 && !defined(SMALL_KERNEL)
925 		extern int acpi_enabled;
926 
927 		if (acpi_enabled) {
928 			delay(500000);
929 			if ((howto & RB_POWERDOWN) != 0)
930 				acpi_powerdown();
931 		}
932 #endif
933 		printf("\n");
934 		printf("The operating system has halted.\n");
935 		printf("Please press any key to reboot.\n\n");
936 		cnpollc(1);	/* for proper keyboard command handling */
937 		cngetc();
938 		cnpollc(0);
939 	}
940 
941 doreset:
942 	printf("rebooting...\n");
943 	if (cpureset_delay > 0)
944 		delay(cpureset_delay * 1000);
945 	cpu_reset();
946 	for (;;)
947 		continue;
948 	/* NOTREACHED */
949 }
950 
951 /*
952  * These variables are needed by /sbin/savecore
953  */
954 u_long	dumpmag = 0x8fca0101;	/* magic number */
955 int 	dumpsize = 0;		/* pages */
956 long	dumplo = 0; 		/* blocks */
957 
958 /*
959  * cpu_dump: dump the machine-dependent kernel core dump headers.
960  */
961 int
cpu_dump(void)962 cpu_dump(void)
963 {
964 	int (*dump)(dev_t, daddr_t, caddr_t, size_t);
965 	char buf[dbtob(1)];
966 	kcore_seg_t *segp;
967 	cpu_kcore_hdr_t *cpuhdrp;
968 	phys_ram_seg_t *memsegp;
969 	caddr_t va;
970 	int i;
971 
972 	dump = bdevsw[major(dumpdev)].d_dump;
973 
974 	memset(buf, 0, sizeof buf);
975 	segp = (kcore_seg_t *)buf;
976 	cpuhdrp = (cpu_kcore_hdr_t *)&buf[ALIGN(sizeof(*segp))];
977 	memsegp = (phys_ram_seg_t *)&buf[ALIGN(sizeof(*segp)) +
978 	    ALIGN(sizeof(*cpuhdrp))];
979 
980 	/*
981 	 * Generate a segment header.
982 	 */
983 	CORE_SETMAGIC(*segp, KCORE_MAGIC, MID_MACHINE, CORE_CPU);
984 	segp->c_size = dbtob(1) - ALIGN(sizeof(*segp));
985 
986 	/*
987 	 * Add the machine-dependent header info.
988 	 */
989 	cpuhdrp->ptdpaddr = proc0.p_addr->u_pcb.pcb_cr3;
990 	cpuhdrp->nmemsegs = mem_cluster_cnt;
991 
992 	/*
993 	 * Fill in the memory segment descriptors.
994 	 */
995 	for (i = 0; i < mem_cluster_cnt; i++) {
996 		memsegp[i].start = mem_clusters[i].start;
997 		memsegp[i].size = mem_clusters[i].size & ~PAGE_MASK;
998 	}
999 
1000 	/*
1001 	 * If we have dump memory then assume the kernel stack is in high
1002 	 * memory and bounce
1003 	 */
1004 	if (dumpmem_vaddr != 0) {
1005 		memcpy((char *)dumpmem_vaddr, buf, sizeof(buf));
1006 		va = (caddr_t)dumpmem_vaddr;
1007 	} else {
1008 		va = (caddr_t)buf;
1009 	}
1010 	return (dump(dumpdev, dumplo, va, dbtob(1)));
1011 }
1012 
1013 /*
1014  * This is called by main to set dumplo and dumpsize.
1015  * Dumps always skip the first PAGE_SIZE of disk space
1016  * in case there might be a disk label stored there.
1017  * If there is extra space, put dump at the end to
1018  * reduce the chance that swapping trashes it.
1019  */
1020 void
dumpconf(void)1021 dumpconf(void)
1022 {
1023 	int nblks, dumpblks;	/* size of dump area */
1024 
1025 	if (dumpdev == NODEV ||
1026 	    (nblks = (bdevsw[major(dumpdev)].d_psize)(dumpdev)) == 0)
1027 		return;
1028 	if (nblks <= ctod(1))
1029 		return;
1030 
1031 	dumpblks = cpu_dumpsize();
1032 	if (dumpblks < 0)
1033 		return;
1034 	dumpblks += ctod(cpu_dump_mempagecnt());
1035 
1036 	/* If dump won't fit (incl. room for possible label), punt. */
1037 	if (dumpblks > (nblks - ctod(1)))
1038 		return;
1039 
1040 	/* Put dump at end of partition */
1041 	dumplo = nblks - dumpblks;
1042 
1043 	/* dumpsize is in page units, and doesn't include headers. */
1044 	dumpsize = cpu_dump_mempagecnt();
1045 }
1046 
1047 /*
1048  * Doadump comes here after turning off memory management and
1049  * getting on the dump stack, either when called above, or by
1050  * the auto-restart code.
1051  */
1052 #define BYTES_PER_DUMP  MAXPHYS /* must be a multiple of pagesize */
1053 
1054 void
dumpsys(void)1055 dumpsys(void)
1056 {
1057 	u_long totalbytesleft, bytes, i, n, memseg;
1058 	u_long maddr;
1059 	daddr_t blkno;
1060 	void *va;
1061 	int (*dump)(dev_t, daddr_t, caddr_t, size_t);
1062 	int error;
1063 
1064 	/* Save registers. */
1065 	savectx(&dumppcb);
1066 
1067 	if (dumpdev == NODEV)
1068 		return;
1069 
1070 	/*
1071 	 * For dumps during autoconfiguration,
1072 	 * if dump device has already configured...
1073 	 */
1074 	if (dumpsize == 0)
1075 		dumpconf();
1076 	if (dumplo <= 0 || dumpsize == 0) {
1077 		printf("\ndump to dev %u,%u not possible\n", major(dumpdev),
1078 		    minor(dumpdev));
1079 		return;
1080 	}
1081 	printf("\ndumping to dev %u,%u offset %ld\n", major(dumpdev),
1082 	    minor(dumpdev), dumplo);
1083 
1084 	error = (*bdevsw[major(dumpdev)].d_psize)(dumpdev);
1085 	printf("dump ");
1086 	if (error == -1) {
1087 		printf("area unavailable\n");
1088 		return;
1089 	}
1090 
1091 	if ((error = cpu_dump()) != 0)
1092 		goto err;
1093 
1094 	totalbytesleft = ptoa(cpu_dump_mempagecnt());
1095 	blkno = dumplo + cpu_dumpsize();
1096 	dump = bdevsw[major(dumpdev)].d_dump;
1097 	error = 0;
1098 
1099 	for (memseg = 0; memseg < mem_cluster_cnt; memseg++) {
1100 		maddr = mem_clusters[memseg].start;
1101 		bytes = mem_clusters[memseg].size;
1102 
1103 		for (i = 0; i < bytes; i += n, totalbytesleft -= n) {
1104 			/* Print out how many MBs we have left to go. */
1105 			if ((totalbytesleft % (1024*1024)) < BYTES_PER_DUMP)
1106 				printf("%ld ", totalbytesleft / (1024 * 1024));
1107 
1108 			/* Limit size for next transfer. */
1109 			n = bytes - i;
1110 			if (n > BYTES_PER_DUMP)
1111 				n = BYTES_PER_DUMP;
1112 			if (maddr > 0xffffffff) {
1113 				va = (void *)dumpmem_vaddr;
1114 				if (n > dumpmem_sz)
1115 					n = dumpmem_sz;
1116 				memcpy(va, (void *)PMAP_DIRECT_MAP(maddr), n);
1117 			} else {
1118 				va = (void *)PMAP_DIRECT_MAP(maddr);
1119 			}
1120 
1121 			error = (*dump)(dumpdev, blkno, va, n);
1122 			if (error)
1123 				goto err;
1124 			maddr += n;
1125 			blkno += btodb(n);		/* XXX? */
1126 
1127 #if 0	/* XXX this doesn't work.  grr. */
1128 			/* operator aborting dump? */
1129 			if (sget() != NULL) {
1130 				error = EINTR;
1131 				break;
1132 			}
1133 #endif
1134 		}
1135 	}
1136 
1137  err:
1138 	switch (error) {
1139 
1140 	case ENXIO:
1141 		printf("device bad\n");
1142 		break;
1143 
1144 	case EFAULT:
1145 		printf("device not ready\n");
1146 		break;
1147 
1148 	case EINVAL:
1149 		printf("area improper\n");
1150 		break;
1151 
1152 	case EIO:
1153 		printf("i/o error\n");
1154 		break;
1155 
1156 	case EINTR:
1157 		printf("aborted from console\n");
1158 		break;
1159 
1160 	case 0:
1161 		printf("succeeded\n");
1162 		break;
1163 
1164 	default:
1165 		printf("error %d\n", error);
1166 		break;
1167 	}
1168 	printf("\n\n");
1169 	delay(5000000);		/* 5 seconds */
1170 }
1171 
1172 /*
1173  * Force the userspace FS.base to be reloaded from the PCB on return from
1174  * the kernel, and reset the segment registers (%ds, %es, %fs, and %gs)
1175  * to their expected userspace value.
1176  */
1177 void
reset_segs(void)1178 reset_segs(void)
1179 {
1180 	/*
1181 	 * This operates like the cpu_switchto() sequence: if we
1182 	 * haven't reset %[defg]s already, do so now.
1183 	*/
1184 	if (curcpu()->ci_pflags & CPUPF_USERSEGS) {
1185 		curcpu()->ci_pflags &= ~CPUPF_USERSEGS;
1186 		__asm volatile(
1187 		    "movw %%ax,%%ds\n\t"
1188 		    "movw %%ax,%%es\n\t"
1189 		    "movw %%ax,%%fs\n\t"
1190 		    "cli\n\t"		/* block intr when on user GS.base */
1191 		    "swapgs\n\t"	/* swap from kernel to user GS.base */
1192 		    "movw %%ax,%%gs\n\t"/* set %gs to UDATA and GS.base to 0 */
1193 		    "swapgs\n\t"	/* back to kernel GS.base */
1194 		    "sti" : : "a"(GSEL(GUDATA_SEL, SEL_UPL)));
1195 	}
1196 }
1197 
1198 /*
1199  * Clear registers on exec
1200  */
1201 void
setregs(struct proc * p,struct exec_package * pack,u_long stack,struct ps_strings * arginfo)1202 setregs(struct proc *p, struct exec_package *pack, u_long stack,
1203     struct ps_strings *arginfo)
1204 {
1205 	struct trapframe *tf;
1206 
1207 	initialize_thread_xstate(p);
1208 
1209 	/* To reset all registers we have to return via iretq */
1210 	p->p_md.md_flags |= MDP_IRET;
1211 
1212 	reset_segs();
1213 	p->p_addr->u_pcb.pcb_fsbase = 0;
1214 
1215 	tf = p->p_md.md_regs;
1216 	memset(tf, 0, sizeof *tf);
1217 	tf->tf_rip = pack->ep_entry;
1218 	tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
1219 	tf->tf_rflags = PSL_USERSET;
1220 	tf->tf_rsp = stack;
1221 	tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
1222 }
1223 
1224 /*
1225  * Initialize segments and descriptor tables
1226  */
1227 
1228 struct gate_descriptor *idt;
1229 char idt_allocmap[NIDT];
1230 struct user *proc0paddr = NULL;
1231 
1232 void
setgate(struct gate_descriptor * gd,void * func,int ist,int type,int dpl,int sel)1233 setgate(struct gate_descriptor *gd, void *func, int ist, int type, int dpl,
1234     int sel)
1235 {
1236 	gd->gd_looffset = (u_int64_t)func & 0xffff;
1237 	gd->gd_selector = sel;
1238 	gd->gd_ist = ist;
1239 	gd->gd_type = type;
1240 	gd->gd_dpl = dpl;
1241 	gd->gd_p = 1;
1242 	gd->gd_hioffset = (u_int64_t)func >> 16;
1243 	gd->gd_zero = 0;
1244 	gd->gd_xx1 = 0;
1245 	gd->gd_xx2 = 0;
1246 	gd->gd_xx3 = 0;
1247 }
1248 
1249 void
unsetgate(struct gate_descriptor * gd)1250 unsetgate(struct gate_descriptor *gd)
1251 {
1252 	memset(gd, 0, sizeof (*gd));
1253 }
1254 
1255 void
setregion(struct region_descriptor * rd,void * base,u_int16_t limit)1256 setregion(struct region_descriptor *rd, void *base, u_int16_t limit)
1257 {
1258 	rd->rd_limit = limit;
1259 	rd->rd_base = (u_int64_t)base;
1260 }
1261 
1262 /*
1263  * Note that the base and limit fields are ignored in long mode.
1264  */
1265 void
set_mem_segment(struct mem_segment_descriptor * sd,void * base,size_t limit,int type,int dpl,int gran,int def32,int is64)1266 set_mem_segment(struct mem_segment_descriptor *sd, void *base, size_t limit,
1267     int type, int dpl, int gran, int def32, int is64)
1268 {
1269 	sd->sd_lolimit = (unsigned)limit;
1270 	sd->sd_lobase = (unsigned long)base;
1271 	sd->sd_type = type;
1272 	sd->sd_dpl = dpl;
1273 	sd->sd_p = 1;
1274 	sd->sd_hilimit = (unsigned)limit >> 16;
1275 	sd->sd_avl = 0;
1276 	sd->sd_long = is64;
1277 	sd->sd_def32 = def32;
1278 	sd->sd_gran = gran;
1279 	sd->sd_hibase = (unsigned long)base >> 24;
1280 }
1281 
1282 void
set_sys_segment(struct sys_segment_descriptor * sd,void * base,size_t limit,int type,int dpl,int gran)1283 set_sys_segment(struct sys_segment_descriptor *sd, void *base, size_t limit,
1284     int type, int dpl, int gran)
1285 {
1286 	memset(sd, 0, sizeof *sd);
1287 	sd->sd_lolimit = (unsigned)limit;
1288 	sd->sd_lobase = (u_int64_t)base;
1289 	sd->sd_type = type;
1290 	sd->sd_dpl = dpl;
1291 	sd->sd_p = 1;
1292 	sd->sd_hilimit = (unsigned)limit >> 16;
1293 	sd->sd_gran = gran;
1294 	sd->sd_hibase = (u_int64_t)base >> 24;
1295 }
1296 
cpu_init_idt(void)1297 void cpu_init_idt(void)
1298 {
1299 	struct region_descriptor region;
1300 
1301 	setregion(&region, idt, NIDT * sizeof(idt[0]) - 1);
1302 	lidt(&region);
1303 }
1304 
1305 void
cpu_init_extents(void)1306 cpu_init_extents(void)
1307 {
1308 	extern struct extent *iomem_ex;
1309 	static int already_done;
1310 	int i;
1311 
1312 	/* We get called for each CPU, only first should do this */
1313 	if (already_done)
1314 		return;
1315 
1316 	/*
1317 	 * Allocate the physical addresses used by RAM from the iomem
1318 	 * extent map.
1319 	 */
1320 	for (i = 0; i < mem_cluster_cnt; i++) {
1321 		if (extent_alloc_region(iomem_ex, mem_clusters[i].start,
1322 		    mem_clusters[i].size, EX_NOWAIT)) {
1323 			/* XXX What should we do? */
1324 			printf("WARNING: CAN'T ALLOCATE RAM (%llx-%llx)"
1325 			    " FROM IOMEM EXTENT MAP!\n", mem_clusters[i].start,
1326 			    mem_clusters[i].start + mem_clusters[i].size - 1);
1327 		}
1328 	}
1329 
1330 	already_done = 1;
1331 }
1332 
1333 void
map_tramps(void)1334 map_tramps(void)
1335 {
1336 #if defined(MULTIPROCESSOR) || \
1337     (NACPI > 0 && !defined(SMALL_KERNEL))
1338 	struct pmap *kmp = pmap_kernel();
1339 	extern paddr_t tramp_pdirpa;
1340 #ifdef MULTIPROCESSOR
1341 	extern u_char cpu_spinup_trampoline[];
1342 	extern u_char cpu_spinup_trampoline_end[];
1343 	extern u_char mp_tramp_data_start[];
1344 	extern u_char mp_tramp_data_end[];
1345 	extern u_int32_t mp_pdirpa;
1346 #endif
1347 
1348 	/*
1349 	 * The initial PML4 pointer must be below 4G, so if the
1350 	 * current one isn't, use a "bounce buffer" and save it
1351 	 * for tramps to use.
1352 	 */
1353 	if (kmp->pm_pdirpa > 0xffffffff) {
1354 		pmap_kenter_pa(lo32_vaddr, lo32_paddr, PROT_READ | PROT_WRITE);
1355 		memcpy((void *)lo32_vaddr, kmp->pm_pdir, PAGE_SIZE);
1356 		tramp_pdirpa = lo32_paddr;
1357 		pmap_kremove(lo32_vaddr, PAGE_SIZE);
1358 	} else
1359 		tramp_pdirpa = kmp->pm_pdirpa;
1360 
1361 
1362 #ifdef MULTIPROCESSOR
1363 	/* Map MP tramp code and data pages RW for copy */
1364 	pmap_kenter_pa(MP_TRAMPOLINE, MP_TRAMPOLINE,
1365 	    PROT_READ | PROT_WRITE);
1366 
1367 	pmap_kenter_pa(MP_TRAMP_DATA, MP_TRAMP_DATA,
1368 	    PROT_READ | PROT_WRITE);
1369 
1370 	memset((caddr_t)MP_TRAMPOLINE, 0xcc, PAGE_SIZE);
1371 	memset((caddr_t)MP_TRAMP_DATA, 0xcc, PAGE_SIZE);
1372 
1373 	memcpy((caddr_t)MP_TRAMPOLINE,
1374 	    cpu_spinup_trampoline,
1375 	    cpu_spinup_trampoline_end-cpu_spinup_trampoline);
1376 
1377 	memcpy((caddr_t)MP_TRAMP_DATA,
1378 		mp_tramp_data_start,
1379 		mp_tramp_data_end - mp_tramp_data_start);
1380 
1381 	/*
1382 	 * We need to patch this after we copy the tramp data,
1383 	 * the symbol points into the copied tramp data page.
1384 	 */
1385 	mp_pdirpa = tramp_pdirpa;
1386 
1387 	/* Unmap, will be remapped in cpu_start_secondary */
1388 	pmap_kremove(MP_TRAMPOLINE, PAGE_SIZE);
1389 	pmap_kremove(MP_TRAMP_DATA, PAGE_SIZE);
1390 #endif /* MULTIPROCESSOR */
1391 #endif
1392 }
1393 
1394 void
cpu_set_vendor(struct cpu_info * ci,int level,const char * vendor)1395 cpu_set_vendor(struct cpu_info *ci, int level, const char *vendor)
1396 {
1397 	ci->ci_cpuid_level = level;
1398 	cpuid_level = MIN(cpuid_level, level);
1399 
1400 	/* map the vendor string to an integer */
1401 	if (strcmp(vendor, "AuthenticAMD") == 0)
1402 		ci->ci_vendor = CPUV_AMD;
1403 	else if (strcmp(vendor, "GenuineIntel") == 0)
1404 		ci->ci_vendor = CPUV_INTEL;
1405 	else if (strcmp(vendor, "CentaurHauls") == 0)
1406 		ci->ci_vendor = CPUV_VIA;
1407 	else
1408 		ci->ci_vendor = CPUV_UNKNOWN;
1409 }
1410 
1411 #define	IDTVEC(name)	__CONCAT(X, name)
1412 typedef void (vector)(void);
1413 extern vector *IDTVEC(exceptions)[];
1414 
1415 paddr_t early_pte_pages;
1416 
1417 void
init_x86_64(paddr_t first_avail)1418 init_x86_64(paddr_t first_avail)
1419 {
1420 	struct region_descriptor region;
1421 	bios_memmap_t *bmp;
1422 	int x, ist;
1423 	uint64_t max_dm_size = ((uint64_t)512 * NUM_L4_SLOT_DIRECT) << 30;
1424 
1425 	/*
1426 	 * locore0 mapped 3 pages for use before the pmap is initialized
1427 	 * starting at first_avail. These pages are currently used by
1428 	 * efifb to create early-use VAs for the framebuffer before efifb
1429 	 * is attached.
1430 	 */
1431 	early_pte_pages = first_avail;
1432 	first_avail += 3 * NBPG;
1433 
1434 	cpu_set_vendor(&cpu_info_primary, cpuid_level, cpu_vendor);
1435 	cpu_init_msrs(&cpu_info_primary);
1436 
1437 	proc0.p_addr = proc0paddr;
1438 	cpu_info_primary.ci_curpcb = &proc0.p_addr->u_pcb;
1439 
1440 	x86_bus_space_init();
1441 
1442 	i8254_startclock();
1443 
1444 	/*
1445 	 * Initialize PAGE_SIZE-dependent variables.
1446 	 */
1447 	uvm_setpagesize();
1448 
1449 	/*
1450 	 * Boot arguments are in a single page specified by /boot.
1451 	 *
1452 	 * We require the "new" vector form, as well as memory ranges
1453 	 * to be given in bytes rather than KB.
1454 	 *
1455 	 * locore copies the data into bootinfo[] for us.
1456 	 */
1457 	if ((bootapiver & (BAPIV_VECTOR | BAPIV_BMEMMAP)) ==
1458 	    (BAPIV_VECTOR | BAPIV_BMEMMAP)) {
1459 		if (bootinfo_size >= sizeof(bootinfo))
1460 			panic("boot args too big");
1461 
1462 		getbootinfo(bootinfo, bootinfo_size);
1463 	} else
1464 		panic("invalid /boot");
1465 
1466 	cninit();
1467 
1468 /*
1469  * Memory on the AMD64 port is described by three different things.
1470  *
1471  * 1. biosbasemem - This is outdated, and should really only be used to
1472  *    sanitize the other values. This is what we get back from the BIOS
1473  *    using the legacy routines, describing memory below 640KB.
1474  *
1475  * 2. bios_memmap[] - This is the memory map as the bios has returned
1476  *    it to us.  It includes memory the kernel occupies, etc.
1477  *
1478  * 3. mem_cluster[] - This is the massaged free memory segments after
1479  *    taking into account the contents of bios_memmap, biosbasemem,
1480  *    and locore/machdep/pmap kernel allocations of physical
1481  *    pages.
1482  *
1483  * The other thing is that the physical page *RANGE* is described by
1484  * three more variables:
1485  *
1486  * avail_start - This is a physical address of the start of available
1487  *               pages, until IOM_BEGIN.  This is basically the start
1488  *               of the UVM managed range of memory, with some holes...
1489  *
1490  * avail_end - This is the end of physical pages.  All physical pages
1491  *             that UVM manages are between avail_start and avail_end.
1492  *             There are holes...
1493  *
1494  * first_avail - This is the first available physical page after the
1495  *               kernel, page tables, etc.
1496  *
1497  * We skip the first few pages for trampolines, hibernate, and to avoid
1498  * buggy SMI implementations that could corrupt the first 64KB.
1499  */
1500 	avail_start = 16*PAGE_SIZE;
1501 
1502 #ifdef MULTIPROCESSOR
1503 	if (avail_start < MP_TRAMPOLINE + PAGE_SIZE)
1504 		avail_start = MP_TRAMPOLINE + PAGE_SIZE;
1505 	if (avail_start < MP_TRAMP_DATA + PAGE_SIZE)
1506 		avail_start = MP_TRAMP_DATA + PAGE_SIZE;
1507 #endif
1508 
1509 #if (NACPI > 0 && !defined(SMALL_KERNEL))
1510 	if (avail_start < ACPI_TRAMPOLINE + PAGE_SIZE)
1511 		avail_start = ACPI_TRAMPOLINE + PAGE_SIZE;
1512 	if (avail_start < ACPI_TRAMP_DATA + PAGE_SIZE)
1513 		avail_start = ACPI_TRAMP_DATA + PAGE_SIZE;
1514 #endif
1515 
1516 #ifdef HIBERNATE
1517 	if (avail_start < HIBERNATE_HIBALLOC_PAGE + PAGE_SIZE)
1518 		avail_start = HIBERNATE_HIBALLOC_PAGE + PAGE_SIZE;
1519 #endif /* HIBERNATE */
1520 
1521 	/*
1522 	 * We need to go through the BIOS memory map given, and
1523 	 * fill out mem_clusters and mem_cluster_cnt stuff, taking
1524 	 * into account all the points listed above.
1525 	 */
1526 	avail_end = mem_cluster_cnt = 0;
1527 	for (bmp = bios_memmap; bmp->type != BIOS_MAP_END; bmp++) {
1528 		paddr_t s1, s2, e1, e2;
1529 
1530 		/* Ignore non-free memory */
1531 		if (bmp->type != BIOS_MAP_FREE)
1532 			continue;
1533 		if (bmp->size < PAGE_SIZE)
1534 			continue;
1535 
1536 		/* Init our segment(s), round/trunc to pages */
1537 		s1 = round_page(bmp->addr);
1538 		e1 = trunc_page(bmp->addr + bmp->size);
1539 		s2 = e2 = 0;
1540 
1541 		/*
1542 		 * XXX Some buggy ACPI BIOSes use memory that they
1543 		 * declare as free.  Current worst offender is
1544 		 * Supermicro 5019D-FTN4.  Typically the affected memory
1545 		 * areas are small blocks between areas reserved for
1546 		 * ACPI and other BIOS goo.  So skip areas smaller
1547 		 * than 32 MB above the 16 MB boundary (to avoid
1548 		 * affecting legacy stuff).
1549 		 */
1550 		if (s1 > 16*1024*1024 && (e1 - s1) < 32*1024*1024)
1551 			continue;
1552 
1553 		/* Check and adjust our segment(s) */
1554 		/* Nuke low pages */
1555 		if (s1 < avail_start) {
1556 			s1 = avail_start;
1557 			if (s1 > e1)
1558 				continue;
1559 		}
1560 
1561 		/*
1562 		 * The direct map is limited to 512GB * NUM_L4_SLOT_DIRECT of
1563 		 * memory, so discard anything above that.
1564 		 */
1565 		if (e1 >= max_dm_size) {
1566 			e1 = max_dm_size;
1567 			if (s1 > e1)
1568 				continue;
1569 		}
1570 
1571 		/* Crop stuff into "640K hole" */
1572 		if (s1 < IOM_BEGIN && e1 > IOM_BEGIN)
1573 			e1 = IOM_BEGIN;
1574 		if (s1 < biosbasemem && e1 > biosbasemem)
1575 			e1 = biosbasemem;
1576 
1577 		/* Split any segments straddling the 16MB boundary */
1578 		if (s1 < 16*1024*1024 && e1 > 16*1024*1024) {
1579 			e2 = e1;
1580 			s2 = e1 = 16*1024*1024;
1581 		}
1582 
1583 		/* Store segment(s) */
1584 		if (e1 - s1 >= PAGE_SIZE) {
1585 			mem_clusters[mem_cluster_cnt].start = s1;
1586 			mem_clusters[mem_cluster_cnt].size = e1 - s1;
1587 			mem_cluster_cnt++;
1588 		}
1589 		if (e2 - s2 >= PAGE_SIZE) {
1590 			mem_clusters[mem_cluster_cnt].start = s2;
1591 			mem_clusters[mem_cluster_cnt].size = e2 - s2;
1592 			mem_cluster_cnt++;
1593 		}
1594 		if (avail_end < e1) avail_end = e1;
1595 		if (avail_end < e2) avail_end = e2;
1596 	}
1597 
1598 	/*
1599 	 * Call pmap initialization to make new kernel address space.
1600 	 * We must do this before loading pages into the VM system.
1601 	 */
1602 	first_avail = pmap_bootstrap(first_avail, trunc_page(avail_end));
1603 
1604 #if NEFI > 0
1605 	/* Relocate the EFI memory map. */
1606 	if (bios_efiinfo && bios_efiinfo->mmap_start) {
1607 		mmap = (EFI_MEMORY_DESCRIPTOR *)PMAP_DIRECT_MAP(first_avail);
1608 		memcpy(mmap, (void *)PMAP_DIRECT_MAP(bios_efiinfo->mmap_start),
1609 		    bios_efiinfo->mmap_size);
1610 		first_avail += round_page(bios_efiinfo->mmap_size);
1611 	}
1612 #endif
1613 
1614 	/* Allocate these out of the 640KB base memory */
1615 	if (avail_start != PAGE_SIZE)
1616 		avail_start = pmap_prealloc_lowmem_ptps(avail_start);
1617 
1618 	cpu_init_extents();
1619 
1620 	/* Make sure the end of the space used by the kernel is rounded. */
1621 	first_avail = round_page(first_avail);
1622 	kern_end = KERNBASE + first_avail;
1623 
1624 	/*
1625 	 * Now, load the memory clusters (which have already been
1626 	 * flensed) into the VM system.
1627 	 */
1628 	for (x = 0; x < mem_cluster_cnt; x++) {
1629 		paddr_t seg_start = mem_clusters[x].start;
1630 		paddr_t seg_end = seg_start + mem_clusters[x].size;
1631 
1632 		if (seg_start < first_avail) seg_start = first_avail;
1633 		if (seg_start > seg_end) continue;
1634 		if (seg_end - seg_start < PAGE_SIZE) continue;
1635 
1636 		physmem += atop(mem_clusters[x].size);
1637 
1638 #if DEBUG_MEMLOAD
1639 		printf("loading 0x%lx-0x%lx (0x%lx-0x%lx)\n",
1640 		    seg_start, seg_end, atop(seg_start), atop(seg_end));
1641 #endif
1642 		uvm_page_physload(atop(seg_start), atop(seg_end),
1643 		    atop(seg_start), atop(seg_end), 0);
1644 	}
1645 
1646 	/*
1647          * Now, load the memory between the end of I/O memory "hole"
1648          * and the kernel.
1649 	 */
1650 	{
1651 		paddr_t seg_start = round_page(IOM_END);
1652 		paddr_t seg_end = trunc_page(KERNTEXTOFF - KERNBASE);
1653 
1654 		if (seg_start < seg_end) {
1655 #if DEBUG_MEMLOAD
1656 			printf("loading 0x%lx-0x%lx\n", seg_start, seg_end);
1657 #endif
1658 			uvm_page_physload(atop(seg_start), atop(seg_end),
1659 			    atop(seg_start), atop(seg_end), 0);
1660 		}
1661 	}
1662 
1663 #if DEBUG_MEMLOAD
1664 	printf("avail_start = 0x%lx\n", avail_start);
1665 	printf("avail_end = 0x%lx\n", avail_end);
1666 	printf("first_avail = 0x%lx\n", first_avail);
1667 #endif
1668 
1669 	/*
1670 	 * Steal memory for the message buffer (at end of core).
1671 	 */
1672 	{
1673 		struct vm_physseg *vps = NULL;
1674 		psize_t sz = round_page(MSGBUFSIZE);
1675 		psize_t reqsz = sz;
1676 
1677 		for (x = 0; x < vm_nphysseg; x++) {
1678 			vps = &vm_physmem[x];
1679 			if (ptoa(vps->avail_end) == avail_end)
1680 				break;
1681 		}
1682 		if (x == vm_nphysseg)
1683 			panic("init_x86_64: can't find end of memory");
1684 
1685 		/* Shrink so it'll fit in the last segment. */
1686 		if ((vps->avail_end - vps->avail_start) < atop(sz))
1687 			sz = ptoa(vps->avail_end - vps->avail_start);
1688 
1689 		vps->avail_end -= atop(sz);
1690 		vps->end -= atop(sz);
1691 		msgbuf_paddr = ptoa(vps->avail_end);
1692 
1693 		/* Remove the last segment if it now has no pages. */
1694 		if (vps->start == vps->end) {
1695 			for (vm_nphysseg--; x < vm_nphysseg; x++)
1696 				vm_physmem[x] = vm_physmem[x + 1];
1697 		}
1698 
1699 		/* Now find where the new avail_end is. */
1700 		for (avail_end = 0, x = 0; x < vm_nphysseg; x++)
1701 			if (vm_physmem[x].avail_end > avail_end)
1702 				avail_end = vm_physmem[x].avail_end;
1703 		avail_end = ptoa(avail_end);
1704 
1705 		/* Warn if the message buffer had to be shrunk. */
1706 		if (sz != reqsz)
1707 			printf("WARNING: %ld bytes not available for msgbuf "
1708 			    "in last cluster (%ld used)\n", reqsz, sz);
1709 	}
1710 
1711 	/*
1712 	 * Steal some memory for a dump bouncebuffer if we have memory over
1713 	 * the 32-bit barrier.
1714 	 */
1715 	if (avail_end > 0xffffffff) {
1716 		struct vm_physseg *vps = NULL;
1717 		psize_t sz = round_page(MAX(BYTES_PER_DUMP, dbtob(1)));
1718 
1719 		/* XXX assumes segments are ordered */
1720 		for (x = 0; x < vm_nphysseg; x++) {
1721 			vps = &vm_physmem[x];
1722 			/* Find something between 16meg and 4gig */
1723 			if (ptoa(vps->avail_end) <= 0xffffffff &&
1724 			    ptoa(vps->avail_start) >= 0xffffff)
1725 				break;
1726 		}
1727 		if (x == vm_nphysseg)
1728 			panic("init_x86_64: no memory between "
1729 			    "0xffffff-0xffffffff");
1730 
1731 		/* Shrink so it'll fit in the segment. */
1732 		if ((vps->avail_end - vps->avail_start) < atop(sz))
1733 			sz = ptoa(vps->avail_end - vps->avail_start);
1734 
1735 		vps->avail_end -= atop(sz);
1736 		vps->end -= atop(sz);
1737 		dumpmem_paddr = ptoa(vps->avail_end);
1738 		dumpmem_vaddr = PMAP_DIRECT_MAP(dumpmem_paddr);
1739 		dumpmem_sz = sz;
1740 
1741 		/* Remove the last segment if it now has no pages. */
1742 		if (vps->start == vps->end) {
1743 			for (vm_nphysseg--; x < vm_nphysseg; x++)
1744 				vm_physmem[x] = vm_physmem[x + 1];
1745 		}
1746 	}
1747 
1748 	pmap_growkernel(VM_MIN_KERNEL_ADDRESS + 32 * 1024 * 1024);
1749 
1750 	pmap_kenter_pa(idt_vaddr, idt_paddr, PROT_READ | PROT_WRITE);
1751 
1752 	idt = (struct gate_descriptor *)idt_vaddr;
1753 	cpu_info_primary.ci_tss = &cpu_info_full_primary.cif_tss;
1754 	cpu_info_primary.ci_gdt = &cpu_info_full_primary.cif_gdt;
1755 
1756 	/* make gdt gates and memory segments */
1757 	set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GCODE_SEL), 0,
1758 	    0xfffff, SDT_MEMERA, SEL_KPL, 1, 0, 1);
1759 
1760 	set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GDATA_SEL), 0,
1761 	    0xfffff, SDT_MEMRWA, SEL_KPL, 1, 0, 1);
1762 
1763 	set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GUDATA_SEL), 0,
1764 	    atop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMRWA, SEL_UPL, 1, 0, 1);
1765 
1766 	set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GUCODE_SEL), 0,
1767 	    atop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMERA, SEL_UPL, 1, 0, 1);
1768 
1769 	set_sys_segment(GDT_ADDR_SYS(cpu_info_primary.ci_gdt, GPROC0_SEL),
1770 	    cpu_info_primary.ci_tss, sizeof (struct x86_64_tss)-1,
1771 	    SDT_SYS386TSS, SEL_KPL, 0);
1772 
1773 	/* exceptions */
1774 	for (x = 0; x < 32; x++) {
1775 		/* trap2 == NMI, trap8 == double fault */
1776 		ist = (x == 2) ? 2 : (x == 8) ? 1 : 0;
1777 		setgate(&idt[x], IDTVEC(exceptions)[x], ist, SDT_SYS386IGT,
1778 		    (x == 3) ? SEL_UPL : SEL_KPL,
1779 		    GSEL(GCODE_SEL, SEL_KPL));
1780 		idt_allocmap[x] = 1;
1781 	}
1782 
1783 	setregion(&region, cpu_info_primary.ci_gdt, GDT_SIZE - 1);
1784 	lgdt(&region);
1785 
1786 	cpu_init_idt();
1787 
1788 	intr_default_setup();
1789 
1790 	fpuinit(&cpu_info_primary);
1791 
1792 	softintr_init();
1793 	splraise(IPL_IPI);
1794 	intr_enable();
1795 
1796 #ifdef DDB
1797 	db_machine_init();
1798 	ddb_init();
1799 	if (boothowto & RB_KDB)
1800 		db_enter();
1801 #endif
1802 }
1803 
1804 void
cpu_reset(void)1805 cpu_reset(void)
1806 {
1807 	intr_disable();
1808 
1809 	if (cpuresetfn)
1810 		(*cpuresetfn)();
1811 
1812 	/*
1813 	 * The keyboard controller has 4 random output pins, one of which is
1814 	 * connected to the RESET pin on the CPU in many PCs.  We tell the
1815 	 * keyboard controller to pulse this line a couple of times.
1816 	 */
1817 	outb(IO_KBD + KBCMDP, KBC_PULSE0);
1818 	delay(100000);
1819 	outb(IO_KBD + KBCMDP, KBC_PULSE0);
1820 	delay(100000);
1821 
1822 	/*
1823 	 * Try to cause a triple fault and watchdog reset by making the IDT
1824 	 * invalid and causing a fault.
1825 	 */
1826 	memset((caddr_t)idt, 0, NIDT * sizeof(idt[0]));
1827 	__asm volatile("divl %0,%1" : : "q" (0), "a" (0));
1828 
1829 	for (;;)
1830 		continue;
1831 	/* NOTREACHED */
1832 }
1833 
1834 /*
1835  * cpu_dumpsize: calculate size of machine-dependent kernel core dump headers.
1836  */
1837 int
cpu_dumpsize(void)1838 cpu_dumpsize(void)
1839 {
1840 	int size;
1841 
1842 	size = ALIGN(sizeof(kcore_seg_t)) +
1843 	    ALIGN(mem_cluster_cnt * sizeof(phys_ram_seg_t));
1844 	if (roundup(size, dbtob(1)) != dbtob(1))
1845 		return (-1);
1846 
1847 	return (1);
1848 }
1849 
1850 /*
1851  * cpu_dump_mempagecnt: calculate the size of RAM (in pages) to be dumped.
1852  */
1853 u_long
cpu_dump_mempagecnt(void)1854 cpu_dump_mempagecnt(void)
1855 {
1856 	u_long i, n;
1857 
1858 	n = 0;
1859 	for (i = 0; i < mem_cluster_cnt; i++)
1860 		n += atop(mem_clusters[i].size);
1861 	return (n);
1862 }
1863 
1864 /*
1865  * Figure out which portions of memory are used by the kernel/system.
1866  */
1867 int
amd64_pa_used(paddr_t addr)1868 amd64_pa_used(paddr_t addr)
1869 {
1870 	struct vm_page	*pg;
1871 
1872 	/* Kernel manages these */
1873 	if ((pg = PHYS_TO_VM_PAGE(addr)) && (pg->pg_flags & PG_DEV) == 0)
1874 		return 1;
1875 
1876 	/* Kernel is loaded here */
1877 	if (addr > IOM_END && addr < (kern_end - KERNBASE))
1878 		return 1;
1879 
1880 	/* Low memory used for various bootstrap things */
1881 	if (addr < avail_start)
1882 		return 1;
1883 
1884 	/*
1885 	 * The only regions I can think of that are left are the things
1886 	 * we steal away from UVM.  The message buffer?
1887 	 * XXX - ignore these for now.
1888 	 */
1889 
1890 	return 0;
1891 }
1892 
1893 void
cpu_initclocks(void)1894 cpu_initclocks(void)
1895 {
1896 	(*initclock_func)();
1897 }
1898 
1899 void
cpu_startclock(void)1900 cpu_startclock(void)
1901 {
1902 	(*startclock_func)();
1903 }
1904 
1905 void
need_resched(struct cpu_info * ci)1906 need_resched(struct cpu_info *ci)
1907 {
1908 	ci->ci_want_resched = 1;
1909 
1910 	/* There's a risk we'll be called before the idle threads start */
1911 	if (ci->ci_curproc) {
1912 		aston(ci->ci_curproc);
1913 		cpu_kick(ci);
1914 	}
1915 }
1916 
1917 /*
1918  * Allocate an IDT vector slot within the given range.
1919  * XXX needs locking to avoid MP allocation races.
1920  */
1921 
1922 int
idt_vec_alloc(int low,int high)1923 idt_vec_alloc(int low, int high)
1924 {
1925 	int vec;
1926 
1927 	for (vec = low; vec <= high; vec++) {
1928 		if (idt_allocmap[vec] == 0) {
1929 			idt_allocmap[vec] = 1;
1930 			return vec;
1931 		}
1932 	}
1933 	return 0;
1934 }
1935 
1936 int
idt_vec_alloc_range(int low,int high,int num)1937 idt_vec_alloc_range(int low, int high, int num)
1938 {
1939 	int i, vec;
1940 
1941 	KASSERT(powerof2(num));
1942 	low = (low + num - 1) & ~(num - 1);
1943 	high = ((high + 1) & ~(num - 1)) - 1;
1944 
1945 	for (vec = low; vec <= high; vec += num) {
1946 		for (i = 0; i < num; i++) {
1947 			if (idt_allocmap[vec + i] != 0)
1948 				break;
1949 		}
1950 		if (i == num) {
1951 			for (i = 0; i < num; i++)
1952 				idt_allocmap[vec + i] = 1;
1953 			return vec;
1954 		}
1955 	}
1956 	return 0;
1957 }
1958 
1959 void
idt_vec_set(int vec,void (* function)(void))1960 idt_vec_set(int vec, void (*function)(void))
1961 {
1962 	/*
1963 	 * Vector should be allocated, so no locking needed.
1964 	 */
1965 	KASSERT(idt_allocmap[vec] == 1);
1966 	setgate(&idt[vec], function, 0, SDT_SYS386IGT, SEL_KPL,
1967 	    GSEL(GCODE_SEL, SEL_KPL));
1968 }
1969 
1970 void
idt_vec_free(int vec)1971 idt_vec_free(int vec)
1972 {
1973 	unsetgate(&idt[vec]);
1974 	idt_allocmap[vec] = 0;
1975 }
1976 
1977 #ifdef DIAGNOSTIC
1978 void
splassert_check(int wantipl,const char * func)1979 splassert_check(int wantipl, const char *func)
1980 {
1981 	int cpl = curcpu()->ci_ilevel;
1982 	int floor = curcpu()->ci_handled_intr_level;
1983 
1984 	if (cpl < wantipl) {
1985 		splassert_fail(wantipl, cpl, func);
1986 	}
1987 	if (floor > wantipl) {
1988 		splassert_fail(wantipl, floor, func);
1989 	}
1990 
1991 }
1992 #endif
1993 
1994 int
copyin32(const uint32_t * uaddr,uint32_t * kaddr)1995 copyin32(const uint32_t *uaddr, uint32_t *kaddr)
1996 {
1997 	if ((vaddr_t)uaddr & 0x3)
1998 		return EFAULT;
1999 
2000 	/* copyin(9) is atomic */
2001 	return copyin(uaddr, kaddr, sizeof(uint32_t));
2002 }
2003 
2004 void
getbootinfo(char * bootinfo,int bootinfo_size)2005 getbootinfo(char *bootinfo, int bootinfo_size)
2006 {
2007 	bootarg32_t *q;
2008 	bios_ddb_t *bios_ddb;
2009 	bios_bootduid_t *bios_bootduid;
2010 	bios_bootsr_t *bios_bootsr;
2011 #undef BOOTINFO_DEBUG
2012 #ifdef BOOTINFO_DEBUG
2013 	printf("bootargv:");
2014 #endif
2015 
2016 	for (q = (bootarg32_t *)bootinfo;
2017 	    (q->ba_type != BOOTARG_END) &&
2018 	    ((((char *)q) - bootinfo) < bootinfo_size);
2019 	    q = (bootarg32_t *)(((char *)q) + q->ba_size)) {
2020 
2021 		switch (q->ba_type) {
2022 		case BOOTARG_MEMMAP:
2023 			bios_memmap = (bios_memmap_t *)q->ba_arg;
2024 #ifdef BOOTINFO_DEBUG
2025 			printf(" memmap %p", bios_memmap);
2026 #endif
2027 			break;
2028 		case BOOTARG_DISKINFO:
2029 			bios_diskinfo = (bios_diskinfo_t *)q->ba_arg;
2030 #ifdef BOOTINFO_DEBUG
2031 			printf(" diskinfo %p", bios_diskinfo);
2032 #endif
2033 			break;
2034 		case BOOTARG_APMINFO:
2035 			/* generated by i386 boot loader */
2036 			break;
2037 		case BOOTARG_CKSUMLEN:
2038 			bios_cksumlen = *(u_int32_t *)q->ba_arg;
2039 #ifdef BOOTINFO_DEBUG
2040 			printf(" cksumlen %d", bios_cksumlen);
2041 #endif
2042 			break;
2043 		case BOOTARG_PCIINFO:
2044 			/* generated by i386 boot loader */
2045 			break;
2046 		case BOOTARG_CONSDEV: {
2047 #if NCOM > 0
2048 			bios_consdev_t *cdp = (bios_consdev_t*)q->ba_arg;
2049 			static const int ports[] =
2050 			    { 0x3f8, 0x2f8, 0x3e8, 0x2e8 };
2051 			int unit = minor(cdp->consdev);
2052 			uint64_t consaddr = cdp->consaddr;
2053 			if (consaddr == -1 && unit >= 0 && unit < nitems(ports))
2054 				consaddr = ports[unit];
2055 			if (major(cdp->consdev) == 8 && consaddr != -1) {
2056 				comconsunit = unit;
2057 				comconsaddr = consaddr;
2058 				comconsrate = cdp->conspeed;
2059 				comconsfreq = cdp->consfreq;
2060 				comcons_reg_width = cdp->reg_width;
2061 				comcons_reg_shift = cdp->reg_shift;
2062 				if (cdp->flags & BCD_MMIO)
2063 					comconsiot = X86_BUS_SPACE_MEM;
2064 				else
2065 					comconsiot = X86_BUS_SPACE_IO;
2066 			}
2067 #endif
2068 #ifdef BOOTINFO_DEBUG
2069 			printf(" console 0x%x:%d", cdp->consdev, cdp->conspeed);
2070 #endif
2071 			break;
2072 		}
2073 		case BOOTARG_BOOTMAC:
2074 			bios_bootmac = (bios_bootmac_t *)q->ba_arg;
2075 			break;
2076 
2077 		case BOOTARG_DDB:
2078 			bios_ddb = (bios_ddb_t *)q->ba_arg;
2079 #ifdef DDB
2080 			db_console = bios_ddb->db_console;
2081 #endif
2082 			break;
2083 
2084 		case BOOTARG_BOOTDUID:
2085 			bios_bootduid = (bios_bootduid_t *)q->ba_arg;
2086 			memcpy(bootduid, bios_bootduid, sizeof(bootduid));
2087 			break;
2088 
2089 		case BOOTARG_BOOTSR:
2090 			bios_bootsr = (bios_bootsr_t *)q->ba_arg;
2091 #if NSOFTRAID > 0
2092 			memcpy(&sr_bootuuid, &bios_bootsr->uuid,
2093 			    sizeof(sr_bootuuid));
2094 			memcpy(&sr_bootkey, &bios_bootsr->maskkey,
2095 			    sizeof(sr_bootkey));
2096 #endif
2097 			explicit_bzero(bios_bootsr, sizeof(bios_bootsr_t));
2098 			break;
2099 
2100 		case BOOTARG_EFIINFO:
2101 			bios_efiinfo = (bios_efiinfo_t *)q->ba_arg;
2102 			break;
2103 
2104 		case BOOTARG_UCODE:
2105 			bios_ucode = (bios_ucode_t *)q->ba_arg;
2106 			break;
2107 
2108 		default:
2109 #ifdef BOOTINFO_DEBUG
2110 			printf(" unsupported arg (%d) %p", q->ba_type,
2111 			    q->ba_arg);
2112 #endif
2113 			break;
2114 		}
2115 	}
2116 #ifdef BOOTINFO_DEBUG
2117 	printf("\n");
2118 #endif
2119 }
2120 
2121 int
check_context(const struct reg * regs,struct trapframe * tf)2122 check_context(const struct reg *regs, struct trapframe *tf)
2123 {
2124 	uint16_t sel;
2125 
2126 	if (((regs->r_rflags ^ tf->tf_rflags) & PSL_USERSTATIC) != 0)
2127 		return EINVAL;
2128 
2129 	sel = regs->r_ss & 0xffff;
2130 	if (!VALID_USER_DSEL(sel))
2131 		return EINVAL;
2132 
2133 	sel = regs->r_cs & 0xffff;
2134 	if (!VALID_USER_CSEL(sel))
2135 		return EINVAL;
2136 
2137 	if (regs->r_rip >= VM_MAXUSER_ADDRESS)
2138 		return EINVAL;
2139 
2140 	return 0;
2141 }
2142 
2143 int amd64_delay_quality;
2144 
2145 void
delay_init(void (* fn)(int),int fn_quality)2146 delay_init(void(*fn)(int), int fn_quality)
2147 {
2148 	if (fn_quality > amd64_delay_quality) {
2149 		delay_func = fn;
2150 		amd64_delay_quality = fn_quality;
2151 	}
2152 }
2153 
2154 void
delay_fini(void (* fn)(int))2155 delay_fini(void (*fn)(int))
2156 {
2157 	if (fn == delay_func) {
2158 		delay_func = i8254_delay;
2159 		amd64_delay_quality = 0;
2160 	}
2161 }
2162