1 /* $OpenBSD: machdep.c,v 1.294 2024/06/07 16:53:35 kettenis Exp $ */
2 /* $NetBSD: machdep.c,v 1.3 2003/05/07 22:58:18 fvdl Exp $ */
3
4 /*-
5 * Copyright (c) 1996, 1997, 1998, 2000 The NetBSD Foundation, Inc.
6 * All rights reserved.
7 *
8 * This code is derived from software contributed to The NetBSD Foundation
9 * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace
10 * Simulation Facility, NASA Ames Research Center.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
25 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 /*-
35 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
36 * All rights reserved.
37 *
38 * This code is derived from software contributed to Berkeley by
39 * William Jolitz.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * @(#)machdep.c 7.4 (Berkeley) 6/3/91
66 */
67
68 #include <sys/param.h>
69 #include <sys/systm.h>
70 #include <sys/signal.h>
71 #include <sys/signalvar.h>
72 #include <sys/proc.h>
73 #include <sys/user.h>
74 #include <sys/exec.h>
75 #include <sys/buf.h>
76 #include <sys/reboot.h>
77 #include <sys/conf.h>
78 #include <sys/msgbuf.h>
79 #include <sys/mount.h>
80 #include <sys/extent.h>
81 #include <sys/core.h>
82 #include <sys/kcore.h>
83 #include <sys/syscallargs.h>
84
85 #include <dev/cons.h>
86 #include <stand/boot/bootarg.h>
87
88 #include <net/if.h>
89 #include <uvm/uvm_extern.h>
90
91 #include <sys/sysctl.h>
92
93 #include <machine/cpu_full.h>
94 #include <machine/cpufunc.h>
95 #include <machine/pio.h>
96 #include <machine/psl.h>
97 #include <machine/reg.h>
98 #include <machine/fpu.h>
99 #include <machine/biosvar.h>
100 #include <machine/mpbiosvar.h>
101 #include <machine/kcore.h>
102 #include <machine/tss.h>
103
104 #include <dev/isa/isareg.h>
105 #include <dev/ic/i8042reg.h>
106
107 #ifdef DDB
108 #include <machine/db_machdep.h>
109 #include <ddb/db_extern.h>
110 extern int db_console;
111 #endif
112
113 #include "isa.h"
114 #include "isadma.h"
115 #include "ksyms.h"
116
117 #include "acpi.h"
118 #if NACPI > 0
119 #include <dev/acpi/acpivar.h>
120 #endif
121
122 #include "com.h"
123 #if NCOM > 0
124 #include <sys/tty.h>
125 #include <dev/ic/comvar.h>
126 #include <dev/ic/comreg.h>
127 #endif
128
129 #include "efi.h"
130 #if NEFI > 0
131 #include <dev/efi/efi.h>
132 #endif
133
134 #include "softraid.h"
135 #if NSOFTRAID > 0
136 #include <dev/softraidvar.h>
137 #endif
138
139 #ifdef HIBERNATE
140 #include <machine/hibernate_var.h>
141 #endif /* HIBERNATE */
142
143 #include "ukbd.h"
144 #include "pckbc.h"
145 #if NPCKBC > 0 && NUKBD > 0
146 #include <dev/ic/pckbcvar.h>
147 #endif
148
149 /* #define MACHDEP_DEBUG */
150
151 #ifdef MACHDEP_DEBUG
152 #define DPRINTF(x...) do { printf(x); } while(0)
153 #else
154 #define DPRINTF(x...)
155 #endif /* MACHDEP_DEBUG */
156
157 /* the following is used externally (sysctl_hw) */
158 char machine[] = MACHINE;
159
160 /*
161 * switchto vectors
162 */
163 void cpu_idle_cycle_hlt(void);
164 void (*cpu_idle_cycle_fcn)(void) = &cpu_idle_cycle_hlt;
165 void (*cpu_suspend_cycle_fcn)(void);
166
167 /* the following is used externally for concurrent handlers */
168 int setperf_prio = 0;
169
170 #ifdef CPURESET_DELAY
171 int cpureset_delay = CPURESET_DELAY;
172 #else
173 int cpureset_delay = 0;
174 #endif
175
176 char *ssym = 0, *esym = 0; /* start and end of symbol table */
177 dev_t bootdev = 0; /* device we booted from */
178 int biosbasemem = 0; /* base memory reported by BIOS */
179 u_int bootapiver = 0; /* /boot API version */
180
181 int physmem;
182 extern int boothowto;
183
184 paddr_t dumpmem_paddr;
185 vaddr_t dumpmem_vaddr;
186 psize_t dumpmem_sz;
187
188 vaddr_t kern_end;
189
190 vaddr_t msgbuf_vaddr;
191 paddr_t msgbuf_paddr;
192
193 vaddr_t idt_vaddr;
194 paddr_t idt_paddr;
195
196 vaddr_t lo32_vaddr;
197 paddr_t lo32_paddr;
198 paddr_t tramp_pdirpa;
199
200 int kbd_reset;
201 int lid_action = 1;
202 int pwr_action = 1;
203 int forceukbd;
204
205 /*
206 * safepri is a safe priority for sleep to set for a spin-wait
207 * during autoconfiguration or after a panic.
208 */
209 int safepri = 0;
210
211 struct vm_map *exec_map = NULL;
212 struct vm_map *phys_map = NULL;
213
214 /* UVM constraint ranges. */
215 struct uvm_constraint_range isa_constraint = { 0x0, 0x00ffffffUL };
216 struct uvm_constraint_range dma_constraint = { 0x0, 0xffffffffUL };
217 struct uvm_constraint_range *uvm_md_constraints[] = {
218 &isa_constraint,
219 &dma_constraint,
220 NULL,
221 };
222
223 paddr_t avail_start;
224 paddr_t avail_end;
225
226 void (*delay_func)(int) = i8254_delay;
227 void (*initclock_func)(void) = i8254_initclocks;
228 void (*startclock_func)(void) = i8254_start_both_clocks;
229
230 /*
231 * Format of boot information passed to us by 32-bit /boot
232 */
233 typedef struct _boot_args32 {
234 int ba_type;
235 int ba_size;
236 int ba_nextX; /* a ptr in 32-bit world, but not here */
237 char ba_arg[1];
238 } bootarg32_t;
239
240 #define BOOTARGC_MAX NBPG /* one page */
241
242 bios_bootmac_t *bios_bootmac;
243
244 /* locore copies the arguments from /boot to here for us */
245 char bootinfo[BOOTARGC_MAX];
246 int bootinfo_size = BOOTARGC_MAX;
247
248 void getbootinfo(char *, int);
249
250 /* Data passed to us by /boot, filled in by getbootinfo() */
251 bios_diskinfo_t *bios_diskinfo;
252 bios_memmap_t *bios_memmap;
253 u_int32_t bios_cksumlen;
254 bios_efiinfo_t *bios_efiinfo;
255 bios_ucode_t *bios_ucode;
256
257 #if NEFI > 0
258 EFI_MEMORY_DESCRIPTOR *mmap;
259 #endif
260
261 /*
262 * Size of memory segments, before any memory is stolen.
263 */
264 phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX];
265 int mem_cluster_cnt;
266
267 int cpu_dump(void);
268 int cpu_dumpsize(void);
269 u_long cpu_dump_mempagecnt(void);
270 void dumpsys(void);
271 void cpu_init_extents(void);
272 void map_tramps(void);
273 void init_x86_64(paddr_t);
274 void (*cpuresetfn)(void);
275 void enter_shared_special_pages(void);
276
277 #ifdef APERTURE
278 int allowaperture = 0;
279 #endif
280
281 /*
282 * Machine-dependent startup code
283 */
284 void
cpu_startup(void)285 cpu_startup(void)
286 {
287 vaddr_t minaddr, maxaddr;
288
289 msgbuf_vaddr = PMAP_DIRECT_MAP(msgbuf_paddr);
290 initmsgbuf((caddr_t)msgbuf_vaddr, round_page(MSGBUFSIZE));
291
292 printf("%s", version);
293 startclocks();
294 rtcinit();
295
296 printf("real mem = %lu (%luMB)\n", ptoa((psize_t)physmem),
297 ptoa((psize_t)physmem)/1024/1024);
298
299 /*
300 * Allocate a submap for exec arguments. This map effectively
301 * limits the number of processes exec'ing at any time.
302 */
303 minaddr = vm_map_min(kernel_map);
304 exec_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
305 16*NCARGS, VM_MAP_PAGEABLE, FALSE, NULL);
306
307 /*
308 * Allocate a submap for physio
309 */
310 minaddr = vm_map_min(kernel_map);
311 phys_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
312 VM_PHYS_SIZE, 0, FALSE, NULL);
313
314 printf("avail mem = %lu (%luMB)\n", ptoa((psize_t)uvmexp.free),
315 ptoa((psize_t)uvmexp.free)/1024/1024);
316
317 bufinit();
318
319 if (boothowto & RB_CONFIG) {
320 #ifdef BOOT_CONFIG
321 user_config();
322 #else
323 printf("kernel does not support -c; continuing..\n");
324 #endif
325 }
326
327 /* Safe for i/o port / memory space allocation to use malloc now. */
328 x86_bus_space_mallocok();
329
330 #ifndef SMALL_KERNEL
331 cpu_ucode_setup();
332 cpu_ucode_apply(&cpu_info_primary);
333 #endif
334 cpu_tsx_disable(&cpu_info_primary);
335
336 /* enter the IDT and trampoline code in the u-k maps */
337 enter_shared_special_pages();
338
339 /* initialize CPU0's TSS and GDT and put them in the u-k maps */
340 cpu_enter_pages(&cpu_info_full_primary);
341 }
342
343 /*
344 * enter_shared_special_pages
345 *
346 * Requests mapping of various special pages required in the Intel Meltdown
347 * case (to be entered into the U-K page table):
348 *
349 * 1 IDT page
350 * Various number of pages covering the U-K ".kutext" section. This section
351 * contains code needed during trampoline operation
352 * Various number of pages covering the U-K ".kudata" section. This section
353 * contains data accessed by the trampoline, before switching to U+K
354 * (for example, various shared global variables used by IPIs, etc)
355 *
356 * The linker script places the required symbols in the sections above.
357 *
358 * On CPUs not affected by Meltdown, the calls to pmap_enter_special below
359 * become no-ops.
360 */
361 void
enter_shared_special_pages(void)362 enter_shared_special_pages(void)
363 {
364 extern char __kutext_start[], __kutext_end[], __kernel_kutext_phys[];
365 extern char __text_page_start[], __text_page_end[];
366 extern char __kernel_kutext_page_phys[];
367 extern char __kudata_start[], __kudata_end[], __kernel_kudata_phys[];
368 vaddr_t va;
369 paddr_t pa;
370
371 /* idt */
372 pmap_enter_special(idt_vaddr, idt_paddr, PROT_READ);
373 DPRINTF("%s: entered idt page va 0x%llx pa 0x%llx\n", __func__,
374 (uint64_t)idt_vaddr, (uint64_t)idt_paddr);
375
376 /* .kutext section */
377 va = (vaddr_t)__kutext_start;
378 pa = (paddr_t)__kernel_kutext_phys;
379 while (va < (vaddr_t)__kutext_end) {
380 pmap_enter_special(va, pa, PROT_READ | PROT_EXEC);
381 DPRINTF("%s: entered kutext page va 0x%llx pa 0x%llx\n",
382 __func__, (uint64_t)va, (uint64_t)pa);
383 va += PAGE_SIZE;
384 pa += PAGE_SIZE;
385 }
386
387 /* .kutext.page section */
388 va = (vaddr_t)__text_page_start;
389 pa = (paddr_t)__kernel_kutext_page_phys;
390 while (va < (vaddr_t)__text_page_end) {
391 pmap_enter_special(va, pa, PROT_READ | PROT_EXEC);
392 DPRINTF("%s: entered kutext.page va 0x%llx pa 0x%llx\n",
393 __func__, (uint64_t)va, (uint64_t)pa);
394 va += PAGE_SIZE;
395 pa += PAGE_SIZE;
396 }
397
398 /* .kudata section */
399 va = (vaddr_t)__kudata_start;
400 pa = (paddr_t)__kernel_kudata_phys;
401 while (va < (vaddr_t)__kudata_end) {
402 pmap_enter_special(va, pa, PROT_READ | PROT_WRITE);
403 DPRINTF("%s: entered kudata page va 0x%llx pa 0x%llx\n",
404 __func__, (uint64_t)va, (uint64_t)pa);
405 va += PAGE_SIZE;
406 pa += PAGE_SIZE;
407 }
408 }
409
410 /*
411 * Set up proc0's PCB and the cpu's TSS.
412 */
413 void
x86_64_proc0_tss_ldt_init(void)414 x86_64_proc0_tss_ldt_init(void)
415 {
416 struct pcb *pcb;
417
418 cpu_info_primary.ci_curpcb = pcb = &proc0.p_addr->u_pcb;
419 pcb->pcb_fsbase = 0;
420 pcb->pcb_kstack = (u_int64_t)proc0.p_addr + USPACE - 16;
421 proc0.p_md.md_regs = (struct trapframe *)pcb->pcb_kstack - 1;
422
423 ltr(GSYSSEL(GPROC0_SEL, SEL_KPL));
424 lldt(0);
425 }
426
427 bios_diskinfo_t *
bios_getdiskinfo(dev_t dev)428 bios_getdiskinfo(dev_t dev)
429 {
430 bios_diskinfo_t *pdi;
431
432 if (bios_diskinfo == NULL)
433 return NULL;
434
435 for (pdi = bios_diskinfo; pdi->bios_number != -1; pdi++) {
436 if ((dev & B_MAGICMASK) == B_DEVMAGIC) { /* search by bootdev */
437 if (pdi->bsd_dev == dev)
438 break;
439 } else {
440 if (pdi->bios_number == dev)
441 break;
442 }
443 }
444
445 if (pdi->bios_number == -1)
446 return NULL;
447 else
448 return pdi;
449 }
450
451 int
bios_sysctl(int * name,u_int namelen,void * oldp,size_t * oldlenp,void * newp,size_t newlen,struct proc * p)452 bios_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
453 size_t newlen, struct proc *p)
454 {
455 bios_diskinfo_t *pdi;
456 int biosdev;
457
458 /* all sysctl names at this level except diskinfo are terminal */
459 if (namelen != 1 && name[0] != BIOS_DISKINFO)
460 return (ENOTDIR); /* overloaded */
461
462 if (!(bootapiver & BAPIV_VECTOR))
463 return EOPNOTSUPP;
464
465 switch (name[0]) {
466 case BIOS_DEV:
467 if ((pdi = bios_getdiskinfo(bootdev)) == NULL)
468 return ENXIO;
469 biosdev = pdi->bios_number;
470 return sysctl_rdint(oldp, oldlenp, newp, biosdev);
471 case BIOS_DISKINFO:
472 if (namelen != 2)
473 return ENOTDIR;
474 if ((pdi = bios_getdiskinfo(name[1])) == NULL)
475 return ENXIO;
476 return sysctl_rdstruct(oldp, oldlenp, newp, pdi, sizeof(*pdi));
477 case BIOS_CKSUMLEN:
478 return sysctl_rdint(oldp, oldlenp, newp, bios_cksumlen);
479 default:
480 return EOPNOTSUPP;
481 }
482 /* NOTREACHED */
483 }
484
485 extern int tsc_is_invariant;
486 extern int amd64_has_xcrypt;
487 extern int need_retpoline;
488
489 const struct sysctl_bounded_args cpuctl_vars[] = {
490 { CPU_LIDACTION, &lid_action, 0, 2 },
491 { CPU_PWRACTION, &pwr_action, 0, 2 },
492 { CPU_CPUID, &cpu_id, SYSCTL_INT_READONLY },
493 { CPU_CPUFEATURE, &cpu_feature, SYSCTL_INT_READONLY },
494 { CPU_XCRYPT, &amd64_has_xcrypt, SYSCTL_INT_READONLY },
495 { CPU_INVARIANTTSC, &tsc_is_invariant, SYSCTL_INT_READONLY },
496 { CPU_RETPOLINE, &need_retpoline, SYSCTL_INT_READONLY },
497 };
498
499 /*
500 * machine dependent system variables.
501 */
502 int
cpu_sysctl(int * name,u_int namelen,void * oldp,size_t * oldlenp,void * newp,size_t newlen,struct proc * p)503 cpu_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
504 size_t newlen, struct proc *p)
505 {
506 extern uint64_t tsc_frequency;
507 dev_t consdev;
508 dev_t dev;
509
510 switch (name[0]) {
511 case CPU_CONSDEV:
512 if (namelen != 1)
513 return (ENOTDIR); /* overloaded */
514 if (cn_tab != NULL)
515 consdev = cn_tab->cn_dev;
516 else
517 consdev = NODEV;
518 return (sysctl_rdstruct(oldp, oldlenp, newp, &consdev,
519 sizeof consdev));
520 case CPU_CHR2BLK:
521 if (namelen != 2)
522 return (ENOTDIR); /* overloaded */
523 dev = chrtoblk((dev_t)name[1]);
524 return sysctl_rdstruct(oldp, oldlenp, newp, &dev, sizeof(dev));
525 case CPU_BIOS:
526 return bios_sysctl(name + 1, namelen - 1, oldp, oldlenp,
527 newp, newlen, p);
528 case CPU_CPUVENDOR:
529 return (sysctl_rdstring(oldp, oldlenp, newp, cpu_vendor));
530 case CPU_KBDRESET:
531 return (sysctl_securelevel_int(oldp, oldlenp, newp, newlen,
532 &kbd_reset));
533 case CPU_ALLOWAPERTURE:
534 if (namelen != 1)
535 return (ENOTDIR); /* overloaded */
536 #ifdef APERTURE
537 if (securelevel > 0)
538 return (sysctl_int_lower(oldp, oldlenp, newp, newlen,
539 &allowaperture));
540 else
541 return (sysctl_int(oldp, oldlenp, newp, newlen,
542 &allowaperture));
543 #else
544 return (sysctl_rdint(oldp, oldlenp, newp, 0));
545 #endif
546 #if NPCKBC > 0 && NUKBD > 0
547 case CPU_FORCEUKBD:
548 {
549 int error;
550
551 if (forceukbd)
552 return (sysctl_rdint(oldp, oldlenp, newp, forceukbd));
553
554 error = sysctl_int(oldp, oldlenp, newp, newlen, &forceukbd);
555 if (forceukbd)
556 pckbc_release_console();
557 return (error);
558 }
559 #endif
560 case CPU_TSCFREQ:
561 return (sysctl_rdquad(oldp, oldlenp, newp, tsc_frequency));
562 default:
563 return (sysctl_bounded_arr(cpuctl_vars, nitems(cpuctl_vars),
564 name, namelen, oldp, oldlenp, newp, newlen));
565 }
566 /* NOTREACHED */
567 }
568
569 static inline void
maybe_enable_user_cet(struct proc * p)570 maybe_enable_user_cet(struct proc *p)
571 {
572 #ifndef SMALL_KERNEL
573 /* Enable indirect-branch tracking if present and not disabled */
574 if ((xsave_mask & XFEATURE_CET_U) &&
575 (p->p_p->ps_flags & PS_NOBTCFI) == 0) {
576 uint64_t msr = rdmsr(MSR_U_CET);
577 wrmsr(MSR_U_CET, msr | MSR_CET_ENDBR_EN | MSR_CET_NO_TRACK_EN);
578 }
579 #endif
580 }
581
582 static inline void
initialize_thread_xstate(struct proc * p)583 initialize_thread_xstate(struct proc *p)
584 {
585 if (cpu_use_xsaves) {
586 xrstors(fpu_cleandata, xsave_mask);
587 maybe_enable_user_cet(p);
588 } else {
589 /* Reset FPU state in PCB */
590 memcpy(&p->p_addr->u_pcb.pcb_savefpu, fpu_cleandata,
591 fpu_save_len);
592
593 if (curcpu()->ci_pflags & CPUPF_USERXSTATE) {
594 /* state in CPU is obsolete; reset it */
595 fpureset();
596 }
597 }
598
599 /* The reset state _is_ the userspace state for this thread now */
600 curcpu()->ci_pflags |= CPUPF_USERXSTATE;
601 }
602
603 /*
604 * Copy out the FPU state, massaging it to be usable from userspace
605 * and acceptable to xrstor_user()
606 */
607 static inline int
copyoutfpu(struct savefpu * sfp,char * sp,size_t len)608 copyoutfpu(struct savefpu *sfp, char *sp, size_t len)
609 {
610 uint64_t bvs[2];
611
612 if (copyout(sfp, sp, len))
613 return 1;
614 if (len > offsetof(struct savefpu, fp_xstate.xstate_bv)) {
615 sp += offsetof(struct savefpu, fp_xstate.xstate_bv);
616 len -= offsetof(struct savefpu, fp_xstate.xstate_bv);
617 bvs[0] = sfp->fp_xstate.xstate_bv & XFEATURE_XCR0_MASK;
618 bvs[1] = sfp->fp_xstate.xstate_xcomp_bv &
619 (XFEATURE_XCR0_MASK | XFEATURE_COMPRESSED);
620 if (copyout(bvs, sp, min(len, sizeof bvs)))
621 return 1;
622 }
623 return 0;
624 }
625
626 /*
627 * Send an interrupt to process.
628 *
629 * Stack is set up to allow sigcode to call routine, followed by
630 * syscall to sigreturn routine below. After sigreturn resets the
631 * signal mask, the stack, and the frame pointer, it returns to the
632 * user specified pc.
633 */
634 int
sendsig(sig_t catcher,int sig,sigset_t mask,const siginfo_t * ksip,int info,int onstack)635 sendsig(sig_t catcher, int sig, sigset_t mask, const siginfo_t *ksip,
636 int info, int onstack)
637 {
638 struct proc *p = curproc;
639 struct trapframe *tf = p->p_md.md_regs;
640 struct sigcontext ksc;
641 struct savefpu *sfp = &p->p_addr->u_pcb.pcb_savefpu;
642 register_t sp, scp, sip;
643 u_long sss;
644
645 memset(&ksc, 0, sizeof ksc);
646 ksc.sc_rdi = tf->tf_rdi;
647 ksc.sc_rsi = tf->tf_rsi;
648 ksc.sc_rdx = tf->tf_rdx;
649 ksc.sc_rcx = tf->tf_rcx;
650 ksc.sc_r8 = tf->tf_r8;
651 ksc.sc_r9 = tf->tf_r9;
652 ksc.sc_r10 = tf->tf_r10;
653 ksc.sc_r11 = tf->tf_r11;
654 ksc.sc_r12 = tf->tf_r12;
655 ksc.sc_r13 = tf->tf_r13;
656 ksc.sc_r14 = tf->tf_r14;
657 ksc.sc_r15 = tf->tf_r15;
658 ksc.sc_rbx = tf->tf_rbx;
659 ksc.sc_rax = tf->tf_rax;
660 ksc.sc_rbp = tf->tf_rbp;
661 ksc.sc_rip = tf->tf_rip;
662 ksc.sc_cs = tf->tf_cs;
663 ksc.sc_rflags = tf->tf_rflags;
664 ksc.sc_rsp = tf->tf_rsp;
665 ksc.sc_ss = tf->tf_ss;
666 ksc.sc_mask = mask;
667
668 /* Allocate space for the signal handler context. */
669 if ((p->p_sigstk.ss_flags & SS_DISABLE) == 0 &&
670 !sigonstack(tf->tf_rsp) && onstack)
671 sp = trunc_page((vaddr_t)p->p_sigstk.ss_sp + p->p_sigstk.ss_size);
672 else
673 sp = tf->tf_rsp - 128;
674
675 sp -= fpu_save_len;
676 if (cpu_use_xsaves)
677 sp &= ~63ULL; /* just in case */
678 else
679 sp &= ~15ULL; /* just in case */
680
681 /* Save FPU state to PCB if necessary, then copy it out */
682 if (curcpu()->ci_pflags & CPUPF_USERXSTATE)
683 fpusave(&p->p_addr->u_pcb.pcb_savefpu);
684 if (copyoutfpu(sfp, (void *)sp, fpu_save_len))
685 return 1;
686
687 initialize_thread_xstate(p);
688
689 ksc.sc_fpstate = (struct fxsave64 *)sp;
690 sss = (sizeof(ksc) + 15) & ~15;
691 sip = 0;
692 if (info) {
693 sip = sp - ((sizeof(*ksip) + 15) & ~15);
694 sss += (sizeof(*ksip) + 15) & ~15;
695
696 if (copyout(ksip, (void *)sip, sizeof(*ksip)))
697 return 1;
698 }
699 scp = sp - sss;
700
701 ksc.sc_cookie = (long)scp ^ p->p_p->ps_sigcookie;
702 if (copyout(&ksc, (void *)scp, sizeof(ksc)))
703 return 1;
704
705 /*
706 * Build context to run handler in.
707 */
708 tf->tf_rax = (u_int64_t)catcher;
709 tf->tf_rdi = sig;
710 tf->tf_rsi = sip;
711 tf->tf_rdx = scp;
712
713 tf->tf_rip = (u_int64_t)p->p_p->ps_sigcode;
714 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
715 tf->tf_rflags &= ~(PSL_T|PSL_D|PSL_VM|PSL_AC);
716 tf->tf_rsp = scp;
717 tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
718
719 return 0;
720 }
721
722 /*
723 * System call to cleanup state after a signal
724 * has been taken. Reset signal mask and
725 * stack state from context left by sendsig (above).
726 * Return to previous pc and psl as specified by
727 * context left by sendsig. Check carefully to
728 * make sure that the user has not modified the
729 * psl to gain improper privileges or to cause
730 * a machine fault.
731 */
732 int
sys_sigreturn(struct proc * p,void * v,register_t * retval)733 sys_sigreturn(struct proc *p, void *v, register_t *retval)
734 {
735 struct sys_sigreturn_args /* {
736 syscallarg(struct sigcontext *) sigcntxp;
737 } */ *uap = v;
738 struct sigcontext ksc, *scp = SCARG(uap, sigcntxp);
739 struct trapframe *tf = p->p_md.md_regs;
740 struct savefpu *sfp = &p->p_addr->u_pcb.pcb_savefpu;
741 int error;
742
743 if (PROC_PC(p) != p->p_p->ps_sigcoderet) {
744 sigexit(p, SIGILL);
745 return (EPERM);
746 }
747
748 if ((error = copyin((caddr_t)scp, &ksc, sizeof ksc)))
749 return (error);
750
751 if (ksc.sc_cookie != ((long)scp ^ p->p_p->ps_sigcookie)) {
752 sigexit(p, SIGILL);
753 return (EFAULT);
754 }
755
756 /* Prevent reuse of the sigcontext cookie */
757 ksc.sc_cookie = 0;
758 (void)copyout(&ksc.sc_cookie, (caddr_t)scp +
759 offsetof(struct sigcontext, sc_cookie), sizeof (ksc.sc_cookie));
760
761 if (((ksc.sc_rflags ^ tf->tf_rflags) & PSL_USERSTATIC) != 0 ||
762 !USERMODE(ksc.sc_cs, ksc.sc_eflags))
763 return (EINVAL);
764
765 /* Current FPU state is obsolete; toss it and force a reload */
766 if (curcpu()->ci_pflags & CPUPF_USERXSTATE) {
767 curcpu()->ci_pflags &= ~CPUPF_USERXSTATE;
768 fpureset();
769 }
770
771 /* Copy in the FPU state to restore */
772 if (__predict_true(ksc.sc_fpstate != NULL)) {
773 if ((error = copyin(ksc.sc_fpstate, sfp, fpu_save_len)))
774 return error;
775 if (xrstor_user(sfp, xsave_mask)) {
776 memcpy(sfp, fpu_cleandata, fpu_save_len);
777 return EINVAL;
778 }
779 maybe_enable_user_cet(p);
780 curcpu()->ci_pflags |= CPUPF_USERXSTATE;
781 } else {
782 /* shouldn't happen, but handle it */
783 initialize_thread_xstate(p);
784 }
785
786 tf->tf_rdi = ksc.sc_rdi;
787 tf->tf_rsi = ksc.sc_rsi;
788 tf->tf_rdx = ksc.sc_rdx;
789 tf->tf_rcx = ksc.sc_rcx;
790 tf->tf_r8 = ksc.sc_r8;
791 tf->tf_r9 = ksc.sc_r9;
792 tf->tf_r10 = ksc.sc_r10;
793 tf->tf_r11 = ksc.sc_r11;
794 tf->tf_r12 = ksc.sc_r12;
795 tf->tf_r13 = ksc.sc_r13;
796 tf->tf_r14 = ksc.sc_r14;
797 tf->tf_r15 = ksc.sc_r15;
798 tf->tf_rbx = ksc.sc_rbx;
799 tf->tf_rax = ksc.sc_rax;
800 tf->tf_rbp = ksc.sc_rbp;
801 tf->tf_rip = ksc.sc_rip;
802 tf->tf_cs = ksc.sc_cs;
803 tf->tf_rflags = ksc.sc_rflags;
804 tf->tf_rsp = ksc.sc_rsp;
805 tf->tf_ss = ksc.sc_ss;
806
807 /* Restore signal mask. */
808 p->p_sigmask = ksc.sc_mask & ~sigcantmask;
809
810 /*
811 * sigreturn() needs to return to userspace via the 'iretq'
812 * method, so that if the process was interrupted (by tick,
813 * an IPI, whatever) as opposed to already being in the kernel
814 * when a signal was being delivered, the process will be
815 * completely restored, including the userland %rcx and %r11
816 * registers which the 'sysretq' instruction cannot restore.
817 * Also need to make sure we can handle faulting on xrstor.
818 */
819 p->p_md.md_flags |= MDP_IRET;
820
821 return (EJUSTRETURN);
822 }
823
824 #ifdef MULTIPROCESSOR
825 /* force a CPU into the kernel, whether or not it's idle */
826 void
cpu_kick(struct cpu_info * ci)827 cpu_kick(struct cpu_info *ci)
828 {
829 /* only need to kick other CPUs */
830 if (ci != curcpu()) {
831 if (cpu_mwait_size > 0) {
832 /*
833 * If not idling, then send an IPI, else
834 * just clear the "keep idling" bit.
835 */
836 if ((ci->ci_mwait & MWAIT_IN_IDLE) == 0)
837 x86_send_ipi(ci, X86_IPI_NOP);
838 else
839 atomic_clearbits_int(&ci->ci_mwait,
840 MWAIT_KEEP_IDLING);
841 } else {
842 /* no mwait, so need an IPI */
843 x86_send_ipi(ci, X86_IPI_NOP);
844 }
845 }
846 }
847 #endif
848
849 /*
850 * Notify the current process (p) that it has a signal pending,
851 * process as soon as possible.
852 */
853 void
signotify(struct proc * p)854 signotify(struct proc *p)
855 {
856 aston(p);
857 cpu_kick(p->p_cpu);
858 }
859
860 #ifdef MULTIPROCESSOR
861 void
cpu_unidle(struct cpu_info * ci)862 cpu_unidle(struct cpu_info *ci)
863 {
864 if (cpu_mwait_size > 0 && (ci->ci_mwait & MWAIT_ONLY)) {
865 /*
866 * Just clear the "keep idling" bit; if it wasn't
867 * idling then we didn't need to do anything anyway.
868 */
869 atomic_clearbits_int(&ci->ci_mwait, MWAIT_KEEP_IDLING);
870 return;
871 }
872
873 if (ci != curcpu())
874 x86_send_ipi(ci, X86_IPI_NOP);
875 }
876 #endif
877
878 int waittime = -1;
879 struct pcb dumppcb;
880
881 __dead void
boot(int howto)882 boot(int howto)
883 {
884 if ((howto & RB_POWERDOWN) != 0)
885 lid_action = 0;
886
887 if ((howto & RB_RESET) != 0)
888 goto doreset;
889
890 if (cold) {
891 if ((howto & RB_USERREQ) == 0)
892 howto |= RB_HALT;
893 goto haltsys;
894 }
895
896 boothowto = howto;
897 if ((howto & RB_NOSYNC) == 0 && waittime < 0) {
898 waittime = 0;
899 vfs_shutdown(curproc);
900
901 if ((howto & RB_TIMEBAD) == 0) {
902 resettodr();
903 } else {
904 printf("WARNING: not updating battery clock\n");
905 }
906 }
907 if_downall();
908
909 uvm_shutdown();
910 splhigh();
911 cold = 1;
912
913 if ((howto & RB_DUMP) != 0)
914 dumpsys();
915
916 haltsys:
917 config_suspend_all(DVACT_POWERDOWN);
918
919 #ifdef MULTIPROCESSOR
920 x86_broadcast_ipi(X86_IPI_HALT);
921 #endif
922
923 if ((howto & RB_HALT) != 0) {
924 #if NACPI > 0 && !defined(SMALL_KERNEL)
925 extern int acpi_enabled;
926
927 if (acpi_enabled) {
928 delay(500000);
929 if ((howto & RB_POWERDOWN) != 0)
930 acpi_powerdown();
931 }
932 #endif
933 printf("\n");
934 printf("The operating system has halted.\n");
935 printf("Please press any key to reboot.\n\n");
936 cnpollc(1); /* for proper keyboard command handling */
937 cngetc();
938 cnpollc(0);
939 }
940
941 doreset:
942 printf("rebooting...\n");
943 if (cpureset_delay > 0)
944 delay(cpureset_delay * 1000);
945 cpu_reset();
946 for (;;)
947 continue;
948 /* NOTREACHED */
949 }
950
951 /*
952 * These variables are needed by /sbin/savecore
953 */
954 u_long dumpmag = 0x8fca0101; /* magic number */
955 int dumpsize = 0; /* pages */
956 long dumplo = 0; /* blocks */
957
958 /*
959 * cpu_dump: dump the machine-dependent kernel core dump headers.
960 */
961 int
cpu_dump(void)962 cpu_dump(void)
963 {
964 int (*dump)(dev_t, daddr_t, caddr_t, size_t);
965 char buf[dbtob(1)];
966 kcore_seg_t *segp;
967 cpu_kcore_hdr_t *cpuhdrp;
968 phys_ram_seg_t *memsegp;
969 caddr_t va;
970 int i;
971
972 dump = bdevsw[major(dumpdev)].d_dump;
973
974 memset(buf, 0, sizeof buf);
975 segp = (kcore_seg_t *)buf;
976 cpuhdrp = (cpu_kcore_hdr_t *)&buf[ALIGN(sizeof(*segp))];
977 memsegp = (phys_ram_seg_t *)&buf[ALIGN(sizeof(*segp)) +
978 ALIGN(sizeof(*cpuhdrp))];
979
980 /*
981 * Generate a segment header.
982 */
983 CORE_SETMAGIC(*segp, KCORE_MAGIC, MID_MACHINE, CORE_CPU);
984 segp->c_size = dbtob(1) - ALIGN(sizeof(*segp));
985
986 /*
987 * Add the machine-dependent header info.
988 */
989 cpuhdrp->ptdpaddr = proc0.p_addr->u_pcb.pcb_cr3;
990 cpuhdrp->nmemsegs = mem_cluster_cnt;
991
992 /*
993 * Fill in the memory segment descriptors.
994 */
995 for (i = 0; i < mem_cluster_cnt; i++) {
996 memsegp[i].start = mem_clusters[i].start;
997 memsegp[i].size = mem_clusters[i].size & ~PAGE_MASK;
998 }
999
1000 /*
1001 * If we have dump memory then assume the kernel stack is in high
1002 * memory and bounce
1003 */
1004 if (dumpmem_vaddr != 0) {
1005 memcpy((char *)dumpmem_vaddr, buf, sizeof(buf));
1006 va = (caddr_t)dumpmem_vaddr;
1007 } else {
1008 va = (caddr_t)buf;
1009 }
1010 return (dump(dumpdev, dumplo, va, dbtob(1)));
1011 }
1012
1013 /*
1014 * This is called by main to set dumplo and dumpsize.
1015 * Dumps always skip the first PAGE_SIZE of disk space
1016 * in case there might be a disk label stored there.
1017 * If there is extra space, put dump at the end to
1018 * reduce the chance that swapping trashes it.
1019 */
1020 void
dumpconf(void)1021 dumpconf(void)
1022 {
1023 int nblks, dumpblks; /* size of dump area */
1024
1025 if (dumpdev == NODEV ||
1026 (nblks = (bdevsw[major(dumpdev)].d_psize)(dumpdev)) == 0)
1027 return;
1028 if (nblks <= ctod(1))
1029 return;
1030
1031 dumpblks = cpu_dumpsize();
1032 if (dumpblks < 0)
1033 return;
1034 dumpblks += ctod(cpu_dump_mempagecnt());
1035
1036 /* If dump won't fit (incl. room for possible label), punt. */
1037 if (dumpblks > (nblks - ctod(1)))
1038 return;
1039
1040 /* Put dump at end of partition */
1041 dumplo = nblks - dumpblks;
1042
1043 /* dumpsize is in page units, and doesn't include headers. */
1044 dumpsize = cpu_dump_mempagecnt();
1045 }
1046
1047 /*
1048 * Doadump comes here after turning off memory management and
1049 * getting on the dump stack, either when called above, or by
1050 * the auto-restart code.
1051 */
1052 #define BYTES_PER_DUMP MAXPHYS /* must be a multiple of pagesize */
1053
1054 void
dumpsys(void)1055 dumpsys(void)
1056 {
1057 u_long totalbytesleft, bytes, i, n, memseg;
1058 u_long maddr;
1059 daddr_t blkno;
1060 void *va;
1061 int (*dump)(dev_t, daddr_t, caddr_t, size_t);
1062 int error;
1063
1064 /* Save registers. */
1065 savectx(&dumppcb);
1066
1067 if (dumpdev == NODEV)
1068 return;
1069
1070 /*
1071 * For dumps during autoconfiguration,
1072 * if dump device has already configured...
1073 */
1074 if (dumpsize == 0)
1075 dumpconf();
1076 if (dumplo <= 0 || dumpsize == 0) {
1077 printf("\ndump to dev %u,%u not possible\n", major(dumpdev),
1078 minor(dumpdev));
1079 return;
1080 }
1081 printf("\ndumping to dev %u,%u offset %ld\n", major(dumpdev),
1082 minor(dumpdev), dumplo);
1083
1084 error = (*bdevsw[major(dumpdev)].d_psize)(dumpdev);
1085 printf("dump ");
1086 if (error == -1) {
1087 printf("area unavailable\n");
1088 return;
1089 }
1090
1091 if ((error = cpu_dump()) != 0)
1092 goto err;
1093
1094 totalbytesleft = ptoa(cpu_dump_mempagecnt());
1095 blkno = dumplo + cpu_dumpsize();
1096 dump = bdevsw[major(dumpdev)].d_dump;
1097 error = 0;
1098
1099 for (memseg = 0; memseg < mem_cluster_cnt; memseg++) {
1100 maddr = mem_clusters[memseg].start;
1101 bytes = mem_clusters[memseg].size;
1102
1103 for (i = 0; i < bytes; i += n, totalbytesleft -= n) {
1104 /* Print out how many MBs we have left to go. */
1105 if ((totalbytesleft % (1024*1024)) < BYTES_PER_DUMP)
1106 printf("%ld ", totalbytesleft / (1024 * 1024));
1107
1108 /* Limit size for next transfer. */
1109 n = bytes - i;
1110 if (n > BYTES_PER_DUMP)
1111 n = BYTES_PER_DUMP;
1112 if (maddr > 0xffffffff) {
1113 va = (void *)dumpmem_vaddr;
1114 if (n > dumpmem_sz)
1115 n = dumpmem_sz;
1116 memcpy(va, (void *)PMAP_DIRECT_MAP(maddr), n);
1117 } else {
1118 va = (void *)PMAP_DIRECT_MAP(maddr);
1119 }
1120
1121 error = (*dump)(dumpdev, blkno, va, n);
1122 if (error)
1123 goto err;
1124 maddr += n;
1125 blkno += btodb(n); /* XXX? */
1126
1127 #if 0 /* XXX this doesn't work. grr. */
1128 /* operator aborting dump? */
1129 if (sget() != NULL) {
1130 error = EINTR;
1131 break;
1132 }
1133 #endif
1134 }
1135 }
1136
1137 err:
1138 switch (error) {
1139
1140 case ENXIO:
1141 printf("device bad\n");
1142 break;
1143
1144 case EFAULT:
1145 printf("device not ready\n");
1146 break;
1147
1148 case EINVAL:
1149 printf("area improper\n");
1150 break;
1151
1152 case EIO:
1153 printf("i/o error\n");
1154 break;
1155
1156 case EINTR:
1157 printf("aborted from console\n");
1158 break;
1159
1160 case 0:
1161 printf("succeeded\n");
1162 break;
1163
1164 default:
1165 printf("error %d\n", error);
1166 break;
1167 }
1168 printf("\n\n");
1169 delay(5000000); /* 5 seconds */
1170 }
1171
1172 /*
1173 * Force the userspace FS.base to be reloaded from the PCB on return from
1174 * the kernel, and reset the segment registers (%ds, %es, %fs, and %gs)
1175 * to their expected userspace value.
1176 */
1177 void
reset_segs(void)1178 reset_segs(void)
1179 {
1180 /*
1181 * This operates like the cpu_switchto() sequence: if we
1182 * haven't reset %[defg]s already, do so now.
1183 */
1184 if (curcpu()->ci_pflags & CPUPF_USERSEGS) {
1185 curcpu()->ci_pflags &= ~CPUPF_USERSEGS;
1186 __asm volatile(
1187 "movw %%ax,%%ds\n\t"
1188 "movw %%ax,%%es\n\t"
1189 "movw %%ax,%%fs\n\t"
1190 "cli\n\t" /* block intr when on user GS.base */
1191 "swapgs\n\t" /* swap from kernel to user GS.base */
1192 "movw %%ax,%%gs\n\t"/* set %gs to UDATA and GS.base to 0 */
1193 "swapgs\n\t" /* back to kernel GS.base */
1194 "sti" : : "a"(GSEL(GUDATA_SEL, SEL_UPL)));
1195 }
1196 }
1197
1198 /*
1199 * Clear registers on exec
1200 */
1201 void
setregs(struct proc * p,struct exec_package * pack,u_long stack,struct ps_strings * arginfo)1202 setregs(struct proc *p, struct exec_package *pack, u_long stack,
1203 struct ps_strings *arginfo)
1204 {
1205 struct trapframe *tf;
1206
1207 initialize_thread_xstate(p);
1208
1209 /* To reset all registers we have to return via iretq */
1210 p->p_md.md_flags |= MDP_IRET;
1211
1212 reset_segs();
1213 p->p_addr->u_pcb.pcb_fsbase = 0;
1214
1215 tf = p->p_md.md_regs;
1216 memset(tf, 0, sizeof *tf);
1217 tf->tf_rip = pack->ep_entry;
1218 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
1219 tf->tf_rflags = PSL_USERSET;
1220 tf->tf_rsp = stack;
1221 tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
1222 }
1223
1224 /*
1225 * Initialize segments and descriptor tables
1226 */
1227
1228 struct gate_descriptor *idt;
1229 char idt_allocmap[NIDT];
1230 struct user *proc0paddr = NULL;
1231
1232 void
setgate(struct gate_descriptor * gd,void * func,int ist,int type,int dpl,int sel)1233 setgate(struct gate_descriptor *gd, void *func, int ist, int type, int dpl,
1234 int sel)
1235 {
1236 gd->gd_looffset = (u_int64_t)func & 0xffff;
1237 gd->gd_selector = sel;
1238 gd->gd_ist = ist;
1239 gd->gd_type = type;
1240 gd->gd_dpl = dpl;
1241 gd->gd_p = 1;
1242 gd->gd_hioffset = (u_int64_t)func >> 16;
1243 gd->gd_zero = 0;
1244 gd->gd_xx1 = 0;
1245 gd->gd_xx2 = 0;
1246 gd->gd_xx3 = 0;
1247 }
1248
1249 void
unsetgate(struct gate_descriptor * gd)1250 unsetgate(struct gate_descriptor *gd)
1251 {
1252 memset(gd, 0, sizeof (*gd));
1253 }
1254
1255 void
setregion(struct region_descriptor * rd,void * base,u_int16_t limit)1256 setregion(struct region_descriptor *rd, void *base, u_int16_t limit)
1257 {
1258 rd->rd_limit = limit;
1259 rd->rd_base = (u_int64_t)base;
1260 }
1261
1262 /*
1263 * Note that the base and limit fields are ignored in long mode.
1264 */
1265 void
set_mem_segment(struct mem_segment_descriptor * sd,void * base,size_t limit,int type,int dpl,int gran,int def32,int is64)1266 set_mem_segment(struct mem_segment_descriptor *sd, void *base, size_t limit,
1267 int type, int dpl, int gran, int def32, int is64)
1268 {
1269 sd->sd_lolimit = (unsigned)limit;
1270 sd->sd_lobase = (unsigned long)base;
1271 sd->sd_type = type;
1272 sd->sd_dpl = dpl;
1273 sd->sd_p = 1;
1274 sd->sd_hilimit = (unsigned)limit >> 16;
1275 sd->sd_avl = 0;
1276 sd->sd_long = is64;
1277 sd->sd_def32 = def32;
1278 sd->sd_gran = gran;
1279 sd->sd_hibase = (unsigned long)base >> 24;
1280 }
1281
1282 void
set_sys_segment(struct sys_segment_descriptor * sd,void * base,size_t limit,int type,int dpl,int gran)1283 set_sys_segment(struct sys_segment_descriptor *sd, void *base, size_t limit,
1284 int type, int dpl, int gran)
1285 {
1286 memset(sd, 0, sizeof *sd);
1287 sd->sd_lolimit = (unsigned)limit;
1288 sd->sd_lobase = (u_int64_t)base;
1289 sd->sd_type = type;
1290 sd->sd_dpl = dpl;
1291 sd->sd_p = 1;
1292 sd->sd_hilimit = (unsigned)limit >> 16;
1293 sd->sd_gran = gran;
1294 sd->sd_hibase = (u_int64_t)base >> 24;
1295 }
1296
cpu_init_idt(void)1297 void cpu_init_idt(void)
1298 {
1299 struct region_descriptor region;
1300
1301 setregion(®ion, idt, NIDT * sizeof(idt[0]) - 1);
1302 lidt(®ion);
1303 }
1304
1305 void
cpu_init_extents(void)1306 cpu_init_extents(void)
1307 {
1308 extern struct extent *iomem_ex;
1309 static int already_done;
1310 int i;
1311
1312 /* We get called for each CPU, only first should do this */
1313 if (already_done)
1314 return;
1315
1316 /*
1317 * Allocate the physical addresses used by RAM from the iomem
1318 * extent map.
1319 */
1320 for (i = 0; i < mem_cluster_cnt; i++) {
1321 if (extent_alloc_region(iomem_ex, mem_clusters[i].start,
1322 mem_clusters[i].size, EX_NOWAIT)) {
1323 /* XXX What should we do? */
1324 printf("WARNING: CAN'T ALLOCATE RAM (%llx-%llx)"
1325 " FROM IOMEM EXTENT MAP!\n", mem_clusters[i].start,
1326 mem_clusters[i].start + mem_clusters[i].size - 1);
1327 }
1328 }
1329
1330 already_done = 1;
1331 }
1332
1333 void
map_tramps(void)1334 map_tramps(void)
1335 {
1336 #if defined(MULTIPROCESSOR) || \
1337 (NACPI > 0 && !defined(SMALL_KERNEL))
1338 struct pmap *kmp = pmap_kernel();
1339 extern paddr_t tramp_pdirpa;
1340 #ifdef MULTIPROCESSOR
1341 extern u_char cpu_spinup_trampoline[];
1342 extern u_char cpu_spinup_trampoline_end[];
1343 extern u_char mp_tramp_data_start[];
1344 extern u_char mp_tramp_data_end[];
1345 extern u_int32_t mp_pdirpa;
1346 #endif
1347
1348 /*
1349 * The initial PML4 pointer must be below 4G, so if the
1350 * current one isn't, use a "bounce buffer" and save it
1351 * for tramps to use.
1352 */
1353 if (kmp->pm_pdirpa > 0xffffffff) {
1354 pmap_kenter_pa(lo32_vaddr, lo32_paddr, PROT_READ | PROT_WRITE);
1355 memcpy((void *)lo32_vaddr, kmp->pm_pdir, PAGE_SIZE);
1356 tramp_pdirpa = lo32_paddr;
1357 pmap_kremove(lo32_vaddr, PAGE_SIZE);
1358 } else
1359 tramp_pdirpa = kmp->pm_pdirpa;
1360
1361
1362 #ifdef MULTIPROCESSOR
1363 /* Map MP tramp code and data pages RW for copy */
1364 pmap_kenter_pa(MP_TRAMPOLINE, MP_TRAMPOLINE,
1365 PROT_READ | PROT_WRITE);
1366
1367 pmap_kenter_pa(MP_TRAMP_DATA, MP_TRAMP_DATA,
1368 PROT_READ | PROT_WRITE);
1369
1370 memset((caddr_t)MP_TRAMPOLINE, 0xcc, PAGE_SIZE);
1371 memset((caddr_t)MP_TRAMP_DATA, 0xcc, PAGE_SIZE);
1372
1373 memcpy((caddr_t)MP_TRAMPOLINE,
1374 cpu_spinup_trampoline,
1375 cpu_spinup_trampoline_end-cpu_spinup_trampoline);
1376
1377 memcpy((caddr_t)MP_TRAMP_DATA,
1378 mp_tramp_data_start,
1379 mp_tramp_data_end - mp_tramp_data_start);
1380
1381 /*
1382 * We need to patch this after we copy the tramp data,
1383 * the symbol points into the copied tramp data page.
1384 */
1385 mp_pdirpa = tramp_pdirpa;
1386
1387 /* Unmap, will be remapped in cpu_start_secondary */
1388 pmap_kremove(MP_TRAMPOLINE, PAGE_SIZE);
1389 pmap_kremove(MP_TRAMP_DATA, PAGE_SIZE);
1390 #endif /* MULTIPROCESSOR */
1391 #endif
1392 }
1393
1394 void
cpu_set_vendor(struct cpu_info * ci,int level,const char * vendor)1395 cpu_set_vendor(struct cpu_info *ci, int level, const char *vendor)
1396 {
1397 ci->ci_cpuid_level = level;
1398 cpuid_level = MIN(cpuid_level, level);
1399
1400 /* map the vendor string to an integer */
1401 if (strcmp(vendor, "AuthenticAMD") == 0)
1402 ci->ci_vendor = CPUV_AMD;
1403 else if (strcmp(vendor, "GenuineIntel") == 0)
1404 ci->ci_vendor = CPUV_INTEL;
1405 else if (strcmp(vendor, "CentaurHauls") == 0)
1406 ci->ci_vendor = CPUV_VIA;
1407 else
1408 ci->ci_vendor = CPUV_UNKNOWN;
1409 }
1410
1411 #define IDTVEC(name) __CONCAT(X, name)
1412 typedef void (vector)(void);
1413 extern vector *IDTVEC(exceptions)[];
1414
1415 paddr_t early_pte_pages;
1416
1417 void
init_x86_64(paddr_t first_avail)1418 init_x86_64(paddr_t first_avail)
1419 {
1420 struct region_descriptor region;
1421 bios_memmap_t *bmp;
1422 int x, ist;
1423 uint64_t max_dm_size = ((uint64_t)512 * NUM_L4_SLOT_DIRECT) << 30;
1424
1425 /*
1426 * locore0 mapped 3 pages for use before the pmap is initialized
1427 * starting at first_avail. These pages are currently used by
1428 * efifb to create early-use VAs for the framebuffer before efifb
1429 * is attached.
1430 */
1431 early_pte_pages = first_avail;
1432 first_avail += 3 * NBPG;
1433
1434 cpu_set_vendor(&cpu_info_primary, cpuid_level, cpu_vendor);
1435 cpu_init_msrs(&cpu_info_primary);
1436
1437 proc0.p_addr = proc0paddr;
1438 cpu_info_primary.ci_curpcb = &proc0.p_addr->u_pcb;
1439
1440 x86_bus_space_init();
1441
1442 i8254_startclock();
1443
1444 /*
1445 * Initialize PAGE_SIZE-dependent variables.
1446 */
1447 uvm_setpagesize();
1448
1449 /*
1450 * Boot arguments are in a single page specified by /boot.
1451 *
1452 * We require the "new" vector form, as well as memory ranges
1453 * to be given in bytes rather than KB.
1454 *
1455 * locore copies the data into bootinfo[] for us.
1456 */
1457 if ((bootapiver & (BAPIV_VECTOR | BAPIV_BMEMMAP)) ==
1458 (BAPIV_VECTOR | BAPIV_BMEMMAP)) {
1459 if (bootinfo_size >= sizeof(bootinfo))
1460 panic("boot args too big");
1461
1462 getbootinfo(bootinfo, bootinfo_size);
1463 } else
1464 panic("invalid /boot");
1465
1466 cninit();
1467
1468 /*
1469 * Memory on the AMD64 port is described by three different things.
1470 *
1471 * 1. biosbasemem - This is outdated, and should really only be used to
1472 * sanitize the other values. This is what we get back from the BIOS
1473 * using the legacy routines, describing memory below 640KB.
1474 *
1475 * 2. bios_memmap[] - This is the memory map as the bios has returned
1476 * it to us. It includes memory the kernel occupies, etc.
1477 *
1478 * 3. mem_cluster[] - This is the massaged free memory segments after
1479 * taking into account the contents of bios_memmap, biosbasemem,
1480 * and locore/machdep/pmap kernel allocations of physical
1481 * pages.
1482 *
1483 * The other thing is that the physical page *RANGE* is described by
1484 * three more variables:
1485 *
1486 * avail_start - This is a physical address of the start of available
1487 * pages, until IOM_BEGIN. This is basically the start
1488 * of the UVM managed range of memory, with some holes...
1489 *
1490 * avail_end - This is the end of physical pages. All physical pages
1491 * that UVM manages are between avail_start and avail_end.
1492 * There are holes...
1493 *
1494 * first_avail - This is the first available physical page after the
1495 * kernel, page tables, etc.
1496 *
1497 * We skip the first few pages for trampolines, hibernate, and to avoid
1498 * buggy SMI implementations that could corrupt the first 64KB.
1499 */
1500 avail_start = 16*PAGE_SIZE;
1501
1502 #ifdef MULTIPROCESSOR
1503 if (avail_start < MP_TRAMPOLINE + PAGE_SIZE)
1504 avail_start = MP_TRAMPOLINE + PAGE_SIZE;
1505 if (avail_start < MP_TRAMP_DATA + PAGE_SIZE)
1506 avail_start = MP_TRAMP_DATA + PAGE_SIZE;
1507 #endif
1508
1509 #if (NACPI > 0 && !defined(SMALL_KERNEL))
1510 if (avail_start < ACPI_TRAMPOLINE + PAGE_SIZE)
1511 avail_start = ACPI_TRAMPOLINE + PAGE_SIZE;
1512 if (avail_start < ACPI_TRAMP_DATA + PAGE_SIZE)
1513 avail_start = ACPI_TRAMP_DATA + PAGE_SIZE;
1514 #endif
1515
1516 #ifdef HIBERNATE
1517 if (avail_start < HIBERNATE_HIBALLOC_PAGE + PAGE_SIZE)
1518 avail_start = HIBERNATE_HIBALLOC_PAGE + PAGE_SIZE;
1519 #endif /* HIBERNATE */
1520
1521 /*
1522 * We need to go through the BIOS memory map given, and
1523 * fill out mem_clusters and mem_cluster_cnt stuff, taking
1524 * into account all the points listed above.
1525 */
1526 avail_end = mem_cluster_cnt = 0;
1527 for (bmp = bios_memmap; bmp->type != BIOS_MAP_END; bmp++) {
1528 paddr_t s1, s2, e1, e2;
1529
1530 /* Ignore non-free memory */
1531 if (bmp->type != BIOS_MAP_FREE)
1532 continue;
1533 if (bmp->size < PAGE_SIZE)
1534 continue;
1535
1536 /* Init our segment(s), round/trunc to pages */
1537 s1 = round_page(bmp->addr);
1538 e1 = trunc_page(bmp->addr + bmp->size);
1539 s2 = e2 = 0;
1540
1541 /*
1542 * XXX Some buggy ACPI BIOSes use memory that they
1543 * declare as free. Current worst offender is
1544 * Supermicro 5019D-FTN4. Typically the affected memory
1545 * areas are small blocks between areas reserved for
1546 * ACPI and other BIOS goo. So skip areas smaller
1547 * than 32 MB above the 16 MB boundary (to avoid
1548 * affecting legacy stuff).
1549 */
1550 if (s1 > 16*1024*1024 && (e1 - s1) < 32*1024*1024)
1551 continue;
1552
1553 /* Check and adjust our segment(s) */
1554 /* Nuke low pages */
1555 if (s1 < avail_start) {
1556 s1 = avail_start;
1557 if (s1 > e1)
1558 continue;
1559 }
1560
1561 /*
1562 * The direct map is limited to 512GB * NUM_L4_SLOT_DIRECT of
1563 * memory, so discard anything above that.
1564 */
1565 if (e1 >= max_dm_size) {
1566 e1 = max_dm_size;
1567 if (s1 > e1)
1568 continue;
1569 }
1570
1571 /* Crop stuff into "640K hole" */
1572 if (s1 < IOM_BEGIN && e1 > IOM_BEGIN)
1573 e1 = IOM_BEGIN;
1574 if (s1 < biosbasemem && e1 > biosbasemem)
1575 e1 = biosbasemem;
1576
1577 /* Split any segments straddling the 16MB boundary */
1578 if (s1 < 16*1024*1024 && e1 > 16*1024*1024) {
1579 e2 = e1;
1580 s2 = e1 = 16*1024*1024;
1581 }
1582
1583 /* Store segment(s) */
1584 if (e1 - s1 >= PAGE_SIZE) {
1585 mem_clusters[mem_cluster_cnt].start = s1;
1586 mem_clusters[mem_cluster_cnt].size = e1 - s1;
1587 mem_cluster_cnt++;
1588 }
1589 if (e2 - s2 >= PAGE_SIZE) {
1590 mem_clusters[mem_cluster_cnt].start = s2;
1591 mem_clusters[mem_cluster_cnt].size = e2 - s2;
1592 mem_cluster_cnt++;
1593 }
1594 if (avail_end < e1) avail_end = e1;
1595 if (avail_end < e2) avail_end = e2;
1596 }
1597
1598 /*
1599 * Call pmap initialization to make new kernel address space.
1600 * We must do this before loading pages into the VM system.
1601 */
1602 first_avail = pmap_bootstrap(first_avail, trunc_page(avail_end));
1603
1604 #if NEFI > 0
1605 /* Relocate the EFI memory map. */
1606 if (bios_efiinfo && bios_efiinfo->mmap_start) {
1607 mmap = (EFI_MEMORY_DESCRIPTOR *)PMAP_DIRECT_MAP(first_avail);
1608 memcpy(mmap, (void *)PMAP_DIRECT_MAP(bios_efiinfo->mmap_start),
1609 bios_efiinfo->mmap_size);
1610 first_avail += round_page(bios_efiinfo->mmap_size);
1611 }
1612 #endif
1613
1614 /* Allocate these out of the 640KB base memory */
1615 if (avail_start != PAGE_SIZE)
1616 avail_start = pmap_prealloc_lowmem_ptps(avail_start);
1617
1618 cpu_init_extents();
1619
1620 /* Make sure the end of the space used by the kernel is rounded. */
1621 first_avail = round_page(first_avail);
1622 kern_end = KERNBASE + first_avail;
1623
1624 /*
1625 * Now, load the memory clusters (which have already been
1626 * flensed) into the VM system.
1627 */
1628 for (x = 0; x < mem_cluster_cnt; x++) {
1629 paddr_t seg_start = mem_clusters[x].start;
1630 paddr_t seg_end = seg_start + mem_clusters[x].size;
1631
1632 if (seg_start < first_avail) seg_start = first_avail;
1633 if (seg_start > seg_end) continue;
1634 if (seg_end - seg_start < PAGE_SIZE) continue;
1635
1636 physmem += atop(mem_clusters[x].size);
1637
1638 #if DEBUG_MEMLOAD
1639 printf("loading 0x%lx-0x%lx (0x%lx-0x%lx)\n",
1640 seg_start, seg_end, atop(seg_start), atop(seg_end));
1641 #endif
1642 uvm_page_physload(atop(seg_start), atop(seg_end),
1643 atop(seg_start), atop(seg_end), 0);
1644 }
1645
1646 /*
1647 * Now, load the memory between the end of I/O memory "hole"
1648 * and the kernel.
1649 */
1650 {
1651 paddr_t seg_start = round_page(IOM_END);
1652 paddr_t seg_end = trunc_page(KERNTEXTOFF - KERNBASE);
1653
1654 if (seg_start < seg_end) {
1655 #if DEBUG_MEMLOAD
1656 printf("loading 0x%lx-0x%lx\n", seg_start, seg_end);
1657 #endif
1658 uvm_page_physload(atop(seg_start), atop(seg_end),
1659 atop(seg_start), atop(seg_end), 0);
1660 }
1661 }
1662
1663 #if DEBUG_MEMLOAD
1664 printf("avail_start = 0x%lx\n", avail_start);
1665 printf("avail_end = 0x%lx\n", avail_end);
1666 printf("first_avail = 0x%lx\n", first_avail);
1667 #endif
1668
1669 /*
1670 * Steal memory for the message buffer (at end of core).
1671 */
1672 {
1673 struct vm_physseg *vps = NULL;
1674 psize_t sz = round_page(MSGBUFSIZE);
1675 psize_t reqsz = sz;
1676
1677 for (x = 0; x < vm_nphysseg; x++) {
1678 vps = &vm_physmem[x];
1679 if (ptoa(vps->avail_end) == avail_end)
1680 break;
1681 }
1682 if (x == vm_nphysseg)
1683 panic("init_x86_64: can't find end of memory");
1684
1685 /* Shrink so it'll fit in the last segment. */
1686 if ((vps->avail_end - vps->avail_start) < atop(sz))
1687 sz = ptoa(vps->avail_end - vps->avail_start);
1688
1689 vps->avail_end -= atop(sz);
1690 vps->end -= atop(sz);
1691 msgbuf_paddr = ptoa(vps->avail_end);
1692
1693 /* Remove the last segment if it now has no pages. */
1694 if (vps->start == vps->end) {
1695 for (vm_nphysseg--; x < vm_nphysseg; x++)
1696 vm_physmem[x] = vm_physmem[x + 1];
1697 }
1698
1699 /* Now find where the new avail_end is. */
1700 for (avail_end = 0, x = 0; x < vm_nphysseg; x++)
1701 if (vm_physmem[x].avail_end > avail_end)
1702 avail_end = vm_physmem[x].avail_end;
1703 avail_end = ptoa(avail_end);
1704
1705 /* Warn if the message buffer had to be shrunk. */
1706 if (sz != reqsz)
1707 printf("WARNING: %ld bytes not available for msgbuf "
1708 "in last cluster (%ld used)\n", reqsz, sz);
1709 }
1710
1711 /*
1712 * Steal some memory for a dump bouncebuffer if we have memory over
1713 * the 32-bit barrier.
1714 */
1715 if (avail_end > 0xffffffff) {
1716 struct vm_physseg *vps = NULL;
1717 psize_t sz = round_page(MAX(BYTES_PER_DUMP, dbtob(1)));
1718
1719 /* XXX assumes segments are ordered */
1720 for (x = 0; x < vm_nphysseg; x++) {
1721 vps = &vm_physmem[x];
1722 /* Find something between 16meg and 4gig */
1723 if (ptoa(vps->avail_end) <= 0xffffffff &&
1724 ptoa(vps->avail_start) >= 0xffffff)
1725 break;
1726 }
1727 if (x == vm_nphysseg)
1728 panic("init_x86_64: no memory between "
1729 "0xffffff-0xffffffff");
1730
1731 /* Shrink so it'll fit in the segment. */
1732 if ((vps->avail_end - vps->avail_start) < atop(sz))
1733 sz = ptoa(vps->avail_end - vps->avail_start);
1734
1735 vps->avail_end -= atop(sz);
1736 vps->end -= atop(sz);
1737 dumpmem_paddr = ptoa(vps->avail_end);
1738 dumpmem_vaddr = PMAP_DIRECT_MAP(dumpmem_paddr);
1739 dumpmem_sz = sz;
1740
1741 /* Remove the last segment if it now has no pages. */
1742 if (vps->start == vps->end) {
1743 for (vm_nphysseg--; x < vm_nphysseg; x++)
1744 vm_physmem[x] = vm_physmem[x + 1];
1745 }
1746 }
1747
1748 pmap_growkernel(VM_MIN_KERNEL_ADDRESS + 32 * 1024 * 1024);
1749
1750 pmap_kenter_pa(idt_vaddr, idt_paddr, PROT_READ | PROT_WRITE);
1751
1752 idt = (struct gate_descriptor *)idt_vaddr;
1753 cpu_info_primary.ci_tss = &cpu_info_full_primary.cif_tss;
1754 cpu_info_primary.ci_gdt = &cpu_info_full_primary.cif_gdt;
1755
1756 /* make gdt gates and memory segments */
1757 set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GCODE_SEL), 0,
1758 0xfffff, SDT_MEMERA, SEL_KPL, 1, 0, 1);
1759
1760 set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GDATA_SEL), 0,
1761 0xfffff, SDT_MEMRWA, SEL_KPL, 1, 0, 1);
1762
1763 set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GUDATA_SEL), 0,
1764 atop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMRWA, SEL_UPL, 1, 0, 1);
1765
1766 set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GUCODE_SEL), 0,
1767 atop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMERA, SEL_UPL, 1, 0, 1);
1768
1769 set_sys_segment(GDT_ADDR_SYS(cpu_info_primary.ci_gdt, GPROC0_SEL),
1770 cpu_info_primary.ci_tss, sizeof (struct x86_64_tss)-1,
1771 SDT_SYS386TSS, SEL_KPL, 0);
1772
1773 /* exceptions */
1774 for (x = 0; x < 32; x++) {
1775 /* trap2 == NMI, trap8 == double fault */
1776 ist = (x == 2) ? 2 : (x == 8) ? 1 : 0;
1777 setgate(&idt[x], IDTVEC(exceptions)[x], ist, SDT_SYS386IGT,
1778 (x == 3) ? SEL_UPL : SEL_KPL,
1779 GSEL(GCODE_SEL, SEL_KPL));
1780 idt_allocmap[x] = 1;
1781 }
1782
1783 setregion(®ion, cpu_info_primary.ci_gdt, GDT_SIZE - 1);
1784 lgdt(®ion);
1785
1786 cpu_init_idt();
1787
1788 intr_default_setup();
1789
1790 fpuinit(&cpu_info_primary);
1791
1792 softintr_init();
1793 splraise(IPL_IPI);
1794 intr_enable();
1795
1796 #ifdef DDB
1797 db_machine_init();
1798 ddb_init();
1799 if (boothowto & RB_KDB)
1800 db_enter();
1801 #endif
1802 }
1803
1804 void
cpu_reset(void)1805 cpu_reset(void)
1806 {
1807 intr_disable();
1808
1809 if (cpuresetfn)
1810 (*cpuresetfn)();
1811
1812 /*
1813 * The keyboard controller has 4 random output pins, one of which is
1814 * connected to the RESET pin on the CPU in many PCs. We tell the
1815 * keyboard controller to pulse this line a couple of times.
1816 */
1817 outb(IO_KBD + KBCMDP, KBC_PULSE0);
1818 delay(100000);
1819 outb(IO_KBD + KBCMDP, KBC_PULSE0);
1820 delay(100000);
1821
1822 /*
1823 * Try to cause a triple fault and watchdog reset by making the IDT
1824 * invalid and causing a fault.
1825 */
1826 memset((caddr_t)idt, 0, NIDT * sizeof(idt[0]));
1827 __asm volatile("divl %0,%1" : : "q" (0), "a" (0));
1828
1829 for (;;)
1830 continue;
1831 /* NOTREACHED */
1832 }
1833
1834 /*
1835 * cpu_dumpsize: calculate size of machine-dependent kernel core dump headers.
1836 */
1837 int
cpu_dumpsize(void)1838 cpu_dumpsize(void)
1839 {
1840 int size;
1841
1842 size = ALIGN(sizeof(kcore_seg_t)) +
1843 ALIGN(mem_cluster_cnt * sizeof(phys_ram_seg_t));
1844 if (roundup(size, dbtob(1)) != dbtob(1))
1845 return (-1);
1846
1847 return (1);
1848 }
1849
1850 /*
1851 * cpu_dump_mempagecnt: calculate the size of RAM (in pages) to be dumped.
1852 */
1853 u_long
cpu_dump_mempagecnt(void)1854 cpu_dump_mempagecnt(void)
1855 {
1856 u_long i, n;
1857
1858 n = 0;
1859 for (i = 0; i < mem_cluster_cnt; i++)
1860 n += atop(mem_clusters[i].size);
1861 return (n);
1862 }
1863
1864 /*
1865 * Figure out which portions of memory are used by the kernel/system.
1866 */
1867 int
amd64_pa_used(paddr_t addr)1868 amd64_pa_used(paddr_t addr)
1869 {
1870 struct vm_page *pg;
1871
1872 /* Kernel manages these */
1873 if ((pg = PHYS_TO_VM_PAGE(addr)) && (pg->pg_flags & PG_DEV) == 0)
1874 return 1;
1875
1876 /* Kernel is loaded here */
1877 if (addr > IOM_END && addr < (kern_end - KERNBASE))
1878 return 1;
1879
1880 /* Low memory used for various bootstrap things */
1881 if (addr < avail_start)
1882 return 1;
1883
1884 /*
1885 * The only regions I can think of that are left are the things
1886 * we steal away from UVM. The message buffer?
1887 * XXX - ignore these for now.
1888 */
1889
1890 return 0;
1891 }
1892
1893 void
cpu_initclocks(void)1894 cpu_initclocks(void)
1895 {
1896 (*initclock_func)();
1897 }
1898
1899 void
cpu_startclock(void)1900 cpu_startclock(void)
1901 {
1902 (*startclock_func)();
1903 }
1904
1905 void
need_resched(struct cpu_info * ci)1906 need_resched(struct cpu_info *ci)
1907 {
1908 ci->ci_want_resched = 1;
1909
1910 /* There's a risk we'll be called before the idle threads start */
1911 if (ci->ci_curproc) {
1912 aston(ci->ci_curproc);
1913 cpu_kick(ci);
1914 }
1915 }
1916
1917 /*
1918 * Allocate an IDT vector slot within the given range.
1919 * XXX needs locking to avoid MP allocation races.
1920 */
1921
1922 int
idt_vec_alloc(int low,int high)1923 idt_vec_alloc(int low, int high)
1924 {
1925 int vec;
1926
1927 for (vec = low; vec <= high; vec++) {
1928 if (idt_allocmap[vec] == 0) {
1929 idt_allocmap[vec] = 1;
1930 return vec;
1931 }
1932 }
1933 return 0;
1934 }
1935
1936 int
idt_vec_alloc_range(int low,int high,int num)1937 idt_vec_alloc_range(int low, int high, int num)
1938 {
1939 int i, vec;
1940
1941 KASSERT(powerof2(num));
1942 low = (low + num - 1) & ~(num - 1);
1943 high = ((high + 1) & ~(num - 1)) - 1;
1944
1945 for (vec = low; vec <= high; vec += num) {
1946 for (i = 0; i < num; i++) {
1947 if (idt_allocmap[vec + i] != 0)
1948 break;
1949 }
1950 if (i == num) {
1951 for (i = 0; i < num; i++)
1952 idt_allocmap[vec + i] = 1;
1953 return vec;
1954 }
1955 }
1956 return 0;
1957 }
1958
1959 void
idt_vec_set(int vec,void (* function)(void))1960 idt_vec_set(int vec, void (*function)(void))
1961 {
1962 /*
1963 * Vector should be allocated, so no locking needed.
1964 */
1965 KASSERT(idt_allocmap[vec] == 1);
1966 setgate(&idt[vec], function, 0, SDT_SYS386IGT, SEL_KPL,
1967 GSEL(GCODE_SEL, SEL_KPL));
1968 }
1969
1970 void
idt_vec_free(int vec)1971 idt_vec_free(int vec)
1972 {
1973 unsetgate(&idt[vec]);
1974 idt_allocmap[vec] = 0;
1975 }
1976
1977 #ifdef DIAGNOSTIC
1978 void
splassert_check(int wantipl,const char * func)1979 splassert_check(int wantipl, const char *func)
1980 {
1981 int cpl = curcpu()->ci_ilevel;
1982 int floor = curcpu()->ci_handled_intr_level;
1983
1984 if (cpl < wantipl) {
1985 splassert_fail(wantipl, cpl, func);
1986 }
1987 if (floor > wantipl) {
1988 splassert_fail(wantipl, floor, func);
1989 }
1990
1991 }
1992 #endif
1993
1994 int
copyin32(const uint32_t * uaddr,uint32_t * kaddr)1995 copyin32(const uint32_t *uaddr, uint32_t *kaddr)
1996 {
1997 if ((vaddr_t)uaddr & 0x3)
1998 return EFAULT;
1999
2000 /* copyin(9) is atomic */
2001 return copyin(uaddr, kaddr, sizeof(uint32_t));
2002 }
2003
2004 void
getbootinfo(char * bootinfo,int bootinfo_size)2005 getbootinfo(char *bootinfo, int bootinfo_size)
2006 {
2007 bootarg32_t *q;
2008 bios_ddb_t *bios_ddb;
2009 bios_bootduid_t *bios_bootduid;
2010 bios_bootsr_t *bios_bootsr;
2011 #undef BOOTINFO_DEBUG
2012 #ifdef BOOTINFO_DEBUG
2013 printf("bootargv:");
2014 #endif
2015
2016 for (q = (bootarg32_t *)bootinfo;
2017 (q->ba_type != BOOTARG_END) &&
2018 ((((char *)q) - bootinfo) < bootinfo_size);
2019 q = (bootarg32_t *)(((char *)q) + q->ba_size)) {
2020
2021 switch (q->ba_type) {
2022 case BOOTARG_MEMMAP:
2023 bios_memmap = (bios_memmap_t *)q->ba_arg;
2024 #ifdef BOOTINFO_DEBUG
2025 printf(" memmap %p", bios_memmap);
2026 #endif
2027 break;
2028 case BOOTARG_DISKINFO:
2029 bios_diskinfo = (bios_diskinfo_t *)q->ba_arg;
2030 #ifdef BOOTINFO_DEBUG
2031 printf(" diskinfo %p", bios_diskinfo);
2032 #endif
2033 break;
2034 case BOOTARG_APMINFO:
2035 /* generated by i386 boot loader */
2036 break;
2037 case BOOTARG_CKSUMLEN:
2038 bios_cksumlen = *(u_int32_t *)q->ba_arg;
2039 #ifdef BOOTINFO_DEBUG
2040 printf(" cksumlen %d", bios_cksumlen);
2041 #endif
2042 break;
2043 case BOOTARG_PCIINFO:
2044 /* generated by i386 boot loader */
2045 break;
2046 case BOOTARG_CONSDEV: {
2047 #if NCOM > 0
2048 bios_consdev_t *cdp = (bios_consdev_t*)q->ba_arg;
2049 static const int ports[] =
2050 { 0x3f8, 0x2f8, 0x3e8, 0x2e8 };
2051 int unit = minor(cdp->consdev);
2052 uint64_t consaddr = cdp->consaddr;
2053 if (consaddr == -1 && unit >= 0 && unit < nitems(ports))
2054 consaddr = ports[unit];
2055 if (major(cdp->consdev) == 8 && consaddr != -1) {
2056 comconsunit = unit;
2057 comconsaddr = consaddr;
2058 comconsrate = cdp->conspeed;
2059 comconsfreq = cdp->consfreq;
2060 comcons_reg_width = cdp->reg_width;
2061 comcons_reg_shift = cdp->reg_shift;
2062 if (cdp->flags & BCD_MMIO)
2063 comconsiot = X86_BUS_SPACE_MEM;
2064 else
2065 comconsiot = X86_BUS_SPACE_IO;
2066 }
2067 #endif
2068 #ifdef BOOTINFO_DEBUG
2069 printf(" console 0x%x:%d", cdp->consdev, cdp->conspeed);
2070 #endif
2071 break;
2072 }
2073 case BOOTARG_BOOTMAC:
2074 bios_bootmac = (bios_bootmac_t *)q->ba_arg;
2075 break;
2076
2077 case BOOTARG_DDB:
2078 bios_ddb = (bios_ddb_t *)q->ba_arg;
2079 #ifdef DDB
2080 db_console = bios_ddb->db_console;
2081 #endif
2082 break;
2083
2084 case BOOTARG_BOOTDUID:
2085 bios_bootduid = (bios_bootduid_t *)q->ba_arg;
2086 memcpy(bootduid, bios_bootduid, sizeof(bootduid));
2087 break;
2088
2089 case BOOTARG_BOOTSR:
2090 bios_bootsr = (bios_bootsr_t *)q->ba_arg;
2091 #if NSOFTRAID > 0
2092 memcpy(&sr_bootuuid, &bios_bootsr->uuid,
2093 sizeof(sr_bootuuid));
2094 memcpy(&sr_bootkey, &bios_bootsr->maskkey,
2095 sizeof(sr_bootkey));
2096 #endif
2097 explicit_bzero(bios_bootsr, sizeof(bios_bootsr_t));
2098 break;
2099
2100 case BOOTARG_EFIINFO:
2101 bios_efiinfo = (bios_efiinfo_t *)q->ba_arg;
2102 break;
2103
2104 case BOOTARG_UCODE:
2105 bios_ucode = (bios_ucode_t *)q->ba_arg;
2106 break;
2107
2108 default:
2109 #ifdef BOOTINFO_DEBUG
2110 printf(" unsupported arg (%d) %p", q->ba_type,
2111 q->ba_arg);
2112 #endif
2113 break;
2114 }
2115 }
2116 #ifdef BOOTINFO_DEBUG
2117 printf("\n");
2118 #endif
2119 }
2120
2121 int
check_context(const struct reg * regs,struct trapframe * tf)2122 check_context(const struct reg *regs, struct trapframe *tf)
2123 {
2124 uint16_t sel;
2125
2126 if (((regs->r_rflags ^ tf->tf_rflags) & PSL_USERSTATIC) != 0)
2127 return EINVAL;
2128
2129 sel = regs->r_ss & 0xffff;
2130 if (!VALID_USER_DSEL(sel))
2131 return EINVAL;
2132
2133 sel = regs->r_cs & 0xffff;
2134 if (!VALID_USER_CSEL(sel))
2135 return EINVAL;
2136
2137 if (regs->r_rip >= VM_MAXUSER_ADDRESS)
2138 return EINVAL;
2139
2140 return 0;
2141 }
2142
2143 int amd64_delay_quality;
2144
2145 void
delay_init(void (* fn)(int),int fn_quality)2146 delay_init(void(*fn)(int), int fn_quality)
2147 {
2148 if (fn_quality > amd64_delay_quality) {
2149 delay_func = fn;
2150 amd64_delay_quality = fn_quality;
2151 }
2152 }
2153
2154 void
delay_fini(void (* fn)(int))2155 delay_fini(void (*fn)(int))
2156 {
2157 if (fn == delay_func) {
2158 delay_func = i8254_delay;
2159 amd64_delay_quality = 0;
2160 }
2161 }
2162