1 /* $OpenBSD: machdep.c,v 1.297 2024/09/21 19:06:07 deraadt Exp $ */
2 /* $NetBSD: machdep.c,v 1.3 2003/05/07 22:58:18 fvdl Exp $ */
3
4 /*-
5 * Copyright (c) 1996, 1997, 1998, 2000 The NetBSD Foundation, Inc.
6 * All rights reserved.
7 *
8 * This code is derived from software contributed to The NetBSD Foundation
9 * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace
10 * Simulation Facility, NASA Ames Research Center.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
25 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 /*-
35 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
36 * All rights reserved.
37 *
38 * This code is derived from software contributed to Berkeley by
39 * William Jolitz.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * @(#)machdep.c 7.4 (Berkeley) 6/3/91
66 */
67
68 #include <sys/param.h>
69 #include <sys/systm.h>
70 #include <sys/signal.h>
71 #include <sys/signalvar.h>
72 #include <sys/proc.h>
73 #include <sys/user.h>
74 #include <sys/exec.h>
75 #include <sys/buf.h>
76 #include <sys/reboot.h>
77 #include <sys/conf.h>
78 #include <sys/msgbuf.h>
79 #include <sys/mount.h>
80 #include <sys/extent.h>
81 #include <sys/core.h>
82 #include <sys/kcore.h>
83 #include <sys/syscallargs.h>
84
85 #include <dev/cons.h>
86 #include <stand/boot/bootarg.h>
87
88 #include <net/if.h>
89 #include <uvm/uvm_extern.h>
90
91 #include <sys/sysctl.h>
92
93 #include <machine/cpu_full.h>
94 #include <machine/cpufunc.h>
95 #include <machine/pio.h>
96 #include <machine/psl.h>
97 #include <machine/reg.h>
98 #include <machine/fpu.h>
99 #include <machine/biosvar.h>
100 #include <machine/mpbiosvar.h>
101 #include <machine/kcore.h>
102 #include <machine/tss.h>
103
104 #include <dev/isa/isareg.h>
105 #include <dev/ic/i8042reg.h>
106
107 #ifdef DDB
108 #include <machine/db_machdep.h>
109 #include <ddb/db_extern.h>
110 extern int db_console;
111 #endif
112
113 #include "isa.h"
114 #include "isadma.h"
115 #include "ksyms.h"
116
117 #include "acpi.h"
118 #if NACPI > 0
119 #include <dev/acpi/acpireg.h>
120 #include <dev/acpi/acpivar.h>
121 #endif
122
123 #include "com.h"
124 #if NCOM > 0
125 #include <sys/tty.h>
126 #include <dev/ic/comvar.h>
127 #include <dev/ic/comreg.h>
128 #endif
129
130 #include "efi.h"
131 #if NEFI > 0
132 #include <dev/efi/efi.h>
133 #endif
134
135 #include "softraid.h"
136 #if NSOFTRAID > 0
137 #include <dev/softraidvar.h>
138 #endif
139
140 #ifdef HIBERNATE
141 #include <machine/hibernate_var.h>
142 #endif /* HIBERNATE */
143
144 #include "ukbd.h"
145 #include "pckbc.h"
146 #if NPCKBC > 0 && NUKBD > 0
147 #include <dev/ic/pckbcvar.h>
148 #endif
149
150 /* #define MACHDEP_DEBUG */
151
152 #ifdef MACHDEP_DEBUG
153 #define DPRINTF(x...) do { printf(x); } while(0)
154 #else
155 #define DPRINTF(x...)
156 #endif /* MACHDEP_DEBUG */
157
158 /* the following is used externally (sysctl_hw) */
159 char machine[] = MACHINE;
160
161 /*
162 * switchto vectors
163 */
164 void cpu_idle_cycle_hlt(void);
165 void (*cpu_idle_cycle_fcn)(void) = &cpu_idle_cycle_hlt;
166 void (*cpu_suspend_cycle_fcn)(void);
167
168 /* the following is used externally for concurrent handlers */
169 int setperf_prio = 0;
170
171 #ifdef CPURESET_DELAY
172 int cpureset_delay = CPURESET_DELAY;
173 #else
174 int cpureset_delay = 0;
175 #endif
176
177 char *ssym = 0, *esym = 0; /* start and end of symbol table */
178 dev_t bootdev = 0; /* device we booted from */
179 int biosbasemem = 0; /* base memory reported by BIOS */
180 u_int bootapiver = 0; /* /boot API version */
181
182 int physmem;
183 extern int boothowto;
184
185 paddr_t dumpmem_paddr;
186 vaddr_t dumpmem_vaddr;
187 psize_t dumpmem_sz;
188
189 vaddr_t kern_end;
190
191 vaddr_t msgbuf_vaddr;
192 paddr_t msgbuf_paddr;
193
194 vaddr_t idt_vaddr;
195 paddr_t idt_paddr;
196
197 vaddr_t lo32_vaddr;
198 paddr_t lo32_paddr;
199 paddr_t tramp_pdirpa;
200
201 int kbd_reset;
202 int lid_action = 1;
203 int pwr_action = 1;
204 int forceukbd;
205
206 /*
207 * safepri is a safe priority for sleep to set for a spin-wait
208 * during autoconfiguration or after a panic.
209 */
210 int safepri = 0;
211
212 struct vm_map *exec_map = NULL;
213 struct vm_map *phys_map = NULL;
214
215 /* UVM constraint ranges. */
216 struct uvm_constraint_range isa_constraint = { 0x0, 0x00ffffffUL };
217 struct uvm_constraint_range dma_constraint = { 0x0, 0xffffffffUL };
218 struct uvm_constraint_range *uvm_md_constraints[] = {
219 &isa_constraint,
220 &dma_constraint,
221 NULL,
222 };
223
224 paddr_t avail_start;
225 paddr_t avail_end;
226
227 void (*delay_func)(int) = i8254_delay;
228 void (*initclock_func)(void) = i8254_initclocks;
229 void (*startclock_func)(void) = i8254_start_both_clocks;
230
231 /*
232 * Format of boot information passed to us by 32-bit /boot
233 */
234 typedef struct _boot_args32 {
235 int ba_type;
236 int ba_size;
237 int ba_nextX; /* a ptr in 32-bit world, but not here */
238 char ba_arg[1];
239 } bootarg32_t;
240
241 #define BOOTARGC_MAX NBPG /* one page */
242
243 bios_bootmac_t *bios_bootmac;
244
245 /* locore copies the arguments from /boot to here for us */
246 char bootinfo[BOOTARGC_MAX];
247 int bootinfo_size = BOOTARGC_MAX;
248
249 void getbootinfo(char *, int);
250
251 /* Data passed to us by /boot, filled in by getbootinfo() */
252 bios_diskinfo_t *bios_diskinfo;
253 bios_memmap_t *bios_memmap;
254 u_int32_t bios_cksumlen;
255 bios_efiinfo_t *bios_efiinfo;
256 bios_ucode_t *bios_ucode;
257
258 #if NEFI > 0
259 EFI_MEMORY_DESCRIPTOR *mmap;
260 #endif
261
262 /*
263 * Size of memory segments, before any memory is stolen.
264 */
265 phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX];
266 int mem_cluster_cnt;
267
268 int cpu_dump(void);
269 int cpu_dumpsize(void);
270 u_long cpu_dump_mempagecnt(void);
271 void dumpsys(void);
272 void cpu_init_extents(void);
273 void map_tramps(void);
274 void init_x86_64(paddr_t);
275 void (*cpuresetfn)(void);
276 void enter_shared_special_pages(void);
277
278 #ifdef APERTURE
279 int allowaperture = 0;
280 #endif
281
282 /*
283 * Machine-dependent startup code
284 */
285 void
cpu_startup(void)286 cpu_startup(void)
287 {
288 vaddr_t minaddr, maxaddr;
289
290 msgbuf_vaddr = PMAP_DIRECT_MAP(msgbuf_paddr);
291 initmsgbuf((caddr_t)msgbuf_vaddr, round_page(MSGBUFSIZE));
292
293 printf("%s", version);
294 startclocks();
295 rtcinit();
296
297 printf("real mem = %lu (%luMB)\n", ptoa((psize_t)physmem),
298 ptoa((psize_t)physmem)/1024/1024);
299
300 /*
301 * Allocate a submap for exec arguments. This map effectively
302 * limits the number of processes exec'ing at any time.
303 */
304 minaddr = vm_map_min(kernel_map);
305 exec_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
306 16*NCARGS, VM_MAP_PAGEABLE, FALSE, NULL);
307
308 /*
309 * Allocate a submap for physio
310 */
311 minaddr = vm_map_min(kernel_map);
312 phys_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
313 VM_PHYS_SIZE, 0, FALSE, NULL);
314
315 printf("avail mem = %lu (%luMB)\n", ptoa((psize_t)uvmexp.free),
316 ptoa((psize_t)uvmexp.free)/1024/1024);
317
318 bufinit();
319
320 if (boothowto & RB_CONFIG) {
321 #ifdef BOOT_CONFIG
322 user_config();
323 #else
324 printf("kernel does not support -c; continuing..\n");
325 #endif
326 }
327
328 /* Safe for i/o port / memory space allocation to use malloc now. */
329 x86_bus_space_mallocok();
330
331 #ifndef SMALL_KERNEL
332 cpu_ucode_setup();
333 cpu_ucode_apply(&cpu_info_primary);
334 #endif
335 cpu_tsx_disable(&cpu_info_primary);
336
337 /* enter the IDT and trampoline code in the u-k maps */
338 enter_shared_special_pages();
339
340 /* initialize CPU0's TSS and GDT and put them in the u-k maps */
341 cpu_enter_pages(&cpu_info_full_primary);
342 }
343
344 /*
345 * enter_shared_special_pages
346 *
347 * Requests mapping of various special pages required in the Intel Meltdown
348 * case (to be entered into the U-K page table):
349 *
350 * 1 IDT page
351 * Various number of pages covering the U-K ".kutext" section. This section
352 * contains code needed during trampoline operation
353 * Various number of pages covering the U-K ".kudata" section. This section
354 * contains data accessed by the trampoline, before switching to U+K
355 * (for example, various shared global variables used by IPIs, etc)
356 *
357 * The linker script places the required symbols in the sections above.
358 *
359 * On CPUs not affected by Meltdown, the calls to pmap_enter_special below
360 * become no-ops.
361 */
362 void
enter_shared_special_pages(void)363 enter_shared_special_pages(void)
364 {
365 extern char __kutext_start[], __kutext_end[], __kernel_kutext_phys[];
366 extern char __text_page_start[], __text_page_end[];
367 extern char __kernel_kutext_page_phys[];
368 extern char __kudata_start[], __kudata_end[], __kernel_kudata_phys[];
369 vaddr_t va;
370 paddr_t pa;
371
372 /* idt */
373 pmap_enter_special(idt_vaddr, idt_paddr, PROT_READ);
374 DPRINTF("%s: entered idt page va 0x%llx pa 0x%llx\n", __func__,
375 (uint64_t)idt_vaddr, (uint64_t)idt_paddr);
376
377 /* .kutext section */
378 va = (vaddr_t)__kutext_start;
379 pa = (paddr_t)__kernel_kutext_phys;
380 while (va < (vaddr_t)__kutext_end) {
381 pmap_enter_special(va, pa, PROT_READ | PROT_EXEC);
382 DPRINTF("%s: entered kutext page va 0x%llx pa 0x%llx\n",
383 __func__, (uint64_t)va, (uint64_t)pa);
384 va += PAGE_SIZE;
385 pa += PAGE_SIZE;
386 }
387
388 /* .kutext.page section */
389 va = (vaddr_t)__text_page_start;
390 pa = (paddr_t)__kernel_kutext_page_phys;
391 while (va < (vaddr_t)__text_page_end) {
392 pmap_enter_special(va, pa, PROT_READ | PROT_EXEC);
393 DPRINTF("%s: entered kutext.page va 0x%llx pa 0x%llx\n",
394 __func__, (uint64_t)va, (uint64_t)pa);
395 va += PAGE_SIZE;
396 pa += PAGE_SIZE;
397 }
398
399 /* .kudata section */
400 va = (vaddr_t)__kudata_start;
401 pa = (paddr_t)__kernel_kudata_phys;
402 while (va < (vaddr_t)__kudata_end) {
403 pmap_enter_special(va, pa, PROT_READ | PROT_WRITE);
404 DPRINTF("%s: entered kudata page va 0x%llx pa 0x%llx\n",
405 __func__, (uint64_t)va, (uint64_t)pa);
406 va += PAGE_SIZE;
407 pa += PAGE_SIZE;
408 }
409 }
410
411 /*
412 * Set up proc0's PCB and the cpu's TSS.
413 */
414 void
x86_64_proc0_tss_ldt_init(void)415 x86_64_proc0_tss_ldt_init(void)
416 {
417 struct pcb *pcb;
418
419 cpu_info_primary.ci_curpcb = pcb = &proc0.p_addr->u_pcb;
420 pcb->pcb_fsbase = 0;
421 pcb->pcb_kstack = (u_int64_t)proc0.p_addr + USPACE - 16;
422 proc0.p_md.md_regs = (struct trapframe *)pcb->pcb_kstack - 1;
423
424 ltr(GSYSSEL(GPROC0_SEL, SEL_KPL));
425 lldt(0);
426 }
427
428 bios_diskinfo_t *
bios_getdiskinfo(dev_t dev)429 bios_getdiskinfo(dev_t dev)
430 {
431 bios_diskinfo_t *pdi;
432
433 if (bios_diskinfo == NULL)
434 return NULL;
435
436 for (pdi = bios_diskinfo; pdi->bios_number != -1; pdi++) {
437 if ((dev & B_MAGICMASK) == B_DEVMAGIC) { /* search by bootdev */
438 if (pdi->bsd_dev == dev)
439 break;
440 } else {
441 if (pdi->bios_number == dev)
442 break;
443 }
444 }
445
446 if (pdi->bios_number == -1)
447 return NULL;
448 else
449 return pdi;
450 }
451
452 int
bios_sysctl(int * name,u_int namelen,void * oldp,size_t * oldlenp,void * newp,size_t newlen,struct proc * p)453 bios_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
454 size_t newlen, struct proc *p)
455 {
456 bios_diskinfo_t *pdi;
457 int biosdev;
458
459 /* all sysctl names at this level except diskinfo are terminal */
460 if (namelen != 1 && name[0] != BIOS_DISKINFO)
461 return (ENOTDIR); /* overloaded */
462
463 if (!(bootapiver & BAPIV_VECTOR))
464 return EOPNOTSUPP;
465
466 switch (name[0]) {
467 case BIOS_DEV:
468 if ((pdi = bios_getdiskinfo(bootdev)) == NULL)
469 return ENXIO;
470 biosdev = pdi->bios_number;
471 return sysctl_rdint(oldp, oldlenp, newp, biosdev);
472 case BIOS_DISKINFO:
473 if (namelen != 2)
474 return ENOTDIR;
475 if ((pdi = bios_getdiskinfo(name[1])) == NULL)
476 return ENXIO;
477 return sysctl_rdstruct(oldp, oldlenp, newp, pdi, sizeof(*pdi));
478 case BIOS_CKSUMLEN:
479 return sysctl_rdint(oldp, oldlenp, newp, bios_cksumlen);
480 default:
481 return EOPNOTSUPP;
482 }
483 /* NOTREACHED */
484 }
485
486 extern int tsc_is_invariant;
487 extern int amd64_has_xcrypt;
488 extern int need_retpoline;
489
490 const struct sysctl_bounded_args cpuctl_vars[] = {
491 { CPU_LIDACTION, &lid_action, -1, 2 },
492 { CPU_PWRACTION, &pwr_action, 0, 2 },
493 { CPU_CPUID, &cpu_id, SYSCTL_INT_READONLY },
494 { CPU_CPUFEATURE, &cpu_feature, SYSCTL_INT_READONLY },
495 { CPU_XCRYPT, &amd64_has_xcrypt, SYSCTL_INT_READONLY },
496 { CPU_INVARIANTTSC, &tsc_is_invariant, SYSCTL_INT_READONLY },
497 { CPU_RETPOLINE, &need_retpoline, SYSCTL_INT_READONLY },
498 };
499
500 /*
501 * machine dependent system variables.
502 */
503 int
cpu_sysctl(int * name,u_int namelen,void * oldp,size_t * oldlenp,void * newp,size_t newlen,struct proc * p)504 cpu_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
505 size_t newlen, struct proc *p)
506 {
507 extern uint64_t tsc_frequency;
508 dev_t consdev;
509 dev_t dev;
510
511 switch (name[0]) {
512 case CPU_CONSDEV:
513 if (namelen != 1)
514 return (ENOTDIR); /* overloaded */
515 if (cn_tab != NULL)
516 consdev = cn_tab->cn_dev;
517 else
518 consdev = NODEV;
519 return (sysctl_rdstruct(oldp, oldlenp, newp, &consdev,
520 sizeof consdev));
521 case CPU_CHR2BLK:
522 if (namelen != 2)
523 return (ENOTDIR); /* overloaded */
524 dev = chrtoblk((dev_t)name[1]);
525 return sysctl_rdstruct(oldp, oldlenp, newp, &dev, sizeof(dev));
526 case CPU_BIOS:
527 return bios_sysctl(name + 1, namelen - 1, oldp, oldlenp,
528 newp, newlen, p);
529 case CPU_CPUVENDOR:
530 return (sysctl_rdstring(oldp, oldlenp, newp, cpu_vendor));
531 case CPU_KBDRESET:
532 return (sysctl_securelevel_int(oldp, oldlenp, newp, newlen,
533 &kbd_reset));
534 case CPU_ALLOWAPERTURE:
535 if (namelen != 1)
536 return (ENOTDIR); /* overloaded */
537 #ifdef APERTURE
538 if (securelevel > 0)
539 return (sysctl_int_lower(oldp, oldlenp, newp, newlen,
540 &allowaperture));
541 else
542 return (sysctl_int(oldp, oldlenp, newp, newlen,
543 &allowaperture));
544 #else
545 return (sysctl_rdint(oldp, oldlenp, newp, 0));
546 #endif
547 #if NPCKBC > 0 && NUKBD > 0
548 case CPU_FORCEUKBD:
549 {
550 int error;
551
552 if (forceukbd)
553 return (sysctl_rdint(oldp, oldlenp, newp, forceukbd));
554
555 error = sysctl_int(oldp, oldlenp, newp, newlen, &forceukbd);
556 if (forceukbd)
557 pckbc_release_console();
558 return (error);
559 }
560 #endif
561 case CPU_TSCFREQ:
562 return (sysctl_rdquad(oldp, oldlenp, newp, tsc_frequency));
563 default:
564 return (sysctl_bounded_arr(cpuctl_vars, nitems(cpuctl_vars),
565 name, namelen, oldp, oldlenp, newp, newlen));
566 }
567 /* NOTREACHED */
568 }
569
570 static inline void
maybe_enable_user_cet(struct proc * p)571 maybe_enable_user_cet(struct proc *p)
572 {
573 #ifndef SMALL_KERNEL
574 /* Enable indirect-branch tracking if present and not disabled */
575 if ((xsave_mask & XFEATURE_CET_U) &&
576 (p->p_p->ps_flags & PS_NOBTCFI) == 0) {
577 uint64_t msr = rdmsr(MSR_U_CET);
578 wrmsr(MSR_U_CET, msr | MSR_CET_ENDBR_EN | MSR_CET_NO_TRACK_EN);
579 }
580 #endif
581 }
582
583 static inline void
initialize_thread_xstate(struct proc * p)584 initialize_thread_xstate(struct proc *p)
585 {
586 if (cpu_use_xsaves) {
587 xrstors(fpu_cleandata, xsave_mask);
588 maybe_enable_user_cet(p);
589 } else {
590 /* Reset FPU state in PCB */
591 memcpy(&p->p_addr->u_pcb.pcb_savefpu, fpu_cleandata,
592 fpu_save_len);
593
594 if (curcpu()->ci_pflags & CPUPF_USERXSTATE) {
595 /* state in CPU is obsolete; reset it */
596 fpureset();
597 }
598 }
599
600 /* The reset state _is_ the userspace state for this thread now */
601 curcpu()->ci_pflags |= CPUPF_USERXSTATE;
602 }
603
604 /*
605 * Copy out the FPU state, massaging it to be usable from userspace
606 * and acceptable to xrstor_user()
607 */
608 static inline int
copyoutfpu(struct savefpu * sfp,char * sp,size_t len)609 copyoutfpu(struct savefpu *sfp, char *sp, size_t len)
610 {
611 uint64_t bvs[2];
612
613 if (copyout(sfp, sp, len))
614 return 1;
615 if (len > offsetof(struct savefpu, fp_xstate.xstate_bv)) {
616 sp += offsetof(struct savefpu, fp_xstate.xstate_bv);
617 len -= offsetof(struct savefpu, fp_xstate.xstate_bv);
618 bvs[0] = sfp->fp_xstate.xstate_bv & XFEATURE_XCR0_MASK;
619 bvs[1] = sfp->fp_xstate.xstate_xcomp_bv &
620 (XFEATURE_XCR0_MASK | XFEATURE_COMPRESSED);
621 if (copyout(bvs, sp, min(len, sizeof bvs)))
622 return 1;
623 }
624 return 0;
625 }
626
627 /*
628 * Send an interrupt to process.
629 *
630 * Stack is set up to allow sigcode to call routine, followed by
631 * syscall to sigreturn routine below. After sigreturn resets the
632 * signal mask, the stack, and the frame pointer, it returns to the
633 * user specified pc.
634 */
635 int
sendsig(sig_t catcher,int sig,sigset_t mask,const siginfo_t * ksip,int info,int onstack)636 sendsig(sig_t catcher, int sig, sigset_t mask, const siginfo_t *ksip,
637 int info, int onstack)
638 {
639 struct proc *p = curproc;
640 struct trapframe *tf = p->p_md.md_regs;
641 struct sigcontext ksc;
642 struct savefpu *sfp = &p->p_addr->u_pcb.pcb_savefpu;
643 register_t sp, scp, sip;
644 u_long sss;
645
646 memset(&ksc, 0, sizeof ksc);
647 ksc.sc_rdi = tf->tf_rdi;
648 ksc.sc_rsi = tf->tf_rsi;
649 ksc.sc_rdx = tf->tf_rdx;
650 ksc.sc_rcx = tf->tf_rcx;
651 ksc.sc_r8 = tf->tf_r8;
652 ksc.sc_r9 = tf->tf_r9;
653 ksc.sc_r10 = tf->tf_r10;
654 ksc.sc_r11 = tf->tf_r11;
655 ksc.sc_r12 = tf->tf_r12;
656 ksc.sc_r13 = tf->tf_r13;
657 ksc.sc_r14 = tf->tf_r14;
658 ksc.sc_r15 = tf->tf_r15;
659 ksc.sc_rbx = tf->tf_rbx;
660 ksc.sc_rax = tf->tf_rax;
661 ksc.sc_rbp = tf->tf_rbp;
662 ksc.sc_rip = tf->tf_rip;
663 ksc.sc_cs = tf->tf_cs;
664 ksc.sc_rflags = tf->tf_rflags;
665 ksc.sc_rsp = tf->tf_rsp;
666 ksc.sc_ss = tf->tf_ss;
667 ksc.sc_mask = mask;
668
669 /* Allocate space for the signal handler context. */
670 if ((p->p_sigstk.ss_flags & SS_DISABLE) == 0 &&
671 !sigonstack(tf->tf_rsp) && onstack)
672 sp = trunc_page((vaddr_t)p->p_sigstk.ss_sp + p->p_sigstk.ss_size);
673 else
674 sp = tf->tf_rsp - 128;
675
676 sp -= fpu_save_len;
677 if (cpu_use_xsaves)
678 sp &= ~63ULL; /* just in case */
679 else
680 sp &= ~15ULL; /* just in case */
681
682 /* Save FPU state to PCB if necessary, then copy it out */
683 if (curcpu()->ci_pflags & CPUPF_USERXSTATE)
684 fpusave(&p->p_addr->u_pcb.pcb_savefpu);
685 if (copyoutfpu(sfp, (void *)sp, fpu_save_len))
686 return 1;
687
688 initialize_thread_xstate(p);
689
690 ksc.sc_fpstate = (struct fxsave64 *)sp;
691 sss = (sizeof(ksc) + 15) & ~15;
692 sip = 0;
693 if (info) {
694 sip = sp - ((sizeof(*ksip) + 15) & ~15);
695 sss += (sizeof(*ksip) + 15) & ~15;
696
697 if (copyout(ksip, (void *)sip, sizeof(*ksip)))
698 return 1;
699 }
700 scp = sp - sss;
701
702 ksc.sc_cookie = (long)scp ^ p->p_p->ps_sigcookie;
703 if (copyout(&ksc, (void *)scp, sizeof(ksc)))
704 return 1;
705
706 /*
707 * Build context to run handler in.
708 */
709 tf->tf_rax = (u_int64_t)catcher;
710 tf->tf_rdi = sig;
711 tf->tf_rsi = sip;
712 tf->tf_rdx = scp;
713
714 tf->tf_rip = (u_int64_t)p->p_p->ps_sigcode;
715 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
716 tf->tf_rflags &= ~(PSL_T|PSL_D|PSL_VM|PSL_AC);
717 tf->tf_rsp = scp;
718 tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
719
720 return 0;
721 }
722
723 /*
724 * System call to cleanup state after a signal
725 * has been taken. Reset signal mask and
726 * stack state from context left by sendsig (above).
727 * Return to previous pc and psl as specified by
728 * context left by sendsig. Check carefully to
729 * make sure that the user has not modified the
730 * psl to gain improper privileges or to cause
731 * a machine fault.
732 */
733 int
sys_sigreturn(struct proc * p,void * v,register_t * retval)734 sys_sigreturn(struct proc *p, void *v, register_t *retval)
735 {
736 struct sys_sigreturn_args /* {
737 syscallarg(struct sigcontext *) sigcntxp;
738 } */ *uap = v;
739 struct sigcontext ksc, *scp = SCARG(uap, sigcntxp);
740 struct trapframe *tf = p->p_md.md_regs;
741 struct savefpu *sfp = &p->p_addr->u_pcb.pcb_savefpu;
742 int error;
743
744 if (PROC_PC(p) != p->p_p->ps_sigcoderet) {
745 sigexit(p, SIGILL);
746 return (EPERM);
747 }
748
749 if ((error = copyin((caddr_t)scp, &ksc, sizeof ksc)))
750 return (error);
751
752 if (ksc.sc_cookie != ((long)scp ^ p->p_p->ps_sigcookie)) {
753 sigexit(p, SIGILL);
754 return (EFAULT);
755 }
756
757 /* Prevent reuse of the sigcontext cookie */
758 ksc.sc_cookie = 0;
759 (void)copyout(&ksc.sc_cookie, (caddr_t)scp +
760 offsetof(struct sigcontext, sc_cookie), sizeof (ksc.sc_cookie));
761
762 if (((ksc.sc_rflags ^ tf->tf_rflags) & PSL_USERSTATIC) != 0 ||
763 !USERMODE(ksc.sc_cs, ksc.sc_eflags))
764 return (EINVAL);
765
766 /* Current FPU state is obsolete; toss it and force a reload */
767 if (curcpu()->ci_pflags & CPUPF_USERXSTATE) {
768 curcpu()->ci_pflags &= ~CPUPF_USERXSTATE;
769 fpureset();
770 }
771
772 /* Copy in the FPU state to restore */
773 if (__predict_true(ksc.sc_fpstate != NULL)) {
774 if ((error = copyin(ksc.sc_fpstate, sfp, fpu_save_len)))
775 return error;
776 if (xrstor_user(sfp, xsave_mask)) {
777 memcpy(sfp, fpu_cleandata, fpu_save_len);
778 return EINVAL;
779 }
780 maybe_enable_user_cet(p);
781 curcpu()->ci_pflags |= CPUPF_USERXSTATE;
782 } else {
783 /* shouldn't happen, but handle it */
784 initialize_thread_xstate(p);
785 }
786
787 tf->tf_rdi = ksc.sc_rdi;
788 tf->tf_rsi = ksc.sc_rsi;
789 tf->tf_rdx = ksc.sc_rdx;
790 tf->tf_rcx = ksc.sc_rcx;
791 tf->tf_r8 = ksc.sc_r8;
792 tf->tf_r9 = ksc.sc_r9;
793 tf->tf_r10 = ksc.sc_r10;
794 tf->tf_r11 = ksc.sc_r11;
795 tf->tf_r12 = ksc.sc_r12;
796 tf->tf_r13 = ksc.sc_r13;
797 tf->tf_r14 = ksc.sc_r14;
798 tf->tf_r15 = ksc.sc_r15;
799 tf->tf_rbx = ksc.sc_rbx;
800 tf->tf_rax = ksc.sc_rax;
801 tf->tf_rbp = ksc.sc_rbp;
802 tf->tf_rip = ksc.sc_rip;
803 tf->tf_cs = ksc.sc_cs;
804 tf->tf_rflags = ksc.sc_rflags;
805 tf->tf_rsp = ksc.sc_rsp;
806 tf->tf_ss = ksc.sc_ss;
807
808 /* Restore signal mask. */
809 p->p_sigmask = ksc.sc_mask & ~sigcantmask;
810
811 /*
812 * sigreturn() needs to return to userspace via the 'iretq'
813 * method, so that if the process was interrupted (by tick,
814 * an IPI, whatever) as opposed to already being in the kernel
815 * when a signal was being delivered, the process will be
816 * completely restored, including the userland %rcx and %r11
817 * registers which the 'sysretq' instruction cannot restore.
818 * Also need to make sure we can handle faulting on xrstor.
819 */
820 p->p_md.md_flags |= MDP_IRET;
821
822 return (EJUSTRETURN);
823 }
824
825 #ifdef MULTIPROCESSOR
826 /* force a CPU into the kernel, whether or not it's idle */
827 void
cpu_kick(struct cpu_info * ci)828 cpu_kick(struct cpu_info *ci)
829 {
830 /* only need to kick other CPUs */
831 if (ci != curcpu()) {
832 if (cpu_mwait_size > 0) {
833 /*
834 * If not idling, then send an IPI, else
835 * just clear the "keep idling" bit.
836 */
837 if ((ci->ci_mwait & MWAIT_IN_IDLE) == 0)
838 x86_send_ipi(ci, X86_IPI_NOP);
839 else
840 atomic_clearbits_int(&ci->ci_mwait,
841 MWAIT_KEEP_IDLING);
842 } else {
843 /* no mwait, so need an IPI */
844 x86_send_ipi(ci, X86_IPI_NOP);
845 }
846 }
847 }
848 #endif
849
850 /*
851 * Notify the current process (p) that it has a signal pending,
852 * process as soon as possible.
853 */
854 void
signotify(struct proc * p)855 signotify(struct proc *p)
856 {
857 aston(p);
858 cpu_kick(p->p_cpu);
859 }
860
861 #ifdef MULTIPROCESSOR
862 void
cpu_unidle(struct cpu_info * ci)863 cpu_unidle(struct cpu_info *ci)
864 {
865 if (cpu_mwait_size > 0 && (ci->ci_mwait & MWAIT_ONLY)) {
866 /*
867 * Just clear the "keep idling" bit; if it wasn't
868 * idling then we didn't need to do anything anyway.
869 */
870 atomic_clearbits_int(&ci->ci_mwait, MWAIT_KEEP_IDLING);
871 return;
872 }
873
874 if (ci != curcpu())
875 x86_send_ipi(ci, X86_IPI_NOP);
876 }
877 #endif
878
879 int waittime = -1;
880 struct pcb dumppcb;
881
882 __dead void
boot(int howto)883 boot(int howto)
884 {
885 #if NACPI > 0
886 if ((howto & RB_POWERDOWN) != 0 && acpi_softc)
887 acpi_softc->sc_state = ACPI_STATE_S5;
888 #endif
889
890 if ((howto & RB_POWERDOWN) != 0)
891 lid_action = 0;
892
893 if ((howto & RB_RESET) != 0)
894 goto doreset;
895
896 if (cold) {
897 if ((howto & RB_USERREQ) == 0)
898 howto |= RB_HALT;
899 goto haltsys;
900 }
901
902 boothowto = howto;
903 if ((howto & RB_NOSYNC) == 0 && waittime < 0) {
904 waittime = 0;
905 vfs_shutdown(curproc);
906
907 if ((howto & RB_TIMEBAD) == 0) {
908 resettodr();
909 } else {
910 printf("WARNING: not updating battery clock\n");
911 }
912 }
913 if_downall();
914
915 uvm_shutdown();
916 splhigh();
917 cold = 1;
918
919 if ((howto & RB_DUMP) != 0)
920 dumpsys();
921
922 haltsys:
923 config_suspend_all(DVACT_POWERDOWN);
924
925 #ifdef MULTIPROCESSOR
926 x86_broadcast_ipi(X86_IPI_HALT);
927 #endif
928
929 if ((howto & RB_HALT) != 0) {
930 #if NACPI > 0 && !defined(SMALL_KERNEL)
931 extern int acpi_enabled;
932
933 if (acpi_enabled) {
934 delay(500000);
935 if ((howto & RB_POWERDOWN) != 0)
936 acpi_powerdown();
937 }
938 #endif
939 printf("\n");
940 printf("The operating system has halted.\n");
941 printf("Please press any key to reboot.\n\n");
942 cnpollc(1); /* for proper keyboard command handling */
943 cngetc();
944 cnpollc(0);
945 }
946
947 doreset:
948 printf("rebooting...\n");
949 if (cpureset_delay > 0)
950 delay(cpureset_delay * 1000);
951 cpu_reset();
952 for (;;)
953 continue;
954 /* NOTREACHED */
955 }
956
957 /*
958 * These variables are needed by /sbin/savecore
959 */
960 u_long dumpmag = 0x8fca0101; /* magic number */
961 int dumpsize = 0; /* pages */
962 long dumplo = 0; /* blocks */
963
964 /*
965 * cpu_dump: dump the machine-dependent kernel core dump headers.
966 */
967 int
cpu_dump(void)968 cpu_dump(void)
969 {
970 int (*dump)(dev_t, daddr_t, caddr_t, size_t);
971 char buf[dbtob(1)];
972 kcore_seg_t *segp;
973 cpu_kcore_hdr_t *cpuhdrp;
974 phys_ram_seg_t *memsegp;
975 caddr_t va;
976 int i;
977
978 dump = bdevsw[major(dumpdev)].d_dump;
979
980 memset(buf, 0, sizeof buf);
981 segp = (kcore_seg_t *)buf;
982 cpuhdrp = (cpu_kcore_hdr_t *)&buf[ALIGN(sizeof(*segp))];
983 memsegp = (phys_ram_seg_t *)&buf[ALIGN(sizeof(*segp)) +
984 ALIGN(sizeof(*cpuhdrp))];
985
986 /*
987 * Generate a segment header.
988 */
989 CORE_SETMAGIC(*segp, KCORE_MAGIC, MID_MACHINE, CORE_CPU);
990 segp->c_size = dbtob(1) - ALIGN(sizeof(*segp));
991
992 /*
993 * Add the machine-dependent header info.
994 */
995 cpuhdrp->ptdpaddr = proc0.p_addr->u_pcb.pcb_cr3;
996 cpuhdrp->nmemsegs = mem_cluster_cnt;
997
998 /*
999 * Fill in the memory segment descriptors.
1000 */
1001 for (i = 0; i < mem_cluster_cnt; i++) {
1002 memsegp[i].start = mem_clusters[i].start;
1003 memsegp[i].size = mem_clusters[i].size & ~PAGE_MASK;
1004 }
1005
1006 /*
1007 * If we have dump memory then assume the kernel stack is in high
1008 * memory and bounce
1009 */
1010 if (dumpmem_vaddr != 0) {
1011 memcpy((char *)dumpmem_vaddr, buf, sizeof(buf));
1012 va = (caddr_t)dumpmem_vaddr;
1013 } else {
1014 va = (caddr_t)buf;
1015 }
1016 return (dump(dumpdev, dumplo, va, dbtob(1)));
1017 }
1018
1019 /*
1020 * This is called by main to set dumplo and dumpsize.
1021 * Dumps always skip the first PAGE_SIZE of disk space
1022 * in case there might be a disk label stored there.
1023 * If there is extra space, put dump at the end to
1024 * reduce the chance that swapping trashes it.
1025 */
1026 void
dumpconf(void)1027 dumpconf(void)
1028 {
1029 int nblks, dumpblks; /* size of dump area */
1030
1031 if (dumpdev == NODEV ||
1032 (nblks = (bdevsw[major(dumpdev)].d_psize)(dumpdev)) == 0)
1033 return;
1034 if (nblks <= ctod(1))
1035 return;
1036
1037 dumpblks = cpu_dumpsize();
1038 if (dumpblks < 0)
1039 return;
1040 dumpblks += ctod(cpu_dump_mempagecnt());
1041
1042 /* If dump won't fit (incl. room for possible label), punt. */
1043 if (dumpblks > (nblks - ctod(1)))
1044 return;
1045
1046 /* Put dump at end of partition */
1047 dumplo = nblks - dumpblks;
1048
1049 /* dumpsize is in page units, and doesn't include headers. */
1050 dumpsize = cpu_dump_mempagecnt();
1051 }
1052
1053 /*
1054 * Doadump comes here after turning off memory management and
1055 * getting on the dump stack, either when called above, or by
1056 * the auto-restart code.
1057 */
1058 #define BYTES_PER_DUMP MAXPHYS /* must be a multiple of pagesize */
1059
1060 void
dumpsys(void)1061 dumpsys(void)
1062 {
1063 u_long totalbytesleft, bytes, i, n, memseg;
1064 u_long maddr;
1065 daddr_t blkno;
1066 void *va;
1067 int (*dump)(dev_t, daddr_t, caddr_t, size_t);
1068 int error;
1069
1070 /* Save registers. */
1071 savectx(&dumppcb);
1072
1073 if (dumpdev == NODEV)
1074 return;
1075
1076 /*
1077 * For dumps during autoconfiguration,
1078 * if dump device has already configured...
1079 */
1080 if (dumpsize == 0)
1081 dumpconf();
1082 if (dumplo <= 0 || dumpsize == 0) {
1083 printf("\ndump to dev %u,%u not possible\n", major(dumpdev),
1084 minor(dumpdev));
1085 return;
1086 }
1087 printf("\ndumping to dev %u,%u offset %ld\n", major(dumpdev),
1088 minor(dumpdev), dumplo);
1089
1090 error = (*bdevsw[major(dumpdev)].d_psize)(dumpdev);
1091 printf("dump ");
1092 if (error == -1) {
1093 printf("area unavailable\n");
1094 return;
1095 }
1096
1097 if ((error = cpu_dump()) != 0)
1098 goto err;
1099
1100 totalbytesleft = ptoa(cpu_dump_mempagecnt());
1101 blkno = dumplo + cpu_dumpsize();
1102 dump = bdevsw[major(dumpdev)].d_dump;
1103 error = 0;
1104
1105 for (memseg = 0; memseg < mem_cluster_cnt; memseg++) {
1106 maddr = mem_clusters[memseg].start;
1107 bytes = mem_clusters[memseg].size;
1108
1109 for (i = 0; i < bytes; i += n, totalbytesleft -= n) {
1110 /* Print out how many MBs we have left to go. */
1111 if ((totalbytesleft % (1024*1024)) < BYTES_PER_DUMP)
1112 printf("%ld ", totalbytesleft / (1024 * 1024));
1113
1114 /* Limit size for next transfer. */
1115 n = bytes - i;
1116 if (n > BYTES_PER_DUMP)
1117 n = BYTES_PER_DUMP;
1118 if (maddr > 0xffffffff) {
1119 va = (void *)dumpmem_vaddr;
1120 if (n > dumpmem_sz)
1121 n = dumpmem_sz;
1122 memcpy(va, (void *)PMAP_DIRECT_MAP(maddr), n);
1123 } else {
1124 va = (void *)PMAP_DIRECT_MAP(maddr);
1125 }
1126
1127 error = (*dump)(dumpdev, blkno, va, n);
1128 if (error)
1129 goto err;
1130 maddr += n;
1131 blkno += btodb(n); /* XXX? */
1132
1133 #if 0 /* XXX this doesn't work. grr. */
1134 /* operator aborting dump? */
1135 if (sget() != NULL) {
1136 error = EINTR;
1137 break;
1138 }
1139 #endif
1140 }
1141 }
1142
1143 err:
1144 switch (error) {
1145
1146 case ENXIO:
1147 printf("device bad\n");
1148 break;
1149
1150 case EFAULT:
1151 printf("device not ready\n");
1152 break;
1153
1154 case EINVAL:
1155 printf("area improper\n");
1156 break;
1157
1158 case EIO:
1159 printf("i/o error\n");
1160 break;
1161
1162 case EINTR:
1163 printf("aborted from console\n");
1164 break;
1165
1166 case 0:
1167 printf("succeeded\n");
1168 break;
1169
1170 default:
1171 printf("error %d\n", error);
1172 break;
1173 }
1174 printf("\n\n");
1175 delay(5000000); /* 5 seconds */
1176 }
1177
1178 /*
1179 * Force the userspace FS.base to be reloaded from the PCB on return from
1180 * the kernel, and reset the segment registers (%ds, %es, %fs, and %gs)
1181 * to their expected userspace value.
1182 */
1183 void
reset_segs(void)1184 reset_segs(void)
1185 {
1186 /*
1187 * This operates like the cpu_switchto() sequence: if we
1188 * haven't reset %[defg]s already, do so now.
1189 */
1190 if (curcpu()->ci_pflags & CPUPF_USERSEGS) {
1191 curcpu()->ci_pflags &= ~CPUPF_USERSEGS;
1192 __asm volatile(
1193 "movw %%ax,%%ds\n\t"
1194 "movw %%ax,%%es\n\t"
1195 "movw %%ax,%%fs\n\t"
1196 "cli\n\t" /* block intr when on user GS.base */
1197 "swapgs\n\t" /* swap from kernel to user GS.base */
1198 "movw %%ax,%%gs\n\t"/* set %gs to UDATA and GS.base to 0 */
1199 "swapgs\n\t" /* back to kernel GS.base */
1200 "sti" : : "a"(GSEL(GUDATA_SEL, SEL_UPL)));
1201 }
1202 }
1203
1204 /*
1205 * Clear registers on exec
1206 */
1207 void
setregs(struct proc * p,struct exec_package * pack,u_long stack,struct ps_strings * arginfo)1208 setregs(struct proc *p, struct exec_package *pack, u_long stack,
1209 struct ps_strings *arginfo)
1210 {
1211 struct trapframe *tf;
1212
1213 initialize_thread_xstate(p);
1214
1215 /* To reset all registers we have to return via iretq */
1216 p->p_md.md_flags |= MDP_IRET;
1217
1218 reset_segs();
1219 p->p_addr->u_pcb.pcb_fsbase = 0;
1220
1221 tf = p->p_md.md_regs;
1222 memset(tf, 0, sizeof *tf);
1223 tf->tf_rip = pack->ep_entry;
1224 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
1225 tf->tf_rflags = PSL_USERSET;
1226 tf->tf_rsp = stack;
1227 tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
1228 }
1229
1230 /*
1231 * Initialize segments and descriptor tables
1232 */
1233
1234 struct gate_descriptor *idt;
1235 char idt_allocmap[NIDT];
1236 struct user *proc0paddr = NULL;
1237
1238 void
setgate(struct gate_descriptor * gd,void * func,int ist,int type,int dpl,int sel)1239 setgate(struct gate_descriptor *gd, void *func, int ist, int type, int dpl,
1240 int sel)
1241 {
1242 gd->gd_looffset = (u_int64_t)func & 0xffff;
1243 gd->gd_selector = sel;
1244 gd->gd_ist = ist;
1245 gd->gd_type = type;
1246 gd->gd_dpl = dpl;
1247 gd->gd_p = 1;
1248 gd->gd_hioffset = (u_int64_t)func >> 16;
1249 gd->gd_zero = 0;
1250 gd->gd_xx1 = 0;
1251 gd->gd_xx2 = 0;
1252 gd->gd_xx3 = 0;
1253 }
1254
1255 void
unsetgate(struct gate_descriptor * gd)1256 unsetgate(struct gate_descriptor *gd)
1257 {
1258 memset(gd, 0, sizeof (*gd));
1259 }
1260
1261 void
setregion(struct region_descriptor * rd,void * base,u_int16_t limit)1262 setregion(struct region_descriptor *rd, void *base, u_int16_t limit)
1263 {
1264 rd->rd_limit = limit;
1265 rd->rd_base = (u_int64_t)base;
1266 }
1267
1268 /*
1269 * Note that the base and limit fields are ignored in long mode.
1270 */
1271 void
set_mem_segment(struct mem_segment_descriptor * sd,void * base,size_t limit,int type,int dpl,int gran,int def32,int is64)1272 set_mem_segment(struct mem_segment_descriptor *sd, void *base, size_t limit,
1273 int type, int dpl, int gran, int def32, int is64)
1274 {
1275 sd->sd_lolimit = (unsigned)limit;
1276 sd->sd_lobase = (unsigned long)base;
1277 sd->sd_type = type;
1278 sd->sd_dpl = dpl;
1279 sd->sd_p = 1;
1280 sd->sd_hilimit = (unsigned)limit >> 16;
1281 sd->sd_avl = 0;
1282 sd->sd_long = is64;
1283 sd->sd_def32 = def32;
1284 sd->sd_gran = gran;
1285 sd->sd_hibase = (unsigned long)base >> 24;
1286 }
1287
1288 void
set_sys_segment(struct sys_segment_descriptor * sd,void * base,size_t limit,int type,int dpl,int gran)1289 set_sys_segment(struct sys_segment_descriptor *sd, void *base, size_t limit,
1290 int type, int dpl, int gran)
1291 {
1292 memset(sd, 0, sizeof *sd);
1293 sd->sd_lolimit = (unsigned)limit;
1294 sd->sd_lobase = (u_int64_t)base;
1295 sd->sd_type = type;
1296 sd->sd_dpl = dpl;
1297 sd->sd_p = 1;
1298 sd->sd_hilimit = (unsigned)limit >> 16;
1299 sd->sd_gran = gran;
1300 sd->sd_hibase = (u_int64_t)base >> 24;
1301 }
1302
1303 void
cpu_init_idt(void)1304 cpu_init_idt(void)
1305 {
1306 struct region_descriptor region;
1307
1308 setregion(®ion, idt, NIDT * sizeof(idt[0]) - 1);
1309 lidt(®ion);
1310 }
1311
1312 void
cpu_init_extents(void)1313 cpu_init_extents(void)
1314 {
1315 extern struct extent *iomem_ex;
1316 static int already_done;
1317 int i;
1318
1319 /* We get called for each CPU, only first should do this */
1320 if (already_done)
1321 return;
1322
1323 /*
1324 * Allocate the physical addresses used by RAM from the iomem
1325 * extent map.
1326 */
1327 for (i = 0; i < mem_cluster_cnt; i++) {
1328 if (extent_alloc_region(iomem_ex, mem_clusters[i].start,
1329 mem_clusters[i].size, EX_NOWAIT)) {
1330 /* XXX What should we do? */
1331 printf("WARNING: CAN'T ALLOCATE RAM (%llx-%llx)"
1332 " FROM IOMEM EXTENT MAP!\n", mem_clusters[i].start,
1333 mem_clusters[i].start + mem_clusters[i].size - 1);
1334 }
1335 }
1336
1337 already_done = 1;
1338 }
1339
1340 void
map_tramps(void)1341 map_tramps(void)
1342 {
1343 #if defined(MULTIPROCESSOR) || \
1344 (NACPI > 0 && !defined(SMALL_KERNEL))
1345 struct pmap *kmp = pmap_kernel();
1346 extern paddr_t tramp_pdirpa;
1347 #ifdef MULTIPROCESSOR
1348 extern u_char cpu_spinup_trampoline[];
1349 extern u_char cpu_spinup_trampoline_end[];
1350 extern u_char mp_tramp_data_start[];
1351 extern u_char mp_tramp_data_end[];
1352 extern u_int32_t mp_pdirpa;
1353 #endif
1354
1355 /*
1356 * The initial PML4 pointer must be below 4G, so if the
1357 * current one isn't, use a "bounce buffer" and save it
1358 * for tramps to use.
1359 */
1360 if (kmp->pm_pdirpa > 0xffffffff) {
1361 pmap_kenter_pa(lo32_vaddr, lo32_paddr, PROT_READ | PROT_WRITE);
1362 memcpy((void *)lo32_vaddr, kmp->pm_pdir, PAGE_SIZE);
1363 tramp_pdirpa = lo32_paddr;
1364 pmap_kremove(lo32_vaddr, PAGE_SIZE);
1365 } else
1366 tramp_pdirpa = kmp->pm_pdirpa;
1367
1368
1369 #ifdef MULTIPROCESSOR
1370 /* Map MP tramp code and data pages RW for copy */
1371 pmap_kenter_pa(MP_TRAMPOLINE, MP_TRAMPOLINE,
1372 PROT_READ | PROT_WRITE);
1373
1374 pmap_kenter_pa(MP_TRAMP_DATA, MP_TRAMP_DATA,
1375 PROT_READ | PROT_WRITE);
1376
1377 memset((caddr_t)MP_TRAMPOLINE, 0xcc, PAGE_SIZE);
1378 memset((caddr_t)MP_TRAMP_DATA, 0xcc, PAGE_SIZE);
1379
1380 memcpy((caddr_t)MP_TRAMPOLINE,
1381 cpu_spinup_trampoline,
1382 cpu_spinup_trampoline_end-cpu_spinup_trampoline);
1383
1384 memcpy((caddr_t)MP_TRAMP_DATA,
1385 mp_tramp_data_start,
1386 mp_tramp_data_end - mp_tramp_data_start);
1387
1388 /*
1389 * We need to patch this after we copy the tramp data,
1390 * the symbol points into the copied tramp data page.
1391 */
1392 mp_pdirpa = tramp_pdirpa;
1393
1394 /* Unmap, will be remapped in cpu_start_secondary */
1395 pmap_kremove(MP_TRAMPOLINE, PAGE_SIZE);
1396 pmap_kremove(MP_TRAMP_DATA, PAGE_SIZE);
1397 #endif /* MULTIPROCESSOR */
1398 #endif
1399 }
1400
1401 void
cpu_set_vendor(struct cpu_info * ci,int level,const char * vendor)1402 cpu_set_vendor(struct cpu_info *ci, int level, const char *vendor)
1403 {
1404 ci->ci_cpuid_level = level;
1405 cpuid_level = MIN(cpuid_level, level);
1406
1407 /* map the vendor string to an integer */
1408 if (strcmp(vendor, "AuthenticAMD") == 0)
1409 ci->ci_vendor = CPUV_AMD;
1410 else if (strcmp(vendor, "GenuineIntel") == 0)
1411 ci->ci_vendor = CPUV_INTEL;
1412 else if (strcmp(vendor, "CentaurHauls") == 0)
1413 ci->ci_vendor = CPUV_VIA;
1414 else
1415 ci->ci_vendor = CPUV_UNKNOWN;
1416 }
1417
1418 #define IDTVEC(name) __CONCAT(X, name)
1419 typedef void (vector)(void);
1420 extern vector *IDTVEC(exceptions)[];
1421
1422 paddr_t early_pte_pages;
1423
1424 void
init_x86_64(paddr_t first_avail)1425 init_x86_64(paddr_t first_avail)
1426 {
1427 struct region_descriptor region;
1428 bios_memmap_t *bmp;
1429 int x, ist;
1430 uint64_t max_dm_size = ((uint64_t)512 * NUM_L4_SLOT_DIRECT) << 30;
1431
1432 /*
1433 * locore0 mapped 3 pages for use before the pmap is initialized
1434 * starting at first_avail. These pages are currently used by
1435 * efifb to create early-use VAs for the framebuffer before efifb
1436 * is attached.
1437 */
1438 early_pte_pages = first_avail;
1439 first_avail += 3 * NBPG;
1440
1441 cpu_set_vendor(&cpu_info_primary, cpuid_level, cpu_vendor);
1442 cpu_init_msrs(&cpu_info_primary);
1443
1444 proc0.p_addr = proc0paddr;
1445 cpu_info_primary.ci_curpcb = &proc0.p_addr->u_pcb;
1446
1447 x86_bus_space_init();
1448
1449 i8254_startclock();
1450
1451 /*
1452 * Initialize PAGE_SIZE-dependent variables.
1453 */
1454 uvm_setpagesize();
1455
1456 /*
1457 * Boot arguments are in a single page specified by /boot.
1458 *
1459 * We require the "new" vector form, as well as memory ranges
1460 * to be given in bytes rather than KB.
1461 *
1462 * locore copies the data into bootinfo[] for us.
1463 */
1464 if ((bootapiver & (BAPIV_VECTOR | BAPIV_BMEMMAP)) ==
1465 (BAPIV_VECTOR | BAPIV_BMEMMAP)) {
1466 if (bootinfo_size >= sizeof(bootinfo))
1467 panic("boot args too big");
1468
1469 getbootinfo(bootinfo, bootinfo_size);
1470 } else
1471 panic("invalid /boot");
1472
1473 cninit();
1474
1475 /*
1476 * Memory on the AMD64 port is described by three different things.
1477 *
1478 * 1. biosbasemem - This is outdated, and should really only be used to
1479 * sanitize the other values. This is what we get back from the BIOS
1480 * using the legacy routines, describing memory below 640KB.
1481 *
1482 * 2. bios_memmap[] - This is the memory map as the bios has returned
1483 * it to us. It includes memory the kernel occupies, etc.
1484 *
1485 * 3. mem_cluster[] - This is the massaged free memory segments after
1486 * taking into account the contents of bios_memmap, biosbasemem,
1487 * and locore/machdep/pmap kernel allocations of physical
1488 * pages.
1489 *
1490 * The other thing is that the physical page *RANGE* is described by
1491 * three more variables:
1492 *
1493 * avail_start - This is a physical address of the start of available
1494 * pages, until IOM_BEGIN. This is basically the start
1495 * of the UVM managed range of memory, with some holes...
1496 *
1497 * avail_end - This is the end of physical pages. All physical pages
1498 * that UVM manages are between avail_start and avail_end.
1499 * There are holes...
1500 *
1501 * first_avail - This is the first available physical page after the
1502 * kernel, page tables, etc.
1503 *
1504 * We skip the first few pages for trampolines, hibernate, and to avoid
1505 * buggy SMI implementations that could corrupt the first 64KB.
1506 */
1507 avail_start = 16*PAGE_SIZE;
1508
1509 #ifdef MULTIPROCESSOR
1510 if (avail_start < MP_TRAMPOLINE + PAGE_SIZE)
1511 avail_start = MP_TRAMPOLINE + PAGE_SIZE;
1512 if (avail_start < MP_TRAMP_DATA + PAGE_SIZE)
1513 avail_start = MP_TRAMP_DATA + PAGE_SIZE;
1514 #endif
1515
1516 #if (NACPI > 0 && !defined(SMALL_KERNEL))
1517 if (avail_start < ACPI_TRAMPOLINE + PAGE_SIZE)
1518 avail_start = ACPI_TRAMPOLINE + PAGE_SIZE;
1519 if (avail_start < ACPI_TRAMP_DATA + PAGE_SIZE)
1520 avail_start = ACPI_TRAMP_DATA + PAGE_SIZE;
1521 #endif
1522
1523 #ifdef HIBERNATE
1524 if (avail_start < HIBERNATE_HIBALLOC_PAGE + PAGE_SIZE)
1525 avail_start = HIBERNATE_HIBALLOC_PAGE + PAGE_SIZE;
1526 #endif /* HIBERNATE */
1527
1528 /*
1529 * We need to go through the BIOS memory map given, and
1530 * fill out mem_clusters and mem_cluster_cnt stuff, taking
1531 * into account all the points listed above.
1532 */
1533 avail_end = mem_cluster_cnt = 0;
1534 for (bmp = bios_memmap; bmp->type != BIOS_MAP_END; bmp++) {
1535 paddr_t s1, s2, e1, e2;
1536
1537 /* Ignore non-free memory */
1538 if (bmp->type != BIOS_MAP_FREE)
1539 continue;
1540 if (bmp->size < PAGE_SIZE)
1541 continue;
1542
1543 /* Init our segment(s), round/trunc to pages */
1544 s1 = round_page(bmp->addr);
1545 e1 = trunc_page(bmp->addr + bmp->size);
1546 s2 = e2 = 0;
1547
1548 /*
1549 * XXX Some buggy ACPI BIOSes use memory that they
1550 * declare as free. Current worst offender is
1551 * Supermicro 5019D-FTN4. Typically the affected memory
1552 * areas are small blocks between areas reserved for
1553 * ACPI and other BIOS goo. So skip areas smaller
1554 * than 32 MB above the 16 MB boundary (to avoid
1555 * affecting legacy stuff).
1556 */
1557 if (s1 > 16*1024*1024 && (e1 - s1) < 32*1024*1024)
1558 continue;
1559
1560 /* Check and adjust our segment(s) */
1561 /* Nuke low pages */
1562 if (s1 < avail_start) {
1563 s1 = avail_start;
1564 if (s1 > e1)
1565 continue;
1566 }
1567
1568 /*
1569 * The direct map is limited to 512GB * NUM_L4_SLOT_DIRECT of
1570 * memory, so discard anything above that.
1571 */
1572 if (e1 >= max_dm_size) {
1573 e1 = max_dm_size;
1574 if (s1 > e1)
1575 continue;
1576 }
1577
1578 /* Crop stuff into "640K hole" */
1579 if (s1 < IOM_BEGIN && e1 > IOM_BEGIN)
1580 e1 = IOM_BEGIN;
1581 if (s1 < biosbasemem && e1 > biosbasemem)
1582 e1 = biosbasemem;
1583
1584 /* Split any segments straddling the 16MB boundary */
1585 if (s1 < 16*1024*1024 && e1 > 16*1024*1024) {
1586 e2 = e1;
1587 s2 = e1 = 16*1024*1024;
1588 }
1589
1590 /* Store segment(s) */
1591 if (e1 - s1 >= PAGE_SIZE) {
1592 mem_clusters[mem_cluster_cnt].start = s1;
1593 mem_clusters[mem_cluster_cnt].size = e1 - s1;
1594 mem_cluster_cnt++;
1595 }
1596 if (e2 - s2 >= PAGE_SIZE) {
1597 mem_clusters[mem_cluster_cnt].start = s2;
1598 mem_clusters[mem_cluster_cnt].size = e2 - s2;
1599 mem_cluster_cnt++;
1600 }
1601 if (avail_end < e1) avail_end = e1;
1602 if (avail_end < e2) avail_end = e2;
1603 }
1604
1605 /*
1606 * Call pmap initialization to make new kernel address space.
1607 * We must do this before loading pages into the VM system.
1608 */
1609 first_avail = pmap_bootstrap(first_avail, trunc_page(avail_end));
1610
1611 #if NEFI > 0
1612 /* Relocate the EFI memory map. */
1613 if (bios_efiinfo && bios_efiinfo->mmap_start) {
1614 mmap = (EFI_MEMORY_DESCRIPTOR *)PMAP_DIRECT_MAP(first_avail);
1615 memcpy(mmap, (void *)PMAP_DIRECT_MAP(bios_efiinfo->mmap_start),
1616 bios_efiinfo->mmap_size);
1617 first_avail += round_page(bios_efiinfo->mmap_size);
1618 }
1619 #endif
1620
1621 /* Allocate these out of the 640KB base memory */
1622 if (avail_start != PAGE_SIZE)
1623 avail_start = pmap_prealloc_lowmem_ptps(avail_start);
1624
1625 cpu_init_extents();
1626
1627 /* Make sure the end of the space used by the kernel is rounded. */
1628 first_avail = round_page(first_avail);
1629 kern_end = KERNBASE + first_avail;
1630
1631 /*
1632 * Now, load the memory clusters (which have already been
1633 * flensed) into the VM system.
1634 */
1635 for (x = 0; x < mem_cluster_cnt; x++) {
1636 paddr_t seg_start = mem_clusters[x].start;
1637 paddr_t seg_end = seg_start + mem_clusters[x].size;
1638
1639 if (seg_start < first_avail) seg_start = first_avail;
1640 if (seg_start > seg_end) continue;
1641 if (seg_end - seg_start < PAGE_SIZE) continue;
1642
1643 physmem += atop(mem_clusters[x].size);
1644
1645 #if DEBUG_MEMLOAD
1646 printf("loading 0x%lx-0x%lx (0x%lx-0x%lx)\n",
1647 seg_start, seg_end, atop(seg_start), atop(seg_end));
1648 #endif
1649 uvm_page_physload(atop(seg_start), atop(seg_end),
1650 atop(seg_start), atop(seg_end), 0);
1651 }
1652
1653 /*
1654 * Now, load the memory between the end of I/O memory "hole"
1655 * and the kernel.
1656 */
1657 {
1658 paddr_t seg_start = round_page(IOM_END);
1659 paddr_t seg_end = trunc_page(KERNTEXTOFF - KERNBASE);
1660
1661 if (seg_start < seg_end) {
1662 #if DEBUG_MEMLOAD
1663 printf("loading 0x%lx-0x%lx\n", seg_start, seg_end);
1664 #endif
1665 uvm_page_physload(atop(seg_start), atop(seg_end),
1666 atop(seg_start), atop(seg_end), 0);
1667 }
1668 }
1669
1670 #if DEBUG_MEMLOAD
1671 printf("avail_start = 0x%lx\n", avail_start);
1672 printf("avail_end = 0x%lx\n", avail_end);
1673 printf("first_avail = 0x%lx\n", first_avail);
1674 #endif
1675
1676 /*
1677 * Steal memory for the message buffer (at end of core).
1678 */
1679 {
1680 struct vm_physseg *vps = NULL;
1681 psize_t sz = round_page(MSGBUFSIZE);
1682 psize_t reqsz = sz;
1683
1684 for (x = 0; x < vm_nphysseg; x++) {
1685 vps = &vm_physmem[x];
1686 if (ptoa(vps->avail_end) == avail_end)
1687 break;
1688 }
1689 if (x == vm_nphysseg)
1690 panic("init_x86_64: can't find end of memory");
1691
1692 /* Shrink so it'll fit in the last segment. */
1693 if ((vps->avail_end - vps->avail_start) < atop(sz))
1694 sz = ptoa(vps->avail_end - vps->avail_start);
1695
1696 vps->avail_end -= atop(sz);
1697 vps->end -= atop(sz);
1698 msgbuf_paddr = ptoa(vps->avail_end);
1699
1700 /* Remove the last segment if it now has no pages. */
1701 if (vps->start == vps->end) {
1702 for (vm_nphysseg--; x < vm_nphysseg; x++)
1703 vm_physmem[x] = vm_physmem[x + 1];
1704 }
1705
1706 /* Now find where the new avail_end is. */
1707 for (avail_end = 0, x = 0; x < vm_nphysseg; x++)
1708 if (vm_physmem[x].avail_end > avail_end)
1709 avail_end = vm_physmem[x].avail_end;
1710 avail_end = ptoa(avail_end);
1711
1712 /* Warn if the message buffer had to be shrunk. */
1713 if (sz != reqsz)
1714 printf("WARNING: %ld bytes not available for msgbuf "
1715 "in last cluster (%ld used)\n", reqsz, sz);
1716 }
1717
1718 /*
1719 * Steal some memory for a dump bouncebuffer if we have memory over
1720 * the 32-bit barrier.
1721 */
1722 if (avail_end > 0xffffffff) {
1723 struct vm_physseg *vps = NULL;
1724 psize_t sz = round_page(MAX(BYTES_PER_DUMP, dbtob(1)));
1725
1726 /* XXX assumes segments are ordered */
1727 for (x = 0; x < vm_nphysseg; x++) {
1728 vps = &vm_physmem[x];
1729 /* Find something between 16meg and 4gig */
1730 if (ptoa(vps->avail_end) <= 0xffffffff &&
1731 ptoa(vps->avail_start) >= 0xffffff)
1732 break;
1733 }
1734 if (x == vm_nphysseg)
1735 panic("init_x86_64: no memory between "
1736 "0xffffff-0xffffffff");
1737
1738 /* Shrink so it'll fit in the segment. */
1739 if ((vps->avail_end - vps->avail_start) < atop(sz))
1740 sz = ptoa(vps->avail_end - vps->avail_start);
1741
1742 vps->avail_end -= atop(sz);
1743 vps->end -= atop(sz);
1744 dumpmem_paddr = ptoa(vps->avail_end);
1745 dumpmem_vaddr = PMAP_DIRECT_MAP(dumpmem_paddr);
1746 dumpmem_sz = sz;
1747
1748 /* Remove the last segment if it now has no pages. */
1749 if (vps->start == vps->end) {
1750 for (vm_nphysseg--; x < vm_nphysseg; x++)
1751 vm_physmem[x] = vm_physmem[x + 1];
1752 }
1753 }
1754
1755 pmap_growkernel(VM_MIN_KERNEL_ADDRESS + 32 * 1024 * 1024);
1756
1757 pmap_kenter_pa(idt_vaddr, idt_paddr, PROT_READ | PROT_WRITE);
1758
1759 idt = (struct gate_descriptor *)idt_vaddr;
1760 cpu_info_primary.ci_tss = &cpu_info_full_primary.cif_tss;
1761 cpu_info_primary.ci_gdt = &cpu_info_full_primary.cif_gdt;
1762
1763 /* make gdt gates and memory segments */
1764 set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GCODE_SEL), 0,
1765 0xfffff, SDT_MEMERA, SEL_KPL, 1, 0, 1);
1766
1767 set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GDATA_SEL), 0,
1768 0xfffff, SDT_MEMRWA, SEL_KPL, 1, 0, 1);
1769
1770 set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GUDATA_SEL), 0,
1771 atop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMRWA, SEL_UPL, 1, 0, 1);
1772
1773 set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GUCODE_SEL), 0,
1774 atop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMERA, SEL_UPL, 1, 0, 1);
1775
1776 set_sys_segment(GDT_ADDR_SYS(cpu_info_primary.ci_gdt, GPROC0_SEL),
1777 cpu_info_primary.ci_tss, sizeof (struct x86_64_tss)-1,
1778 SDT_SYS386TSS, SEL_KPL, 0);
1779
1780 /* exceptions */
1781 for (x = 0; x < 32; x++) {
1782 /* trap2 == NMI, trap8 == double fault */
1783 ist = (x == 2) ? 2 : (x == 8) ? 1 : 0;
1784 setgate(&idt[x], IDTVEC(exceptions)[x], ist, SDT_SYS386IGT,
1785 (x == 3) ? SEL_UPL : SEL_KPL,
1786 GSEL(GCODE_SEL, SEL_KPL));
1787 idt_allocmap[x] = 1;
1788 }
1789
1790 setregion(®ion, cpu_info_primary.ci_gdt, GDT_SIZE - 1);
1791 lgdt(®ion);
1792
1793 cpu_init_idt();
1794
1795 intr_default_setup();
1796
1797 fpuinit(&cpu_info_primary);
1798
1799 softintr_init();
1800 splraise(IPL_IPI);
1801 intr_enable();
1802
1803 #ifdef DDB
1804 db_machine_init();
1805 ddb_init();
1806 if (boothowto & RB_KDB)
1807 db_enter();
1808 #endif
1809 }
1810
1811 void
cpu_reset(void)1812 cpu_reset(void)
1813 {
1814 intr_disable();
1815
1816 if (cpuresetfn)
1817 (*cpuresetfn)();
1818
1819 /*
1820 * The keyboard controller has 4 random output pins, one of which is
1821 * connected to the RESET pin on the CPU in many PCs. We tell the
1822 * keyboard controller to pulse this line a couple of times.
1823 */
1824 outb(IO_KBD + KBCMDP, KBC_PULSE0);
1825 delay(100000);
1826 outb(IO_KBD + KBCMDP, KBC_PULSE0);
1827 delay(100000);
1828
1829 /*
1830 * Try to cause a triple fault and watchdog reset by making the IDT
1831 * invalid and causing a fault.
1832 */
1833 memset((caddr_t)idt, 0, NIDT * sizeof(idt[0]));
1834 __asm volatile("divl %0,%1" : : "q" (0), "a" (0));
1835
1836 for (;;)
1837 continue;
1838 /* NOTREACHED */
1839 }
1840
1841 /*
1842 * cpu_dumpsize: calculate size of machine-dependent kernel core dump headers.
1843 */
1844 int
cpu_dumpsize(void)1845 cpu_dumpsize(void)
1846 {
1847 int size;
1848
1849 size = ALIGN(sizeof(kcore_seg_t)) +
1850 ALIGN(mem_cluster_cnt * sizeof(phys_ram_seg_t));
1851 if (roundup(size, dbtob(1)) != dbtob(1))
1852 return (-1);
1853
1854 return (1);
1855 }
1856
1857 /*
1858 * cpu_dump_mempagecnt: calculate the size of RAM (in pages) to be dumped.
1859 */
1860 u_long
cpu_dump_mempagecnt(void)1861 cpu_dump_mempagecnt(void)
1862 {
1863 u_long i, n;
1864
1865 n = 0;
1866 for (i = 0; i < mem_cluster_cnt; i++)
1867 n += atop(mem_clusters[i].size);
1868 return (n);
1869 }
1870
1871 /*
1872 * Figure out which portions of memory are used by the kernel/system.
1873 */
1874 int
amd64_pa_used(paddr_t addr)1875 amd64_pa_used(paddr_t addr)
1876 {
1877 struct vm_page *pg;
1878
1879 /* Kernel manages these */
1880 if ((pg = PHYS_TO_VM_PAGE(addr)) && (pg->pg_flags & PG_DEV) == 0)
1881 return 1;
1882
1883 /* Kernel is loaded here */
1884 if (addr > IOM_END && addr < (kern_end - KERNBASE))
1885 return 1;
1886
1887 /* Low memory used for various bootstrap things */
1888 if (addr < avail_start)
1889 return 1;
1890
1891 /*
1892 * The only regions I can think of that are left are the things
1893 * we steal away from UVM. The message buffer?
1894 * XXX - ignore these for now.
1895 */
1896
1897 return 0;
1898 }
1899
1900 void
cpu_initclocks(void)1901 cpu_initclocks(void)
1902 {
1903 (*initclock_func)();
1904 }
1905
1906 void
cpu_startclock(void)1907 cpu_startclock(void)
1908 {
1909 (*startclock_func)();
1910 }
1911
1912 void
need_resched(struct cpu_info * ci)1913 need_resched(struct cpu_info *ci)
1914 {
1915 ci->ci_want_resched = 1;
1916
1917 /* There's a risk we'll be called before the idle threads start */
1918 if (ci->ci_curproc) {
1919 aston(ci->ci_curproc);
1920 cpu_kick(ci);
1921 }
1922 }
1923
1924 /*
1925 * Allocate an IDT vector slot within the given range.
1926 * XXX needs locking to avoid MP allocation races.
1927 */
1928
1929 int
idt_vec_alloc(int low,int high)1930 idt_vec_alloc(int low, int high)
1931 {
1932 int vec;
1933
1934 for (vec = low; vec <= high; vec++) {
1935 if (idt_allocmap[vec] == 0) {
1936 idt_allocmap[vec] = 1;
1937 return vec;
1938 }
1939 }
1940 return 0;
1941 }
1942
1943 int
idt_vec_alloc_range(int low,int high,int num)1944 idt_vec_alloc_range(int low, int high, int num)
1945 {
1946 int i, vec;
1947
1948 KASSERT(powerof2(num));
1949 low = (low + num - 1) & ~(num - 1);
1950 high = ((high + 1) & ~(num - 1)) - 1;
1951
1952 for (vec = low; vec <= high; vec += num) {
1953 for (i = 0; i < num; i++) {
1954 if (idt_allocmap[vec + i] != 0)
1955 break;
1956 }
1957 if (i == num) {
1958 for (i = 0; i < num; i++)
1959 idt_allocmap[vec + i] = 1;
1960 return vec;
1961 }
1962 }
1963 return 0;
1964 }
1965
1966 void
idt_vec_set(int vec,void (* function)(void))1967 idt_vec_set(int vec, void (*function)(void))
1968 {
1969 /*
1970 * Vector should be allocated, so no locking needed.
1971 */
1972 KASSERT(idt_allocmap[vec] == 1);
1973 setgate(&idt[vec], function, 0, SDT_SYS386IGT, SEL_KPL,
1974 GSEL(GCODE_SEL, SEL_KPL));
1975 }
1976
1977 void
idt_vec_free(int vec)1978 idt_vec_free(int vec)
1979 {
1980 unsetgate(&idt[vec]);
1981 idt_allocmap[vec] = 0;
1982 }
1983
1984 #ifdef DIAGNOSTIC
1985 void
splassert_check(int wantipl,const char * func)1986 splassert_check(int wantipl, const char *func)
1987 {
1988 int cpl = curcpu()->ci_ilevel;
1989 int floor = curcpu()->ci_handled_intr_level;
1990
1991 if (cpl < wantipl) {
1992 splassert_fail(wantipl, cpl, func);
1993 }
1994 if (floor > wantipl) {
1995 splassert_fail(wantipl, floor, func);
1996 }
1997
1998 }
1999 #endif
2000
2001 int
copyin32(const uint32_t * uaddr,uint32_t * kaddr)2002 copyin32(const uint32_t *uaddr, uint32_t *kaddr)
2003 {
2004 if ((vaddr_t)uaddr & 0x3)
2005 return EFAULT;
2006
2007 /* copyin(9) is atomic */
2008 return copyin(uaddr, kaddr, sizeof(uint32_t));
2009 }
2010
2011 void
getbootinfo(char * bootinfo,int bootinfo_size)2012 getbootinfo(char *bootinfo, int bootinfo_size)
2013 {
2014 bootarg32_t *q;
2015 bios_ddb_t *bios_ddb;
2016 bios_bootduid_t *bios_bootduid;
2017 bios_bootsr_t *bios_bootsr;
2018 #undef BOOTINFO_DEBUG
2019 #ifdef BOOTINFO_DEBUG
2020 printf("bootargv:");
2021 #endif
2022
2023 for (q = (bootarg32_t *)bootinfo;
2024 (q->ba_type != BOOTARG_END) &&
2025 ((((char *)q) - bootinfo) < bootinfo_size);
2026 q = (bootarg32_t *)(((char *)q) + q->ba_size)) {
2027
2028 switch (q->ba_type) {
2029 case BOOTARG_MEMMAP:
2030 bios_memmap = (bios_memmap_t *)q->ba_arg;
2031 #ifdef BOOTINFO_DEBUG
2032 printf(" memmap %p", bios_memmap);
2033 #endif
2034 break;
2035 case BOOTARG_DISKINFO:
2036 bios_diskinfo = (bios_diskinfo_t *)q->ba_arg;
2037 #ifdef BOOTINFO_DEBUG
2038 printf(" diskinfo %p", bios_diskinfo);
2039 #endif
2040 break;
2041 case BOOTARG_APMINFO:
2042 /* generated by i386 boot loader */
2043 break;
2044 case BOOTARG_CKSUMLEN:
2045 bios_cksumlen = *(u_int32_t *)q->ba_arg;
2046 #ifdef BOOTINFO_DEBUG
2047 printf(" cksumlen %d", bios_cksumlen);
2048 #endif
2049 break;
2050 case BOOTARG_PCIINFO:
2051 /* generated by i386 boot loader */
2052 break;
2053 case BOOTARG_CONSDEV: {
2054 #if NCOM > 0
2055 bios_consdev_t *cdp = (bios_consdev_t*)q->ba_arg;
2056 static const int ports[] =
2057 { 0x3f8, 0x2f8, 0x3e8, 0x2e8 };
2058 int unit = minor(cdp->consdev);
2059 uint64_t consaddr = cdp->consaddr;
2060 if (consaddr == -1 && unit >= 0 && unit < nitems(ports))
2061 consaddr = ports[unit];
2062 if (major(cdp->consdev) == 8 && consaddr != -1) {
2063 comconsunit = unit;
2064 comconsaddr = consaddr;
2065 comconsrate = cdp->conspeed;
2066 comconsfreq = cdp->consfreq;
2067 comcons_reg_width = cdp->reg_width;
2068 comcons_reg_shift = cdp->reg_shift;
2069 if (cdp->flags & BCD_MMIO)
2070 comconsiot = X86_BUS_SPACE_MEM;
2071 else
2072 comconsiot = X86_BUS_SPACE_IO;
2073 }
2074 #endif
2075 #ifdef BOOTINFO_DEBUG
2076 printf(" console 0x%x:%d", cdp->consdev, cdp->conspeed);
2077 #endif
2078 break;
2079 }
2080 case BOOTARG_BOOTMAC:
2081 bios_bootmac = (bios_bootmac_t *)q->ba_arg;
2082 break;
2083
2084 case BOOTARG_DDB:
2085 bios_ddb = (bios_ddb_t *)q->ba_arg;
2086 #ifdef DDB
2087 db_console = bios_ddb->db_console;
2088 #endif
2089 break;
2090
2091 case BOOTARG_BOOTDUID:
2092 bios_bootduid = (bios_bootduid_t *)q->ba_arg;
2093 memcpy(bootduid, bios_bootduid, sizeof(bootduid));
2094 break;
2095
2096 case BOOTARG_BOOTSR:
2097 bios_bootsr = (bios_bootsr_t *)q->ba_arg;
2098 #if NSOFTRAID > 0
2099 memcpy(&sr_bootuuid, &bios_bootsr->uuid,
2100 sizeof(sr_bootuuid));
2101 memcpy(&sr_bootkey, &bios_bootsr->maskkey,
2102 sizeof(sr_bootkey));
2103 #endif
2104 explicit_bzero(bios_bootsr, sizeof(bios_bootsr_t));
2105 break;
2106
2107 case BOOTARG_EFIINFO:
2108 bios_efiinfo = (bios_efiinfo_t *)q->ba_arg;
2109 break;
2110
2111 case BOOTARG_UCODE:
2112 bios_ucode = (bios_ucode_t *)q->ba_arg;
2113 break;
2114
2115 default:
2116 #ifdef BOOTINFO_DEBUG
2117 printf(" unsupported arg (%d) %p", q->ba_type,
2118 q->ba_arg);
2119 #endif
2120 break;
2121 }
2122 }
2123 #ifdef BOOTINFO_DEBUG
2124 printf("\n");
2125 #endif
2126 }
2127
2128 int
check_context(const struct reg * regs,struct trapframe * tf)2129 check_context(const struct reg *regs, struct trapframe *tf)
2130 {
2131 uint16_t sel;
2132
2133 if (((regs->r_rflags ^ tf->tf_rflags) & PSL_USERSTATIC) != 0)
2134 return EINVAL;
2135
2136 sel = regs->r_ss & 0xffff;
2137 if (!VALID_USER_DSEL(sel))
2138 return EINVAL;
2139
2140 sel = regs->r_cs & 0xffff;
2141 if (!VALID_USER_CSEL(sel))
2142 return EINVAL;
2143
2144 if (regs->r_rip >= VM_MAXUSER_ADDRESS)
2145 return EINVAL;
2146
2147 return 0;
2148 }
2149
2150 int amd64_delay_quality;
2151
2152 void
delay_init(void (* fn)(int),int fn_quality)2153 delay_init(void(*fn)(int), int fn_quality)
2154 {
2155 if (fn_quality > amd64_delay_quality) {
2156 delay_func = fn;
2157 amd64_delay_quality = fn_quality;
2158 }
2159 }
2160
2161 void
delay_fini(void (* fn)(int))2162 delay_fini(void (*fn)(int))
2163 {
2164 if (fn == delay_func) {
2165 delay_func = i8254_delay;
2166 amd64_delay_quality = 0;
2167 }
2168 }
2169