1 /* $NetBSD: machdep.c,v 1.367 2023/07/16 19:55:43 riastradh Exp $ */
2
3 /*
4 * Copyright (c) 1996, 1997, 1998, 2000, 2006, 2007, 2008, 2011
5 * The NetBSD Foundation, Inc.
6 * All rights reserved.
7 *
8 * This code is derived from software contributed to The NetBSD Foundation
9 * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace
10 * Simulation Facility, NASA Ames Research Center.
11 *
12 * This code is derived from software contributed to The NetBSD Foundation
13 * by Coyote Point Systems, Inc. which was written under contract to Coyote
14 * Point by Jed Davis and Devon O'Dell.
15 *
16 * Redistribution and use in source and binary forms, with or without
17 * modification, are permitted provided that the following conditions
18 * are met:
19 * 1. Redistributions of source code must retain the above copyright
20 * notice, this list of conditions and the following disclaimer.
21 * 2. Redistributions in binary form must reproduce the above copyright
22 * notice, this list of conditions and the following disclaimer in the
23 * documentation and/or other materials provided with the distribution.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
26 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
36 */
37
38 /*
39 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
40 *
41 * Permission to use, copy, modify, and distribute this software for any
42 * purpose with or without fee is hereby granted, provided that the above
43 * copyright notice and this permission notice appear in all copies.
44 *
45 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
46 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
47 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
48 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
49 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
50 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
51 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
52 */
53
54 /*
55 * Copyright (c) 2007 Manuel Bouyer.
56 *
57 * Redistribution and use in source and binary forms, with or without
58 * modification, are permitted provided that the following conditions
59 * are met:
60 * 1. Redistributions of source code must retain the above copyright
61 * notice, this list of conditions and the following disclaimer.
62 * 2. Redistributions in binary form must reproduce the above copyright
63 * notice, this list of conditions and the following disclaimer in the
64 * documentation and/or other materials provided with the distribution.
65 *
66 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
67 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
68 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
69 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
70 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
71 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
72 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
73 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
74 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
75 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
76 */
77
78 /*
79 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
80 * All rights reserved.
81 *
82 * This code is derived from software contributed to Berkeley by
83 * William Jolitz.
84 *
85 * Redistribution and use in source and binary forms, with or without
86 * modification, are permitted provided that the following conditions
87 * are met:
88 * 1. Redistributions of source code must retain the above copyright
89 * notice, this list of conditions and the following disclaimer.
90 * 2. Redistributions in binary form must reproduce the above copyright
91 * notice, this list of conditions and the following disclaimer in the
92 * documentation and/or other materials provided with the distribution.
93 * 3. Neither the name of the University nor the names of its contributors
94 * may be used to endorse or promote products derived from this software
95 * without specific prior written permission.
96 *
97 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
98 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
99 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
100 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
101 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
102 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
103 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
104 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
105 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
106 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
107 * SUCH DAMAGE.
108 *
109 * @(#)machdep.c 7.4 (Berkeley) 6/3/91
110 */
111
112 #include <sys/cdefs.h>
113 __KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.367 2023/07/16 19:55:43 riastradh Exp $");
114
115 #include "opt_modular.h"
116 #include "opt_user_ldt.h"
117 #include "opt_ddb.h"
118 #include "opt_kgdb.h"
119 #include "opt_cpureset_delay.h"
120 #include "opt_mtrr.h"
121 #include "opt_realmem.h"
122 #include "opt_xen.h"
123 #include "opt_svs.h"
124 #include "opt_kaslr.h"
125 #ifndef XENPV
126 #include "opt_physmem.h"
127 #endif
128 #include "isa.h"
129 #include "pci.h"
130
131 #include <sys/param.h>
132 #include <sys/systm.h>
133 #include <sys/signal.h>
134 #include <sys/signalvar.h>
135 #include <sys/kernel.h>
136 #include <sys/cpu.h>
137 #include <sys/exec.h>
138 #include <sys/exec_aout.h> /* for MID_* */
139 #include <sys/reboot.h>
140 #include <sys/conf.h>
141 #include <sys/msgbuf.h>
142 #include <sys/mount.h>
143 #include <sys/core.h>
144 #include <sys/kcore.h>
145 #include <sys/ucontext.h>
146 #include <machine/kcore.h>
147 #include <sys/ras.h>
148 #include <sys/syscallargs.h>
149 #include <sys/ksyms.h>
150 #include <sys/device.h>
151 #include <sys/lwp.h>
152 #include <sys/proc.h>
153 #include <sys/asan.h>
154 #include <sys/csan.h>
155 #include <sys/msan.h>
156 #include <sys/module.h>
157 #include <sys/timevar.h>
158
159 #ifdef KGDB
160 #include <sys/kgdb.h>
161 #endif
162
163 #include <lib/libkern/entpool.h> /* XXX */
164
165 #include <dev/cons.h>
166 #include <dev/mm.h>
167
168 #include <uvm/uvm.h>
169 #include <uvm/uvm_page.h>
170
171 #include <sys/sysctl.h>
172
173 #include <machine/cpu.h>
174 #include <machine/cpu_rng.h>
175 #include <machine/cpufunc.h>
176 #include <machine/gdt.h>
177 #include <machine/intr.h>
178 #include <machine/pio.h>
179 #include <machine/psl.h>
180 #include <machine/reg.h>
181 #include <machine/specialreg.h>
182 #include <machine/bootinfo.h>
183 #include <x86/fpu.h>
184 #include <x86/dbregs.h>
185 #include <machine/mtrr.h>
186 #include <machine/mpbiosvar.h>
187 #include <machine/pmap_private.h>
188
189 #include <x86/bootspace.h>
190 #include <x86/cputypes.h>
191 #include <x86/cpuvar.h>
192 #include <x86/machdep.h>
193 #include <x86/x86/tsc.h>
194
195 #include <dev/isa/isareg.h>
196 #include <machine/isa_machdep.h>
197 #include <dev/ic/i8042reg.h>
198
199 #ifdef XEN
200 #include <xen/xen.h>
201 #include <xen/hypervisor.h>
202 #include <xen/evtchn.h>
203 #include <xen/include/public/version.h>
204 #include <xen/include/public/vcpu.h>
205 #endif /* XEN */
206
207 #include <ddb/db_active.h>
208
209 #ifdef DDB
210 #include <machine/db_machdep.h>
211 #include <ddb/db_extern.h>
212 #include <ddb/db_output.h>
213 #include <ddb/db_interface.h>
214 #endif
215
216 #include "acpica.h"
217
218 #if NACPICA > 0
219 #include <dev/acpi/acpivar.h>
220 #define ACPI_MACHDEP_PRIVATE
221 #include <machine/acpi_machdep.h>
222 #else
223 #include <machine/i82489var.h>
224 #endif
225
226 #include "isa.h"
227 #include "isadma.h"
228 #include "ksyms.h"
229
230 /* the following is used externally (sysctl_hw) */
231 char machine[] = "amd64"; /* CPU "architecture" */
232 char machine_arch[] = "x86_64"; /* machine == machine_arch */
233
234 #ifdef CPURESET_DELAY
235 int cpureset_delay = CPURESET_DELAY;
236 #else
237 int cpureset_delay = 2000; /* default to 2s */
238 #endif
239
240 int cpu_class = CPUCLASS_686;
241
242 #ifdef MTRR
243 const struct mtrr_funcs *mtrr_funcs;
244 #endif
245
246 int cpu_class;
247 int use_pae;
248
249 #ifndef NO_SPARSE_DUMP
250 int sparse_dump = 1;
251
252 paddr_t max_paddr = 0;
253 unsigned char *sparse_dump_physmap;
254 #endif
255
256 char *dump_headerbuf, *dump_headerbuf_ptr;
257 #define dump_headerbuf_size PAGE_SIZE
258 #define dump_headerbuf_end (dump_headerbuf + dump_headerbuf_size)
259 #define dump_headerbuf_avail (dump_headerbuf_end - dump_headerbuf_ptr)
260 daddr_t dump_header_blkno;
261
262 size_t dump_nmemsegs;
263 size_t dump_npages;
264 size_t dump_header_size;
265 size_t dump_totalbytesleft;
266
267 vaddr_t idt_vaddr;
268 paddr_t idt_paddr;
269 vaddr_t gdt_vaddr;
270 paddr_t gdt_paddr;
271 vaddr_t ldt_vaddr;
272 paddr_t ldt_paddr;
273
274 static struct vm_map module_map_store;
275 extern struct bootspace bootspace;
276 extern struct slotspace slotspace;
277
278 vaddr_t vm_min_kernel_address __read_mostly = VM_MIN_KERNEL_ADDRESS_DEFAULT;
279 vaddr_t vm_max_kernel_address __read_mostly = VM_MAX_KERNEL_ADDRESS_DEFAULT;
280 pd_entry_t *pte_base __read_mostly;
281
282 struct vm_map *phys_map = NULL;
283
284 extern paddr_t lowmem_rsvd;
285 extern paddr_t avail_start, avail_end;
286 #ifdef XENPV
287 extern paddr_t pmap_pa_start, pmap_pa_end;
288 #endif
289
290 struct nmistore {
291 uint64_t cr3;
292 uint64_t scratch;
293 } __packed;
294
295 /*
296 * Size of memory segments, before any memory is stolen.
297 */
298 phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX];
299 int mem_cluster_cnt;
300
301 int cpu_dump(void);
302 int cpu_dumpsize(void);
303 u_long cpu_dump_mempagecnt(void);
304 void dodumpsys(void);
305 void dumpsys(void);
306
307 static void x86_64_proc0_pcb_ldt_init(void);
308
309 void dump_misc_init(void);
310 void dump_seg_prep(void);
311 int dump_seg_iter(int (*)(paddr_t, paddr_t));
312
313 #ifndef NO_SPARSE_DUMP
314 void sparse_dump_reset(void);
315 void sparse_dump_mark(void);
316 void cpu_dump_prep_sparse(void);
317 #endif
318
319 void dump_header_start(void);
320 int dump_header_flush(void);
321 int dump_header_addbytes(const void*, size_t);
322 int dump_header_addseg(paddr_t, paddr_t);
323 int dump_header_finish(void);
324
325 int dump_seg_count_range(paddr_t, paddr_t);
326 int dumpsys_seg(paddr_t, paddr_t);
327
328 void init_bootspace(void);
329 void init_slotspace(void);
330 void init_x86_64(paddr_t);
331
332 /*
333 * Machine-dependent startup code
334 */
335 void
cpu_startup(void)336 cpu_startup(void)
337 {
338 int x, y;
339 vaddr_t minaddr, maxaddr;
340 psize_t sz;
341
342 /*
343 * For console drivers that require uvm and pmap to be initialized,
344 * we'll give them one more chance here...
345 */
346 consinit();
347
348 /*
349 * Initialize error message buffer (at end of core).
350 */
351 if (msgbuf_p_cnt == 0)
352 panic("msgbuf paddr map has not been set up");
353 for (x = 0, sz = 0; x < msgbuf_p_cnt; sz += msgbuf_p_seg[x++].sz)
354 continue;
355
356 msgbuf_vaddr = uvm_km_alloc(kernel_map, sz, 0, UVM_KMF_VAONLY);
357 if (msgbuf_vaddr == 0)
358 panic("failed to valloc msgbuf_vaddr");
359
360 for (y = 0, sz = 0; y < msgbuf_p_cnt; y++) {
361 for (x = 0; x < btoc(msgbuf_p_seg[y].sz); x++, sz += PAGE_SIZE)
362 pmap_kenter_pa((vaddr_t)msgbuf_vaddr + sz,
363 msgbuf_p_seg[y].paddr + x * PAGE_SIZE,
364 VM_PROT_READ|VM_PROT_WRITE, 0);
365 }
366
367 pmap_update(pmap_kernel());
368
369 initmsgbuf((void *)msgbuf_vaddr, round_page(sz));
370
371 minaddr = 0;
372
373 /*
374 * Allocate a submap for physio.
375 */
376 phys_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
377 VM_PHYS_SIZE, 0, false, NULL);
378
379 /*
380 * Create the module map.
381 *
382 * The kernel uses RIP-relative addressing with a maximum offset of
383 * 2GB. Because of that, we can't put the kernel modules in kernel_map
384 * (like i386 does), since kernel_map is too far away in memory from
385 * the kernel sections. So we have to create a special module_map.
386 *
387 * The module map is taken as what is left of the bootstrap memory
388 * created in locore/prekern.
389 */
390 uvm_map_setup(&module_map_store, bootspace.smodule,
391 bootspace.emodule, 0);
392 module_map_store.pmap = pmap_kernel();
393 module_map = &module_map_store;
394
395 /* Say hello. */
396 banner();
397
398 #if NISA > 0 || NPCI > 0
399 /* Safe for i/o port / memory space allocation to use malloc now. */
400 x86_bus_space_mallocok();
401 #endif
402
403 #ifdef __HAVE_PCPU_AREA
404 cpu_pcpuarea_init(&cpu_info_primary);
405 #endif
406 gdt_init();
407 x86_64_proc0_pcb_ldt_init();
408
409 cpu_init_tss(&cpu_info_primary);
410 #if !defined(XENPV)
411 ltr(cpu_info_primary.ci_tss_sel);
412 #endif
413
414 x86_startup();
415 }
416
417 #ifdef XENPV
418 /* used in assembly */
419 void hypervisor_callback(void);
420 void failsafe_callback(void);
421 void x86_64_switch_context(struct pcb *);
422 void x86_64_tls_switch(struct lwp *);
423
424 void
x86_64_switch_context(struct pcb * new)425 x86_64_switch_context(struct pcb *new)
426 {
427 HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), new->pcb_rsp0);
428 struct physdev_set_iopl set_iopl;
429 set_iopl.iopl = new->pcb_iopl;
430 HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
431 }
432
433 void
x86_64_tls_switch(struct lwp * l)434 x86_64_tls_switch(struct lwp *l)
435 {
436 struct cpu_info *ci = curcpu();
437 struct pcb *pcb = lwp_getpcb(l);
438 struct trapframe *tf = l->l_md.md_regs;
439 uint64_t zero = 0;
440
441 /*
442 * Raise the IPL to IPL_HIGH. XXX Still needed?
443 */
444 (void)splhigh();
445
446 /* Update segment registers */
447 if (pcb->pcb_flags & PCB_COMPAT32) {
448 update_descriptor(&ci->ci_gdt[GUFS_SEL], &pcb->pcb_fs);
449 update_descriptor(&ci->ci_gdt[GUGS_SEL], &pcb->pcb_gs);
450 setds(GSEL(GUDATA32_SEL, SEL_UPL));
451 setes(GSEL(GUDATA32_SEL, SEL_UPL));
452 setfs(GSEL(GUDATA32_SEL, SEL_UPL));
453 HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, tf->tf_gs);
454 } else {
455 update_descriptor(&ci->ci_gdt[GUFS_SEL], &zero);
456 update_descriptor(&ci->ci_gdt[GUGS_SEL], &zero);
457 setds(GSEL(GUDATA_SEL, SEL_UPL));
458 setes(GSEL(GUDATA_SEL, SEL_UPL));
459 setfs(0);
460 HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, 0);
461 HYPERVISOR_set_segment_base(SEGBASE_FS, pcb->pcb_fs);
462 HYPERVISOR_set_segment_base(SEGBASE_GS_USER, pcb->pcb_gs);
463 }
464 }
465 #endif /* XENPV */
466
467 /*
468 * Set up proc0's PCB and LDT.
469 */
470 static void
x86_64_proc0_pcb_ldt_init(void)471 x86_64_proc0_pcb_ldt_init(void)
472 {
473 struct lwp *l = &lwp0;
474 struct pcb *pcb = lwp_getpcb(l);
475
476 pcb->pcb_flags = 0;
477 pcb->pcb_fs = 0;
478 pcb->pcb_gs = 0;
479 pcb->pcb_rsp0 = (uvm_lwp_getuarea(l) + USPACE - 16) & ~0xf;
480 pcb->pcb_iopl = IOPL_KPL;
481 pcb->pcb_dbregs = NULL;
482 pcb->pcb_cr0 = rcr0() & ~CR0_TS;
483 l->l_md.md_regs = (struct trapframe *)pcb->pcb_rsp0 - 1;
484
485 #if !defined(XENPV)
486 lldt(GSYSSEL(GLDT_SEL, SEL_KPL));
487 #else
488 xen_set_ldt((vaddr_t)ldtstore, LDT_SIZE >> 3);
489 /* Reset TS bit and set kernel stack for interrupt handlers */
490 HYPERVISOR_fpu_taskswitch(1);
491 HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), pcb->pcb_rsp0);
492 struct physdev_set_iopl set_iopl;
493 set_iopl.iopl = pcb->pcb_iopl;
494 HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
495 #endif
496 }
497
498 /*
499 * Set up TSS and I/O bitmap.
500 */
501 void
cpu_init_tss(struct cpu_info * ci)502 cpu_init_tss(struct cpu_info *ci)
503 {
504 #ifdef __HAVE_PCPU_AREA
505 const cpuid_t cid = cpu_index(ci);
506 #endif
507 struct cpu_tss *cputss;
508 struct nmistore *store;
509 uintptr_t p;
510
511 #ifdef __HAVE_PCPU_AREA
512 cputss = (struct cpu_tss *)&pcpuarea->ent[cid].tss;
513 #else
514 cputss = (struct cpu_tss *)uvm_km_alloc(kernel_map,
515 sizeof(struct cpu_tss), 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
516 #endif
517
518 cputss->tss.tss_iobase = IOMAP_INVALOFF << 16;
519
520 /* DDB stack */
521 #ifdef __HAVE_PCPU_AREA
522 p = (vaddr_t)&pcpuarea->ent[cid].ist0;
523 #else
524 p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
525 #endif
526 cputss->tss.tss_ist[0] = p + PAGE_SIZE - 16;
527
528 /* double fault */
529 #ifdef __HAVE_PCPU_AREA
530 p = (vaddr_t)&pcpuarea->ent[cid].ist1;
531 #else
532 p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
533 #endif
534 cputss->tss.tss_ist[1] = p + PAGE_SIZE - 16;
535
536 /* NMI - store a structure at the top of the stack */
537 #ifdef __HAVE_PCPU_AREA
538 p = (vaddr_t)&pcpuarea->ent[cid].ist2;
539 #else
540 p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
541 #endif
542 cputss->tss.tss_ist[2] = p + PAGE_SIZE - sizeof(struct nmistore);
543 store = (struct nmistore *)(p + PAGE_SIZE - sizeof(struct nmistore));
544 store->cr3 = pmap_pdirpa(pmap_kernel(), 0);
545
546 /* DB */
547 #ifdef __HAVE_PCPU_AREA
548 p = (vaddr_t)&pcpuarea->ent[cid].ist3;
549 #else
550 p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
551 #endif
552 cputss->tss.tss_ist[3] = p + PAGE_SIZE - 16;
553
554 ci->ci_tss = cputss;
555 ci->ci_tss_sel = tss_alloc(&cputss->tss);
556 }
557
558 void
buildcontext(struct lwp * l,void * catcher,void * f)559 buildcontext(struct lwp *l, void *catcher, void *f)
560 {
561 struct trapframe *tf = l->l_md.md_regs;
562
563 tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
564 tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
565 tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL);
566 tf->tf_gs = GSEL(GUDATA_SEL, SEL_UPL);
567
568 tf->tf_rip = (uint64_t)catcher;
569 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
570 tf->tf_rflags &= ~PSL_CLEARSIG;
571 tf->tf_rsp = (uint64_t)f;
572 tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
573
574 /* Ensure FP state is sane */
575 fpu_sigreset(l);
576 }
577
578 void
sendsig_sigcontext(const ksiginfo_t * ksi,const sigset_t * mask)579 sendsig_sigcontext(const ksiginfo_t *ksi, const sigset_t *mask)
580 {
581
582 printf("sendsig_sigcontext: illegal\n");
583 sigexit(curlwp, SIGILL);
584 }
585
586 void
sendsig_siginfo(const ksiginfo_t * ksi,const sigset_t * mask)587 sendsig_siginfo(const ksiginfo_t *ksi, const sigset_t *mask)
588 {
589 struct lwp *l = curlwp;
590 struct proc *p = l->l_proc;
591 struct sigacts *ps = p->p_sigacts;
592 int onstack, error;
593 int sig = ksi->ksi_signo;
594 struct sigframe_siginfo *fp, frame;
595 sig_t catcher = SIGACTION(p, sig).sa_handler;
596 struct trapframe *tf = l->l_md.md_regs;
597 char *sp;
598
599 KASSERT(mutex_owned(p->p_lock));
600
601 /* Do we need to jump onto the signal stack? */
602 onstack =
603 (l->l_sigstk.ss_flags & (SS_DISABLE | SS_ONSTACK)) == 0 &&
604 (SIGACTION(p, sig).sa_flags & SA_ONSTACK) != 0;
605
606 /* Allocate space for the signal handler context. */
607 if (onstack)
608 sp = ((char *)l->l_sigstk.ss_sp + l->l_sigstk.ss_size);
609 else
610 /* AMD64 ABI 128-bytes "red zone". */
611 sp = (char *)tf->tf_rsp - 128;
612
613 sp -= sizeof(struct sigframe_siginfo);
614 /* Round down the stackpointer to a multiple of 16 for the ABI. */
615 fp = (struct sigframe_siginfo *)(((unsigned long)sp & ~15) - 8);
616
617 memset(&frame, 0, sizeof(frame));
618 frame.sf_ra = (uint64_t)ps->sa_sigdesc[sig].sd_tramp;
619 frame.sf_si._info = ksi->ksi_info;
620 frame.sf_uc.uc_flags = _UC_SIGMASK;
621 frame.sf_uc.uc_sigmask = *mask;
622 frame.sf_uc.uc_link = l->l_ctxlink;
623 frame.sf_uc.uc_flags |= (l->l_sigstk.ss_flags & SS_ONSTACK)
624 ? _UC_SETSTACK : _UC_CLRSTACK;
625 sendsig_reset(l, sig);
626
627 mutex_exit(p->p_lock);
628 cpu_getmcontext(l, &frame.sf_uc.uc_mcontext, &frame.sf_uc.uc_flags);
629 /* Copyout all the fp regs, the signal handler might expect them. */
630 error = copyout(&frame, fp, sizeof frame);
631 mutex_enter(p->p_lock);
632
633 if (error != 0) {
634 /*
635 * Process has trashed its stack; give it an illegal
636 * instruction to halt it in its tracks.
637 */
638 sigexit(l, SIGILL);
639 /* NOTREACHED */
640 }
641
642 buildcontext(l, catcher, fp);
643
644 tf->tf_rdi = sig;
645 tf->tf_rsi = (uint64_t)&fp->sf_si;
646 tf->tf_rdx = tf->tf_r15 = (uint64_t)&fp->sf_uc;
647
648 /* Remember that we're now on the signal stack. */
649 if (onstack)
650 l->l_sigstk.ss_flags |= SS_ONSTACK;
651
652 if ((vaddr_t)catcher >= VM_MAXUSER_ADDRESS) {
653 /*
654 * process has given an invalid address for the
655 * handler. Stop it, but do not do it before so
656 * we can return the right info to userland (or in core dump)
657 */
658 sigexit(l, SIGILL);
659 /* NOTREACHED */
660 }
661 }
662
663 struct pcb dumppcb;
664
665 void
cpu_reboot(int howto,char * bootstr)666 cpu_reboot(int howto, char *bootstr)
667 {
668 static bool syncdone = false;
669 int s = IPL_NONE;
670 __USE(s); /* ugly otherwise */
671
672 if (cold) {
673 howto |= RB_HALT;
674 goto haltsys;
675 }
676
677 boothowto = howto;
678
679 /* i386 maybe_dump() */
680
681 /*
682 * If we've panic'd, don't make the situation potentially
683 * worse by syncing or unmounting the file systems.
684 */
685 if ((howto & RB_NOSYNC) == 0 && panicstr == NULL) {
686 if (!syncdone) {
687 syncdone = true;
688 /* XXX used to force unmount as well, here */
689 vfs_sync_all(curlwp);
690 /*
691 * If we've been adjusting the clock, the todr
692 * will be out of synch; adjust it now.
693 *
694 * XXX used to do this after unmounting all
695 * filesystems with vfs_shutdown().
696 */
697 if (time_adjusted != 0)
698 resettodr();
699 }
700
701 while (vfs_unmountall1(curlwp, false, false) ||
702 config_detach_all(boothowto) ||
703 vfs_unmount_forceone(curlwp))
704 ; /* do nothing */
705 } else {
706 if (!db_active)
707 suspendsched();
708 }
709
710 pmf_system_shutdown(boothowto);
711
712 /* Disable interrupts. */
713 s = splhigh();
714
715 /* Do a dump if requested. */
716 if ((howto & (RB_DUMP | RB_HALT)) == RB_DUMP)
717 dumpsys();
718
719 haltsys:
720 doshutdownhooks();
721
722 if ((howto & RB_POWERDOWN) == RB_POWERDOWN) {
723 #if NACPICA > 0
724 if (s != IPL_NONE)
725 splx(s);
726
727 acpi_enter_sleep_state(ACPI_STATE_S5);
728 #endif
729 #ifdef XEN
730 if (vm_guest == VM_GUEST_XENPV ||
731 vm_guest == VM_GUEST_XENPVH ||
732 vm_guest == VM_GUEST_XENPVHVM)
733 HYPERVISOR_shutdown();
734 #endif /* XEN */
735 }
736
737 cpu_broadcast_halt();
738
739 if (howto & RB_HALT) {
740 #if NACPICA > 0
741 acpi_disable();
742 #endif
743
744 printf("\n");
745 printf("The operating system has halted.\n");
746 printf("Please press any key to reboot.\n\n");
747 cnpollc(1); /* for proper keyboard command handling */
748 if (cngetc() == 0) {
749 /* no console attached, so just hlt */
750 printf("No keyboard - cannot reboot after all.\n");
751 for(;;) {
752 x86_hlt();
753 }
754 }
755 cnpollc(0);
756 }
757
758 printf("rebooting...\n");
759 if (cpureset_delay > 0)
760 delay(cpureset_delay * 1000);
761 cpu_reset();
762 for(;;) ;
763 /*NOTREACHED*/
764 }
765
766 /*
767 * XXXfvdl share dumpcode.
768 */
769
770 /*
771 * Perform assorted dump-related initialization tasks. Assumes that
772 * the maximum physical memory address will not increase afterwards.
773 */
774 void
dump_misc_init(void)775 dump_misc_init(void)
776 {
777 #ifndef NO_SPARSE_DUMP
778 int i;
779 #endif
780
781 if (dump_headerbuf != NULL)
782 return; /* already called */
783
784 #ifndef NO_SPARSE_DUMP
785 for (i = 0; i < mem_cluster_cnt; ++i) {
786 paddr_t top = mem_clusters[i].start + mem_clusters[i].size;
787 if (max_paddr < top)
788 max_paddr = top;
789 }
790 #ifdef DEBUG
791 printf("dump_misc_init: max_paddr = 0x%lx\n",
792 (unsigned long)max_paddr);
793 #endif
794 if (max_paddr == 0) {
795 printf("Your machine does not initialize mem_clusters; "
796 "sparse_dumps disabled\n");
797 sparse_dump = 0;
798 } else {
799 sparse_dump_physmap = (void *)uvm_km_alloc(kernel_map,
800 roundup(max_paddr / (PAGE_SIZE * NBBY), PAGE_SIZE),
801 PAGE_SIZE, UVM_KMF_WIRED|UVM_KMF_ZERO);
802 }
803 #endif
804 dump_headerbuf = (void *)uvm_km_alloc(kernel_map,
805 dump_headerbuf_size,
806 PAGE_SIZE, UVM_KMF_WIRED|UVM_KMF_ZERO);
807 /* XXXjld should check for failure here, disable dumps if so. */
808 }
809
810 #ifndef NO_SPARSE_DUMP
811 /*
812 * Clear the set of pages to include in a sparse dump.
813 */
814 void
sparse_dump_reset(void)815 sparse_dump_reset(void)
816 {
817 memset(sparse_dump_physmap, 0,
818 roundup(max_paddr / (PAGE_SIZE * NBBY), PAGE_SIZE));
819 }
820
821 /*
822 * Include or exclude pages in a sparse dump.
823 */
824 void
sparse_dump_mark(void)825 sparse_dump_mark(void)
826 {
827 paddr_t p, pstart, pend;
828 struct vm_page *pg;
829 int i;
830 uvm_physseg_t upm;
831
832 /*
833 * Mark all memory pages, then unmark pages that are uninteresting.
834 * Dereferenceing pg->uobject might crash again if another CPU
835 * frees the object out from under us, but we can't lock anything
836 * so it's a risk we have to take.
837 */
838
839 for (i = 0; i < mem_cluster_cnt; ++i) {
840 pstart = mem_clusters[i].start / PAGE_SIZE;
841 pend = pstart + mem_clusters[i].size / PAGE_SIZE;
842
843 for (p = pstart; p < pend; p++) {
844 setbit(sparse_dump_physmap, p);
845 }
846 }
847 for (upm = uvm_physseg_get_first();
848 uvm_physseg_valid_p(upm);
849 upm = uvm_physseg_get_next(upm)) {
850 paddr_t pfn;
851
852 /*
853 * We assume that seg->start to seg->end are
854 * uvm_page_physload()ed
855 */
856 for (pfn = uvm_physseg_get_start(upm);
857 pfn < uvm_physseg_get_end(upm);
858 pfn++) {
859 pg = PHYS_TO_VM_PAGE(ptoa(pfn));
860
861 if (pg->uanon || (pg->flags & PG_FREE) ||
862 (pg->uobject && pg->uobject->pgops)) {
863 p = VM_PAGE_TO_PHYS(pg) / PAGE_SIZE;
864 clrbit(sparse_dump_physmap, p);
865 }
866 }
867 }
868 }
869
870 /*
871 * Machine-dependently decides on the contents of a sparse dump, using
872 * the above.
873 */
874 void
cpu_dump_prep_sparse(void)875 cpu_dump_prep_sparse(void)
876 {
877 sparse_dump_reset();
878 /* XXX could the alternate recursive page table be skipped? */
879 sparse_dump_mark();
880 /* Memory for I/O buffers could be unmarked here, for example. */
881 /* The kernel text could also be unmarked, but gdb would be upset. */
882 }
883 #endif
884
885 /*
886 * Abstractly iterate over the collection of memory segments to be
887 * dumped; the callback lacks the customary environment-pointer
888 * argument because none of the current users really need one.
889 *
890 * To be used only after dump_seg_prep is called to set things up.
891 */
892 int
dump_seg_iter(int (* callback)(paddr_t,paddr_t))893 dump_seg_iter(int (*callback)(paddr_t, paddr_t))
894 {
895 int error, i;
896
897 #define CALLBACK(start,size) do { \
898 error = callback(start,size); \
899 if (error) \
900 return error; \
901 } while(0)
902
903 for (i = 0; i < mem_cluster_cnt; ++i) {
904 #ifndef NO_SPARSE_DUMP
905 /*
906 * The bitmap is scanned within each memory segment,
907 * rather than over its entire domain, in case any
908 * pages outside of the memory proper have been mapped
909 * into kva; they might be devices that wouldn't
910 * appreciate being arbitrarily read, and including
911 * them could also break the assumption that a sparse
912 * dump will always be smaller than a full one.
913 */
914 if (sparse_dump && sparse_dump_physmap) {
915 paddr_t p, sp_start, sp_end;
916 int lastset;
917
918 sp_start = mem_clusters[i].start;
919 sp_end = sp_start + mem_clusters[i].size;
920 sp_start = rounddown(sp_start, PAGE_SIZE); /* unnecessary? */
921 lastset = 0;
922 for (p = sp_start; p < sp_end; p += PAGE_SIZE) {
923 int thisset = isset(sparse_dump_physmap,
924 p/PAGE_SIZE);
925
926 if (!lastset && thisset)
927 sp_start = p;
928 if (lastset && !thisset)
929 CALLBACK(sp_start, p - sp_start);
930 lastset = thisset;
931 }
932 if (lastset)
933 CALLBACK(sp_start, p - sp_start);
934 } else
935 #endif
936 CALLBACK(mem_clusters[i].start, mem_clusters[i].size);
937 }
938 return 0;
939 #undef CALLBACK
940 }
941
942 /*
943 * Prepare for an impending core dump: decide what's being dumped and
944 * how much space it will take up.
945 */
946 void
dump_seg_prep(void)947 dump_seg_prep(void)
948 {
949 #ifndef NO_SPARSE_DUMP
950 if (sparse_dump && sparse_dump_physmap)
951 cpu_dump_prep_sparse();
952 #endif
953
954 dump_nmemsegs = 0;
955 dump_npages = 0;
956 dump_seg_iter(dump_seg_count_range);
957
958 dump_header_size = ALIGN(sizeof(kcore_seg_t)) +
959 ALIGN(sizeof(cpu_kcore_hdr_t)) +
960 ALIGN(dump_nmemsegs * sizeof(phys_ram_seg_t));
961 dump_header_size = roundup(dump_header_size, dbtob(1));
962
963 /*
964 * savecore(8) will read this to decide how many pages to
965 * copy, and cpu_dumpconf has already used the pessimistic
966 * value to set dumplo, so it's time to tell the truth.
967 */
968 dumpsize = dump_npages; /* XXX could these just be one variable? */
969 }
970
971 int
dump_seg_count_range(paddr_t start,paddr_t size)972 dump_seg_count_range(paddr_t start, paddr_t size)
973 {
974 ++dump_nmemsegs;
975 dump_npages += size / PAGE_SIZE;
976 return 0;
977 }
978
979 /*
980 * A sparse dump's header may be rather large, due to the number of
981 * "segments" emitted. These routines manage a simple output buffer,
982 * so that the header can be written to disk incrementally.
983 */
984 void
dump_header_start(void)985 dump_header_start(void)
986 {
987 dump_headerbuf_ptr = dump_headerbuf;
988 dump_header_blkno = dumplo;
989 }
990
991 int
dump_header_flush(void)992 dump_header_flush(void)
993 {
994 const struct bdevsw *bdev;
995 size_t to_write;
996 int error;
997
998 bdev = bdevsw_lookup(dumpdev);
999 to_write = roundup(dump_headerbuf_ptr - dump_headerbuf, dbtob(1));
1000 error = bdev->d_dump(dumpdev, dump_header_blkno,
1001 dump_headerbuf, to_write);
1002 dump_header_blkno += btodb(to_write);
1003 dump_headerbuf_ptr = dump_headerbuf;
1004 return error;
1005 }
1006
1007 int
dump_header_addbytes(const void * vptr,size_t n)1008 dump_header_addbytes(const void* vptr, size_t n)
1009 {
1010 const char* ptr = vptr;
1011 int error;
1012
1013 while (n > dump_headerbuf_avail) {
1014 memcpy(dump_headerbuf_ptr, ptr, dump_headerbuf_avail);
1015 ptr += dump_headerbuf_avail;
1016 n -= dump_headerbuf_avail;
1017 dump_headerbuf_ptr = dump_headerbuf_end;
1018 error = dump_header_flush();
1019 if (error)
1020 return error;
1021 }
1022 memcpy(dump_headerbuf_ptr, ptr, n);
1023 dump_headerbuf_ptr += n;
1024
1025 return 0;
1026 }
1027
1028 int
dump_header_addseg(paddr_t start,paddr_t size)1029 dump_header_addseg(paddr_t start, paddr_t size)
1030 {
1031 phys_ram_seg_t seg = { start, size };
1032
1033 return dump_header_addbytes(&seg, sizeof(seg));
1034 }
1035
1036 int
dump_header_finish(void)1037 dump_header_finish(void)
1038 {
1039 memset(dump_headerbuf_ptr, 0, dump_headerbuf_avail);
1040 return dump_header_flush();
1041 }
1042
1043
1044 /*
1045 * These variables are needed by /sbin/savecore
1046 */
1047 uint32_t dumpmag = 0x8fca0101; /* magic number */
1048 int dumpsize = 0; /* pages */
1049 long dumplo = 0; /* blocks */
1050
1051 /*
1052 * cpu_dumpsize: calculate size of machine-dependent kernel core dump headers
1053 * for a full (non-sparse) dump.
1054 */
1055 int
cpu_dumpsize(void)1056 cpu_dumpsize(void)
1057 {
1058 int size;
1059
1060 size = ALIGN(sizeof(kcore_seg_t)) + ALIGN(sizeof(cpu_kcore_hdr_t)) +
1061 ALIGN(mem_cluster_cnt * sizeof(phys_ram_seg_t));
1062 if (roundup(size, dbtob(1)) != dbtob(1))
1063 return (-1);
1064
1065 return (1);
1066 }
1067
1068 /*
1069 * cpu_dump_mempagecnt: calculate the size of RAM (in pages) to be dumped
1070 * for a full (non-sparse) dump.
1071 */
1072 u_long
cpu_dump_mempagecnt(void)1073 cpu_dump_mempagecnt(void)
1074 {
1075 u_long i, n;
1076
1077 n = 0;
1078 for (i = 0; i < mem_cluster_cnt; i++)
1079 n += atop(mem_clusters[i].size);
1080 return (n);
1081 }
1082
1083 /*
1084 * cpu_dump: dump the machine-dependent kernel core dump headers.
1085 */
1086 int
cpu_dump(void)1087 cpu_dump(void)
1088 {
1089 kcore_seg_t seg;
1090 cpu_kcore_hdr_t cpuhdr;
1091 const struct bdevsw *bdev;
1092
1093 bdev = bdevsw_lookup(dumpdev);
1094 if (bdev == NULL)
1095 return (ENXIO);
1096
1097 /*
1098 * Generate a segment header.
1099 */
1100 CORE_SETMAGIC(seg, KCORE_MAGIC, MID_MACHINE, CORE_CPU);
1101 seg.c_size = dump_header_size - ALIGN(sizeof(seg));
1102 (void)dump_header_addbytes(&seg, ALIGN(sizeof(seg)));
1103
1104 /*
1105 * Add the machine-dependent header info.
1106 */
1107 cpuhdr.ptdpaddr = PDPpaddr;
1108 cpuhdr.nmemsegs = dump_nmemsegs;
1109 (void)dump_header_addbytes(&cpuhdr, ALIGN(sizeof(cpuhdr)));
1110
1111 /*
1112 * Write out the memory segment descriptors.
1113 */
1114 return dump_seg_iter(dump_header_addseg);
1115 }
1116
1117 /*
1118 * Doadump comes here after turning off memory management and
1119 * getting on the dump stack, either when called above, or by
1120 * the auto-restart code.
1121 */
1122 #define BYTES_PER_DUMP PAGE_SIZE /* must be a multiple of pagesize XXX small */
1123 static vaddr_t dumpspace;
1124
1125 vaddr_t
reserve_dumppages(vaddr_t p)1126 reserve_dumppages(vaddr_t p)
1127 {
1128
1129 dumpspace = p;
1130 return (p + BYTES_PER_DUMP);
1131 }
1132
1133 int
dumpsys_seg(paddr_t maddr,paddr_t bytes)1134 dumpsys_seg(paddr_t maddr, paddr_t bytes)
1135 {
1136 u_long i, m, n;
1137 daddr_t blkno;
1138 const struct bdevsw *bdev;
1139 int (*dump)(dev_t, daddr_t, void *, size_t);
1140 int error;
1141
1142 if (dumpdev == NODEV)
1143 return ENODEV;
1144 bdev = bdevsw_lookup(dumpdev);
1145 if (bdev == NULL || bdev->d_psize == NULL)
1146 return ENODEV;
1147
1148 dump = bdev->d_dump;
1149
1150 blkno = dump_header_blkno;
1151 for (i = 0; i < bytes; i += n, dump_totalbytesleft -= n) {
1152 /* Print out how many MBs we have left to go. */
1153 if ((dump_totalbytesleft % (1024*1024)) == 0)
1154 printf_nolog("%lu ", (unsigned long)
1155 (dump_totalbytesleft / (1024 * 1024)));
1156
1157 /* Limit size for next transfer. */
1158 n = bytes - i;
1159 if (n > BYTES_PER_DUMP)
1160 n = BYTES_PER_DUMP;
1161
1162 for (m = 0; m < n; m += NBPG)
1163 pmap_kenter_pa(dumpspace + m, maddr + m,
1164 VM_PROT_READ, 0);
1165 pmap_update(pmap_kernel());
1166
1167 error = (*dump)(dumpdev, blkno, (void *)dumpspace, n);
1168 pmap_kremove_local(dumpspace, n);
1169 if (error)
1170 return error;
1171 maddr += n;
1172 blkno += btodb(n); /* XXX? */
1173
1174 #if 0 /* XXX this doesn't work. grr. */
1175 /* operator aborting dump? */
1176 if (sget() != NULL)
1177 return EINTR;
1178 #endif
1179 }
1180 dump_header_blkno = blkno;
1181
1182 return 0;
1183 }
1184
1185 void
dodumpsys(void)1186 dodumpsys(void)
1187 {
1188 const struct bdevsw *bdev;
1189 int dumpend, psize;
1190 int error;
1191
1192 if (dumpdev == NODEV)
1193 return;
1194
1195 bdev = bdevsw_lookup(dumpdev);
1196 if (bdev == NULL || bdev->d_psize == NULL)
1197 return;
1198 /*
1199 * For dumps during autoconfiguration,
1200 * if dump device has already configured...
1201 */
1202 if (dumpsize == 0)
1203 cpu_dumpconf();
1204
1205 printf("\ndumping to dev %llu,%llu (offset=%ld, size=%d):",
1206 (unsigned long long)major(dumpdev),
1207 (unsigned long long)minor(dumpdev), dumplo, dumpsize);
1208
1209 if (dumplo <= 0 || dumpsize <= 0) {
1210 printf(" not possible\n");
1211 return;
1212 }
1213
1214 psize = bdev_size(dumpdev);
1215 printf("\ndump ");
1216 if (psize == -1) {
1217 printf("area unavailable\n");
1218 return;
1219 }
1220
1221 #if 0 /* XXX this doesn't work. grr. */
1222 /* toss any characters present prior to dump */
1223 while (sget() != NULL); /*syscons and pccons differ */
1224 #endif
1225
1226 dump_seg_prep();
1227 dumpend = dumplo + btodb(dump_header_size) + ctod(dump_npages);
1228 if (dumpend > psize) {
1229 printf("failed: insufficient space (%d < %d)\n",
1230 psize, dumpend);
1231 goto failed;
1232 }
1233
1234 dump_header_start();
1235 if ((error = cpu_dump()) != 0)
1236 goto err;
1237 if ((error = dump_header_finish()) != 0)
1238 goto err;
1239
1240 if (dump_header_blkno != dumplo + btodb(dump_header_size)) {
1241 printf("BAD header size (%ld [written] != %ld [expected])\n",
1242 (long)(dump_header_blkno - dumplo),
1243 (long)btodb(dump_header_size));
1244 goto failed;
1245 }
1246
1247 dump_totalbytesleft = roundup(ptoa(dump_npages), BYTES_PER_DUMP);
1248 error = dump_seg_iter(dumpsys_seg);
1249
1250 if (error == 0 && dump_header_blkno != dumpend) {
1251 printf("BAD dump size (%ld [written] != %ld [expected])\n",
1252 (long)(dumpend - dumplo),
1253 (long)(dump_header_blkno - dumplo));
1254 goto failed;
1255 }
1256
1257 err:
1258 switch (error) {
1259
1260 case ENXIO:
1261 printf("device bad\n");
1262 break;
1263
1264 case EFAULT:
1265 printf("device not ready\n");
1266 break;
1267
1268 case EINVAL:
1269 printf("area improper\n");
1270 break;
1271
1272 case EIO:
1273 printf("i/o error\n");
1274 break;
1275
1276 case EINTR:
1277 printf("aborted from console\n");
1278 break;
1279
1280 case 0:
1281 printf("succeeded\n");
1282 break;
1283
1284 default:
1285 printf("error %d\n", error);
1286 break;
1287 }
1288 failed:
1289 printf("\n\n");
1290 delay(5000000); /* 5 seconds */
1291 }
1292
1293 /*
1294 * This is called by main to set dumplo and dumpsize.
1295 * Dumps always skip the first PAGE_SIZE of disk space
1296 * in case there might be a disk label stored there.
1297 * If there is extra space, put dump at the end to
1298 * reduce the chance that swapping trashes it.
1299 *
1300 * Sparse dumps can't placed as close to the end as possible, because
1301 * savecore(8) has to know where to start reading in the dump device
1302 * before it has access to any of the crashed system's state.
1303 *
1304 * Note also that a sparse dump will never be larger than a full one:
1305 * in order to add a phys_ram_seg_t to the header, at least one page
1306 * must be removed.
1307 */
1308 void
cpu_dumpconf(void)1309 cpu_dumpconf(void)
1310 {
1311 int nblks, dumpblks; /* size of dump area */
1312
1313 if (dumpdev == NODEV)
1314 goto bad;
1315 nblks = bdev_size(dumpdev);
1316 if (nblks <= ctod(1))
1317 goto bad;
1318
1319 dumpblks = cpu_dumpsize();
1320 if (dumpblks < 0)
1321 goto bad;
1322
1323 /* dumpsize is in page units, and doesn't include headers. */
1324 dumpsize = cpu_dump_mempagecnt();
1325
1326 dumpblks += ctod(dumpsize);
1327
1328 /* If dump won't fit (incl. room for possible label), punt. */
1329 if (dumpblks > (nblks - ctod(1))) {
1330 #ifndef NO_SPARSE_DUMP
1331 /* A sparse dump might (and hopefully will) fit. */
1332 dumplo = ctod(1);
1333 #else
1334 /* But if we're not configured for that, punt. */
1335 goto bad;
1336 #endif
1337 } else {
1338 /* Put dump at end of partition */
1339 dumplo = nblks - dumpblks;
1340 }
1341
1342
1343 /* Now that we've decided this will work, init ancillary stuff. */
1344 dump_misc_init();
1345 return;
1346
1347 bad:
1348 dumpsize = 0;
1349 }
1350
1351 /*
1352 * Clear registers on exec
1353 */
1354 void
setregs(struct lwp * l,struct exec_package * pack,vaddr_t stack)1355 setregs(struct lwp *l, struct exec_package *pack, vaddr_t stack)
1356 {
1357 struct pcb *pcb = lwp_getpcb(l);
1358 struct trapframe *tf;
1359
1360 #ifdef USER_LDT
1361 pmap_ldt_cleanup(l);
1362 #endif
1363
1364 fpu_clear(l, pack->ep_osversion >= 699002600
1365 ? __NetBSD_NPXCW__ : __NetBSD_COMPAT_NPXCW__);
1366 x86_dbregs_clear(l);
1367
1368 kpreempt_disable();
1369 pcb->pcb_flags = 0;
1370 l->l_proc->p_flag &= ~PK_32;
1371 l->l_md.md_flags = MDL_IRET;
1372 cpu_segregs64_zero(l);
1373 kpreempt_enable();
1374
1375 tf = l->l_md.md_regs;
1376 tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
1377 tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
1378 tf->tf_rdi = 0;
1379 tf->tf_rsi = 0;
1380 tf->tf_rbp = 0;
1381 tf->tf_rbx = l->l_proc->p_psstrp;
1382 tf->tf_rdx = 0;
1383 tf->tf_rcx = 0;
1384 tf->tf_rax = 0;
1385 tf->tf_rip = pack->ep_entry;
1386 tf->tf_cs = LSEL(LUCODE_SEL, SEL_UPL);
1387 tf->tf_rflags = PSL_USERSET;
1388 tf->tf_rsp = stack;
1389 tf->tf_ss = LSEL(LUDATA_SEL, SEL_UPL);
1390 }
1391
1392 /*
1393 * Initialize segments and descriptor tables
1394 */
1395 char *ldtstore;
1396 char *gdtstore;
1397
1398 void
setgate(struct gate_descriptor * gd,void * func,int ist,int type,int dpl,int sel)1399 setgate(struct gate_descriptor *gd, void *func,
1400 int ist, int type, int dpl, int sel)
1401 {
1402 vaddr_t vaddr;
1403
1404 vaddr = ((vaddr_t)gd) & ~PAGE_MASK;
1405
1406 kpreempt_disable();
1407 pmap_changeprot_local(vaddr, VM_PROT_READ|VM_PROT_WRITE);
1408
1409 gd->gd_looffset = (uint64_t)func & 0xffff;
1410 gd->gd_selector = sel;
1411 gd->gd_ist = ist;
1412 gd->gd_type = type;
1413 gd->gd_dpl = dpl;
1414 gd->gd_p = 1;
1415 gd->gd_hioffset = (uint64_t)func >> 16;
1416 gd->gd_zero = 0;
1417 gd->gd_xx1 = 0;
1418 gd->gd_xx2 = 0;
1419 gd->gd_xx3 = 0;
1420
1421 pmap_changeprot_local(vaddr, VM_PROT_READ);
1422 kpreempt_enable();
1423 }
1424
1425 void
unsetgate(struct gate_descriptor * gd)1426 unsetgate(struct gate_descriptor *gd)
1427 {
1428 vaddr_t vaddr;
1429
1430 vaddr = ((vaddr_t)gd) & ~PAGE_MASK;
1431
1432 kpreempt_disable();
1433 pmap_changeprot_local(vaddr, VM_PROT_READ|VM_PROT_WRITE);
1434
1435 memset(gd, 0, sizeof (*gd));
1436
1437 pmap_changeprot_local(vaddr, VM_PROT_READ);
1438 kpreempt_enable();
1439 }
1440
1441 void
setregion(struct region_descriptor * rd,void * base,uint16_t limit)1442 setregion(struct region_descriptor *rd, void *base, uint16_t limit)
1443 {
1444 rd->rd_limit = limit;
1445 rd->rd_base = (uint64_t)base;
1446 }
1447
1448 /*
1449 * Note that the base and limit fields are ignored in long mode.
1450 */
1451 void
set_mem_segment(struct mem_segment_descriptor * sd,void * base,size_t limit,int type,int dpl,int gran,int def32,int is64)1452 set_mem_segment(struct mem_segment_descriptor *sd, void *base, size_t limit,
1453 int type, int dpl, int gran, int def32, int is64)
1454 {
1455 sd->sd_lolimit = (unsigned)limit;
1456 sd->sd_lobase = (unsigned long)base;
1457 sd->sd_type = type;
1458 sd->sd_dpl = dpl;
1459 sd->sd_p = 1;
1460 sd->sd_hilimit = (unsigned)limit >> 16;
1461 sd->sd_avl = 0;
1462 sd->sd_long = is64;
1463 sd->sd_def32 = def32;
1464 sd->sd_gran = gran;
1465 sd->sd_hibase = (unsigned long)base >> 24;
1466 }
1467
1468 void
set_sys_segment(struct sys_segment_descriptor * sd,void * base,size_t limit,int type,int dpl,int gran)1469 set_sys_segment(struct sys_segment_descriptor *sd, void *base, size_t limit,
1470 int type, int dpl, int gran)
1471 {
1472 memset(sd, 0, sizeof *sd);
1473 sd->sd_lolimit = (unsigned)limit;
1474 sd->sd_lobase = (uint64_t)base;
1475 sd->sd_type = type;
1476 sd->sd_dpl = dpl;
1477 sd->sd_p = 1;
1478 sd->sd_hilimit = (unsigned)limit >> 16;
1479 sd->sd_gran = gran;
1480 sd->sd_hibase = (uint64_t)base >> 24;
1481 }
1482
1483 void
cpu_init_idt(struct cpu_info * ci)1484 cpu_init_idt(struct cpu_info *ci)
1485 {
1486 struct region_descriptor region;
1487 idt_descriptor_t *idt;
1488
1489 idt = ci->ci_idtvec.iv_idt;
1490 setregion(®ion, idt, NIDT * sizeof(idt[0]) - 1);
1491 lidt(®ion);
1492 }
1493
1494 #define IDTVEC(name) __CONCAT(X, name)
1495 typedef void (vector)(void);
1496 extern vector IDTVEC(syscall);
1497 extern vector IDTVEC(syscall32);
1498 extern vector IDTVEC(osyscall);
1499 extern vector *x86_exceptions[];
1500
1501 #ifndef XENPV
1502 static void
init_x86_64_ksyms(void)1503 init_x86_64_ksyms(void)
1504 {
1505 #if NKSYMS || defined(DDB) || defined(MODULAR)
1506 extern int end;
1507 extern int *esym;
1508 struct btinfo_symtab *symtab;
1509 vaddr_t tssym, tesym;
1510
1511 #ifdef DDB
1512 db_machine_init();
1513 #endif
1514
1515 symtab = lookup_bootinfo(BTINFO_SYMTAB);
1516 if (symtab) {
1517 #ifdef KASLR
1518 tssym = bootspace.head.va;
1519 tesym = bootspace.head.va; /* (unused...) */
1520 #else
1521 tssym = (vaddr_t)symtab->ssym + KERNBASE;
1522 tesym = (vaddr_t)symtab->esym + KERNBASE;
1523 #endif
1524 ksyms_addsyms_elf(symtab->nsym, (void *)tssym, (void *)tesym);
1525 } else {
1526 uintptr_t endp = (uintptr_t)(void *)&end;
1527
1528 ksyms_addsyms_elf(*(long *)endp,
1529 ((long *)endp) + 1, esym);
1530 }
1531 #endif
1532 }
1533 #endif /* XENPV */
1534
1535 void __noasan
init_bootspace(void)1536 init_bootspace(void)
1537 {
1538 extern char __rodata_start;
1539 extern char __data_start;
1540 extern char __kernel_end;
1541 size_t i = 0;
1542
1543 memset(&bootspace, 0, sizeof(bootspace));
1544
1545 bootspace.head.va = KERNTEXTOFF;
1546 bootspace.head.pa = KERNTEXTOFF - KERNBASE;
1547 bootspace.head.sz = 0;
1548
1549 bootspace.segs[i].type = BTSEG_TEXT;
1550 bootspace.segs[i].va = KERNTEXTOFF;
1551 bootspace.segs[i].pa = KERNTEXTOFF - KERNBASE;
1552 bootspace.segs[i].sz = (size_t)&__rodata_start - KERNTEXTOFF;
1553 i++;
1554
1555 bootspace.segs[i].type = BTSEG_RODATA;
1556 bootspace.segs[i].va = (vaddr_t)&__rodata_start;
1557 bootspace.segs[i].pa = (paddr_t)&__rodata_start - KERNBASE;
1558 bootspace.segs[i].sz = (size_t)&__data_start - (size_t)&__rodata_start;
1559 i++;
1560
1561 bootspace.segs[i].type = BTSEG_DATA;
1562 bootspace.segs[i].va = (vaddr_t)&__data_start;
1563 bootspace.segs[i].pa = (paddr_t)&__data_start - KERNBASE;
1564 bootspace.segs[i].sz = (size_t)&__kernel_end - (size_t)&__data_start;
1565 i++;
1566
1567 bootspace.boot.va = (vaddr_t)&__kernel_end;
1568 bootspace.boot.pa = (paddr_t)&__kernel_end - KERNBASE;
1569 bootspace.boot.sz = (size_t)(atdevbase + IOM_SIZE) -
1570 (size_t)&__kernel_end;
1571
1572 /* In locore.S, we allocated a tmp va. We will use it now. */
1573 bootspace.spareva = KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2;
1574
1575 /* Virtual address of the L4 page. */
1576 bootspace.pdir = (vaddr_t)(PDPpaddr + KERNBASE);
1577
1578 /* Kernel module map. */
1579 bootspace.smodule = (vaddr_t)atdevbase + IOM_SIZE;
1580 bootspace.emodule = KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2;
1581 }
1582
1583 static void
init_pte(void)1584 init_pte(void)
1585 {
1586 #ifndef XENPV
1587 extern uint32_t nox_flag;
1588 pd_entry_t *pdir = (pd_entry_t *)bootspace.pdir;
1589 pdir[L4_SLOT_PTE] = PDPpaddr | PTE_W | ((uint64_t)nox_flag << 32) |
1590 PTE_P;
1591 #endif
1592
1593 extern pd_entry_t *normal_pdes[3];
1594 normal_pdes[0] = L2_BASE;
1595 normal_pdes[1] = L3_BASE;
1596 normal_pdes[2] = L4_BASE;
1597 }
1598
1599 void
init_slotspace(void)1600 init_slotspace(void)
1601 {
1602 /*
1603 * XXX Too early to use cprng(9), or even entropy_extract.
1604 */
1605 struct entpool pool;
1606 size_t randhole;
1607 vaddr_t randva;
1608 uint64_t sample;
1609 vaddr_t va;
1610
1611 memset(&pool, 0, sizeof pool);
1612 cpu_rng_early_sample(&sample);
1613 entpool_enter(&pool, &sample, sizeof sample);
1614
1615 memset(&slotspace, 0, sizeof(slotspace));
1616
1617 /* User. [256, because we want to land in >= 256] */
1618 slotspace.area[SLAREA_USER].sslot = 0;
1619 slotspace.area[SLAREA_USER].nslot = PDIR_SLOT_USERLIM+1;
1620 slotspace.area[SLAREA_USER].active = true;
1621
1622 #ifdef XENPV
1623 /* PTE. */
1624 slotspace.area[SLAREA_PTE].sslot = PDIR_SLOT_PTE;
1625 slotspace.area[SLAREA_PTE].nslot = 1;
1626 slotspace.area[SLAREA_PTE].active = true;
1627 #endif
1628
1629 #ifdef __HAVE_PCPU_AREA
1630 /* Per-CPU. */
1631 slotspace.area[SLAREA_PCPU].sslot = PDIR_SLOT_PCPU;
1632 slotspace.area[SLAREA_PCPU].nslot = 1;
1633 slotspace.area[SLAREA_PCPU].active = true;
1634 #endif
1635
1636 #ifdef __HAVE_DIRECT_MAP
1637 /* Direct Map. [Randomized later] */
1638 slotspace.area[SLAREA_DMAP].active = false;
1639 #endif
1640
1641 #ifdef XENPV
1642 /* Hypervisor. */
1643 slotspace.area[SLAREA_HYPV].sslot = 256;
1644 slotspace.area[SLAREA_HYPV].nslot = 17;
1645 slotspace.area[SLAREA_HYPV].active = true;
1646 #endif
1647
1648 #ifdef KASAN
1649 /* ASAN. */
1650 slotspace.area[SLAREA_ASAN].sslot = L4_SLOT_KASAN;
1651 slotspace.area[SLAREA_ASAN].nslot = NL4_SLOT_KASAN;
1652 slotspace.area[SLAREA_ASAN].active = true;
1653 #endif
1654
1655 #ifdef KMSAN
1656 /* MSAN. */
1657 slotspace.area[SLAREA_MSAN].sslot = L4_SLOT_KMSAN;
1658 slotspace.area[SLAREA_MSAN].nslot = NL4_SLOT_KMSAN;
1659 slotspace.area[SLAREA_MSAN].active = true;
1660 #endif
1661
1662 /* Kernel. */
1663 slotspace.area[SLAREA_KERN].sslot = L4_SLOT_KERNBASE;
1664 slotspace.area[SLAREA_KERN].nslot = 1;
1665 slotspace.area[SLAREA_KERN].active = true;
1666
1667 /* Main. */
1668 cpu_rng_early_sample(&sample);
1669 entpool_enter(&pool, &sample, sizeof sample);
1670 entpool_extract(&pool, &randhole, sizeof randhole);
1671 entpool_extract(&pool, &randva, sizeof randva);
1672 va = slotspace_rand(SLAREA_MAIN, NKL4_MAX_ENTRIES * NBPD_L4,
1673 NBPD_L4, randhole, randva); /* TODO: NBPD_L1 */
1674 vm_min_kernel_address = va;
1675 vm_max_kernel_address = va + NKL4_MAX_ENTRIES * NBPD_L4;
1676
1677 #ifndef XENPV
1678 /* PTE. */
1679 cpu_rng_early_sample(&sample);
1680 entpool_enter(&pool, &sample, sizeof sample);
1681 entpool_extract(&pool, &randhole, sizeof randhole);
1682 entpool_extract(&pool, &randva, sizeof randva);
1683 va = slotspace_rand(SLAREA_PTE, NBPD_L4, NBPD_L4, randhole, randva);
1684 pte_base = (pd_entry_t *)va;
1685 #endif
1686
1687 explicit_memset(&pool, 0, sizeof pool);
1688 }
1689
1690 void
init_x86_64(paddr_t first_avail)1691 init_x86_64(paddr_t first_avail)
1692 {
1693 extern void consinit(void);
1694 struct region_descriptor region;
1695 struct mem_segment_descriptor *ldt_segp;
1696 struct idt_vec *iv;
1697 idt_descriptor_t *idt;
1698 int x;
1699 struct pcb *pcb;
1700 extern vaddr_t lwp0uarea;
1701 #ifndef XENPV
1702 extern paddr_t local_apic_pa;
1703 #endif
1704
1705 KASSERT(first_avail % PAGE_SIZE == 0);
1706
1707 #ifdef XENPV
1708 KASSERT(HYPERVISOR_shared_info != NULL);
1709 cpu_info_primary.ci_vcpu = &HYPERVISOR_shared_info->vcpu_info[0];
1710 #endif
1711
1712 #ifdef XEN
1713 if (vm_guest == VM_GUEST_XENPVH)
1714 xen_parse_cmdline(XEN_PARSE_BOOTFLAGS, NULL);
1715 #endif
1716 init_pte();
1717
1718 uvm_lwp_setuarea(&lwp0, lwp0uarea);
1719
1720 cpu_probe(&cpu_info_primary);
1721 #ifdef SVS
1722 svs_init();
1723 #endif
1724
1725 /*
1726 * Initialize MSRs on cpu0:
1727 *
1728 * - Enables SYSCALL/SYSRET.
1729 *
1730 * - Sets up %fs and %gs so that %gs points to the current
1731 * struct cpu_info as needed for CPUVAR(...), curcpu(), and
1732 * curlwp.
1733 *
1734 * - Enables the no-execute bit if supported.
1735 *
1736 * Thus, after this point, CPUVAR(...), curcpu(), and curlwp
1737 * will work on cpu0.
1738 *
1739 * Note: The call to cpu_init_msrs for secondary CPUs happens
1740 * in cpu_hatch.
1741 */
1742 cpu_init_msrs(&cpu_info_primary, true);
1743
1744 #ifndef XENPV
1745 cpu_speculation_init(&cpu_info_primary);
1746 #endif
1747
1748 use_pae = 1; /* PAE always enabled in long mode */
1749
1750 pcb = lwp_getpcb(&lwp0);
1751 #ifdef XENPV
1752 mutex_init(&pte_lock, MUTEX_DEFAULT, IPL_VM);
1753 pcb->pcb_cr3 = xen_start_info.pt_base - KERNBASE;
1754 #else
1755 pcb->pcb_cr3 = PDPpaddr;
1756 #endif
1757
1758 #if NISA > 0 || NPCI > 0
1759 x86_bus_space_init();
1760 #endif
1761
1762 pat_init(&cpu_info_primary);
1763
1764 consinit(); /* XXX SHOULD NOT BE DONE HERE */
1765
1766 /*
1767 * Initialize RNG to get entropy ASAP either from CPU
1768 * RDRAND/RDSEED or from seed on disk. Must happen after
1769 * cpu_init_msrs. Prefer to happen after consinit so we have
1770 * the opportunity to print useful feedback.
1771 */
1772 cpu_rng_init();
1773 x86_rndseed();
1774
1775 /*
1776 * Initialize PAGE_SIZE-dependent variables.
1777 */
1778 uvm_md_init();
1779
1780 uvmexp.ncolors = 2;
1781
1782 avail_start = first_avail;
1783
1784 #ifndef XENPV
1785 /*
1786 * Low memory reservations:
1787 * Page 0: BIOS data
1788 * Page 1: BIOS callback (not used yet, for symmetry with i386)
1789 * Page 2: MP bootstrap code (MP_TRAMPOLINE)
1790 * Page 3: ACPI wakeup code (ACPI_WAKEUP_ADDR)
1791 * Page 4: Temporary page table for 0MB-4MB
1792 * Page 5: Temporary page directory
1793 * Page 6: Temporary page map level 3
1794 * Page 7: Temporary page map level 4
1795 */
1796 lowmem_rsvd = 8 * PAGE_SIZE;
1797
1798 /* Initialize the memory clusters (needed in pmap_bootstrap). */
1799 init_x86_clusters();
1800 #else
1801 /* Parse Xen command line (replace bootinfo) */
1802 xen_parse_cmdline(XEN_PARSE_BOOTFLAGS, NULL);
1803
1804 avail_end = ctob(xen_start_info.nr_pages);
1805 pmap_pa_start = (KERNTEXTOFF - KERNBASE);
1806 pmap_pa_end = avail_end;
1807 #endif
1808
1809 /*
1810 * Call pmap initialization to make new kernel address space.
1811 * We must do this before loading pages into the VM system.
1812 */
1813 pmap_bootstrap(VM_MIN_KERNEL_ADDRESS);
1814
1815 #ifndef XENPV
1816 /* Internalize the physical pages into the VM system. */
1817 init_x86_vm(avail_start);
1818 #else
1819 physmem = xen_start_info.nr_pages;
1820 uvm_page_physload(atop(avail_start), atop(avail_end),
1821 atop(avail_start), atop(avail_end), VM_FREELIST_DEFAULT);
1822 #endif
1823
1824 init_x86_msgbuf();
1825
1826 kasan_init();
1827 kcsan_init();
1828 kmsan_init((void *)lwp0uarea);
1829
1830 pmap_growkernel(VM_MIN_KERNEL_ADDRESS + 32 * 1024 * 1024);
1831
1832 kpreempt_disable();
1833
1834 #ifndef XENPV
1835 pmap_kenter_pa(local_apic_va, local_apic_pa,
1836 VM_PROT_READ|VM_PROT_WRITE, 0);
1837 pmap_update(pmap_kernel());
1838 memset((void *)local_apic_va, 0, PAGE_SIZE);
1839 #endif
1840
1841 pmap_kenter_pa(idt_vaddr, idt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
1842 pmap_kenter_pa(gdt_vaddr, gdt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
1843 pmap_kenter_pa(ldt_vaddr, ldt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
1844 pmap_update(pmap_kernel());
1845 memset((void *)idt_vaddr, 0, PAGE_SIZE);
1846 memset((void *)gdt_vaddr, 0, PAGE_SIZE);
1847 memset((void *)ldt_vaddr, 0, PAGE_SIZE);
1848
1849 #ifndef XENPV
1850 pmap_changeprot_local(idt_vaddr, VM_PROT_READ);
1851 #endif
1852
1853 pmap_update(pmap_kernel());
1854
1855 iv = &(cpu_info_primary.ci_idtvec);
1856 idt_vec_init_cpu_md(iv, cpu_index(&cpu_info_primary));
1857 idt = iv->iv_idt;
1858 gdtstore = (char *)gdt_vaddr;
1859 ldtstore = (char *)ldt_vaddr;
1860
1861 /*
1862 * Make GDT gates and memory segments.
1863 */
1864 set_mem_segment(GDT_ADDR_MEM(gdtstore, GCODE_SEL), 0,
1865 0xfffff, SDT_MEMERA, SEL_KPL, 1, 0, 1);
1866
1867 set_mem_segment(GDT_ADDR_MEM(gdtstore, GDATA_SEL), 0,
1868 0xfffff, SDT_MEMRWA, SEL_KPL, 1, 0, 1);
1869
1870 set_mem_segment(GDT_ADDR_MEM(gdtstore, GUCODE_SEL), 0,
1871 x86_btop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMERA, SEL_UPL, 1, 0, 1);
1872
1873 set_mem_segment(GDT_ADDR_MEM(gdtstore, GUDATA_SEL), 0,
1874 x86_btop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMRWA, SEL_UPL, 1, 0, 1);
1875
1876 #ifndef XENPV
1877 set_sys_segment(GDT_ADDR_SYS(gdtstore, GLDT_SEL), ldtstore,
1878 LDT_SIZE - 1, SDT_SYSLDT, SEL_KPL, 0);
1879 #endif
1880
1881 /*
1882 * Make LDT memory segments.
1883 */
1884 *(struct mem_segment_descriptor *)(ldtstore + LUCODE_SEL) =
1885 *GDT_ADDR_MEM(gdtstore, GUCODE_SEL);
1886 *(struct mem_segment_descriptor *)(ldtstore + LUDATA_SEL) =
1887 *GDT_ADDR_MEM(gdtstore, GUDATA_SEL);
1888
1889 /*
1890 * 32 bit GDT entries.
1891 */
1892 set_mem_segment(GDT_ADDR_MEM(gdtstore, GUCODE32_SEL), 0,
1893 x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMERA, SEL_UPL, 1, 1, 0);
1894
1895 set_mem_segment(GDT_ADDR_MEM(gdtstore, GUDATA32_SEL), 0,
1896 x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0);
1897
1898 set_mem_segment(GDT_ADDR_MEM(gdtstore, GUFS_SEL), 0,
1899 x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0);
1900
1901 set_mem_segment(GDT_ADDR_MEM(gdtstore, GUGS_SEL), 0,
1902 x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0);
1903
1904 /*
1905 * 32 bit LDT entries.
1906 */
1907 ldt_segp = (struct mem_segment_descriptor *)(ldtstore + LUCODE32_SEL);
1908 set_mem_segment(ldt_segp, 0, x86_btop(VM_MAXUSER_ADDRESS32) - 1,
1909 SDT_MEMERA, SEL_UPL, 1, 1, 0);
1910 ldt_segp = (struct mem_segment_descriptor *)(ldtstore + LUDATA32_SEL);
1911 set_mem_segment(ldt_segp, 0, x86_btop(VM_MAXUSER_ADDRESS32) - 1,
1912 SDT_MEMRWA, SEL_UPL, 1, 1, 0);
1913
1914 /* CPU-specific IDT exceptions. */
1915 for (x = 0; x < NCPUIDT; x++) {
1916 int sel, ist;
1917
1918 /* Reset to default. Special cases below */
1919 sel = SEL_KPL;
1920 ist = 0;
1921
1922 idt_vec_reserve(iv, x);
1923
1924 switch (x) {
1925 case 1: /* DB */
1926 ist = 4;
1927 break;
1928 case 2: /* NMI */
1929 ist = 3;
1930 break;
1931 case 3:
1932 case 4:
1933 sel = SEL_UPL;
1934 break;
1935 case 8: /* double fault */
1936 ist = 2;
1937 break;
1938 #ifdef XENPV
1939 case 18: /* MCA */
1940 sel |= 0x4; /* Auto EOI/mask */
1941 break;
1942 #endif /* XENPV */
1943 default:
1944 break;
1945 }
1946
1947 set_idtgate(&idt[x], x86_exceptions[x], ist, SDT_SYS386IGT,
1948 sel, GSEL(GCODE_SEL, SEL_KPL));
1949 }
1950
1951 /* new-style interrupt gate for syscalls */
1952 idt_vec_reserve(iv, 128);
1953 set_idtgate(&idt[128], &IDTVEC(osyscall), 0, SDT_SYS386IGT, SEL_UPL,
1954 GSEL(GCODE_SEL, SEL_KPL));
1955
1956 kpreempt_enable();
1957
1958 setregion(®ion, gdtstore, DYNSEL_START - 1);
1959 lgdt(®ion);
1960
1961 #ifdef XENPV
1962 /* Init Xen callbacks and syscall handlers */
1963 if (HYPERVISOR_set_callbacks(
1964 (unsigned long) hypervisor_callback,
1965 (unsigned long) failsafe_callback,
1966 (unsigned long) Xsyscall))
1967 panic("HYPERVISOR_set_callbacks() failed");
1968 #endif /* XENPV */
1969
1970 cpu_init_idt(&cpu_info_primary);
1971
1972 #ifdef XENPV
1973 xen_init_ksyms();
1974 #else /* XENPV */
1975 #ifdef XEN
1976 if (vm_guest == VM_GUEST_XENPVH)
1977 xen_init_ksyms();
1978 else
1979 #endif /* XEN */
1980 init_x86_64_ksyms();
1981 #endif /* XENPV */
1982
1983 #ifndef XENPV
1984 intr_default_setup();
1985 #else
1986 events_default_setup();
1987 #endif
1988
1989 splraise(IPL_HIGH);
1990 x86_enable_intr();
1991
1992 #ifdef DDB
1993 if (boothowto & RB_KDB)
1994 Debugger();
1995 #endif
1996 #ifdef KGDB
1997 kgdb_port_init();
1998 if (boothowto & RB_KDB) {
1999 kgdb_debug_init = 1;
2000 kgdb_connect(1);
2001 }
2002 #endif
2003
2004 pcb->pcb_dbregs = NULL;
2005 x86_dbregs_init();
2006 }
2007
2008 void
cpu_reset(void)2009 cpu_reset(void)
2010 {
2011 #ifndef XENPV
2012 idt_descriptor_t *idt;
2013 vaddr_t vaddr;
2014
2015 idt = cpu_info_primary.ci_idtvec.iv_idt;
2016 vaddr = (vaddr_t)idt;
2017 #endif
2018
2019 x86_disable_intr();
2020
2021 #ifdef XENPV
2022 HYPERVISOR_reboot();
2023 #else
2024
2025 x86_reset();
2026
2027 /*
2028 * Try to cause a triple fault and watchdog reset by making the IDT
2029 * invalid and causing a fault.
2030 */
2031 kpreempt_disable();
2032 pmap_changeprot_local(vaddr, VM_PROT_READ|VM_PROT_WRITE);
2033 memset((void *)idt, 0, NIDT * sizeof(idt[0]));
2034 kpreempt_enable();
2035 breakpoint();
2036
2037 #if 0
2038 /*
2039 * Try to cause a triple fault and watchdog reset by unmapping the
2040 * entire address space and doing a TLB flush.
2041 */
2042 memset((void *)PTD, 0, PAGE_SIZE);
2043 tlbflush();
2044 #endif
2045 #endif /* XENPV */
2046
2047 for (;;);
2048 }
2049
2050 void
cpu_getmcontext(struct lwp * l,mcontext_t * mcp,unsigned int * flags)2051 cpu_getmcontext(struct lwp *l, mcontext_t *mcp, unsigned int *flags)
2052 {
2053 const struct trapframe *tf = l->l_md.md_regs;
2054 __greg_t ras_rip;
2055
2056 mcp->__gregs[_REG_RDI] = tf->tf_rdi;
2057 mcp->__gregs[_REG_RSI] = tf->tf_rsi;
2058 mcp->__gregs[_REG_RDX] = tf->tf_rdx;
2059 mcp->__gregs[_REG_R10] = tf->tf_r10;
2060 mcp->__gregs[_REG_R8] = tf->tf_r8;
2061 mcp->__gregs[_REG_R9] = tf->tf_r9;
2062 /* argX not touched */
2063 mcp->__gregs[_REG_RCX] = tf->tf_rcx;
2064 mcp->__gregs[_REG_R11] = tf->tf_r11;
2065 mcp->__gregs[_REG_R12] = tf->tf_r12;
2066 mcp->__gregs[_REG_R13] = tf->tf_r13;
2067 mcp->__gregs[_REG_R14] = tf->tf_r14;
2068 mcp->__gregs[_REG_R15] = tf->tf_r15;
2069 mcp->__gregs[_REG_RBP] = tf->tf_rbp;
2070 mcp->__gregs[_REG_RBX] = tf->tf_rbx;
2071 mcp->__gregs[_REG_RAX] = tf->tf_rax;
2072 mcp->__gregs[_REG_GS] = 0;
2073 mcp->__gregs[_REG_FS] = 0;
2074 mcp->__gregs[_REG_ES] = GSEL(GUDATA_SEL, SEL_UPL);
2075 mcp->__gregs[_REG_DS] = GSEL(GUDATA_SEL, SEL_UPL);
2076 mcp->__gregs[_REG_TRAPNO] = tf->tf_trapno;
2077 mcp->__gregs[_REG_ERR] = tf->tf_err;
2078 mcp->__gregs[_REG_RIP] = tf->tf_rip;
2079 mcp->__gregs[_REG_CS] = LSEL(LUCODE_SEL, SEL_UPL);
2080 mcp->__gregs[_REG_RFLAGS] = tf->tf_rflags;
2081 mcp->__gregs[_REG_RSP] = tf->tf_rsp;
2082 mcp->__gregs[_REG_SS] = LSEL(LUDATA_SEL, SEL_UPL);
2083
2084 if ((ras_rip = (__greg_t)ras_lookup(l->l_proc,
2085 (void *) mcp->__gregs[_REG_RIP])) != -1)
2086 mcp->__gregs[_REG_RIP] = ras_rip;
2087
2088 *flags |= _UC_CPU;
2089
2090 mcp->_mc_tlsbase = (uintptr_t)l->l_private;
2091 *flags |= _UC_TLSBASE;
2092
2093 process_read_fpregs_xmm(l, (struct fxsave *)&mcp->__fpregs);
2094 *flags |= _UC_FPU;
2095 }
2096
2097 int
cpu_setmcontext(struct lwp * l,const mcontext_t * mcp,unsigned int flags)2098 cpu_setmcontext(struct lwp *l, const mcontext_t *mcp, unsigned int flags)
2099 {
2100 struct trapframe *tf = l->l_md.md_regs;
2101 const __greg_t *gr = mcp->__gregs;
2102 struct proc *p = l->l_proc;
2103 int error;
2104 int64_t rflags;
2105
2106 CTASSERT(sizeof (mcontext_t) == 26 * 8 + 8 + 512);
2107
2108 if ((flags & _UC_CPU) != 0) {
2109 error = cpu_mcontext_validate(l, mcp);
2110 if (error != 0)
2111 return error;
2112
2113 tf->tf_rdi = gr[_REG_RDI];
2114 tf->tf_rsi = gr[_REG_RSI];
2115 tf->tf_rdx = gr[_REG_RDX];
2116 tf->tf_r10 = gr[_REG_R10];
2117 tf->tf_r8 = gr[_REG_R8];
2118 tf->tf_r9 = gr[_REG_R9];
2119 /* argX not touched */
2120 tf->tf_rcx = gr[_REG_RCX];
2121 tf->tf_r11 = gr[_REG_R11];
2122 tf->tf_r12 = gr[_REG_R12];
2123 tf->tf_r13 = gr[_REG_R13];
2124 tf->tf_r14 = gr[_REG_R14];
2125 tf->tf_r15 = gr[_REG_R15];
2126 tf->tf_rbp = gr[_REG_RBP];
2127 tf->tf_rbx = gr[_REG_RBX];
2128 tf->tf_rax = gr[_REG_RAX];
2129 tf->tf_gs = 0;
2130 tf->tf_fs = 0;
2131 tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
2132 tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
2133 /* trapno, err not touched */
2134 tf->tf_rip = gr[_REG_RIP];
2135 tf->tf_cs = LSEL(LUCODE_SEL, SEL_UPL);
2136 rflags = tf->tf_rflags;
2137 rflags &= ~PSL_USER;
2138 tf->tf_rflags = rflags | (gr[_REG_RFLAGS] & PSL_USER);
2139 tf->tf_rsp = gr[_REG_RSP];
2140 tf->tf_ss = LSEL(LUDATA_SEL, SEL_UPL);
2141
2142 l->l_md.md_flags |= MDL_IRET;
2143 }
2144
2145 if ((flags & _UC_FPU) != 0)
2146 process_write_fpregs_xmm(l, (const struct fxsave *)&mcp->__fpregs);
2147
2148 if ((flags & _UC_TLSBASE) != 0)
2149 lwp_setprivate(l, (void *)(uintptr_t)mcp->_mc_tlsbase);
2150
2151 mutex_enter(p->p_lock);
2152 if (flags & _UC_SETSTACK)
2153 l->l_sigstk.ss_flags |= SS_ONSTACK;
2154 if (flags & _UC_CLRSTACK)
2155 l->l_sigstk.ss_flags &= ~SS_ONSTACK;
2156 mutex_exit(p->p_lock);
2157
2158 return 0;
2159 }
2160
2161 int
cpu_mcontext_validate(struct lwp * l,const mcontext_t * mcp)2162 cpu_mcontext_validate(struct lwp *l, const mcontext_t *mcp)
2163 {
2164 struct proc *p __diagused = l->l_proc;
2165 struct trapframe *tf = l->l_md.md_regs;
2166 const __greg_t *gr;
2167 uint16_t sel;
2168
2169 KASSERT((p->p_flag & PK_32) == 0);
2170 gr = mcp->__gregs;
2171
2172 if (((gr[_REG_RFLAGS] ^ tf->tf_rflags) & PSL_USERSTATIC) != 0)
2173 return EINVAL;
2174
2175 sel = gr[_REG_ES] & 0xffff;
2176 if (sel != 0 && !VALID_USER_DSEL(sel))
2177 return EINVAL;
2178
2179 sel = gr[_REG_FS] & 0xffff;
2180 if (sel != 0 && !VALID_USER_DSEL(sel))
2181 return EINVAL;
2182
2183 sel = gr[_REG_GS] & 0xffff;
2184 if (sel != 0 && !VALID_USER_DSEL(sel))
2185 return EINVAL;
2186
2187 sel = gr[_REG_DS] & 0xffff;
2188 if (!VALID_USER_DSEL(sel))
2189 return EINVAL;
2190
2191 #ifndef XENPV
2192 sel = gr[_REG_SS] & 0xffff;
2193 if (!VALID_USER_DSEL(sel))
2194 return EINVAL;
2195
2196 sel = gr[_REG_CS] & 0xffff;
2197 if (!VALID_USER_CSEL(sel))
2198 return EINVAL;
2199 #endif
2200
2201 if (gr[_REG_RIP] >= VM_MAXUSER_ADDRESS)
2202 return EINVAL;
2203
2204 return 0;
2205 }
2206
2207 int
mm_md_kernacc(void * ptr,vm_prot_t prot,bool * handled)2208 mm_md_kernacc(void *ptr, vm_prot_t prot, bool *handled)
2209 {
2210 const vaddr_t v = (vaddr_t)ptr;
2211 vaddr_t kva, kva_end;
2212 size_t i;
2213
2214 kva = bootspace.head.va;
2215 kva_end = kva + bootspace.head.sz;
2216 if (v >= kva && v < kva_end) {
2217 *handled = true;
2218 return 0;
2219 }
2220
2221 for (i = 0; i < BTSPACE_NSEGS; i++) {
2222 kva = bootspace.segs[i].va;
2223 kva_end = kva + bootspace.segs[i].sz;
2224 if (v < kva || v >= kva_end)
2225 continue;
2226 *handled = true;
2227 if (bootspace.segs[i].type == BTSEG_TEXT ||
2228 bootspace.segs[i].type == BTSEG_RODATA) {
2229 if (prot & VM_PROT_WRITE) {
2230 return EFAULT;
2231 }
2232 }
2233 return 0;
2234 }
2235
2236 kva = bootspace.boot.va;
2237 kva_end = kva + bootspace.boot.sz;
2238 if (v >= kva && v < kva_end) {
2239 *handled = true;
2240 return 0;
2241 }
2242
2243 if (v >= bootspace.smodule && v < bootspace.emodule) {
2244 *handled = true;
2245 if (!uvm_map_checkprot(module_map, v, v + 1, prot)) {
2246 return EFAULT;
2247 }
2248 } else {
2249 *handled = false;
2250 }
2251 return 0;
2252 }
2253
2254 /*
2255 * Zero out a 64bit LWP's segments registers. Used when exec'ing a new
2256 * 64bit program.
2257 */
2258 void
cpu_segregs64_zero(struct lwp * l)2259 cpu_segregs64_zero(struct lwp *l)
2260 {
2261 struct trapframe * const tf = l->l_md.md_regs;
2262 struct pcb *pcb;
2263 uint64_t zero = 0;
2264
2265 KASSERT(kpreempt_disabled());
2266 KASSERT((l->l_proc->p_flag & PK_32) == 0);
2267 KASSERT(l == curlwp);
2268
2269 pcb = lwp_getpcb(l);
2270
2271 tf->tf_fs = 0;
2272 tf->tf_gs = 0;
2273 setds(GSEL(GUDATA_SEL, SEL_UPL));
2274 setes(GSEL(GUDATA_SEL, SEL_UPL));
2275 setfs(0);
2276 setusergs(0);
2277
2278 #ifndef XENPV
2279 wrmsr(MSR_FSBASE, 0);
2280 wrmsr(MSR_KERNELGSBASE, 0);
2281 #else
2282 HYPERVISOR_set_segment_base(SEGBASE_FS, 0);
2283 HYPERVISOR_set_segment_base(SEGBASE_GS_USER, 0);
2284 #endif
2285
2286 pcb->pcb_fs = 0;
2287 pcb->pcb_gs = 0;
2288 update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &zero);
2289 update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &zero);
2290 }
2291
2292 /*
2293 * Zero out a 32bit LWP's segments registers. Used when exec'ing a new
2294 * 32bit program.
2295 */
2296 void
cpu_segregs32_zero(struct lwp * l)2297 cpu_segregs32_zero(struct lwp *l)
2298 {
2299 struct trapframe * const tf = l->l_md.md_regs;
2300 struct pcb *pcb;
2301 uint64_t zero = 0;
2302
2303 KASSERT(kpreempt_disabled());
2304 KASSERT(l->l_proc->p_flag & PK_32);
2305 KASSERT(l == curlwp);
2306
2307 pcb = lwp_getpcb(l);
2308
2309 tf->tf_fs = 0;
2310 tf->tf_gs = 0;
2311 setds(GSEL(GUDATA32_SEL, SEL_UPL));
2312 setes(GSEL(GUDATA32_SEL, SEL_UPL));
2313 setfs(0);
2314 setusergs(0);
2315 pcb->pcb_fs = 0;
2316 pcb->pcb_gs = 0;
2317 update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &zero);
2318 update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &zero);
2319 }
2320
2321 /*
2322 * Load an LWP's TLS context, possibly changing the %fs and %gs selectors.
2323 * Used only for 32-bit processes.
2324 */
2325 void
cpu_fsgs_reload(struct lwp * l,int fssel,int gssel)2326 cpu_fsgs_reload(struct lwp *l, int fssel, int gssel)
2327 {
2328 struct trapframe *tf;
2329 struct pcb *pcb;
2330
2331 KASSERT(l->l_proc->p_flag & PK_32);
2332 KASSERT(l == curlwp);
2333
2334 tf = l->l_md.md_regs;
2335 fssel &= 0xFFFF;
2336 gssel &= 0xFFFF;
2337
2338 pcb = lwp_getpcb(l);
2339 kpreempt_disable();
2340 update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &pcb->pcb_fs);
2341 update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &pcb->pcb_gs);
2342
2343 #ifdef XENPV
2344 setusergs(gssel);
2345 #endif
2346
2347 tf->tf_fs = fssel;
2348 tf->tf_gs = gssel;
2349 kpreempt_enable();
2350 }
2351
2352 bool
mm_md_direct_mapped_io(void * addr,paddr_t * paddr)2353 mm_md_direct_mapped_io(void *addr, paddr_t *paddr)
2354 {
2355 vaddr_t va = (vaddr_t)addr;
2356
2357 #ifdef __HAVE_DIRECT_MAP
2358 if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
2359 *paddr = PMAP_DIRECT_UNMAP(va);
2360 return true;
2361 }
2362 #else
2363 __USE(va);
2364 #endif
2365
2366 return false;
2367 }
2368
2369 bool
mm_md_direct_mapped_phys(paddr_t paddr,vaddr_t * vaddr)2370 mm_md_direct_mapped_phys(paddr_t paddr, vaddr_t *vaddr)
2371 {
2372 #ifdef __HAVE_DIRECT_MAP
2373 *vaddr = PMAP_DIRECT_MAP(paddr);
2374 return true;
2375 #else
2376 return false;
2377 #endif
2378 }
2379
2380 static void
idt_vec_copy(struct idt_vec * dst,struct idt_vec * src)2381 idt_vec_copy(struct idt_vec *dst, struct idt_vec *src)
2382 {
2383 idt_descriptor_t *idt_dst;
2384
2385 idt_dst = dst->iv_idt;
2386
2387 kpreempt_disable();
2388 pmap_changeprot_local((vaddr_t)idt_dst, VM_PROT_READ|VM_PROT_WRITE);
2389
2390 memcpy(idt_dst, src->iv_idt, PAGE_SIZE);
2391 memcpy(dst->iv_allocmap, src->iv_allocmap, sizeof(dst->iv_allocmap));
2392
2393 pmap_changeprot_local((vaddr_t)idt_dst, VM_PROT_READ);
2394 kpreempt_enable();
2395 }
2396
2397 void
idt_vec_init_cpu_md(struct idt_vec * iv,cpuid_t cid)2398 idt_vec_init_cpu_md(struct idt_vec *iv, cpuid_t cid)
2399 {
2400 vaddr_t va;
2401
2402 if (cid != cpu_index(&cpu_info_primary) &&
2403 idt_vec_is_pcpu()) {
2404 #ifdef __HAVE_PCPU_AREA
2405 va = (vaddr_t)&pcpuarea->ent[cid].idt;
2406 #else
2407 struct vm_page *pg;
2408
2409 va = uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
2410 UVM_KMF_VAONLY);
2411 pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
2412 if (pg == NULL) {
2413 panic("failed to allocate a page for IDT");
2414 }
2415 pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg),
2416 VM_PROT_READ|VM_PROT_WRITE, 0);
2417 pmap_update(pmap_kernel());
2418 #endif
2419
2420 memset((void *)va, 0, PAGE_SIZE);
2421 #ifndef XENPV
2422 pmap_changeprot_local(va, VM_PROT_READ);
2423 #endif
2424 pmap_update(pmap_kernel());
2425
2426 iv->iv_idt = (void *)va;
2427 idt_vec_copy(iv, &(cpu_info_primary.ci_idtvec));
2428 } else {
2429 iv->iv_idt = (void *)idt_vaddr;
2430 }
2431 }
2432