1 /*	$NetBSD: machdep.c,v 1.226 2016/07/22 14:08:33 maxv Exp $	*/
2 
3 /*-
4  * Copyright (c) 1996, 1997, 1998, 2000, 2006, 2007, 2008, 2011
5  *     The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace
10  * Simulation Facility, NASA Ames Research Center.
11  *
12  * This code is derived from software contributed to The NetBSD Foundation
13  * by Coyote Point Systems, Inc. which was written under contract to Coyote
14  * Point by Jed Davis and Devon O'Dell.
15  *
16  * Redistribution and use in source and binary forms, with or without
17  * modification, are permitted provided that the following conditions
18  * are met:
19  * 1. Redistributions of source code must retain the above copyright
20  *    notice, this list of conditions and the following disclaimer.
21  * 2. Redistributions in binary form must reproduce the above copyright
22  *    notice, this list of conditions and the following disclaimer in the
23  *    documentation and/or other materials provided with the distribution.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
26  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
29  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35  * POSSIBILITY OF SUCH DAMAGE.
36  */
37 
38 /*
39  * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
40  *
41  * Permission to use, copy, modify, and distribute this software for any
42  * purpose with or without fee is hereby granted, provided that the above
43  * copyright notice and this permission notice appear in all copies.
44  *
45  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
46  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
47  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
48  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
49  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
50  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
51  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
52  */
53 
54 /*
55  * Copyright (c) 2007 Manuel Bouyer.
56  *
57  * Redistribution and use in source and binary forms, with or without
58  * modification, are permitted provided that the following conditions
59  * are met:
60  * 1. Redistributions of source code must retain the above copyright
61  *    notice, this list of conditions and the following disclaimer.
62  * 2. Redistributions in binary form must reproduce the above copyright
63  *    notice, this list of conditions and the following disclaimer in the
64  *    documentation and/or other materials provided with the distribution.
65  *
66  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
67  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
68  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
69  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
70  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
71  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
72  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
73  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
74  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
75  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
76  *
77  */
78 
79 /*-
80  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
81  * All rights reserved.
82  *
83  * This code is derived from software contributed to Berkeley by
84  * William Jolitz.
85  *
86  * Redistribution and use in source and binary forms, with or without
87  * modification, are permitted provided that the following conditions
88  * are met:
89  * 1. Redistributions of source code must retain the above copyright
90  *    notice, this list of conditions and the following disclaimer.
91  * 2. Redistributions in binary form must reproduce the above copyright
92  *    notice, this list of conditions and the following disclaimer in the
93  *    documentation and/or other materials provided with the distribution.
94  * 3. Neither the name of the University nor the names of its contributors
95  *    may be used to endorse or promote products derived from this software
96  *    without specific prior written permission.
97  *
98  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
99  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
100  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
101  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
102  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
103  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
104  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
105  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
106  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
107  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
108  * SUCH DAMAGE.
109  *
110  *	@(#)machdep.c	7.4 (Berkeley) 6/3/91
111  */
112 
113 #include <sys/cdefs.h>
114 __KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.226 2016/07/22 14:08:33 maxv Exp $");
115 
116 /* #define XENDEBUG_LOW  */
117 
118 #include "opt_modular.h"
119 #include "opt_user_ldt.h"
120 #include "opt_ddb.h"
121 #include "opt_kgdb.h"
122 #include "opt_cpureset_delay.h"
123 #include "opt_mtrr.h"
124 #include "opt_realmem.h"
125 #include "opt_xen.h"
126 #ifndef XEN
127 #include "opt_physmem.h"
128 #endif
129 #include "isa.h"
130 #include "pci.h"
131 
132 #include <sys/param.h>
133 #include <sys/systm.h>
134 #include <sys/signal.h>
135 #include <sys/signalvar.h>
136 #include <sys/kernel.h>
137 #include <sys/cpu.h>
138 #include <sys/exec.h>
139 #include <sys/exec_aout.h>	/* for MID_* */
140 #include <sys/reboot.h>
141 #include <sys/conf.h>
142 #include <sys/mbuf.h>
143 #include <sys/msgbuf.h>
144 #include <sys/mount.h>
145 #include <sys/core.h>
146 #include <sys/kcore.h>
147 #include <sys/ucontext.h>
148 #include <machine/kcore.h>
149 #include <sys/ras.h>
150 #include <sys/syscallargs.h>
151 #include <sys/ksyms.h>
152 #include <sys/device.h>
153 #include <sys/lwp.h>
154 #include <sys/proc.h>
155 
156 #ifdef KGDB
157 #include <sys/kgdb.h>
158 #endif
159 
160 #include <dev/cons.h>
161 #include <dev/mm.h>
162 
163 #include <uvm/uvm.h>
164 #include <uvm/uvm_page.h>
165 
166 #include <sys/sysctl.h>
167 
168 #include <machine/cpu.h>
169 #include <machine/cpufunc.h>
170 #include <machine/gdt.h>
171 #include <machine/intr.h>
172 #include <machine/pio.h>
173 #include <machine/psl.h>
174 #include <machine/reg.h>
175 #include <machine/specialreg.h>
176 #include <machine/bootinfo.h>
177 #include <x86/fpu.h>
178 #include <machine/mtrr.h>
179 #include <machine/mpbiosvar.h>
180 
181 #include <x86/cputypes.h>
182 #include <x86/cpuvar.h>
183 #include <x86/machdep.h>
184 
185 #include <x86/x86/tsc.h>
186 
187 #include <dev/isa/isareg.h>
188 #include <machine/isa_machdep.h>
189 #include <dev/ic/i8042reg.h>
190 
191 #ifdef XEN
192 #include <xen/xen.h>
193 #include <xen/hypervisor.h>
194 #include <xen/evtchn.h>
195 #endif
196 
197 #ifdef DDB
198 #include <machine/db_machdep.h>
199 #include <ddb/db_extern.h>
200 #include <ddb/db_output.h>
201 #include <ddb/db_interface.h>
202 #endif
203 
204 #include "acpica.h"
205 
206 #if NACPICA > 0
207 #include <dev/acpi/acpivar.h>
208 #define ACPI_MACHDEP_PRIVATE
209 #include <machine/acpi_machdep.h>
210 #endif
211 
212 #include "isa.h"
213 #include "isadma.h"
214 #include "ksyms.h"
215 
216 /* the following is used externally (sysctl_hw) */
217 char machine[] = "amd64";		/* CPU "architecture" */
218 char machine_arch[] = "x86_64";		/* machine == machine_arch */
219 
220 #ifdef CPURESET_DELAY
221 int cpureset_delay = CPURESET_DELAY;
222 #else
223 int cpureset_delay = 2000; /* default to 2s */
224 #endif
225 
226 int cpu_class = CPUCLASS_686;
227 
228 #ifdef MTRR
229 struct mtrr_funcs *mtrr_funcs;
230 #endif
231 
232 uint64_t dumpmem_low;
233 uint64_t dumpmem_high;
234 int cpu_class;
235 int use_pae;
236 
237 #ifndef NO_SPARSE_DUMP
238 int sparse_dump = 1;
239 
240 paddr_t max_paddr = 0;
241 unsigned char *sparse_dump_physmap;
242 #endif
243 
244 char *dump_headerbuf, *dump_headerbuf_ptr;
245 #define dump_headerbuf_size PAGE_SIZE
246 #define dump_headerbuf_end (dump_headerbuf + dump_headerbuf_size)
247 #define dump_headerbuf_avail (dump_headerbuf_end - dump_headerbuf_ptr)
248 daddr_t dump_header_blkno;
249 
250 size_t dump_nmemsegs;
251 size_t dump_npages;
252 size_t dump_header_size;
253 size_t dump_totalbytesleft;
254 
255 vaddr_t msgbuf_vaddr;
256 
257 struct {
258 	paddr_t paddr;
259 	psize_t sz;
260 } msgbuf_p_seg[VM_PHYSSEG_MAX];
261 unsigned int msgbuf_p_cnt = 0;
262 
263 vaddr_t idt_vaddr;
264 paddr_t idt_paddr;
265 
266 vaddr_t module_start, module_end;
267 static struct vm_map module_map_store;
268 extern struct vm_map *module_map;
269 vaddr_t kern_end;
270 
271 struct vm_map *phys_map = NULL;
272 
273 extern paddr_t avail_start, avail_end;
274 #ifdef XEN
275 extern paddr_t pmap_pa_start, pmap_pa_end;
276 #endif
277 
278 #ifndef XEN
279 void (*delay_func)(unsigned int) = i8254_delay;
280 void (*initclock_func)(void) = i8254_initclocks;
281 #else /* XEN */
282 void (*delay_func)(unsigned int) = xen_delay;
283 void (*initclock_func)(void) = xen_initclocks;
284 #endif
285 
286 
287 /*
288  * Size of memory segments, before any memory is stolen.
289  */
290 phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX];
291 int mem_cluster_cnt;
292 
293 char x86_64_doubleflt_stack[4096];
294 
295 int cpu_dump(void);
296 int cpu_dumpsize(void);
297 u_long cpu_dump_mempagecnt(void);
298 void dodumpsys(void);
299 void dumpsys(void);
300 
301 extern int time_adjusted;	/* XXX no common header */
302 
303 void dump_misc_init(void);
304 void dump_seg_prep(void);
305 int dump_seg_iter(int (*)(paddr_t, paddr_t));
306 
307 #ifndef NO_SPARSE_DUMP
308 void sparse_dump_reset(void);
309 void sparse_dump_mark(void);
310 void cpu_dump_prep_sparse(void);
311 #endif
312 
313 void dump_header_start(void);
314 int dump_header_flush(void);
315 int dump_header_addbytes(const void*, size_t);
316 int dump_header_addseg(paddr_t, paddr_t);
317 int dump_header_finish(void);
318 
319 int dump_seg_count_range(paddr_t, paddr_t);
320 int dumpsys_seg(paddr_t, paddr_t);
321 
322 void init_x86_64(paddr_t);
323 
324 static int valid_user_selector(struct lwp *, uint64_t);
325 
326 /*
327  * Machine-dependent startup code
328  */
329 void
cpu_startup(void)330 cpu_startup(void)
331 {
332 	int x, y;
333 	vaddr_t minaddr, maxaddr;
334 	psize_t sz;
335 
336 	/*
337 	 * For console drivers that require uvm and pmap to be initialized,
338 	 * we'll give them one more chance here...
339 	 */
340 	consinit();
341 
342 	/*
343 	 * Initialize error message buffer (et end of core).
344 	 */
345 	if (msgbuf_p_cnt == 0)
346 		panic("msgbuf paddr map has not been set up");
347 	for (x = 0, sz = 0; x < msgbuf_p_cnt; sz += msgbuf_p_seg[x++].sz)
348 		continue;
349 
350 	msgbuf_vaddr = uvm_km_alloc(kernel_map, sz, 0, UVM_KMF_VAONLY);
351 	if (msgbuf_vaddr == 0)
352 		panic("failed to valloc msgbuf_vaddr");
353 
354 	for (y = 0, sz = 0; y < msgbuf_p_cnt; y++) {
355 		for (x = 0; x < btoc(msgbuf_p_seg[y].sz); x++, sz += PAGE_SIZE)
356 			pmap_kenter_pa((vaddr_t)msgbuf_vaddr + sz,
357 			    msgbuf_p_seg[y].paddr + x * PAGE_SIZE,
358 			    VM_PROT_READ|VM_PROT_WRITE, 0);
359 	}
360 
361 	pmap_update(pmap_kernel());
362 
363 	initmsgbuf((void *)msgbuf_vaddr, round_page(sz));
364 
365 	minaddr = 0;
366 
367 	/*
368 	 * Allocate a submap for physio.
369 	 */
370 	phys_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
371 	    VM_PHYS_SIZE, 0, false, NULL);
372 
373 	/*
374 	 * Create the module map.
375 	 *
376 	 * XXX: the module map is taken as what is left of the bootstrap memory
377 	 * created in locore.S, which is not big enough if we want to load many
378 	 * modules dynamically. We really should be using kernel_map instead.
379 	 *
380 	 * But the modules must be located above the kernel image, and that
381 	 * wouldn't be guaranteed if we were using kernel_map.
382 	 */
383 	uvm_map_setup(&module_map_store, module_start, module_end, 0);
384 	module_map_store.pmap = pmap_kernel();
385 	module_map = &module_map_store;
386 
387 	/* Say hello. */
388 	banner();
389 
390 #if NISA > 0 || NPCI > 0
391 	/* Safe for i/o port / memory space allocation to use malloc now. */
392 	x86_bus_space_mallocok();
393 #endif
394 
395 	gdt_init();
396 	x86_64_proc0_tss_ldt_init();
397 
398 	cpu_init_tss(&cpu_info_primary);
399 #if !defined(XEN)
400 	ltr(cpu_info_primary.ci_tss_sel);
401 #endif /* !defined(XEN) */
402 
403 	x86_startup();
404 }
405 
406 #ifdef XEN
407 /* used in assembly */
408 void hypervisor_callback(void);
409 void failsafe_callback(void);
410 void x86_64_switch_context(struct pcb *);
411 void x86_64_tls_switch(struct lwp *);
412 
413 void
x86_64_switch_context(struct pcb * new)414 x86_64_switch_context(struct pcb *new)
415 {
416 	HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), new->pcb_rsp0);
417 	struct physdev_op physop;
418 	physop.cmd = PHYSDEVOP_SET_IOPL;
419 	physop.u.set_iopl.iopl = new->pcb_iopl;
420 	HYPERVISOR_physdev_op(&physop);
421 }
422 
423 void
x86_64_tls_switch(struct lwp * l)424 x86_64_tls_switch(struct lwp *l)
425 {
426 	struct cpu_info *ci = curcpu();
427 	struct pcb *pcb = lwp_getpcb(l);
428 	struct trapframe *tf = l->l_md.md_regs;
429 
430 	/*
431 	 * Raise the IPL to IPL_HIGH.
432 	 * FPU IPIs can alter the LWP's saved cr0.  Dropping the priority
433 	 * is deferred until mi_switch(), when cpu_switchto() returns.
434 	 */
435 	(void)splhigh();
436 	/*
437 	 * If our floating point registers are on a different CPU,
438 	 * set CR0_TS so we'll trap rather than reuse bogus state.
439 	 */
440 	if (l != ci->ci_fpcurlwp) {
441 		HYPERVISOR_fpu_taskswitch(1);
442 	}
443 
444 	/* Update TLS segment pointers */
445 	if (pcb->pcb_flags & PCB_COMPAT32) {
446 		update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &pcb->pcb_fs);
447 		update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &pcb->pcb_gs);
448 		setfs(tf->tf_fs);
449 		HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, tf->tf_gs);
450 	} else {
451 		setfs(0);
452 		HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, 0);
453 		HYPERVISOR_set_segment_base(SEGBASE_FS, pcb->pcb_fs);
454 		HYPERVISOR_set_segment_base(SEGBASE_GS_USER, pcb->pcb_gs);
455 	}
456 }
457 #endif /* XEN */
458 
459 /*
460  * Set up proc0's TSS and LDT.
461  */
462 void
x86_64_proc0_tss_ldt_init(void)463 x86_64_proc0_tss_ldt_init(void)
464 {
465 	struct lwp *l = &lwp0;
466 	struct pcb *pcb = lwp_getpcb(l);
467 
468 	pcb->pcb_flags = 0;
469 	pcb->pcb_fs = 0;
470 	pcb->pcb_gs = 0;
471 	pcb->pcb_rsp0 = (uvm_lwp_getuarea(l) + USPACE - 16) & ~0xf;
472 	pcb->pcb_iopl = SEL_KPL;
473 
474 	pmap_kernel()->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
475 	pcb->pcb_cr0 = rcr0() & ~CR0_TS;
476 	l->l_md.md_regs = (struct trapframe *)pcb->pcb_rsp0 - 1;
477 
478 #if !defined(XEN)
479 	lldt(pmap_kernel()->pm_ldt_sel);
480 #else
481 	{
482 	struct physdev_op physop;
483 	xen_set_ldt((vaddr_t) ldtstore, LDT_SIZE >> 3);
484 	/* Reset TS bit and set kernel stack for interrupt handlers */
485 	HYPERVISOR_fpu_taskswitch(1);
486 	HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), pcb->pcb_rsp0);
487 	physop.cmd = PHYSDEVOP_SET_IOPL;
488 	physop.u.set_iopl.iopl = pcb->pcb_iopl;
489 	HYPERVISOR_physdev_op(&physop);
490 	}
491 #endif /* XEN */
492 }
493 
494 /*
495  * Set up TSS and I/O bitmap.
496  */
497 void
cpu_init_tss(struct cpu_info * ci)498 cpu_init_tss(struct cpu_info *ci)
499 {
500 	struct x86_64_tss *tss = &ci->ci_tss;
501 	uintptr_t p;
502 
503 	tss->tss_iobase = IOMAP_INVALOFF << 16;
504 	/* tss->tss_ist[0] is filled by cpu_intr_init */
505 
506 	/* double fault */
507 	tss->tss_ist[1] = (uint64_t)x86_64_doubleflt_stack + PAGE_SIZE - 16;
508 
509 	/* NMI */
510 	p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED);
511 	tss->tss_ist[2] = p + PAGE_SIZE - 16;
512 	ci->ci_tss_sel = tss_alloc(tss);
513 }
514 
515 void
buildcontext(struct lwp * l,void * catcher,void * f)516 buildcontext(struct lwp *l, void *catcher, void *f)
517 {
518 	struct trapframe *tf = l->l_md.md_regs;
519 
520 	tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
521 	tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
522 	tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL);
523 	tf->tf_gs = GSEL(GUDATA_SEL, SEL_UPL);
524 
525 	tf->tf_rip = (uint64_t)catcher;
526 	tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
527 	tf->tf_rflags &= ~PSL_CLEARSIG;
528 	tf->tf_rsp = (uint64_t)f;
529 	tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
530 
531 	/* Ensure FP state is sane */
532 	fpu_save_area_reset(l);
533 }
534 
535 void
sendsig_sigcontext(const ksiginfo_t * ksi,const sigset_t * mask)536 sendsig_sigcontext(const ksiginfo_t *ksi, const sigset_t *mask)
537 {
538 
539 	printf("sendsig_sigcontext: illegal\n");
540 	sigexit(curlwp, SIGILL);
541 }
542 
543 void
sendsig_siginfo(const ksiginfo_t * ksi,const sigset_t * mask)544 sendsig_siginfo(const ksiginfo_t *ksi, const sigset_t *mask)
545 {
546 	struct lwp *l = curlwp;
547 	struct proc *p = l->l_proc;
548 	struct sigacts *ps = p->p_sigacts;
549 	int onstack, error;
550 	int sig = ksi->ksi_signo;
551 	struct sigframe_siginfo *fp, frame;
552 	sig_t catcher = SIGACTION(p, sig).sa_handler;
553 	struct trapframe *tf = l->l_md.md_regs;
554 	char *sp;
555 
556 	KASSERT(mutex_owned(p->p_lock));
557 
558 	/* Do we need to jump onto the signal stack? */
559 	onstack =
560 	    (l->l_sigstk.ss_flags & (SS_DISABLE | SS_ONSTACK)) == 0 &&
561 	    (SIGACTION(p, sig).sa_flags & SA_ONSTACK) != 0;
562 
563 	/* Allocate space for the signal handler context. */
564 	if (onstack)
565 		sp = ((char *)l->l_sigstk.ss_sp + l->l_sigstk.ss_size);
566 	else
567 		/* AMD64 ABI 128-bytes "red zone". */
568 		sp = (char *)tf->tf_rsp - 128;
569 
570 	sp -= sizeof(struct sigframe_siginfo);
571 	/* Round down the stackpointer to a multiple of 16 for the ABI. */
572 	fp = (struct sigframe_siginfo *)(((unsigned long)sp & ~15) - 8);
573 
574 	frame.sf_ra = (uint64_t)ps->sa_sigdesc[sig].sd_tramp;
575 	frame.sf_si._info = ksi->ksi_info;
576 	frame.sf_uc.uc_flags = _UC_SIGMASK;
577 	frame.sf_uc.uc_sigmask = *mask;
578 	frame.sf_uc.uc_link = l->l_ctxlink;
579 	frame.sf_uc.uc_flags |= (l->l_sigstk.ss_flags & SS_ONSTACK)
580 	    ? _UC_SETSTACK : _UC_CLRSTACK;
581 	memset(&frame.sf_uc.uc_stack, 0, sizeof(frame.sf_uc.uc_stack));
582 	sendsig_reset(l, sig);
583 
584 	mutex_exit(p->p_lock);
585 	cpu_getmcontext(l, &frame.sf_uc.uc_mcontext, &frame.sf_uc.uc_flags);
586 	/* Copyout all the fp regs, the signal handler might expect them. */
587 	error = copyout(&frame, fp, sizeof frame);
588 	mutex_enter(p->p_lock);
589 
590 	if (error != 0) {
591 		/*
592 		 * Process has trashed its stack; give it an illegal
593 		 * instruction to halt it in its tracks.
594 		 */
595 		sigexit(l, SIGILL);
596 		/* NOTREACHED */
597 	}
598 
599 	buildcontext(l, catcher, fp);
600 
601 	tf->tf_rdi = sig;
602 	tf->tf_rsi = (uint64_t)&fp->sf_si;
603 	tf->tf_rdx = tf->tf_r15 = (uint64_t)&fp->sf_uc;
604 
605 	/* Remember that we're now on the signal stack. */
606 	if (onstack)
607 		l->l_sigstk.ss_flags |= SS_ONSTACK;
608 
609 	if ((vaddr_t)catcher >= VM_MAXUSER_ADDRESS) {
610 		/*
611 		 * process has given an invalid address for the
612 		 * handler. Stop it, but do not do it before so
613 		 * we can return the right info to userland (or in core dump)
614 		 */
615 		sigexit(l, SIGILL);
616 		/* NOTREACHED */
617 	}
618 }
619 
620 struct pcb dumppcb;
621 
622 void
cpu_reboot(int howto,char * bootstr)623 cpu_reboot(int howto, char *bootstr)
624 {
625 	static bool syncdone = false;
626 	int s = IPL_NONE;
627 	__USE(s);	/* ugly otherwise */
628 
629 	if (cold) {
630 		howto |= RB_HALT;
631 		goto haltsys;
632 	}
633 
634 	boothowto = howto;
635 
636 	/* i386 maybe_dump() */
637 
638 	/*
639 	 * If we've panic'd, don't make the situation potentially
640 	 * worse by syncing or unmounting the file systems.
641 	 */
642 	if ((howto & RB_NOSYNC) == 0 && panicstr == NULL) {
643 		if (!syncdone) {
644 			syncdone = true;
645 			/* XXX used to force unmount as well, here */
646 			vfs_sync_all(curlwp);
647 			/*
648 			 * If we've been adjusting the clock, the todr
649 			 * will be out of synch; adjust it now.
650 			 *
651 			 * XXX used to do this after unmounting all
652 			 * filesystems with vfs_shutdown().
653 			 */
654 			if (time_adjusted != 0)
655 				resettodr();
656 		}
657 
658 		while (vfs_unmountall1(curlwp, false, false) ||
659 		       config_detach_all(boothowto) ||
660 		       vfs_unmount_forceone(curlwp))
661 			;	/* do nothing */
662 	} else
663 		suspendsched();
664 
665 	pmf_system_shutdown(boothowto);
666 
667 	/* Disable interrupts. */
668 	s = splhigh();
669 
670 	/* Do a dump if requested. */
671 	if ((howto & (RB_DUMP | RB_HALT)) == RB_DUMP)
672 		dumpsys();
673 
674 haltsys:
675 	doshutdownhooks();
676 
677         if ((howto & RB_POWERDOWN) == RB_POWERDOWN) {
678 #if NACPICA > 0
679 		if (s != IPL_NONE)
680 			splx(s);
681 
682 		acpi_enter_sleep_state(ACPI_STATE_S5);
683 #endif
684 #ifdef XEN
685 		HYPERVISOR_shutdown();
686 #endif /* XEN */
687 	}
688 
689 	cpu_broadcast_halt();
690 
691 	if (howto & RB_HALT) {
692 #if NACPICA > 0
693 		acpi_disable();
694 #endif
695 
696 		printf("\n");
697 		printf("The operating system has halted.\n");
698 		printf("Please press any key to reboot.\n\n");
699 		cnpollc(1);	/* for proper keyboard command handling */
700 		if (cngetc() == 0) {
701 			/* no console attached, so just hlt */
702 			printf("No keyboard - cannot reboot after all.\n");
703 			for(;;) {
704 				x86_hlt();
705 			}
706 		}
707 		cnpollc(0);
708 	}
709 
710 	printf("rebooting...\n");
711 	if (cpureset_delay > 0)
712 		delay(cpureset_delay * 1000);
713 	cpu_reset();
714 	for(;;) ;
715 	/*NOTREACHED*/
716 }
717 
718 /*
719  * XXXfvdl share dumpcode.
720  */
721 
722 /*
723  * Perform assorted dump-related initialization tasks.  Assumes that
724  * the maximum physical memory address will not increase afterwards.
725  */
726 void
dump_misc_init(void)727 dump_misc_init(void)
728 {
729 #ifndef NO_SPARSE_DUMP
730 	int i;
731 #endif
732 
733 	if (dump_headerbuf != NULL)
734 		return; /* already called */
735 
736 #ifndef NO_SPARSE_DUMP
737 	for (i = 0; i < mem_cluster_cnt; ++i) {
738 		paddr_t top = mem_clusters[i].start + mem_clusters[i].size;
739 		if (max_paddr < top)
740 			max_paddr = top;
741 	}
742 #ifdef DEBUG
743 	printf("dump_misc_init: max_paddr = 0x%lx\n",
744 	    (unsigned long)max_paddr);
745 #endif
746 	if (max_paddr == 0) {
747 		printf("Your machine does not initialize mem_clusters; "
748 		    "sparse_dumps disabled\n");
749 		sparse_dump = 0;
750 	} else {
751 		sparse_dump_physmap = (void *)uvm_km_alloc(kernel_map,
752 		    roundup(max_paddr / (PAGE_SIZE * NBBY), PAGE_SIZE),
753 		    PAGE_SIZE, UVM_KMF_WIRED|UVM_KMF_ZERO);
754 	}
755 #endif
756 	dump_headerbuf = (void *)uvm_km_alloc(kernel_map,
757 	    dump_headerbuf_size,
758 	    PAGE_SIZE, UVM_KMF_WIRED|UVM_KMF_ZERO);
759 	/* XXXjld should check for failure here, disable dumps if so. */
760 }
761 
762 #ifndef NO_SPARSE_DUMP
763 /*
764  * Clear the set of pages to include in a sparse dump.
765  */
766 void
sparse_dump_reset(void)767 sparse_dump_reset(void)
768 {
769 	memset(sparse_dump_physmap, 0,
770 	    roundup(max_paddr / (PAGE_SIZE * NBBY), PAGE_SIZE));
771 }
772 
773 /*
774  * Include or exclude pages in a sparse dump.
775  */
776 void
sparse_dump_mark(void)777 sparse_dump_mark(void)
778 {
779 	paddr_t p, pstart, pend;
780 	struct vm_page *pg;
781 	int i;
782 
783 	/*
784 	 * Mark all memory pages, then unmark pages that are uninteresting.
785 	 * Dereferenceing pg->uobject might crash again if another CPU
786 	 * frees the object out from under us, but we can't lock anything
787 	 * so it's a risk we have to take.
788 	 */
789 
790 	for (i = 0; i < mem_cluster_cnt; ++i) {
791 		pstart = mem_clusters[i].start / PAGE_SIZE;
792 		pend = pstart + mem_clusters[i].size / PAGE_SIZE;
793 
794 		for (p = pstart; p < pend; p++) {
795 			setbit(sparse_dump_physmap, p);
796 		}
797 	}
798 	for (i = 0; i < vm_nphysseg; i++) {
799 		struct vm_physseg *seg = VM_PHYSMEM_PTR(i);
800 
801 		for (pg = seg->pgs; pg < seg->lastpg; pg++) {
802 			if (pg->uanon || (pg->pqflags & PQ_FREE) ||
803 			    (pg->uobject && pg->uobject->pgops)) {
804 				p = VM_PAGE_TO_PHYS(pg) / PAGE_SIZE;
805 				clrbit(sparse_dump_physmap, p);
806 			}
807 		}
808 	}
809 }
810 
811 /*
812  * Machine-dependently decides on the contents of a sparse dump, using
813  * the above.
814  */
815 void
cpu_dump_prep_sparse(void)816 cpu_dump_prep_sparse(void)
817 {
818 	sparse_dump_reset();
819 	/* XXX could the alternate recursive page table be skipped? */
820 	sparse_dump_mark();
821 	/* Memory for I/O buffers could be unmarked here, for example. */
822 	/* The kernel text could also be unmarked, but gdb would be upset. */
823 }
824 #endif
825 
826 /*
827  * Abstractly iterate over the collection of memory segments to be
828  * dumped; the callback lacks the customary environment-pointer
829  * argument because none of the current users really need one.
830  *
831  * To be used only after dump_seg_prep is called to set things up.
832  */
833 int
dump_seg_iter(int (* callback)(paddr_t,paddr_t))834 dump_seg_iter(int (*callback)(paddr_t, paddr_t))
835 {
836 	int error, i;
837 
838 #define CALLBACK(start,size) do {     \
839 	error = callback(start,size); \
840 	if (error)                    \
841 		return error;         \
842 } while(0)
843 
844 	for (i = 0; i < mem_cluster_cnt; ++i) {
845 #ifndef NO_SPARSE_DUMP
846 		/*
847 		 * The bitmap is scanned within each memory segment,
848 		 * rather than over its entire domain, in case any
849 		 * pages outside of the memory proper have been mapped
850 		 * into kva; they might be devices that wouldn't
851 		 * appreciate being arbitrarily read, and including
852 		 * them could also break the assumption that a sparse
853 		 * dump will always be smaller than a full one.
854 		 */
855 		if (sparse_dump && sparse_dump_physmap) {
856 			paddr_t p, start, end;
857 			int lastset;
858 
859 			start = mem_clusters[i].start;
860 			end = start + mem_clusters[i].size;
861 			start = rounddown(start, PAGE_SIZE); /* unnecessary? */
862 			lastset = 0;
863 			for (p = start; p < end; p += PAGE_SIZE) {
864 				int thisset = isset(sparse_dump_physmap,
865 				    p/PAGE_SIZE);
866 
867 				if (!lastset && thisset)
868 					start = p;
869 				if (lastset && !thisset)
870 					CALLBACK(start, p - start);
871 				lastset = thisset;
872 			}
873 			if (lastset)
874 				CALLBACK(start, p - start);
875 		} else
876 #endif
877 			CALLBACK(mem_clusters[i].start, mem_clusters[i].size);
878 	}
879 	return 0;
880 #undef CALLBACK
881 }
882 
883 /*
884  * Prepare for an impending core dump: decide what's being dumped and
885  * how much space it will take up.
886  */
887 void
dump_seg_prep(void)888 dump_seg_prep(void)
889 {
890 #ifndef NO_SPARSE_DUMP
891 	if (sparse_dump && sparse_dump_physmap)
892 		cpu_dump_prep_sparse();
893 #endif
894 
895 	dump_nmemsegs = 0;
896 	dump_npages = 0;
897 	dump_seg_iter(dump_seg_count_range);
898 
899 	dump_header_size = ALIGN(sizeof(kcore_seg_t)) +
900 	    ALIGN(sizeof(cpu_kcore_hdr_t)) +
901 	    ALIGN(dump_nmemsegs * sizeof(phys_ram_seg_t));
902 	dump_header_size = roundup(dump_header_size, dbtob(1));
903 
904 	/*
905 	 * savecore(8) will read this to decide how many pages to
906 	 * copy, and cpu_dumpconf has already used the pessimistic
907 	 * value to set dumplo, so it's time to tell the truth.
908 	 */
909 	dumpsize = dump_npages; /* XXX could these just be one variable? */
910 }
911 
912 int
dump_seg_count_range(paddr_t start,paddr_t size)913 dump_seg_count_range(paddr_t start, paddr_t size)
914 {
915 	++dump_nmemsegs;
916 	dump_npages += size / PAGE_SIZE;
917 	return 0;
918 }
919 
920 /*
921  * A sparse dump's header may be rather large, due to the number of
922  * "segments" emitted.  These routines manage a simple output buffer,
923  * so that the header can be written to disk incrementally.
924  */
925 void
dump_header_start(void)926 dump_header_start(void)
927 {
928 	dump_headerbuf_ptr = dump_headerbuf;
929 	dump_header_blkno = dumplo;
930 }
931 
932 int
dump_header_flush(void)933 dump_header_flush(void)
934 {
935 	const struct bdevsw *bdev;
936 	size_t to_write;
937 	int error;
938 
939 	bdev = bdevsw_lookup(dumpdev);
940 	to_write = roundup(dump_headerbuf_ptr - dump_headerbuf, dbtob(1));
941 	error = bdev->d_dump(dumpdev, dump_header_blkno,
942 	    dump_headerbuf, to_write);
943 	dump_header_blkno += btodb(to_write);
944 	dump_headerbuf_ptr = dump_headerbuf;
945 	return error;
946 }
947 
948 int
dump_header_addbytes(const void * vptr,size_t n)949 dump_header_addbytes(const void* vptr, size_t n)
950 {
951 	const char* ptr = vptr;
952 	int error;
953 
954 	while (n > dump_headerbuf_avail) {
955 		memcpy(dump_headerbuf_ptr, ptr, dump_headerbuf_avail);
956 		ptr += dump_headerbuf_avail;
957 		n -= dump_headerbuf_avail;
958 		dump_headerbuf_ptr = dump_headerbuf_end;
959 		error = dump_header_flush();
960 		if (error)
961 			return error;
962 	}
963 	memcpy(dump_headerbuf_ptr, ptr, n);
964 	dump_headerbuf_ptr += n;
965 
966 	return 0;
967 }
968 
969 int
dump_header_addseg(paddr_t start,paddr_t size)970 dump_header_addseg(paddr_t start, paddr_t size)
971 {
972 	phys_ram_seg_t seg = { start, size };
973 
974 	return dump_header_addbytes(&seg, sizeof(seg));
975 }
976 
977 int
dump_header_finish(void)978 dump_header_finish(void)
979 {
980 	memset(dump_headerbuf_ptr, 0, dump_headerbuf_avail);
981 	return dump_header_flush();
982 }
983 
984 
985 /*
986  * These variables are needed by /sbin/savecore
987  */
988 uint32_t	dumpmag = 0x8fca0101;	/* magic number */
989 int 	dumpsize = 0;		/* pages */
990 long	dumplo = 0; 		/* blocks */
991 
992 /*
993  * cpu_dumpsize: calculate size of machine-dependent kernel core dump headers
994  * for a full (non-sparse) dump.
995  */
996 int
cpu_dumpsize(void)997 cpu_dumpsize(void)
998 {
999 	int size;
1000 
1001 	size = ALIGN(sizeof(kcore_seg_t)) + ALIGN(sizeof(cpu_kcore_hdr_t)) +
1002 	    ALIGN(mem_cluster_cnt * sizeof(phys_ram_seg_t));
1003 	if (roundup(size, dbtob(1)) != dbtob(1))
1004 		return (-1);
1005 
1006 	return (1);
1007 }
1008 
1009 /*
1010  * cpu_dump_mempagecnt: calculate the size of RAM (in pages) to be dumped
1011  * for a full (non-sparse) dump.
1012  */
1013 u_long
cpu_dump_mempagecnt(void)1014 cpu_dump_mempagecnt(void)
1015 {
1016 	u_long i, n;
1017 
1018 	n = 0;
1019 	for (i = 0; i < mem_cluster_cnt; i++)
1020 		n += atop(mem_clusters[i].size);
1021 	return (n);
1022 }
1023 
1024 /*
1025  * cpu_dump: dump the machine-dependent kernel core dump headers.
1026  */
1027 int
cpu_dump(void)1028 cpu_dump(void)
1029 {
1030 	kcore_seg_t seg;
1031 	cpu_kcore_hdr_t cpuhdr;
1032 	const struct bdevsw *bdev;
1033 
1034 	bdev = bdevsw_lookup(dumpdev);
1035 	if (bdev == NULL)
1036 		return (ENXIO);
1037 
1038 	/*
1039 	 * Generate a segment header.
1040 	 */
1041 	CORE_SETMAGIC(seg, KCORE_MAGIC, MID_MACHINE, CORE_CPU);
1042 	seg.c_size = dump_header_size - ALIGN(sizeof(seg));
1043 	(void)dump_header_addbytes(&seg, ALIGN(sizeof(seg)));
1044 
1045 	/*
1046 	 * Add the machine-dependent header info.
1047 	 */
1048 	cpuhdr.ptdpaddr = PDPpaddr;
1049 	cpuhdr.nmemsegs = dump_nmemsegs;
1050 	(void)dump_header_addbytes(&cpuhdr, ALIGN(sizeof(cpuhdr)));
1051 
1052 	/*
1053 	 * Write out the memory segment descriptors.
1054 	 */
1055 	return dump_seg_iter(dump_header_addseg);
1056 }
1057 
1058 /*
1059  * Doadump comes here after turning off memory management and
1060  * getting on the dump stack, either when called above, or by
1061  * the auto-restart code.
1062  */
1063 #define BYTES_PER_DUMP  PAGE_SIZE /* must be a multiple of pagesize XXX small */
1064 static vaddr_t dumpspace;
1065 
1066 vaddr_t
reserve_dumppages(vaddr_t p)1067 reserve_dumppages(vaddr_t p)
1068 {
1069 
1070 	dumpspace = p;
1071 	return (p + BYTES_PER_DUMP);
1072 }
1073 
1074 int
dumpsys_seg(paddr_t maddr,paddr_t bytes)1075 dumpsys_seg(paddr_t maddr, paddr_t bytes)
1076 {
1077 	u_long i, m, n;
1078 	daddr_t blkno;
1079 	const struct bdevsw *bdev;
1080 	int (*dump)(dev_t, daddr_t, void *, size_t);
1081 	int error;
1082 
1083 	if (dumpdev == NODEV)
1084 		return ENODEV;
1085 	bdev = bdevsw_lookup(dumpdev);
1086 	if (bdev == NULL || bdev->d_psize == NULL)
1087 		return ENODEV;
1088 
1089 	dump = bdev->d_dump;
1090 
1091 	blkno = dump_header_blkno;
1092 	for (i = 0; i < bytes; i += n, dump_totalbytesleft -= n) {
1093 		/* Print out how many MBs we have left to go. */
1094 		if ((dump_totalbytesleft % (1024*1024)) == 0)
1095 			printf_nolog("%lu ", (unsigned long)
1096 			    (dump_totalbytesleft / (1024 * 1024)));
1097 
1098 		/* Limit size for next transfer. */
1099 		n = bytes - i;
1100 		if (n > BYTES_PER_DUMP)
1101 			n = BYTES_PER_DUMP;
1102 
1103 		for (m = 0; m < n; m += NBPG)
1104 			pmap_kenter_pa(dumpspace + m, maddr + m,
1105 			    VM_PROT_READ, 0);
1106 		pmap_update(pmap_kernel());
1107 
1108 		error = (*dump)(dumpdev, blkno, (void *)dumpspace, n);
1109 		pmap_kremove_local(dumpspace, n);
1110 		if (error)
1111 			return error;
1112 		maddr += n;
1113 		blkno += btodb(n);		/* XXX? */
1114 
1115 #if 0	/* XXX this doesn't work.  grr. */
1116 		/* operator aborting dump? */
1117 		if (sget() != NULL)
1118 			return EINTR;
1119 #endif
1120 	}
1121 	dump_header_blkno = blkno;
1122 
1123 	return 0;
1124 }
1125 
1126 void
dodumpsys(void)1127 dodumpsys(void)
1128 {
1129 	const struct bdevsw *bdev;
1130 	int dumpend, psize;
1131 	int error;
1132 
1133 	if (dumpdev == NODEV)
1134 		return;
1135 
1136 	bdev = bdevsw_lookup(dumpdev);
1137 	if (bdev == NULL || bdev->d_psize == NULL)
1138 		return;
1139 	/*
1140 	 * For dumps during autoconfiguration,
1141 	 * if dump device has already configured...
1142 	 */
1143 	if (dumpsize == 0)
1144 		cpu_dumpconf();
1145 
1146 	printf("\ndumping to dev %llu,%llu (offset=%ld, size=%d):",
1147 	    (unsigned long long)major(dumpdev),
1148 	    (unsigned long long)minor(dumpdev), dumplo, dumpsize);
1149 
1150 	if (dumplo <= 0 || dumpsize <= 0) {
1151 		printf(" not possible\n");
1152 		return;
1153 	}
1154 
1155 	psize = bdev_size(dumpdev);
1156 	printf("\ndump ");
1157 	if (psize == -1) {
1158 		printf("area unavailable\n");
1159 		return;
1160 	}
1161 
1162 #if 0	/* XXX this doesn't work.  grr. */
1163 	/* toss any characters present prior to dump */
1164 	while (sget() != NULL); /*syscons and pccons differ */
1165 #endif
1166 
1167 	dump_seg_prep();
1168 	dumpend = dumplo + btodb(dump_header_size) + ctod(dump_npages);
1169 	if (dumpend > psize) {
1170 		printf("failed: insufficient space (%d < %d)\n",
1171 		    psize, dumpend);
1172 		goto failed;
1173 	}
1174 
1175 	dump_header_start();
1176 	if ((error = cpu_dump()) != 0)
1177 		goto err;
1178 	if ((error = dump_header_finish()) != 0)
1179 		goto err;
1180 
1181 	if (dump_header_blkno != dumplo + btodb(dump_header_size)) {
1182 		printf("BAD header size (%ld [written] != %ld [expected])\n",
1183 		    (long)(dump_header_blkno - dumplo),
1184 		    (long)btodb(dump_header_size));
1185 		goto failed;
1186 	}
1187 
1188 	dump_totalbytesleft = roundup(ptoa(dump_npages), BYTES_PER_DUMP);
1189 	error = dump_seg_iter(dumpsys_seg);
1190 
1191 	if (error == 0 && dump_header_blkno != dumpend) {
1192 		printf("BAD dump size (%ld [written] != %ld [expected])\n",
1193 		    (long)(dumpend - dumplo),
1194 		    (long)(dump_header_blkno - dumplo));
1195 		goto failed;
1196 	}
1197 
1198 err:
1199 	switch (error) {
1200 
1201 	case ENXIO:
1202 		printf("device bad\n");
1203 		break;
1204 
1205 	case EFAULT:
1206 		printf("device not ready\n");
1207 		break;
1208 
1209 	case EINVAL:
1210 		printf("area improper\n");
1211 		break;
1212 
1213 	case EIO:
1214 		printf("i/o error\n");
1215 		break;
1216 
1217 	case EINTR:
1218 		printf("aborted from console\n");
1219 		break;
1220 
1221 	case 0:
1222 		printf("succeeded\n");
1223 		break;
1224 
1225 	default:
1226 		printf("error %d\n", error);
1227 		break;
1228 	}
1229 failed:
1230 	printf("\n\n");
1231 	delay(5000000);		/* 5 seconds */
1232 }
1233 
1234 /*
1235  * This is called by main to set dumplo and dumpsize.
1236  * Dumps always skip the first PAGE_SIZE of disk space
1237  * in case there might be a disk label stored there.
1238  * If there is extra space, put dump at the end to
1239  * reduce the chance that swapping trashes it.
1240  *
1241  * Sparse dumps can't placed as close to the end as possible, because
1242  * savecore(8) has to know where to start reading in the dump device
1243  * before it has access to any of the crashed system's state.
1244  *
1245  * Note also that a sparse dump will never be larger than a full one:
1246  * in order to add a phys_ram_seg_t to the header, at least one page
1247  * must be removed.
1248  */
1249 void
cpu_dumpconf(void)1250 cpu_dumpconf(void)
1251 {
1252 	int nblks, dumpblks;	/* size of dump area */
1253 
1254 	if (dumpdev == NODEV)
1255 		goto bad;
1256 	nblks = bdev_size(dumpdev);
1257 	if (nblks <= ctod(1))
1258 		goto bad;
1259 
1260 	dumpblks = cpu_dumpsize();
1261 	if (dumpblks < 0)
1262 		goto bad;
1263 
1264 	/* dumpsize is in page units, and doesn't include headers. */
1265 	dumpsize = cpu_dump_mempagecnt();
1266 
1267 	dumpblks += ctod(dumpsize);
1268 
1269 	/* If dump won't fit (incl. room for possible label), punt. */
1270 	if (dumpblks > (nblks - ctod(1))) {
1271 #ifndef NO_SPARSE_DUMP
1272 		/* A sparse dump might (and hopefully will) fit. */
1273 		dumplo = ctod(1);
1274 #else
1275 		/* But if we're not configured for that, punt. */
1276 		goto bad;
1277 #endif
1278 	} else {
1279 		/* Put dump at end of partition */
1280 		dumplo = nblks - dumpblks;
1281 	}
1282 
1283 
1284 	/* Now that we've decided this will work, init ancillary stuff. */
1285 	dump_misc_init();
1286 	return;
1287 
1288  bad:
1289 	dumpsize = 0;
1290 }
1291 
1292 /*
1293  * Clear registers on exec
1294  */
1295 void
setregs(struct lwp * l,struct exec_package * pack,vaddr_t stack)1296 setregs(struct lwp *l, struct exec_package *pack, vaddr_t stack)
1297 {
1298 	struct pcb *pcb = lwp_getpcb(l);
1299 	struct trapframe *tf;
1300 
1301 #ifdef USER_LDT
1302 	pmap_ldt_cleanup(l);
1303 #endif
1304 
1305 	fpu_save_area_clear(l, pack->ep_osversion >= 699002600
1306 	    ? __NetBSD_NPXCW__ : __NetBSD_COMPAT_NPXCW__);
1307 	pcb->pcb_flags = 0;
1308 
1309 	l->l_proc->p_flag &= ~PK_32;
1310 
1311 	tf = l->l_md.md_regs;
1312 	tf->tf_ds = LSEL(LUDATA_SEL, SEL_UPL);
1313 	tf->tf_es = LSEL(LUDATA_SEL, SEL_UPL);
1314 	cpu_fsgs_zero(l);
1315 	tf->tf_rdi = 0;
1316 	tf->tf_rsi = 0;
1317 	tf->tf_rbp = 0;
1318 	tf->tf_rbx = l->l_proc->p_psstrp;
1319 	tf->tf_rdx = 0;
1320 	tf->tf_rcx = 0;
1321 	tf->tf_rax = 0;
1322 	tf->tf_rip = pack->ep_entry;
1323 	tf->tf_cs = LSEL(LUCODE_SEL, SEL_UPL);
1324 	tf->tf_rflags = PSL_USERSET;
1325 	tf->tf_rsp = stack;
1326 	tf->tf_ss = LSEL(LUDATA_SEL, SEL_UPL);
1327 }
1328 
1329 /*
1330  * Initialize segments and descriptor tables
1331  */
1332 
1333 #ifdef XEN
1334 struct trap_info *xen_idt;
1335 int xen_idt_idx;
1336 #endif
1337 char *ldtstore;
1338 char *gdtstore;
1339 
1340 void
setgate(struct gate_descriptor * gd,void * func,int ist,int type,int dpl,int sel)1341 setgate(struct gate_descriptor *gd, void *func, int ist, int type, int dpl, int sel)
1342 {
1343 
1344 	kpreempt_disable();
1345 	pmap_changeprot_local(idt_vaddr, VM_PROT_READ|VM_PROT_WRITE);
1346 
1347 	gd->gd_looffset = (uint64_t)func & 0xffff;
1348 	gd->gd_selector = sel;
1349 	gd->gd_ist = ist;
1350 	gd->gd_type = type;
1351 	gd->gd_dpl = dpl;
1352 	gd->gd_p = 1;
1353 	gd->gd_hioffset = (uint64_t)func >> 16;
1354 	gd->gd_zero = 0;
1355 	gd->gd_xx1 = 0;
1356 	gd->gd_xx2 = 0;
1357 	gd->gd_xx3 = 0;
1358 
1359 	pmap_changeprot_local(idt_vaddr, VM_PROT_READ);
1360 	kpreempt_enable();
1361 }
1362 
1363 void
unsetgate(struct gate_descriptor * gd)1364 unsetgate(struct gate_descriptor *gd)
1365 {
1366 
1367 	kpreempt_disable();
1368 	pmap_changeprot_local(idt_vaddr, VM_PROT_READ|VM_PROT_WRITE);
1369 
1370 	memset(gd, 0, sizeof (*gd));
1371 
1372 	pmap_changeprot_local(idt_vaddr, VM_PROT_READ);
1373 	kpreempt_enable();
1374 }
1375 
1376 void
setregion(struct region_descriptor * rd,void * base,uint16_t limit)1377 setregion(struct region_descriptor *rd, void *base, uint16_t limit)
1378 {
1379 	rd->rd_limit = limit;
1380 	rd->rd_base = (uint64_t)base;
1381 }
1382 
1383 /*
1384  * Note that the base and limit fields are ignored in long mode.
1385  */
1386 void
set_mem_segment(struct mem_segment_descriptor * sd,void * base,size_t limit,int type,int dpl,int gran,int def32,int is64)1387 set_mem_segment(struct mem_segment_descriptor *sd, void *base, size_t limit,
1388 	int type, int dpl, int gran, int def32, int is64)
1389 {
1390 	sd->sd_lolimit = (unsigned)limit;
1391 	sd->sd_lobase = (unsigned long)base;
1392 	sd->sd_type = type;
1393 	sd->sd_dpl = dpl;
1394 	sd->sd_p = 1;
1395 	sd->sd_hilimit = (unsigned)limit >> 16;
1396 	sd->sd_avl = 0;
1397 	sd->sd_long = is64;
1398 	sd->sd_def32 = def32;
1399 	sd->sd_gran = gran;
1400 	sd->sd_hibase = (unsigned long)base >> 24;
1401 }
1402 
1403 void
set_sys_segment(struct sys_segment_descriptor * sd,void * base,size_t limit,int type,int dpl,int gran)1404 set_sys_segment(struct sys_segment_descriptor *sd, void *base, size_t limit,
1405 	int type, int dpl, int gran)
1406 {
1407 	memset(sd, 0, sizeof *sd);
1408 	sd->sd_lolimit = (unsigned)limit;
1409 	sd->sd_lobase = (uint64_t)base;
1410 	sd->sd_type = type;
1411 	sd->sd_dpl = dpl;
1412 	sd->sd_p = 1;
1413 	sd->sd_hilimit = (unsigned)limit >> 16;
1414 	sd->sd_gran = gran;
1415 	sd->sd_hibase = (uint64_t)base >> 24;
1416 }
1417 
1418 void
cpu_init_idt(void)1419 cpu_init_idt(void)
1420 {
1421 #ifndef XEN
1422 	struct region_descriptor region;
1423 
1424 	setregion(&region, idt, NIDT * sizeof(idt[0]) - 1);
1425 	lidt(&region);
1426 #else
1427 	if (HYPERVISOR_set_trap_table(xen_idt))
1428 		panic("HYPERVISOR_set_trap_table() failed");
1429 #endif
1430 }
1431 
1432 #define	IDTVEC(name)	__CONCAT(X, name)
1433 typedef void (vector)(void);
1434 extern vector IDTVEC(syscall);
1435 extern vector IDTVEC(syscall32);
1436 extern vector IDTVEC(osyscall);
1437 extern vector IDTVEC(oosyscall);
1438 extern vector *IDTVEC(exceptions)[];
1439 
1440 static void
init_x86_64_msgbuf(void)1441 init_x86_64_msgbuf(void)
1442 {
1443 	/* Message buffer is located at end of core. */
1444 	struct vm_physseg *vps;
1445 	psize_t sz = round_page(MSGBUFSIZE);
1446 	psize_t reqsz = sz;
1447 	int x;
1448 
1449  search_again:
1450 	vps = NULL;
1451 
1452 	for (x = 0; x < vm_nphysseg; x++) {
1453 		vps = VM_PHYSMEM_PTR(x);
1454 		if (ctob(vps->avail_end) == avail_end)
1455 			break;
1456 	}
1457 	if (x == vm_nphysseg)
1458 		panic("init_x86_64: can't find end of memory");
1459 
1460 	/* Shrink so it'll fit in the last segment. */
1461 	if ((vps->avail_end - vps->avail_start) < atop(sz))
1462 		sz = ctob(vps->avail_end - vps->avail_start);
1463 
1464 	vps->avail_end -= atop(sz);
1465 	vps->end -= atop(sz);
1466             msgbuf_p_seg[msgbuf_p_cnt].sz = sz;
1467             msgbuf_p_seg[msgbuf_p_cnt++].paddr = ctob(vps->avail_end);
1468 
1469 	/* Remove the last segment if it now has no pages. */
1470 	if (vps->start == vps->end) {
1471 		for (vm_nphysseg--; x < vm_nphysseg; x++)
1472 			VM_PHYSMEM_PTR_SWAP(x, x + 1);
1473 	}
1474 
1475 	/* Now find where the new avail_end is. */
1476 	for (avail_end = 0, x = 0; x < vm_nphysseg; x++)
1477 		if (VM_PHYSMEM_PTR(x)->avail_end > avail_end)
1478 			avail_end = VM_PHYSMEM_PTR(x)->avail_end;
1479 	avail_end = ctob(avail_end);
1480 
1481 	if (sz == reqsz)
1482 		return;
1483 
1484 	reqsz -= sz;
1485 	if (msgbuf_p_cnt == VM_PHYSSEG_MAX) {
1486 		/* No more segments available, bail out. */
1487 		printf("WARNING: MSGBUFSIZE (%zu) too large, using %zu.\n",
1488 		    (size_t)MSGBUFSIZE, (size_t)(MSGBUFSIZE - reqsz));
1489 		return;
1490 	}
1491 
1492 	sz = reqsz;
1493 	goto search_again;
1494 }
1495 
1496 static void
init_x86_64_ksyms(void)1497 init_x86_64_ksyms(void)
1498 {
1499 #if NKSYMS || defined(DDB) || defined(MODULAR)
1500 	extern int end;
1501 	extern int *esym;
1502 #ifndef XEN
1503 	struct btinfo_symtab *symtab;
1504 	vaddr_t tssym, tesym;
1505 #endif
1506 
1507 #ifdef DDB
1508 	db_machine_init();
1509 #endif
1510 
1511 #ifndef XEN
1512 	symtab = lookup_bootinfo(BTINFO_SYMTAB);
1513 	if (symtab) {
1514 		tssym = (vaddr_t)symtab->ssym + KERNBASE;
1515 		tesym = (vaddr_t)symtab->esym + KERNBASE;
1516 		ksyms_addsyms_elf(symtab->nsym, (void *)tssym, (void *)tesym);
1517 	} else
1518 		ksyms_addsyms_elf(*(long *)(void *)&end,
1519 		    ((long *)(void *)&end) + 1, esym);
1520 #else  /* XEN */
1521 	esym = xen_start_info.mod_start ?
1522 	    (void *)xen_start_info.mod_start :
1523 	    (void *)xen_start_info.mfn_list;
1524 	ksyms_addsyms_elf(*(int *)(void *)&end,
1525 	    ((int *)(void *)&end) + 1, esym);
1526 #endif /* XEN */
1527 #endif
1528 }
1529 
1530 void
init_x86_64(paddr_t first_avail)1531 init_x86_64(paddr_t first_avail)
1532 {
1533 	extern void consinit(void);
1534 	struct region_descriptor region;
1535 	struct mem_segment_descriptor *ldt_segp;
1536 	int x;
1537 #ifndef XEN
1538 	int ist;
1539 #endif /* !XEN */
1540 
1541 #ifdef XEN
1542 	KASSERT(HYPERVISOR_shared_info != NULL);
1543 	cpu_info_primary.ci_vcpu = &HYPERVISOR_shared_info->vcpu_info[0];
1544 
1545 	__PRINTK(("init_x86_64(0x%lx)\n", first_avail));
1546 #endif /* XEN */
1547 
1548 	cpu_probe(&cpu_info_primary);
1549 	cpu_init_msrs(&cpu_info_primary, true);
1550 
1551 	use_pae = 1; /* PAE always enabled in long mode */
1552 
1553 #ifdef XEN
1554 	struct pcb *pcb = lwp_getpcb(&lwp0);
1555 	mutex_init(&pte_lock, MUTEX_DEFAULT, IPL_VM);
1556 	pcb->pcb_cr3 = xen_start_info.pt_base - KERNBASE;
1557 	__PRINTK(("pcb_cr3 0x%lx\n", xen_start_info.pt_base - KERNBASE));
1558 #endif
1559 
1560 #if NISA > 0 || NPCI > 0
1561 	x86_bus_space_init();
1562 #endif
1563 
1564 	consinit();	/* XXX SHOULD NOT BE DONE HERE */
1565 
1566 	/*
1567 	 * Initialize PAGE_SIZE-dependent variables.
1568 	 */
1569 	uvm_setpagesize();
1570 
1571 	uvmexp.ncolors = 2;
1572 
1573 #ifndef XEN
1574 	/*
1575 	 * Low memory reservations:
1576 	 * Page 0:	BIOS data
1577 	 * Page 1:	BIOS callback (not used yet, for symmetry with i386)
1578 	 * Page 2:	MP bootstrap code (MP_TRAMPOLINE)
1579 	 * Page 3:	ACPI wakeup code (ACPI_WAKEUP_ADDR)
1580 	 * Page 4:	Temporary page table for 0MB-4MB
1581 	 * Page 5:	Temporary page directory
1582 	 * Page 6:	Temporary page map level 3
1583 	 * Page 7:	Temporary page map level 4
1584 	 */
1585 	avail_start = 8 * PAGE_SIZE;
1586 
1587 	/* Initialize the memory clusters (needed in pmap_boostrap). */
1588 	init_x86_clusters();
1589 #else	/* XEN */
1590 	/* Parse Xen command line (replace bootinfo) */
1591 	xen_parse_cmdline(XEN_PARSE_BOOTFLAGS, NULL);
1592 
1593 	/* Determine physical address space */
1594 	avail_start = first_avail;
1595 	avail_end = ctob(xen_start_info.nr_pages);
1596 	pmap_pa_start = (KERNTEXTOFF - KERNBASE);
1597 	pmap_pa_end = avail_end;
1598 	__PRINTK(("pmap_pa_start 0x%lx avail_start 0x%lx avail_end 0x%lx\n",
1599 	    pmap_pa_start, avail_start, avail_end));
1600 #endif	/* !XEN */
1601 
1602 	/*
1603 	 * Call pmap initialization to make new kernel address space.
1604 	 * We must do this before loading pages into the VM system.
1605 	 */
1606 	pmap_bootstrap(VM_MIN_KERNEL_ADDRESS);
1607 
1608 #ifndef XEN
1609 	/* Internalize the physical pages into the VM system. */
1610 	init_x86_vm(first_avail);
1611 #else	/* XEN */
1612 	kern_end = KERNBASE + first_avail;
1613 	physmem = xen_start_info.nr_pages;
1614 
1615 	uvm_page_physload(atop(avail_start),
1616 		atop(avail_end), atop(avail_start),
1617 		atop(avail_end), VM_FREELIST_DEFAULT);
1618 #endif	/* !XEN */
1619 
1620 	init_x86_64_msgbuf();
1621 
1622 	pmap_growkernel(VM_MIN_KERNEL_ADDRESS + 32 * 1024 * 1024);
1623 
1624 	kpreempt_disable();
1625 
1626 	pmap_kenter_pa(idt_vaddr, idt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
1627 	pmap_update(pmap_kernel());
1628 	memset((void *)idt_vaddr, 0, PAGE_SIZE);
1629 
1630 #ifndef XEN
1631 	pmap_changeprot_local(idt_vaddr, VM_PROT_READ);
1632 #endif
1633 	pmap_kenter_pa(idt_vaddr + PAGE_SIZE, idt_paddr + PAGE_SIZE,
1634 	    VM_PROT_READ|VM_PROT_WRITE, 0);
1635 #ifdef XEN
1636 	/* Steal one more page for LDT */
1637 	pmap_kenter_pa(idt_vaddr + 2 * PAGE_SIZE, idt_paddr + 2 * PAGE_SIZE,
1638 	    VM_PROT_READ|VM_PROT_WRITE, 0);
1639 #endif
1640 	pmap_update(pmap_kernel());
1641 
1642 #ifndef XEN
1643 	idt_init();
1644 	idt = (struct gate_descriptor *)idt_vaddr;
1645 	gdtstore = (char *)(idt + NIDT);
1646 	ldtstore = gdtstore + DYNSEL_START;
1647 #else
1648 	xen_idt = (struct trap_info *)idt_vaddr;
1649 	xen_idt_idx = 0;
1650 	/* Xen wants page aligned GDT/LDT in separated pages */
1651 	ldtstore = (char *) roundup((vaddr_t) (xen_idt + NIDT), PAGE_SIZE);
1652 	gdtstore = (char *) (ldtstore + PAGE_SIZE);
1653 #endif /* XEN */
1654 
1655 	/* make gdt gates and memory segments */
1656 	set_mem_segment(GDT_ADDR_MEM(gdtstore, GCODE_SEL), 0,
1657 	    0xfffff, SDT_MEMERA, SEL_KPL, 1, 0, 1);
1658 
1659 	set_mem_segment(GDT_ADDR_MEM(gdtstore, GDATA_SEL), 0,
1660 	    0xfffff, SDT_MEMRWA, SEL_KPL, 1, 0, 1);
1661 
1662 #ifndef XEN
1663 	set_sys_segment(GDT_ADDR_SYS(gdtstore, GLDT_SEL), ldtstore,
1664 	    LDT_SIZE - 1, SDT_SYSLDT, SEL_KPL, 0);
1665 #endif
1666 
1667 	set_mem_segment(GDT_ADDR_MEM(gdtstore, GUCODE_SEL), 0,
1668 	    x86_btop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMERA, SEL_UPL, 1, 0, 1);
1669 
1670 	set_mem_segment(GDT_ADDR_MEM(gdtstore, GUDATA_SEL), 0,
1671 	    x86_btop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMRWA, SEL_UPL, 1, 0, 1);
1672 
1673 	/* make ldt gates and memory segments */
1674 	setgate((struct gate_descriptor *)(ldtstore + LSYS5CALLS_SEL),
1675 	    &IDTVEC(oosyscall), 0, SDT_SYS386CGT, SEL_UPL,
1676 	    GSEL(GCODE_SEL, SEL_KPL));
1677 	*(struct mem_segment_descriptor *)(ldtstore + LUCODE_SEL) =
1678 	    *GDT_ADDR_MEM(gdtstore, GUCODE_SEL);
1679 	*(struct mem_segment_descriptor *)(ldtstore + LUDATA_SEL) =
1680 	    *GDT_ADDR_MEM(gdtstore, GUDATA_SEL);
1681 
1682 	/*
1683 	 * 32 bit GDT entries.
1684 	 */
1685 	set_mem_segment(GDT_ADDR_MEM(gdtstore, GUCODE32_SEL), 0,
1686 	    x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMERA, SEL_UPL, 1, 1, 0);
1687 
1688 	set_mem_segment(GDT_ADDR_MEM(gdtstore, GUDATA32_SEL), 0,
1689 	    x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0);
1690 
1691 	set_mem_segment(GDT_ADDR_MEM(gdtstore, GUFS_SEL), 0,
1692 	    x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0);
1693 
1694 	set_mem_segment(GDT_ADDR_MEM(gdtstore, GUGS_SEL), 0,
1695 	    x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0);
1696 
1697 	/*
1698 	 * 32 bit LDT entries.
1699 	 */
1700 	ldt_segp = (struct mem_segment_descriptor *)(ldtstore + LUCODE32_SEL);
1701 	set_mem_segment(ldt_segp, 0, x86_btop(VM_MAXUSER_ADDRESS32) - 1,
1702 	    SDT_MEMERA, SEL_UPL, 1, 1, 0);
1703 	ldt_segp = (struct mem_segment_descriptor *)(ldtstore + LUDATA32_SEL);
1704 	set_mem_segment(ldt_segp, 0, x86_btop(VM_MAXUSER_ADDRESS32) - 1,
1705 	    SDT_MEMRWA, SEL_UPL, 1, 1, 0);
1706 
1707 	/*
1708 	 * Other entries.
1709 	 */
1710 	memcpy((struct gate_descriptor *)(ldtstore + LSOL26CALLS_SEL),
1711 	    (struct gate_descriptor *)(ldtstore + LSYS5CALLS_SEL),
1712 	    sizeof (struct gate_descriptor));
1713 	memcpy((struct gate_descriptor *)(ldtstore + LBSDICALLS_SEL),
1714 	    (struct gate_descriptor *)(ldtstore + LSYS5CALLS_SEL),
1715 	    sizeof (struct gate_descriptor));
1716 
1717 	/* exceptions */
1718 	for (x = 0; x < 32; x++) {
1719 #ifndef XEN
1720 		idt_vec_reserve(x);
1721 		switch (x) {
1722 		case 2:	/* NMI */
1723 			ist = 3;
1724 			break;
1725 		case 8:	/* double fault */
1726 			ist = 2;
1727 			break;
1728 		default:
1729 			ist = 0;
1730 			break;
1731 		}
1732 		setgate(&idt[x], IDTVEC(exceptions)[x], ist, SDT_SYS386IGT,
1733 		    (x == 3 || x == 4) ? SEL_UPL : SEL_KPL,
1734 		    GSEL(GCODE_SEL, SEL_KPL));
1735 #else /* XEN */
1736 		pmap_changeprot_local(idt_vaddr, VM_PROT_READ|VM_PROT_WRITE);
1737 		xen_idt[xen_idt_idx].vector = x;
1738 
1739 		switch (x) {
1740 		case 2:  /* NMI */
1741 		case 18: /* MCA */
1742 			TI_SET_IF(&(xen_idt[xen_idt_idx]), 2);
1743 			break;
1744 		case 3:
1745 		case 4:
1746 			xen_idt[xen_idt_idx].flags = SEL_UPL;
1747 			break;
1748 		default:
1749 			xen_idt[xen_idt_idx].flags = SEL_KPL;
1750 			break;
1751 		}
1752 
1753 		xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL);
1754 		xen_idt[xen_idt_idx].address =
1755 		    (unsigned long)IDTVEC(exceptions)[x];
1756 		xen_idt_idx++;
1757 #endif /* XEN */
1758 	}
1759 
1760 	/* new-style interrupt gate for syscalls */
1761 #ifndef XEN
1762 	idt_vec_reserve(128);
1763 	setgate(&idt[128], &IDTVEC(osyscall), 0, SDT_SYS386IGT, SEL_UPL,
1764 	    GSEL(GCODE_SEL, SEL_KPL));
1765 #else
1766 	xen_idt[xen_idt_idx].vector = 128;
1767 	xen_idt[xen_idt_idx].flags = SEL_KPL;
1768 	xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL);
1769 	xen_idt[xen_idt_idx].address =  (unsigned long) &IDTVEC(osyscall);
1770 	xen_idt_idx++;
1771 	pmap_changeprot_local(idt_vaddr, VM_PROT_READ);
1772 #endif /* XEN */
1773 	kpreempt_enable();
1774 
1775 	setregion(&region, gdtstore, DYNSEL_START - 1);
1776 	lgdt(&region);
1777 
1778 #ifdef XEN
1779 	/* Init Xen callbacks and syscall handlers */
1780 	if (HYPERVISOR_set_callbacks(
1781 	    (unsigned long) hypervisor_callback,
1782 	    (unsigned long) failsafe_callback,
1783 	    (unsigned long) Xsyscall))
1784 		panic("HYPERVISOR_set_callbacks() failed");
1785 #endif /* XEN */
1786 	cpu_init_idt();
1787 
1788 	init_x86_64_ksyms();
1789 
1790 #ifndef XEN
1791 	intr_default_setup();
1792 #else
1793 	events_default_setup();
1794 #endif
1795 
1796 	splraise(IPL_HIGH);
1797 	x86_enable_intr();
1798 
1799 #ifdef DDB
1800 	if (boothowto & RB_KDB)
1801 		Debugger();
1802 #endif
1803 #ifdef KGDB
1804 	kgdb_port_init();
1805 	if (boothowto & RB_KDB) {
1806 		kgdb_debug_init = 1;
1807 		kgdb_connect(1);
1808 	}
1809 #endif
1810 }
1811 
1812 void
cpu_reset(void)1813 cpu_reset(void)
1814 {
1815 	x86_disable_intr();
1816 
1817 #ifdef XEN
1818 	HYPERVISOR_reboot();
1819 #else
1820 
1821 	x86_reset();
1822 
1823 	/*
1824 	 * Try to cause a triple fault and watchdog reset by making the IDT
1825 	 * invalid and causing a fault.
1826 	 */
1827 	kpreempt_disable();
1828 	pmap_changeprot_local(idt_vaddr, VM_PROT_READ|VM_PROT_WRITE);
1829 	pmap_changeprot_local(idt_vaddr + PAGE_SIZE,
1830 	    VM_PROT_READ|VM_PROT_WRITE);
1831 	memset((void *)idt, 0, NIDT * sizeof(idt[0]));
1832 	kpreempt_enable();
1833 	breakpoint();
1834 
1835 #if 0
1836 	/*
1837 	 * Try to cause a triple fault and watchdog reset by unmapping the
1838 	 * entire address space and doing a TLB flush.
1839 	 */
1840 	memset((void *)PTD, 0, PAGE_SIZE);
1841 	tlbflush();
1842 #endif
1843 #endif	/* XEN */
1844 
1845 	for (;;);
1846 }
1847 
1848 void
cpu_getmcontext(struct lwp * l,mcontext_t * mcp,unsigned int * flags)1849 cpu_getmcontext(struct lwp *l, mcontext_t *mcp, unsigned int *flags)
1850 {
1851 	const struct trapframe *tf = l->l_md.md_regs;
1852 	__greg_t ras_rip;
1853 
1854 	/* Copy general registers member by member */
1855 #define copy_from_tf(reg, REG, idx) mcp->__gregs[_REG_##REG] = tf->tf_##reg;
1856 	_FRAME_GREG(copy_from_tf)
1857 #undef copy_from_tf
1858 
1859 	if ((ras_rip = (__greg_t)ras_lookup(l->l_proc,
1860 	    (void *) mcp->__gregs[_REG_RIP])) != -1)
1861 		mcp->__gregs[_REG_RIP] = ras_rip;
1862 
1863 	*flags |= _UC_CPU;
1864 
1865 	mcp->_mc_tlsbase = (uintptr_t)l->l_private;
1866 	*flags |= _UC_TLSBASE;
1867 
1868 	process_read_fpregs_xmm(l, (struct fxsave *)&mcp->__fpregs);
1869 	*flags |= _UC_FPU;
1870 }
1871 
1872 int
cpu_setmcontext(struct lwp * l,const mcontext_t * mcp,unsigned int flags)1873 cpu_setmcontext(struct lwp *l, const mcontext_t *mcp, unsigned int flags)
1874 {
1875 	struct trapframe *tf = l->l_md.md_regs;
1876 	const __greg_t *gr = mcp->__gregs;
1877 	struct proc *p = l->l_proc;
1878 	int error;
1879 	int err, trapno;
1880 	int64_t rflags;
1881 
1882 	CTASSERT(sizeof (mcontext_t) == 26 * 8 + 8 + 512);
1883 
1884 	if ((flags & _UC_CPU) != 0) {
1885 		error = cpu_mcontext_validate(l, mcp);
1886 		if (error != 0)
1887 			return error;
1888 		/*
1889 		 * save and restore some values we don't want to change.
1890 		 * _FRAME_GREG(copy_to_tf) below overwrites them.
1891 		 *
1892 		 * XXX maybe inline this.
1893 		 */
1894 		rflags = tf->tf_rflags;
1895 		err = tf->tf_err;
1896 		trapno = tf->tf_trapno;
1897 
1898 		/* Copy general registers member by member */
1899 #define copy_to_tf(reg, REG, idx) tf->tf_##reg = gr[_REG_##REG];
1900 		_FRAME_GREG(copy_to_tf)
1901 #undef copy_to_tf
1902 
1903 #ifdef XEN
1904 		/*
1905 		 * Xen has its own way of dealing with %cs and %ss,
1906 		 * reset it to proper values.
1907 		 */
1908 		tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
1909 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
1910 #endif
1911 		rflags &= ~PSL_USER;
1912 		tf->tf_rflags = rflags | (gr[_REG_RFLAGS] & PSL_USER);
1913 		tf->tf_err = err;
1914 		tf->tf_trapno = trapno;
1915 
1916 		l->l_md.md_flags |= MDL_IRET;
1917 	}
1918 
1919 	if ((flags & _UC_FPU) != 0)
1920 		process_write_fpregs_xmm(l, (const struct fxsave *)&mcp->__fpregs);
1921 
1922 	if ((flags & _UC_TLSBASE) != 0)
1923 		lwp_setprivate(l, (void *)(uintptr_t)mcp->_mc_tlsbase);
1924 
1925 	mutex_enter(p->p_lock);
1926 	if (flags & _UC_SETSTACK)
1927 		l->l_sigstk.ss_flags |= SS_ONSTACK;
1928 	if (flags & _UC_CLRSTACK)
1929 		l->l_sigstk.ss_flags &= ~SS_ONSTACK;
1930 	mutex_exit(p->p_lock);
1931 
1932 	return 0;
1933 }
1934 
1935 int
cpu_mcontext_validate(struct lwp * l,const mcontext_t * mcp)1936 cpu_mcontext_validate(struct lwp *l, const mcontext_t *mcp)
1937 {
1938 	const __greg_t *gr;
1939 	uint16_t sel;
1940 	int error;
1941 	struct pmap *pmap = l->l_proc->p_vmspace->vm_map.pmap;
1942 	struct proc *p = l->l_proc;
1943 	struct trapframe *tf = l->l_md.md_regs;
1944 
1945 	gr = mcp->__gregs;
1946 
1947 	if (((gr[_REG_RFLAGS] ^ tf->tf_rflags) & PSL_USERSTATIC) != 0)
1948 		return EINVAL;
1949 
1950 	if (__predict_false(pmap->pm_ldt != NULL)) {
1951 		error = valid_user_selector(l, gr[_REG_ES]);
1952 		if (error != 0)
1953 			return error;
1954 
1955 		error = valid_user_selector(l, gr[_REG_FS]);
1956 		if (error != 0)
1957 			return error;
1958 
1959 		error = valid_user_selector(l, gr[_REG_GS]);
1960 		if (error != 0)
1961 			return error;
1962 
1963 		if ((gr[_REG_DS] & 0xffff) == 0)
1964 			return EINVAL;
1965 		error = valid_user_selector(l, gr[_REG_DS]);
1966 		if (error != 0)
1967 			return error;
1968 
1969 #ifndef XEN
1970 		if ((gr[_REG_SS] & 0xffff) == 0)
1971 			return EINVAL;
1972 		error = valid_user_selector(l, gr[_REG_SS]);
1973 		if (error != 0)
1974 			return error;
1975 #endif
1976 	} else {
1977 #define VUD(sel) \
1978     ((p->p_flag & PK_32) ? VALID_USER_DSEL32(sel) : VALID_USER_DSEL(sel))
1979 		sel = gr[_REG_ES] & 0xffff;
1980 		if (sel != 0 && !VUD(sel))
1981 			return EINVAL;
1982 
1983 /* XXX: Shouldn't this be FSEL32? */
1984 #define VUF(sel) \
1985     ((p->p_flag & PK_32) ? VALID_USER_DSEL32(sel) : VALID_USER_DSEL(sel))
1986 		sel = gr[_REG_FS] & 0xffff;
1987 		if (sel != 0 && !VUF(sel))
1988 			return EINVAL;
1989 
1990 #define VUG(sel) \
1991     ((p->p_flag & PK_32) ? VALID_USER_GSEL32(sel) : VALID_USER_DSEL(sel))
1992 		sel = gr[_REG_GS] & 0xffff;
1993 		if (sel != 0 && !VUG(sel))
1994 			return EINVAL;
1995 
1996 		sel = gr[_REG_DS] & 0xffff;
1997 		if (!VUD(sel))
1998 			return EINVAL;
1999 
2000 #ifndef XEN
2001 		sel = gr[_REG_SS] & 0xffff;
2002 		if (!VUD(sel))
2003 			return EINVAL;
2004 #endif
2005 
2006 	}
2007 
2008 #ifndef XEN
2009 #define VUC(sel) \
2010     ((p->p_flag & PK_32) ? VALID_USER_CSEL32(sel) : VALID_USER_CSEL(sel))
2011 	sel = gr[_REG_CS] & 0xffff;
2012 	if (!VUC(sel))
2013 		return EINVAL;
2014 #endif
2015 
2016 	if (gr[_REG_RIP] >= VM_MAXUSER_ADDRESS)
2017 		return EINVAL;
2018 	return 0;
2019 }
2020 
2021 void
cpu_initclocks(void)2022 cpu_initclocks(void)
2023 {
2024 	(*initclock_func)();
2025 }
2026 
2027 static int
valid_user_selector(struct lwp * l,uint64_t seg)2028 valid_user_selector(struct lwp *l, uint64_t seg)
2029 {
2030 	int off, len;
2031 	char *dt;
2032 	struct mem_segment_descriptor *sdp;
2033 	struct proc *p = l->l_proc;
2034 	struct pmap *pmap= p->p_vmspace->vm_map.pmap;
2035 	uint64_t base;
2036 
2037 	seg &= 0xffff;
2038 
2039 	if (seg == 0)
2040 		return 0;
2041 
2042 	off = (seg & 0xfff8);
2043 	if (seg & SEL_LDT) {
2044 		if (pmap->pm_ldt != NULL) {
2045 			len = pmap->pm_ldt_len; /* XXX broken */
2046 			dt = (char *)pmap->pm_ldt;
2047 		} else {
2048 			dt = ldtstore;
2049 			len = LDT_SIZE;
2050 		}
2051 
2052 		if (off > (len - 8))
2053 			return EINVAL;
2054 	} else {
2055 		CTASSERT(GUDATA_SEL & SEL_LDT);
2056 		KASSERT(seg != GUDATA_SEL);
2057 		CTASSERT(GUDATA32_SEL & SEL_LDT);
2058 		KASSERT(seg != GUDATA32_SEL);
2059 		return EINVAL;
2060 	}
2061 
2062 	sdp = (struct mem_segment_descriptor *)(dt + off);
2063 	if (sdp->sd_type < SDT_MEMRO || sdp->sd_p == 0)
2064 		return EINVAL;
2065 
2066 	base = ((uint64_t)sdp->sd_hibase << 32) | ((uint64_t)sdp->sd_lobase);
2067 	if (sdp->sd_gran == 1)
2068 		base <<= PAGE_SHIFT;
2069 
2070 	if (base >= VM_MAXUSER_ADDRESS)
2071 		return EINVAL;
2072 
2073 	return 0;
2074 }
2075 
2076 int
mm_md_kernacc(void * ptr,vm_prot_t prot,bool * handled)2077 mm_md_kernacc(void *ptr, vm_prot_t prot, bool *handled)
2078 {
2079 	extern int start, __data_start;
2080 	const vaddr_t v = (vaddr_t)ptr;
2081 
2082 	if (v >= (vaddr_t)&start && v < (vaddr_t)kern_end) {
2083 		*handled = true;
2084 		/* Either the text or rodata segment */
2085 		if (v < (vaddr_t)&__data_start && (prot & VM_PROT_WRITE))
2086 			return EFAULT;
2087 
2088 	} else if (v >= module_start && v < module_end) {
2089 		*handled = true;
2090 		if (!uvm_map_checkprot(module_map, v, v + 1, prot))
2091 			return EFAULT;
2092 	} else {
2093 		*handled = false;
2094 	}
2095 	return 0;
2096 }
2097 
2098 /*
2099  * Zero out an LWP's TLS context (%fs and %gs and associated stuff).
2100  * Used when exec'ing a new program.
2101  */
2102 
2103 void
cpu_fsgs_zero(struct lwp * l)2104 cpu_fsgs_zero(struct lwp *l)
2105 {
2106 	struct trapframe * const tf = l->l_md.md_regs;
2107 	struct pcb *pcb;
2108 	uint64_t zero = 0;
2109 
2110 	pcb = lwp_getpcb(l);
2111 	if (l == curlwp) {
2112 		kpreempt_disable();
2113 		tf->tf_fs = 0;
2114 		tf->tf_gs = 0;
2115 		setfs(0);
2116 #ifndef XEN
2117 		setusergs(0);
2118 #else
2119 		HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, 0);
2120 #endif
2121 		if ((l->l_proc->p_flag & PK_32) == 0) {
2122 #ifndef XEN
2123 			wrmsr(MSR_FSBASE, 0);
2124 			wrmsr(MSR_KERNELGSBASE, 0);
2125 #else
2126 			HYPERVISOR_set_segment_base(SEGBASE_FS, 0);
2127 			HYPERVISOR_set_segment_base(SEGBASE_GS_USER, 0);
2128 #endif
2129 		}
2130 		pcb->pcb_fs = 0;
2131 		pcb->pcb_gs = 0;
2132 		update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &zero);
2133 		update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &zero);
2134 		kpreempt_enable();
2135 	} else {
2136 		tf->tf_fs = 0;
2137 		tf->tf_gs = 0;
2138 		pcb->pcb_fs = 0;
2139 		pcb->pcb_gs = 0;
2140 	}
2141 
2142 }
2143 
2144 /*
2145  * Load an LWP's TLS context, possibly changing the %fs and %gs selectors.
2146  * Used only for 32-bit processes.
2147  */
2148 
2149 void
cpu_fsgs_reload(struct lwp * l,int fssel,int gssel)2150 cpu_fsgs_reload(struct lwp *l, int fssel, int gssel)
2151 {
2152 	struct trapframe *tf;
2153 	struct pcb *pcb;
2154 
2155 	KASSERT(l->l_proc->p_flag & PK_32);
2156 	tf = l->l_md.md_regs;
2157 	if (l == curlwp) {
2158 		pcb = lwp_getpcb(l);
2159 		kpreempt_disable();
2160 		update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &pcb->pcb_fs);
2161 		update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &pcb->pcb_gs);
2162 		setfs(fssel);
2163 #ifndef XEN
2164 		setusergs(gssel);
2165 #else
2166 		HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gssel);
2167 #endif
2168 		tf->tf_fs = fssel;
2169 		tf->tf_gs = gssel;
2170 		kpreempt_enable();
2171 	} else {
2172 		tf->tf_fs = fssel;
2173 		tf->tf_gs = gssel;
2174 	}
2175 }
2176 
2177 
2178 #ifdef __HAVE_DIRECT_MAP
2179 bool
mm_md_direct_mapped_io(void * addr,paddr_t * paddr)2180 mm_md_direct_mapped_io(void *addr, paddr_t *paddr)
2181 {
2182 	vaddr_t va = (vaddr_t)addr;
2183 
2184 	if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
2185 		*paddr = PMAP_DIRECT_UNMAP(va);
2186 		return true;
2187 	}
2188 	return false;
2189 }
2190 
2191 bool
mm_md_direct_mapped_phys(paddr_t paddr,vaddr_t * vaddr)2192 mm_md_direct_mapped_phys(paddr_t paddr, vaddr_t *vaddr)
2193 {
2194 	*vaddr = PMAP_DIRECT_MAP(paddr);
2195 	return true;
2196 }
2197 #endif
2198