1 /*	$NetBSD: machdep.c,v 1.759 2016/07/16 17:02:34 maxv Exp $	*/
2 
3 /*-
4  * Copyright (c) 1996, 1997, 1998, 2000, 2004, 2006, 2008, 2009
5  *     The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Charles M. Hannum, by Jason R. Thorpe of the Numerical Aerospace
10  * Simulation Facility NASA Ames Research Center, by Julio M. Merino Vidal,
11  * and by Andrew Doran.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
24  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
26  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
29  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
30  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
31  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32  * POSSIBILITY OF SUCH DAMAGE.
33  */
34 
35 /*-
36  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
37  * All rights reserved.
38  *
39  * This code is derived from software contributed to Berkeley by
40  * William Jolitz.
41  *
42  * Redistribution and use in source and binary forms, with or without
43  * modification, are permitted provided that the following conditions
44  * are met:
45  * 1. Redistributions of source code must retain the above copyright
46  *    notice, this list of conditions and the following disclaimer.
47  * 2. Redistributions in binary form must reproduce the above copyright
48  *    notice, this list of conditions and the following disclaimer in the
49  *    documentation and/or other materials provided with the distribution.
50  * 3. Neither the name of the University nor the names of its contributors
51  *    may be used to endorse or promote products derived from this software
52  *    without specific prior written permission.
53  *
54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64  * SUCH DAMAGE.
65  *
66  *	@(#)machdep.c	7.4 (Berkeley) 6/3/91
67  */
68 
69 #include <sys/cdefs.h>
70 __KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.759 2016/07/16 17:02:34 maxv Exp $");
71 
72 #include "opt_beep.h"
73 #include "opt_compat_ibcs2.h"
74 #include "opt_compat_freebsd.h"
75 #include "opt_compat_netbsd.h"
76 #include "opt_compat_svr4.h"
77 #include "opt_cpureset_delay.h"
78 #include "opt_ddb.h"
79 #include "opt_ipkdb.h"
80 #include "opt_kgdb.h"
81 #include "opt_mtrr.h"
82 #include "opt_modular.h"
83 #include "opt_multiboot.h"
84 #include "opt_multiprocessor.h"
85 #include "opt_physmem.h"
86 #include "opt_realmem.h"
87 #include "opt_user_ldt.h"
88 #include "opt_vm86.h"
89 #include "opt_xen.h"
90 #include "isa.h"
91 #include "pci.h"
92 
93 #include <sys/param.h>
94 #include <sys/systm.h>
95 #include <sys/signal.h>
96 #include <sys/signalvar.h>
97 #include <sys/kernel.h>
98 #include <sys/cpu.h>
99 #include <sys/exec.h>
100 #include <sys/fcntl.h>
101 #include <sys/reboot.h>
102 #include <sys/conf.h>
103 #include <sys/kauth.h>
104 #include <sys/mbuf.h>
105 #include <sys/msgbuf.h>
106 #include <sys/mount.h>
107 #include <sys/syscallargs.h>
108 #include <sys/core.h>
109 #include <sys/kcore.h>
110 #include <sys/ucontext.h>
111 #include <sys/ras.h>
112 #include <sys/ksyms.h>
113 #include <sys/device.h>
114 
115 #ifdef IPKDB
116 #include <ipkdb/ipkdb.h>
117 #endif
118 
119 #ifdef KGDB
120 #include <sys/kgdb.h>
121 #endif
122 
123 #include <dev/cons.h>
124 #include <dev/mm.h>
125 
126 #include <uvm/uvm.h>
127 #include <uvm/uvm_page.h>
128 
129 #include <sys/sysctl.h>
130 
131 #include <machine/cpu.h>
132 #include <machine/cpufunc.h>
133 #include <machine/cpuvar.h>
134 #include <machine/gdt.h>
135 #include <machine/intr.h>
136 #include <machine/kcore.h>
137 #include <machine/pio.h>
138 #include <machine/psl.h>
139 #include <machine/reg.h>
140 #include <machine/specialreg.h>
141 #include <machine/bootinfo.h>
142 #include <machine/mtrr.h>
143 #include <x86/x86/tsc.h>
144 
145 #include <x86/fpu.h>
146 #include <x86/machdep.h>
147 
148 #include <machine/multiboot.h>
149 #ifdef XEN
150 #include <xen/evtchn.h>
151 #include <xen/xen.h>
152 #include <xen/hypervisor.h>
153 
154 /* #define	XENDEBUG */
155 /* #define	XENDEBUG_LOW */
156 
157 #ifdef XENDEBUG
158 #define	XENPRINTF(x) printf x
159 #define	XENPRINTK(x) printk x
160 #else
161 #define	XENPRINTF(x)
162 #define	XENPRINTK(x)
163 #endif
164 #define	PRINTK(x) printf x
165 #endif /* XEN */
166 
167 #include <dev/isa/isareg.h>
168 #include <machine/isa_machdep.h>
169 #include <dev/ic/i8042reg.h>
170 
171 #ifdef DDB
172 #include <machine/db_machdep.h>
173 #include <ddb/db_extern.h>
174 #endif
175 
176 #ifdef VM86
177 #include <machine/vm86.h>
178 #endif
179 
180 #include "acpica.h"
181 #include "bioscall.h"
182 
183 #if NBIOSCALL > 0
184 #include <machine/bioscall.h>
185 #endif
186 
187 #if NACPICA > 0
188 #include <dev/acpi/acpivar.h>
189 #define ACPI_MACHDEP_PRIVATE
190 #include <machine/acpi_machdep.h>
191 #endif
192 
193 #include "isa.h"
194 #include "isadma.h"
195 #include "ksyms.h"
196 
197 #include "cardbus.h"
198 #if NCARDBUS > 0
199 /* For rbus_min_start hint. */
200 #include <sys/bus.h>
201 #include <dev/cardbus/rbus.h>
202 #include <machine/rbus_machdep.h>
203 #endif
204 
205 #include "mca.h"
206 #if NMCA > 0
207 #include <machine/mca_machdep.h>	/* for mca_busprobe() */
208 #endif
209 
210 #ifdef MULTIPROCESSOR		/* XXX */
211 #include <machine/mpbiosvar.h>	/* XXX */
212 #endif				/* XXX */
213 
214 /* the following is used externally (sysctl_hw) */
215 char machine[] = "i386";		/* CPU "architecture" */
216 char machine_arch[] = "i386";		/* machine == machine_arch */
217 
218 #ifdef CPURESET_DELAY
219 int cpureset_delay = CPURESET_DELAY;
220 #else
221 int cpureset_delay = 2000; /* default to 2s */
222 #endif
223 
224 #ifdef MTRR
225 struct mtrr_funcs *mtrr_funcs;
226 #endif
227 
228 int cpu_class;
229 int use_pae;
230 int i386_fpu_present = 1;
231 int i386_fpu_fdivbug;
232 
233 int i386_use_fxsave;
234 int i386_has_sse;
235 int i386_has_sse2;
236 
237 vaddr_t msgbuf_vaddr;
238 struct {
239 	paddr_t paddr;
240 	psize_t sz;
241 } msgbuf_p_seg[VM_PHYSSEG_MAX];
242 unsigned int msgbuf_p_cnt = 0;
243 
244 vaddr_t idt_vaddr;
245 paddr_t idt_paddr;
246 vaddr_t pentium_idt_vaddr;
247 
248 struct vm_map *phys_map = NULL;
249 
250 extern paddr_t avail_start, avail_end;
251 #ifdef XEN
252 extern paddr_t pmap_pa_start, pmap_pa_end;
253 void hypervisor_callback(void);
254 void failsafe_callback(void);
255 #endif
256 
257 #ifdef XEN
258 void (*delay_func)(unsigned int) = xen_delay;
259 void (*initclock_func)(void) = xen_initclocks;
260 #else
261 void (*delay_func)(unsigned int) = i8254_delay;
262 void (*initclock_func)(void) = i8254_initclocks;
263 #endif
264 
265 
266 /*
267  * Size of memory segments, before any memory is stolen.
268  */
269 phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX];
270 int mem_cluster_cnt = 0;
271 
272 void init386(paddr_t);
273 void initgdt(union descriptor *);
274 
275 extern int time_adjusted;
276 
277 int *esym;
278 int *eblob;
279 extern int boothowto;
280 
281 #ifndef XEN
282 
283 /* Base memory reported by BIOS. */
284 #ifndef REALBASEMEM
285 int biosbasemem = 0;
286 #else
287 int biosbasemem = REALBASEMEM;
288 #endif
289 
290 /* Extended memory reported by BIOS. */
291 #ifndef REALEXTMEM
292 int biosextmem = 0;
293 #else
294 int biosextmem = REALEXTMEM;
295 #endif
296 
297 /* Set if any boot-loader set biosbasemem/biosextmem. */
298 int biosmem_implicit;
299 
300 /*
301  * Representation of the bootinfo structure constructed by a NetBSD native
302  * boot loader.  Only be used by native_loader().
303  */
304 struct bootinfo_source {
305 	uint32_t bs_naddrs;
306 	void *bs_addrs[1]; /* Actually longer. */
307 };
308 
309 /* Only called by locore.S; no need to be in a header file. */
310 void native_loader(int, int, struct bootinfo_source *, paddr_t, int, int);
311 
312 /*
313  * Called as one of the very first things during system startup (just after
314  * the boot loader gave control to the kernel image), this routine is in
315  * charge of retrieving the parameters passed in by the boot loader and
316  * storing them in the appropriate kernel variables.
317  *
318  * WARNING: Because the kernel has not yet relocated itself to KERNBASE,
319  * special care has to be taken when accessing memory because absolute
320  * addresses (referring to kernel symbols) do not work.  So:
321  *
322  *     1) Avoid jumps to absolute addresses (such as gotos and switches).
323  *     2) To access global variables use their physical address, which
324  *        can be obtained using the RELOC macro.
325  */
326 void
native_loader(int bl_boothowto,int bl_bootdev,struct bootinfo_source * bl_bootinfo,paddr_t bl_esym,int bl_biosextmem,int bl_biosbasemem)327 native_loader(int bl_boothowto, int bl_bootdev,
328     struct bootinfo_source *bl_bootinfo, paddr_t bl_esym,
329     int bl_biosextmem, int bl_biosbasemem)
330 {
331 #define RELOC(type, x) ((type)((vaddr_t)(x) - KERNBASE))
332 
333 	*RELOC(int *, &boothowto) = bl_boothowto;
334 
335 #ifdef COMPAT_OLDBOOT
336 	/*
337 	 * Pre-1.3 boot loaders gave the boot device as a parameter
338 	 * (instead of a bootinfo entry).
339 	 */
340 	*RELOC(int *, &bootdev) = bl_bootdev;
341 #endif
342 
343 	/*
344 	 * The boot loader provides a physical, non-relocated address
345 	 * for the symbols table's end.  We need to convert it to a
346 	 * virtual address.
347 	 */
348 	if (bl_esym != 0)
349 		*RELOC(int **, &esym) = (int *)((vaddr_t)bl_esym + KERNBASE);
350 	else
351 		*RELOC(int **, &esym) = 0;
352 
353 	/*
354 	 * Copy bootinfo entries (if any) from the boot loader's
355 	 * representation to the kernel's bootinfo space.
356 	 */
357 	if (bl_bootinfo != NULL) {
358 		size_t i;
359 		uint8_t *data;
360 		struct bootinfo *bidest;
361 		struct btinfo_modulelist *bi;
362 
363 		bidest = RELOC(struct bootinfo *, &bootinfo);
364 
365 		data = &bidest->bi_data[0];
366 
367 		for (i = 0; i < bl_bootinfo->bs_naddrs; i++) {
368 			struct btinfo_common *bc;
369 
370 			bc = bl_bootinfo->bs_addrs[i];
371 
372 			if ((data + bc->len) >
373 			    (&bidest->bi_data[0] + BOOTINFO_MAXSIZE))
374 				break;
375 
376 			memcpy(data, bc, bc->len);
377 			/*
378 			 * If any modules were loaded, record where they
379 			 * end.  We'll need to skip over them.
380 			 */
381 			bi = (struct btinfo_modulelist *)data;
382 			if (bi->common.type == BTINFO_MODULELIST) {
383 				*RELOC(int **, &eblob) =
384 				    (int *)(bi->endpa + KERNBASE);
385 			}
386 			data += bc->len;
387 		}
388 		bidest->bi_nentries = i;
389 	}
390 
391 	/*
392 	 * Configure biosbasemem and biosextmem only if they were not
393 	 * explicitly given during the kernel's build.
394 	 */
395 	if (*RELOC(int *, &biosbasemem) == 0) {
396 		*RELOC(int *, &biosbasemem) = bl_biosbasemem;
397 		*RELOC(int *, &biosmem_implicit) = 1;
398 	}
399 	if (*RELOC(int *, &biosextmem) == 0) {
400 		*RELOC(int *, &biosextmem) = bl_biosextmem;
401 		*RELOC(int *, &biosmem_implicit) = 1;
402 	}
403 #undef RELOC
404 }
405 
406 #endif /* XEN */
407 
408 /*
409  * Machine-dependent startup code
410  */
411 void
cpu_startup(void)412 cpu_startup(void)
413 {
414 	int x, y;
415 	vaddr_t minaddr, maxaddr;
416 	psize_t sz;
417 
418 	/*
419 	 * For console drivers that require uvm and pmap to be initialized,
420 	 * we'll give them one more chance here...
421 	 */
422 	consinit();
423 
424 	/*
425 	 * Initialize error message buffer (et end of core).
426 	 */
427 	if (msgbuf_p_cnt == 0)
428 		panic("msgbuf paddr map has not been set up");
429 	for (x = 0, sz = 0; x < msgbuf_p_cnt; sz += msgbuf_p_seg[x++].sz)
430 		continue;
431 
432 	msgbuf_vaddr = uvm_km_alloc(kernel_map, sz, 0, UVM_KMF_VAONLY);
433 	if (msgbuf_vaddr == 0)
434 		panic("failed to valloc msgbuf_vaddr");
435 
436 	for (y = 0, sz = 0; y < msgbuf_p_cnt; y++) {
437 		for (x = 0; x < btoc(msgbuf_p_seg[y].sz); x++, sz += PAGE_SIZE)
438 			pmap_kenter_pa((vaddr_t)msgbuf_vaddr + sz,
439 			    msgbuf_p_seg[y].paddr + x * PAGE_SIZE,
440 			    VM_PROT_READ|VM_PROT_WRITE, 0);
441 	}
442 
443 	pmap_update(pmap_kernel());
444 
445 	initmsgbuf((void *)msgbuf_vaddr, sz);
446 
447 #ifdef MULTIBOOT
448 	multiboot_print_info();
449 #endif
450 
451 #ifdef TRAPLOG
452 	/*
453 	 * Enable recording of branch from/to in MSR's
454 	 */
455 	wrmsr(MSR_DEBUGCTLMSR, 0x1);
456 #endif
457 
458 #if NCARDBUS > 0
459 	/* Tell RBUS how much RAM we have, so it can use heuristics. */
460 	rbus_min_start_hint(ctob((psize_t)physmem));
461 #endif
462 
463 	minaddr = 0;
464 
465 	/*
466 	 * Allocate a submap for physio
467 	 */
468 	phys_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
469 	    VM_PHYS_SIZE, 0, false, NULL);
470 
471 	/* Say hello. */
472 	banner();
473 
474 	/* Safe for i/o port / memory space allocation to use malloc now. */
475 #if NISA > 0 || NPCI > 0
476 	x86_bus_space_mallocok();
477 #endif
478 
479 	gdt_init();
480 	i386_proc0_tss_ldt_init();
481 
482 #ifndef XEN
483 	cpu_init_tss(&cpu_info_primary);
484 	ltr(cpu_info_primary.ci_tss_sel);
485 #endif
486 
487 	x86_startup();
488 }
489 
490 /*
491  * Set up proc0's TSS and LDT.
492  */
493 void
i386_proc0_tss_ldt_init(void)494 i386_proc0_tss_ldt_init(void)
495 {
496 	struct lwp *l;
497 	struct pcb *pcb __diagused;
498 
499 	l = &lwp0;
500 	pcb = lwp_getpcb(l);
501 
502 	pmap_kernel()->pm_ldt_sel = GSEL(GLDT_SEL, SEL_KPL);
503 	pcb->pcb_cr0 = rcr0() & ~CR0_TS;
504 	pcb->pcb_esp0 = uvm_lwp_getuarea(l) + USPACE - 16;
505 	pcb->pcb_iopl = SEL_KPL;
506 	l->l_md.md_regs = (struct trapframe *)pcb->pcb_esp0 - 1;
507 	memcpy(&pcb->pcb_fsd, &gdt[GUDATA_SEL], sizeof(pcb->pcb_fsd));
508 	memcpy(&pcb->pcb_gsd, &gdt[GUDATA_SEL], sizeof(pcb->pcb_gsd));
509 
510 #ifndef XEN
511 	lldt(pmap_kernel()->pm_ldt_sel);
512 #else
513 	HYPERVISOR_fpu_taskswitch(1);
514 	XENPRINTF(("lwp tss sp %p ss %04x/%04x\n",
515 	    (void *)pcb->pcb_esp0,
516 	    GSEL(GDATA_SEL, SEL_KPL),
517 	    IDXSEL(GSEL(GDATA_SEL, SEL_KPL))));
518 	HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), pcb->pcb_esp0);
519 #endif
520 }
521 
522 #ifdef XEN
523 /* used in assembly */
524 void i386_switch_context(lwp_t *);
525 void i386_tls_switch(lwp_t *);
526 
527 /*
528  * Switch context:
529  * - switch stack pointer for user->kernel transition
530  */
531 void
i386_switch_context(lwp_t * l)532 i386_switch_context(lwp_t *l)
533 {
534 	struct pcb *pcb;
535 	struct physdev_op physop;
536 
537 	pcb = lwp_getpcb(l);
538 
539 	HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), pcb->pcb_esp0);
540 
541 	physop.cmd = PHYSDEVOP_SET_IOPL;
542 	physop.u.set_iopl.iopl = pcb->pcb_iopl;
543 	HYPERVISOR_physdev_op(&physop);
544 }
545 
546 void
i386_tls_switch(lwp_t * l)547 i386_tls_switch(lwp_t *l)
548 {
549 	struct cpu_info *ci = curcpu();
550 	struct pcb *pcb = lwp_getpcb(l);
551 	/*
552          * Raise the IPL to IPL_HIGH.
553 	 * FPU IPIs can alter the LWP's saved cr0.  Dropping the priority
554 	 * is deferred until mi_switch(), when cpu_switchto() returns.
555 	 */
556 	(void)splhigh();
557 
558         /*
559 	 * If our floating point registers are on a different CPU,
560 	 * set CR0_TS so we'll trap rather than reuse bogus state.
561 	 */
562 
563 	if (l != ci->ci_fpcurlwp) {
564 		HYPERVISOR_fpu_taskswitch(1);
565 	}
566 
567 	/* Update TLS segment pointers */
568 	update_descriptor(&ci->ci_gdt[GUFS_SEL],
569 			  (union descriptor *) &pcb->pcb_fsd);
570 	update_descriptor(&ci->ci_gdt[GUGS_SEL],
571 			  (union descriptor *) &pcb->pcb_gsd);
572 
573 }
574 #endif /* XEN */
575 
576 #ifndef XEN
577 /*
578  * Set up TSS and I/O bitmap.
579  */
580 void
cpu_init_tss(struct cpu_info * ci)581 cpu_init_tss(struct cpu_info *ci)
582 {
583 	struct i386tss *tss = &ci->ci_tss;
584 
585 	tss->tss_iobase = IOMAP_INVALOFF << 16;
586 	tss->tss_ss0 = GSEL(GDATA_SEL, SEL_KPL);
587 	tss->tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
588 	tss->tss_cr3 = rcr3();
589 	ci->ci_tss_sel = tss_alloc(tss);
590 }
591 #endif /* XEN */
592 
593 void *
getframe(struct lwp * l,int sig,int * onstack)594 getframe(struct lwp *l, int sig, int *onstack)
595 {
596 	struct proc *p = l->l_proc;
597 	struct trapframe *tf = l->l_md.md_regs;
598 
599 	/* Do we need to jump onto the signal stack? */
600 	*onstack = (l->l_sigstk.ss_flags & (SS_DISABLE | SS_ONSTACK)) == 0
601 	    && (SIGACTION(p, sig).sa_flags & SA_ONSTACK) != 0;
602 	if (*onstack)
603 		return (char *)l->l_sigstk.ss_sp + l->l_sigstk.ss_size;
604 #ifdef VM86
605 	if (tf->tf_eflags & PSL_VM)
606 		return (void *)(tf->tf_esp + (tf->tf_ss << 4));
607 	else
608 #endif
609 		return (void *)tf->tf_esp;
610 }
611 
612 /*
613  * Build context to run handler in.  We invoke the handler
614  * directly, only returning via the trampoline.  Note the
615  * trampoline version numbers are coordinated with machine-
616  * dependent code in libc.
617  */
618 void
buildcontext(struct lwp * l,int sel,void * catcher,void * fp)619 buildcontext(struct lwp *l, int sel, void *catcher, void *fp)
620 {
621 	struct trapframe *tf = l->l_md.md_regs;
622 
623 	tf->tf_gs = GSEL(GUGS_SEL, SEL_UPL);
624 	tf->tf_fs = GSEL(GUFS_SEL, SEL_UPL);
625 	tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
626 	tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
627 	tf->tf_eip = (int)catcher;
628 	tf->tf_cs = GSEL(sel, SEL_UPL);
629 	tf->tf_eflags &= ~PSL_CLEARSIG;
630 	tf->tf_esp = (int)fp;
631 	tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
632 
633 	/* Ensure FP state is reset. */
634 	fpu_save_area_reset(l);
635 }
636 
637 void
sendsig_siginfo(const ksiginfo_t * ksi,const sigset_t * mask)638 sendsig_siginfo(const ksiginfo_t *ksi, const sigset_t *mask)
639 {
640 	struct lwp *l = curlwp;
641 	struct proc *p = l->l_proc;
642 	struct pmap *pmap = vm_map_pmap(&p->p_vmspace->vm_map);
643 	int sel = pmap->pm_hiexec > I386_MAX_EXE_ADDR ?
644 	    GUCODEBIG_SEL : GUCODE_SEL;
645 	struct sigacts *ps = p->p_sigacts;
646 	int onstack, error;
647 	int sig = ksi->ksi_signo;
648 	struct sigframe_siginfo *fp = getframe(l, sig, &onstack), frame;
649 	sig_t catcher = SIGACTION(p, sig).sa_handler;
650 	struct trapframe *tf = l->l_md.md_regs;
651 
652 	KASSERT(mutex_owned(p->p_lock));
653 
654 	fp--;
655 
656 	frame.sf_ra = (int)ps->sa_sigdesc[sig].sd_tramp;
657 	frame.sf_signum = sig;
658 	frame.sf_sip = &fp->sf_si;
659 	frame.sf_ucp = &fp->sf_uc;
660 	frame.sf_si._info = ksi->ksi_info;
661 	frame.sf_uc.uc_flags = _UC_SIGMASK|_UC_VM;
662 	frame.sf_uc.uc_sigmask = *mask;
663 	frame.sf_uc.uc_link = l->l_ctxlink;
664 	frame.sf_uc.uc_flags |= (l->l_sigstk.ss_flags & SS_ONSTACK)
665 	    ? _UC_SETSTACK : _UC_CLRSTACK;
666 	memset(&frame.sf_uc.uc_stack, 0, sizeof(frame.sf_uc.uc_stack));
667 
668 	if (tf->tf_eflags & PSL_VM)
669 		(*p->p_emul->e_syscall_intern)(p);
670 	sendsig_reset(l, sig);
671 
672 	mutex_exit(p->p_lock);
673 	cpu_getmcontext(l, &frame.sf_uc.uc_mcontext, &frame.sf_uc.uc_flags);
674 	error = copyout(&frame, fp, sizeof(frame));
675 	mutex_enter(p->p_lock);
676 
677 	if (error != 0) {
678 		/*
679 		 * Process has trashed its stack; give it an illegal
680 		 * instruction to halt it in its tracks.
681 		 */
682 		sigexit(l, SIGILL);
683 		/* NOTREACHED */
684 	}
685 
686 	buildcontext(l, sel, catcher, fp);
687 
688 	/* Remember that we're now on the signal stack. */
689 	if (onstack)
690 		l->l_sigstk.ss_flags |= SS_ONSTACK;
691 }
692 
693 static void
maybe_dump(int howto)694 maybe_dump(int howto)
695 {
696 	int s;
697 
698 	/* Disable interrupts. */
699 	s = splhigh();
700 
701 	/* Do a dump if requested. */
702 	if ((howto & (RB_DUMP | RB_HALT)) == RB_DUMP)
703 		dumpsys();
704 
705 	splx(s);
706 }
707 
708 void
cpu_reboot(int howto,char * bootstr)709 cpu_reboot(int howto, char *bootstr)
710 {
711 	static bool syncdone = false;
712 	int s = IPL_NONE;
713 
714 	if (cold) {
715 		howto |= RB_HALT;
716 		goto haltsys;
717 	}
718 
719 	boothowto = howto;
720 
721 	/* XXX used to dump after vfs_shutdown() and before
722 	 * detaching devices / shutdown hooks / pmf_system_shutdown().
723 	 */
724 	maybe_dump(howto);
725 
726 	/*
727 	 * If we've panic'd, don't make the situation potentially
728 	 * worse by syncing or unmounting the file systems.
729 	 */
730 	if ((howto & RB_NOSYNC) == 0 && panicstr == NULL) {
731 		if (!syncdone) {
732 			syncdone = true;
733 			/* XXX used to force unmount as well, here */
734 			vfs_sync_all(curlwp);
735 			/*
736 			 * If we've been adjusting the clock, the todr
737 			 * will be out of synch; adjust it now.
738 			 *
739 			 * XXX used to do this after unmounting all
740 			 * filesystems with vfs_shutdown().
741 			 */
742 			if (time_adjusted != 0)
743 				resettodr();
744 		}
745 
746 		while (vfs_unmountall1(curlwp, false, false) ||
747 		       config_detach_all(boothowto) ||
748 		       vfs_unmount_forceone(curlwp))
749 			;	/* do nothing */
750 	} else
751 		suspendsched();
752 
753 	pmf_system_shutdown(boothowto);
754 
755 	s = splhigh();
756 
757 	/* amd64 maybe_dump() */
758 
759 haltsys:
760 	doshutdownhooks();
761 
762 	if ((howto & RB_POWERDOWN) == RB_POWERDOWN) {
763 #if NACPICA > 0
764 		if (s != IPL_NONE)
765 			splx(s);
766 
767 		acpi_enter_sleep_state(ACPI_STATE_S5);
768 #else
769 		__USE(s);
770 #endif
771 #ifdef XEN
772 		HYPERVISOR_shutdown();
773 		for (;;);
774 #endif
775 	}
776 
777 #ifdef MULTIPROCESSOR
778 	cpu_broadcast_halt();
779 #endif /* MULTIPROCESSOR */
780 
781 	if (howto & RB_HALT) {
782 #if NACPICA > 0
783 		acpi_disable();
784 #endif
785 
786 		printf("\n");
787 		printf("The operating system has halted.\n");
788 		printf("Please press any key to reboot.\n\n");
789 
790 #ifdef BEEP_ONHALT
791 		{
792 			int c;
793 			for (c = BEEP_ONHALT_COUNT; c > 0; c--) {
794 				sysbeep(BEEP_ONHALT_PITCH,
795 					BEEP_ONHALT_PERIOD * hz / 1000);
796 				delay(BEEP_ONHALT_PERIOD * 1000);
797 				sysbeep(0, BEEP_ONHALT_PERIOD * hz / 1000);
798 				delay(BEEP_ONHALT_PERIOD * 1000);
799 			}
800 		}
801 #endif
802 
803 		cnpollc(1);	/* for proper keyboard command handling */
804 		if (cngetc() == 0) {
805 			/* no console attached, so just hlt */
806 			printf("No keyboard - cannot reboot after all.\n");
807 			for(;;) {
808 				x86_hlt();
809 			}
810 		}
811 		cnpollc(0);
812 	}
813 
814 	printf("rebooting...\n");
815 	if (cpureset_delay > 0)
816 		delay(cpureset_delay * 1000);
817 	cpu_reset();
818 	for(;;) ;
819 	/*NOTREACHED*/
820 }
821 
822 /*
823  * Clear registers on exec
824  */
825 void
setregs(struct lwp * l,struct exec_package * pack,vaddr_t stack)826 setregs(struct lwp *l, struct exec_package *pack, vaddr_t stack)
827 {
828 	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
829 	struct pcb *pcb = lwp_getpcb(l);
830 	struct trapframe *tf;
831 
832 #ifdef USER_LDT
833 	pmap_ldt_cleanup(l);
834 #endif
835 
836 	fpu_save_area_clear(l, pack->ep_osversion >= 699002600
837 	    ? __INITIAL_NPXCW__ : __NetBSD_COMPAT_NPXCW__);
838 
839 	memcpy(&pcb->pcb_fsd, &gdt[GUDATA_SEL], sizeof(pcb->pcb_fsd));
840 	memcpy(&pcb->pcb_gsd, &gdt[GUDATA_SEL], sizeof(pcb->pcb_gsd));
841 
842 	tf = l->l_md.md_regs;
843 	tf->tf_gs = GSEL(GUGS_SEL, SEL_UPL);
844 	tf->tf_fs = GSEL(GUFS_SEL, SEL_UPL);
845 	tf->tf_es = LSEL(LUDATA_SEL, SEL_UPL);
846 	tf->tf_ds = LSEL(LUDATA_SEL, SEL_UPL);
847 	tf->tf_edi = 0;
848 	tf->tf_esi = 0;
849 	tf->tf_ebp = 0;
850 	tf->tf_ebx = l->l_proc->p_psstrp;
851 	tf->tf_edx = 0;
852 	tf->tf_ecx = 0;
853 	tf->tf_eax = 0;
854 	tf->tf_eip = pack->ep_entry;
855 	tf->tf_cs = pmap->pm_hiexec > I386_MAX_EXE_ADDR ?
856 	    LSEL(LUCODEBIG_SEL, SEL_UPL) : LSEL(LUCODE_SEL, SEL_UPL);
857 	tf->tf_eflags = PSL_USERSET;
858 	tf->tf_esp = stack;
859 	tf->tf_ss = LSEL(LUDATA_SEL, SEL_UPL);
860 }
861 
862 /*
863  * Initialize segments and descriptor tables
864  */
865 
866 union	descriptor *gdt, *ldt;
867 union	descriptor *pentium_idt;
868 extern vaddr_t lwp0uarea;
869 
870 void
setgate(struct gate_descriptor * gd,void * func,int args,int type,int dpl,int sel)871 setgate(struct gate_descriptor *gd, void *func, int args, int type, int dpl,
872     int sel)
873 {
874 
875 	gd->gd_looffset = (int)func;
876 	gd->gd_selector = sel;
877 	gd->gd_stkcpy = args;
878 	gd->gd_xx = 0;
879 	gd->gd_type = type;
880 	gd->gd_dpl = dpl;
881 	gd->gd_p = 1;
882 	gd->gd_hioffset = (int)func >> 16;
883 }
884 
885 void
unsetgate(struct gate_descriptor * gd)886 unsetgate(struct gate_descriptor *gd)
887 {
888 	gd->gd_p = 0;
889 	gd->gd_hioffset = 0;
890 	gd->gd_looffset = 0;
891 	gd->gd_selector = 0;
892 	gd->gd_xx = 0;
893 	gd->gd_stkcpy = 0;
894 	gd->gd_type = 0;
895 	gd->gd_dpl = 0;
896 }
897 
898 
899 void
setregion(struct region_descriptor * rd,void * base,size_t limit)900 setregion(struct region_descriptor *rd, void *base, size_t limit)
901 {
902 
903 	rd->rd_limit = (int)limit;
904 	rd->rd_base = (int)base;
905 }
906 
907 void
setsegment(struct segment_descriptor * sd,const void * base,size_t limit,int type,int dpl,int def32,int gran)908 setsegment(struct segment_descriptor *sd, const void *base, size_t limit,
909     int type, int dpl, int def32, int gran)
910 {
911 
912 	sd->sd_lolimit = (int)limit;
913 	sd->sd_lobase = (int)base;
914 	sd->sd_type = type;
915 	sd->sd_dpl = dpl;
916 	sd->sd_p = 1;
917 	sd->sd_hilimit = (int)limit >> 16;
918 	sd->sd_xx = 0;
919 	sd->sd_def32 = def32;
920 	sd->sd_gran = gran;
921 	sd->sd_hibase = (int)base >> 24;
922 }
923 
924 #define	IDTVEC(name)	__CONCAT(X, name)
925 typedef void (vector)(void);
926 extern vector IDTVEC(syscall);
927 extern vector IDTVEC(osyscall);
928 extern vector *IDTVEC(exceptions)[];
929 extern vector IDTVEC(svr4_fasttrap);
930 void (*svr4_fasttrap_vec)(void) = (void (*)(void))nullop;
931 krwlock_t svr4_fasttrap_lock;
932 #ifdef XEN
933 #define MAX_XEN_IDT 128
934 trap_info_t xen_idt[MAX_XEN_IDT];
935 int xen_idt_idx;
936 extern union descriptor tmpgdt[];
937 #endif
938 
939 void
cpu_init_idt(void)940 cpu_init_idt(void)
941 {
942 #ifndef XEN
943 	struct region_descriptor region;
944 	setregion(&region, pentium_idt, NIDT * sizeof(idt[0]) - 1);
945 	lidt(&region);
946 #else /* XEN */
947 	XENPRINTF(("HYPERVISOR_set_trap_table %p\n", xen_idt));
948 	if (HYPERVISOR_set_trap_table(xen_idt))
949 		panic("HYPERVISOR_set_trap_table %p failed\n", xen_idt);
950 #endif /* !XEN */
951 }
952 
953 void
initgdt(union descriptor * tgdt)954 initgdt(union descriptor *tgdt)
955 {
956 	KASSERT(tgdt != NULL);
957 
958 	gdt = tgdt;
959 #ifdef XEN
960 	u_long	frames[16];
961 #else
962 	struct region_descriptor region;
963 	memset(gdt, 0, NGDT*sizeof(*gdt));
964 #endif /* XEN */
965 	/* make gdt gates and memory segments */
966 	setsegment(&gdt[GCODE_SEL].sd, 0, 0xfffff, SDT_MEMERA, SEL_KPL, 1, 1);
967 	setsegment(&gdt[GDATA_SEL].sd, 0, 0xfffff, SDT_MEMRWA, SEL_KPL, 1, 1);
968 	setsegment(&gdt[GUCODE_SEL].sd, 0, x86_btop(I386_MAX_EXE_ADDR) - 1,
969 	    SDT_MEMERA, SEL_UPL, 1, 1);
970 	setsegment(&gdt[GUCODEBIG_SEL].sd, 0, 0xfffff,
971 	    SDT_MEMERA, SEL_UPL, 1, 1);
972 	setsegment(&gdt[GUDATA_SEL].sd, 0, 0xfffff,
973 	    SDT_MEMRWA, SEL_UPL, 1, 1);
974 #if NBIOSCALL > 0
975 	/* bios trampoline GDT entries */
976 	setsegment(&gdt[GBIOSCODE_SEL].sd, 0, 0xfffff, SDT_MEMERA, SEL_KPL, 0,
977 	    0);
978 	setsegment(&gdt[GBIOSDATA_SEL].sd, 0, 0xfffff, SDT_MEMRWA, SEL_KPL, 0,
979 	    0);
980 #endif
981 	setsegment(&gdt[GCPU_SEL].sd, &cpu_info_primary, 0xfffff,
982 	    SDT_MEMRWA, SEL_KPL, 1, 1);
983 
984 #ifndef XEN
985 	setregion(&region, gdt, NGDT * sizeof(gdt[0]) - 1);
986 	lgdt(&region);
987 #else /* !XEN */
988 	/*
989 	 * We jumpstart the bootstrap process a bit so we can update
990 	 * page permissions. This is done redundantly later from
991 	 * x86_xpmap.c:xen_pmap_bootstrap() - harmless.
992 	 */
993 	xpmap_phys_to_machine_mapping =
994 	    (unsigned long *)xen_start_info.mfn_list;
995 
996 	frames[0] = xpmap_ptom((uint32_t)gdt - KERNBASE) >> PAGE_SHIFT;
997 	{	/*
998 		 * Enter the gdt page RO into the kernel map. We can't
999 		 * use pmap_kenter_pa() here, because %fs is not
1000 		 * usable until the gdt is loaded, and %fs is used as
1001 		 * the base pointer for curcpu() and curlwp(), both of
1002 		 * which are in the callpath of pmap_kenter_pa().
1003 		 * So we mash up our own - this is MD code anyway.
1004 		 */
1005 		pt_entry_t pte;
1006 		pt_entry_t pg_nx = (cpu_feature[2] & CPUID_NOX ? PG_NX : 0);
1007 
1008 		pte = pmap_pa2pte((vaddr_t)gdt - KERNBASE);
1009 		pte |= PG_k | PG_RO | pg_nx | PG_V;
1010 
1011 		if (HYPERVISOR_update_va_mapping((vaddr_t)gdt, pte, UVMF_INVLPG) < 0) {
1012 			panic("gdt page RO update failed.\n");
1013 		}
1014 
1015 	}
1016 
1017 	XENPRINTK(("loading gdt %lx, %d entries\n", frames[0] << PAGE_SHIFT,
1018 	    NGDT));
1019 	if (HYPERVISOR_set_gdt(frames, NGDT /* XXX is it right ? */))
1020 		panic("HYPERVISOR_set_gdt failed!\n");
1021 
1022 	lgdt_finish();
1023 #endif /* !XEN */
1024 }
1025 
1026 static void
init386_msgbuf(void)1027 init386_msgbuf(void)
1028 {
1029 	/* Message buffer is located at end of core. */
1030 	struct vm_physseg *vps;
1031 	psize_t sz = round_page(MSGBUFSIZE);
1032 	psize_t reqsz = sz;
1033 	unsigned int x;
1034 
1035  search_again:
1036 	vps = NULL;
1037 	for (x = 0; x < vm_nphysseg; ++x) {
1038 		vps = VM_PHYSMEM_PTR(x);
1039 		if (ctob(vps->avail_end) == avail_end) {
1040 			break;
1041 		}
1042 	}
1043 	if (x == vm_nphysseg)
1044 		panic("init386: can't find end of memory");
1045 
1046 	/* Shrink so it'll fit in the last segment. */
1047 	if (vps->avail_end - vps->avail_start < atop(sz))
1048 		sz = ctob(vps->avail_end - vps->avail_start);
1049 
1050 	vps->avail_end -= atop(sz);
1051 	vps->end -= atop(sz);
1052 	msgbuf_p_seg[msgbuf_p_cnt].sz = sz;
1053 	msgbuf_p_seg[msgbuf_p_cnt++].paddr = ctob(vps->avail_end);
1054 
1055 	/* Remove the last segment if it now has no pages. */
1056 	if (vps->start == vps->end) {
1057 		for (--vm_nphysseg; x < vm_nphysseg; x++)
1058 			VM_PHYSMEM_PTR_SWAP(x, x + 1);
1059 	}
1060 
1061 	/* Now find where the new avail_end is. */
1062 	for (avail_end = 0, x = 0; x < vm_nphysseg; x++)
1063 		if (VM_PHYSMEM_PTR(x)->avail_end > avail_end)
1064 			avail_end = VM_PHYSMEM_PTR(x)->avail_end;
1065 	avail_end = ctob(avail_end);
1066 
1067 	if (sz == reqsz)
1068 		return;
1069 
1070 	reqsz -= sz;
1071 	if (msgbuf_p_cnt == VM_PHYSSEG_MAX) {
1072 		/* No more segments available, bail out. */
1073 		printf("WARNING: MSGBUFSIZE (%zu) too large, using %zu.\n",
1074 		    (size_t)MSGBUFSIZE, (size_t)(MSGBUFSIZE - reqsz));
1075 		return;
1076 	}
1077 
1078 	sz = reqsz;
1079 	goto search_again;
1080 }
1081 
1082 #ifndef XEN
1083 static void
init386_pte0(void)1084 init386_pte0(void)
1085 {
1086 	paddr_t paddr;
1087 	vaddr_t vaddr;
1088 
1089 	paddr = 4 * PAGE_SIZE;
1090 	vaddr = (vaddr_t)vtopte(0);
1091 	pmap_kenter_pa(vaddr, paddr, VM_PROT_ALL, 0);
1092 	pmap_update(pmap_kernel());
1093 	/* make sure it is clean before using */
1094 	memset((void *)vaddr, 0, PAGE_SIZE);
1095 }
1096 #endif /* !XEN */
1097 
1098 static void
init386_ksyms(void)1099 init386_ksyms(void)
1100 {
1101 #if NKSYMS || defined(DDB) || defined(MODULAR)
1102 	extern int end;
1103 	struct btinfo_symtab *symtab;
1104 
1105 #ifdef DDB
1106 	db_machine_init();
1107 #endif
1108 
1109 #if defined(MULTIBOOT)
1110 	if (multiboot_ksyms_addsyms_elf())
1111 		return;
1112 #endif
1113 
1114 	if ((symtab = lookup_bootinfo(BTINFO_SYMTAB)) == NULL) {
1115 		ksyms_addsyms_elf(*(int *)&end, ((int *)&end) + 1, esym);
1116 		return;
1117 	}
1118 
1119 	symtab->ssym += KERNBASE;
1120 	symtab->esym += KERNBASE;
1121 	ksyms_addsyms_elf(symtab->nsym, (int *)symtab->ssym, (int *)symtab->esym);
1122 #endif
1123 }
1124 
1125 void
init386(paddr_t first_avail)1126 init386(paddr_t first_avail)
1127 {
1128 	extern void consinit(void);
1129 	int x;
1130 #ifndef XEN
1131 	union descriptor *tgdt;
1132 	struct region_descriptor region;
1133 #endif
1134 #if NBIOSCALL > 0
1135 	extern int biostramp_image_size;
1136 	extern u_char biostramp_image[];
1137 #endif
1138 
1139 #ifdef XEN
1140 	XENPRINTK(("HYPERVISOR_shared_info %p (%x)\n", HYPERVISOR_shared_info,
1141 	    xen_start_info.shared_info));
1142 	KASSERT(HYPERVISOR_shared_info != NULL);
1143 	cpu_info_primary.ci_vcpu = &HYPERVISOR_shared_info->vcpu_info[0];
1144 #endif
1145 
1146 	uvm_lwp_setuarea(&lwp0, lwp0uarea);
1147 
1148 	cpu_probe(&cpu_info_primary);
1149 	cpu_init_msrs(&cpu_info_primary, true);
1150 
1151 #ifdef PAE
1152 	use_pae = 1;
1153 #else
1154 	use_pae = 0;
1155 #endif
1156 
1157 #ifdef XEN
1158 	struct pcb *pcb = lwp_getpcb(&lwp0);
1159 	pcb->pcb_cr3 = PDPpaddr;
1160 	__PRINTK(("pcb_cr3 0x%lx cr3 0x%lx\n",
1161 	    PDPpaddr, xpmap_ptom(PDPpaddr)));
1162 	XENPRINTK(("lwp0uarea %p first_avail %p\n",
1163 	    lwp0uarea, (void *)(long)first_avail));
1164 	XENPRINTK(("ptdpaddr %p atdevbase %p\n", (void *)PDPpaddr,
1165 	    (void *)atdevbase));
1166 #endif
1167 
1168 #if defined(PAE) && !defined(XEN)
1169 	/*
1170 	 * Save VA and PA of L3 PD of boot processor (for Xen, this is done
1171 	 * in xen_pmap_bootstrap())
1172 	 */
1173 	cpu_info_primary.ci_pae_l3_pdirpa = rcr3();
1174 	cpu_info_primary.ci_pae_l3_pdir = (pd_entry_t *)(rcr3() + KERNBASE);
1175 #endif /* PAE && !XEN */
1176 
1177 	/*
1178 	 * Initialize PAGE_SIZE-dependent variables.
1179 	 */
1180 	uvm_setpagesize();
1181 
1182 	/*
1183 	 * Start with 2 color bins -- this is just a guess to get us
1184 	 * started.  We'll recolor when we determine the largest cache
1185 	 * sizes on the system.
1186 	 */
1187 	uvmexp.ncolors = 2;
1188 
1189 #ifndef XEN
1190 	/*
1191 	 * Low memory reservations:
1192 	 * Page 0:	BIOS data
1193 	 * Page 1:	BIOS callback
1194 	 * Page 2:	MP bootstrap code (MP_TRAMPOLINE)
1195 	 * Page 3:	ACPI wakeup code (ACPI_WAKEUP_ADDR)
1196 	 * Page 4:	Temporary page table for 0MB-4MB
1197 	 * Page 5:	Temporary page directory
1198 	 */
1199 	avail_start = 6 * PAGE_SIZE;
1200 #else /* !XEN */
1201 	/* Parse Xen command line (replace bootinfo) */
1202 	xen_parse_cmdline(XEN_PARSE_BOOTFLAGS, NULL);
1203 
1204 	/* Steal one page for gdt */
1205 	gdt = (void *)((u_long)first_avail + KERNBASE);
1206 	first_avail += PAGE_SIZE;
1207 
1208 	/* Determine physical address space */
1209 	first_avail = round_page(first_avail);
1210 	avail_start = first_avail;
1211 	avail_end = ctob((paddr_t)xen_start_info.nr_pages);
1212 	pmap_pa_start = (KERNTEXTOFF - KERNBASE);
1213 	pmap_pa_end = pmap_pa_start + ctob((paddr_t)xen_start_info.nr_pages);
1214 	mem_clusters[0].start = avail_start;
1215 	mem_clusters[0].size = avail_end - avail_start;
1216 	mem_cluster_cnt++;
1217 	physmem += xen_start_info.nr_pages;
1218 	uvmexp.wired += atop(avail_start);
1219 
1220 	/*
1221 	 * initgdt() has to be done before consinit(), so that %fs is properly
1222 	 * initialised. initgdt() uses pmap_kenter_pa so it can't be called
1223 	 * before the above variables are set.
1224 	 */
1225 	initgdt(gdt);
1226 
1227 	mutex_init(&pte_lock, MUTEX_DEFAULT, IPL_VM);
1228 #endif /* XEN */
1229 
1230 #if NISA > 0 || NPCI > 0
1231 	x86_bus_space_init();
1232 #endif /* NISA > 0 || NPCI > 0 */
1233 
1234 	consinit();	/* XXX SHOULD NOT BE DONE HERE */
1235 
1236 #ifdef DEBUG_MEMLOAD
1237 	printf("mem_cluster_count: %d\n", mem_cluster_cnt);
1238 #endif
1239 
1240 	/*
1241 	 * Call pmap initialization to make new kernel address space.
1242 	 * We must do this before loading pages into the VM system.
1243 	 */
1244 	pmap_bootstrap((vaddr_t)atdevbase + IOM_SIZE);
1245 
1246 #ifndef XEN
1247 	/* Initialize the memory clusters. */
1248 	init_x86_clusters();
1249 
1250 	/* Internalize the physical pages into the VM system. */
1251 	init_x86_vm(first_avail);
1252 #else /* !XEN */
1253 	XENPRINTK(("load the memory cluster 0x%" PRIx64 " (%" PRId64 ") - "
1254 	    "0x%" PRIx64 " (%" PRId64 ")\n",
1255 	    (uint64_t)avail_start, (uint64_t)atop(avail_start),
1256 	    (uint64_t)avail_end, (uint64_t)atop(avail_end)));
1257 	uvm_page_physload(atop(avail_start), atop(avail_end),
1258 	    atop(avail_start), atop(avail_end),
1259 	    VM_FREELIST_DEFAULT);
1260 
1261 	/* Reclaim the boot gdt page - see locore.s */
1262 	{
1263 		pt_entry_t pte;
1264 		pt_entry_t pg_nx = (cpu_feature[2] & CPUID_NOX ? PG_NX : 0);
1265 
1266 		pte = pmap_pa2pte((vaddr_t)tmpgdt - KERNBASE);
1267 		pte |= PG_k | PG_RW | pg_nx | PG_V;
1268 
1269 		if (HYPERVISOR_update_va_mapping((vaddr_t)tmpgdt, pte, UVMF_INVLPG) < 0) {
1270 			panic("tmpgdt page relaim RW update failed.\n");
1271 		}
1272 	}
1273 
1274 #endif /* !XEN */
1275 
1276 	init386_msgbuf();
1277 
1278 #ifndef XEN
1279 	/*
1280 	 * XXX Remove this
1281 	 *
1282 	 * Setup a temporary Page Table Entry to allow identity mappings of
1283 	 * the real mode address. This is required by:
1284 	 * - bioscall
1285 	 * - MP bootstrap
1286 	 * - ACPI wakecode
1287 	 */
1288 	init386_pte0();
1289 
1290 #if NBIOSCALL > 0
1291 	KASSERT(biostramp_image_size <= PAGE_SIZE);
1292 	pmap_kenter_pa((vaddr_t)BIOSTRAMP_BASE, (paddr_t)BIOSTRAMP_BASE,
1293 	    VM_PROT_ALL, 0);
1294 	pmap_update(pmap_kernel());
1295 	memcpy((void *)BIOSTRAMP_BASE, biostramp_image, biostramp_image_size);
1296 
1297 	/* Needed early, for bioscall() */
1298 	cpu_info_primary.ci_pmap = pmap_kernel();
1299 #endif
1300 #endif /* !XEN */
1301 
1302 	pmap_kenter_pa(idt_vaddr, idt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
1303 	pmap_update(pmap_kernel());
1304 	memset((void *)idt_vaddr, 0, PAGE_SIZE);
1305 
1306 
1307 #ifndef XEN
1308 	idt_init();
1309 
1310 	idt = (struct gate_descriptor *)idt_vaddr;
1311 	pmap_kenter_pa(pentium_idt_vaddr, idt_paddr, VM_PROT_READ, 0);
1312 	pmap_update(pmap_kernel());
1313 	pentium_idt = (union descriptor *)pentium_idt_vaddr;
1314 
1315 	tgdt = gdt;
1316 	gdt = (union descriptor *)
1317 		    ((char *)idt + NIDT * sizeof(struct gate_descriptor));
1318 	ldt = gdt + NGDT;
1319 
1320 	memcpy(gdt, tgdt, NGDT * sizeof(*gdt));
1321 
1322 	setsegment(&gdt[GLDT_SEL].sd, ldt, NLDT * sizeof(ldt[0]) - 1,
1323 	    SDT_SYSLDT, SEL_KPL, 0, 0);
1324 #else
1325 	HYPERVISOR_set_callbacks(
1326 	    GSEL(GCODE_SEL, SEL_KPL), (unsigned long)hypervisor_callback,
1327 	    GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback);
1328 
1329 	ldt = (union descriptor *)idt_vaddr;
1330 #endif /* XEN */
1331 
1332 	/* make ldt gates and memory segments */
1333 	setgate(&ldt[LSYS5CALLS_SEL].gd, &IDTVEC(osyscall), 1,
1334 	    SDT_SYS386CGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
1335 
1336 	ldt[LUCODE_SEL] = gdt[GUCODE_SEL];
1337 	ldt[LUCODEBIG_SEL] = gdt[GUCODEBIG_SEL];
1338 	ldt[LUDATA_SEL] = gdt[GUDATA_SEL];
1339 	ldt[LSOL26CALLS_SEL] = ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL];
1340 
1341 #ifndef XEN
1342 	/* exceptions */
1343 	for (x = 0; x < 32; x++) {
1344 		idt_vec_reserve(x);
1345 		setgate(&idt[x], IDTVEC(exceptions)[x], 0, SDT_SYS386IGT,
1346 		    (x == 3 || x == 4) ? SEL_UPL : SEL_KPL,
1347 		    GSEL(GCODE_SEL, SEL_KPL));
1348 	}
1349 
1350 	/* new-style interrupt gate for syscalls */
1351 	idt_vec_reserve(128);
1352 	setgate(&idt[128], &IDTVEC(syscall), 0, SDT_SYS386IGT, SEL_UPL,
1353 	    GSEL(GCODE_SEL, SEL_KPL));
1354 	idt_vec_reserve(0xd2);
1355 	setgate(&idt[0xd2], &IDTVEC(svr4_fasttrap), 0, SDT_SYS386IGT,
1356 	    SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
1357 
1358 	setregion(&region, gdt, NGDT * sizeof(gdt[0]) - 1);
1359 	lgdt(&region);
1360 
1361 	cpu_init_idt();
1362 #else /* !XEN */
1363 	memset(xen_idt, 0, sizeof(trap_info_t) * MAX_XEN_IDT);
1364 	xen_idt_idx = 0;
1365 	for (x = 0; x < 32; x++) {
1366 		KASSERT(xen_idt_idx < MAX_XEN_IDT);
1367 		xen_idt[xen_idt_idx].vector = x;
1368 
1369 		switch (x) {
1370 		case 2:  /* NMI */
1371 		case 18: /* MCA */
1372 			TI_SET_IF(&(xen_idt[xen_idt_idx]), 2);
1373 			break;
1374 		case 3:
1375 		case 4:
1376 			xen_idt[xen_idt_idx].flags = SEL_UPL;
1377 			break;
1378 		default:
1379 			xen_idt[xen_idt_idx].flags = SEL_XEN;
1380 			break;
1381 		}
1382 
1383 		xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL);
1384 		xen_idt[xen_idt_idx].address =
1385 			(uint32_t)IDTVEC(exceptions)[x];
1386 		xen_idt_idx++;
1387 	}
1388 	KASSERT(xen_idt_idx < MAX_XEN_IDT);
1389 	xen_idt[xen_idt_idx].vector = 128;
1390 	xen_idt[xen_idt_idx].flags = SEL_UPL;
1391 	xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL);
1392 	xen_idt[xen_idt_idx].address = (uint32_t)&IDTVEC(syscall);
1393 	xen_idt_idx++;
1394 	KASSERT(xen_idt_idx < MAX_XEN_IDT);
1395 	xen_idt[xen_idt_idx].vector = 0xd2;
1396 	xen_idt[xen_idt_idx].flags = SEL_UPL;
1397 	xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL);
1398 	xen_idt[xen_idt_idx].address = (uint32_t)&IDTVEC(svr4_fasttrap);
1399 	xen_idt_idx++;
1400 	lldt(GSEL(GLDT_SEL, SEL_KPL));
1401 	cpu_init_idt();
1402 #endif /* XEN */
1403 
1404 	init386_ksyms();
1405 
1406 #if NMCA > 0
1407 	/* check for MCA bus, needed to be done before ISA stuff - if
1408 	 * MCA is detected, ISA needs to use level triggered interrupts
1409 	 * by default */
1410 	mca_busprobe();
1411 #endif
1412 
1413 #ifdef XEN
1414 	XENPRINTF(("events_default_setup\n"));
1415 	events_default_setup();
1416 #else
1417 	intr_default_setup();
1418 #endif
1419 
1420 	splraise(IPL_HIGH);
1421 	x86_enable_intr();
1422 
1423 #ifdef DDB
1424 	if (boothowto & RB_KDB)
1425 		Debugger();
1426 #endif
1427 #ifdef IPKDB
1428 	ipkdb_init();
1429 	if (boothowto & RB_KDB)
1430 		ipkdb_connect(0);
1431 #endif
1432 #ifdef KGDB
1433 	kgdb_port_init();
1434 	if (boothowto & RB_KDB) {
1435 		kgdb_debug_init = 1;
1436 		kgdb_connect(1);
1437 	}
1438 #endif
1439 
1440 	if (physmem < btoc(2 * 1024 * 1024)) {
1441 		printf("warning: too little memory available; "
1442 		       "have %lu bytes, want %lu bytes\n"
1443 		       "running in degraded mode\n"
1444 		       "press a key to confirm\n\n",
1445 		       (unsigned long)ptoa(physmem), 2*1024*1024UL);
1446 		cngetc();
1447 	}
1448 
1449 	rw_init(&svr4_fasttrap_lock);
1450 }
1451 
1452 #include <dev/ic/mc146818reg.h>		/* for NVRAM POST */
1453 #include <i386/isa/nvram.h>		/* for NVRAM POST */
1454 
1455 void
cpu_reset(void)1456 cpu_reset(void)
1457 {
1458 #ifdef XEN
1459 	HYPERVISOR_reboot();
1460 	for (;;);
1461 #else /* XEN */
1462 	struct region_descriptor region;
1463 
1464 	x86_disable_intr();
1465 
1466 	/*
1467 	 * Ensure the NVRAM reset byte contains something vaguely sane.
1468 	 */
1469 
1470 	outb(IO_RTC, NVRAM_RESET);
1471 	outb(IO_RTC+1, NVRAM_RESET_RST);
1472 
1473 	/*
1474 	 * Reset AMD Geode SC1100.
1475 	 *
1476 	 * 1) Write PCI Configuration Address Register (0xcf8) to
1477 	 *    select Function 0, Register 0x44: Bridge Configuration,
1478 	 *    GPIO and LPC Configuration Register Space, Reset
1479 	 *    Control Register.
1480 	 *
1481 	 * 2) Write 0xf to PCI Configuration Data Register (0xcfc)
1482 	 *    to reset IDE controller, IDE bus, and PCI bus, and
1483 	 *    to trigger a system-wide reset.
1484 	 *
1485 	 * See AMD Geode SC1100 Processor Data Book, Revision 2.0,
1486 	 * sections 6.3.1, 6.3.2, and 6.4.1.
1487 	 */
1488 	if (cpu_info_primary.ci_signature == 0x540) {
1489 		outl(0xcf8, 0x80009044);
1490 		outl(0xcfc, 0xf);
1491 	}
1492 
1493 	x86_reset();
1494 
1495 	/*
1496 	 * Try to cause a triple fault and watchdog reset by making the IDT
1497 	 * invalid and causing a fault.
1498 	 */
1499 	memset((void *)idt, 0, NIDT * sizeof(idt[0]));
1500 	setregion(&region, idt, NIDT * sizeof(idt[0]) - 1);
1501 	lidt(&region);
1502 	breakpoint();
1503 
1504 #if 0
1505 	/*
1506 	 * Try to cause a triple fault and watchdog reset by unmapping the
1507 	 * entire address space and doing a TLB flush.
1508 	 */
1509 	memset((void *)PTD, 0, PAGE_SIZE);
1510 	tlbflush();
1511 #endif
1512 
1513 	for (;;);
1514 #endif /* XEN */
1515 }
1516 
1517 void
cpu_getmcontext(struct lwp * l,mcontext_t * mcp,unsigned int * flags)1518 cpu_getmcontext(struct lwp *l, mcontext_t *mcp, unsigned int *flags)
1519 {
1520 	const struct trapframe *tf = l->l_md.md_regs;
1521 	__greg_t *gr = mcp->__gregs;
1522 	__greg_t ras_eip;
1523 
1524 	/* Save register context. */
1525 #ifdef VM86
1526 	if (tf->tf_eflags & PSL_VM) {
1527 		gr[_REG_GS]  = tf->tf_vm86_gs;
1528 		gr[_REG_FS]  = tf->tf_vm86_fs;
1529 		gr[_REG_ES]  = tf->tf_vm86_es;
1530 		gr[_REG_DS]  = tf->tf_vm86_ds;
1531 		gr[_REG_EFL] = get_vflags(l);
1532 	} else
1533 #endif
1534 	{
1535 		gr[_REG_GS]  = tf->tf_gs;
1536 		gr[_REG_FS]  = tf->tf_fs;
1537 		gr[_REG_ES]  = tf->tf_es;
1538 		gr[_REG_DS]  = tf->tf_ds;
1539 		gr[_REG_EFL] = tf->tf_eflags;
1540 	}
1541 	gr[_REG_EDI]    = tf->tf_edi;
1542 	gr[_REG_ESI]    = tf->tf_esi;
1543 	gr[_REG_EBP]    = tf->tf_ebp;
1544 	gr[_REG_EBX]    = tf->tf_ebx;
1545 	gr[_REG_EDX]    = tf->tf_edx;
1546 	gr[_REG_ECX]    = tf->tf_ecx;
1547 	gr[_REG_EAX]    = tf->tf_eax;
1548 	gr[_REG_EIP]    = tf->tf_eip;
1549 	gr[_REG_CS]     = tf->tf_cs;
1550 	gr[_REG_ESP]    = tf->tf_esp;
1551 	gr[_REG_UESP]   = tf->tf_esp;
1552 	gr[_REG_SS]     = tf->tf_ss;
1553 	gr[_REG_TRAPNO] = tf->tf_trapno;
1554 	gr[_REG_ERR]    = tf->tf_err;
1555 
1556 	if ((ras_eip = (__greg_t)ras_lookup(l->l_proc,
1557 	    (void *) gr[_REG_EIP])) != -1)
1558 		gr[_REG_EIP] = ras_eip;
1559 
1560 	*flags |= _UC_CPU;
1561 
1562 	mcp->_mc_tlsbase = (uintptr_t)l->l_private;
1563 	*flags |= _UC_TLSBASE;
1564 
1565 	/*
1566 	 * Save floating point register context.
1567 	 *
1568 	 * If the cpu doesn't support fxsave we must still write to
1569 	 * the entire 512 byte area - otherwise we leak kernel memory
1570 	 * contents to userspace.
1571 	 * It wouldn't matter if we were doing the copyout here.
1572 	 * So we might as well convert to fxsave format.
1573 	 */
1574 	__CTASSERT(sizeof (struct fxsave) ==
1575 	    sizeof mcp->__fpregs.__fp_reg_set.__fp_xmm_state);
1576 	process_read_fpregs_xmm(l, (struct fxsave *)
1577 	    &mcp->__fpregs.__fp_reg_set.__fp_xmm_state);
1578 	memset(&mcp->__fpregs.__fp_pad, 0, sizeof mcp->__fpregs.__fp_pad);
1579 	*flags |= _UC_FXSAVE | _UC_FPU;
1580 }
1581 
1582 int
cpu_mcontext_validate(struct lwp * l,const mcontext_t * mcp)1583 cpu_mcontext_validate(struct lwp *l, const mcontext_t *mcp)
1584 {
1585 	const __greg_t *gr = mcp->__gregs;
1586 	struct trapframe *tf = l->l_md.md_regs;
1587 
1588 	/*
1589 	 * Check for security violations.  If we're returning
1590 	 * to protected mode, the CPU will validate the segment
1591 	 * registers automatically and generate a trap on
1592 	 * violations.  We handle the trap, rather than doing
1593 	 * all of the checking here.
1594 	 */
1595 	if (((gr[_REG_EFL] ^ tf->tf_eflags) & PSL_USERSTATIC) ||
1596 	    !USERMODE(gr[_REG_CS], gr[_REG_EFL]))
1597 		return EINVAL;
1598 
1599 	return 0;
1600 }
1601 
1602 int
cpu_setmcontext(struct lwp * l,const mcontext_t * mcp,unsigned int flags)1603 cpu_setmcontext(struct lwp *l, const mcontext_t *mcp, unsigned int flags)
1604 {
1605 	struct trapframe *tf = l->l_md.md_regs;
1606 	const __greg_t *gr = mcp->__gregs;
1607 	struct proc *p = l->l_proc;
1608 	int error;
1609 
1610 	/* Restore register context, if any. */
1611 	if ((flags & _UC_CPU) != 0) {
1612 #ifdef VM86
1613 		if (gr[_REG_EFL] & PSL_VM) {
1614 			tf->tf_vm86_gs = gr[_REG_GS];
1615 			tf->tf_vm86_fs = gr[_REG_FS];
1616 			tf->tf_vm86_es = gr[_REG_ES];
1617 			tf->tf_vm86_ds = gr[_REG_DS];
1618 			set_vflags(l, gr[_REG_EFL]);
1619 			if (flags & _UC_VM) {
1620 				void syscall_vm86(struct trapframe *);
1621 				l->l_proc->p_md.md_syscall = syscall_vm86;
1622 			}
1623 		} else
1624 #endif
1625 		{
1626 			error = cpu_mcontext_validate(l, mcp);
1627 			if (error)
1628 				return error;
1629 
1630 			tf->tf_gs = gr[_REG_GS];
1631 			tf->tf_fs = gr[_REG_FS];
1632 			tf->tf_es = gr[_REG_ES];
1633 			tf->tf_ds = gr[_REG_DS];
1634 			/* Only change the user-alterable part of eflags */
1635 			tf->tf_eflags &= ~PSL_USER;
1636 			tf->tf_eflags |= (gr[_REG_EFL] & PSL_USER);
1637 		}
1638 		tf->tf_edi    = gr[_REG_EDI];
1639 		tf->tf_esi    = gr[_REG_ESI];
1640 		tf->tf_ebp    = gr[_REG_EBP];
1641 		tf->tf_ebx    = gr[_REG_EBX];
1642 		tf->tf_edx    = gr[_REG_EDX];
1643 		tf->tf_ecx    = gr[_REG_ECX];
1644 		tf->tf_eax    = gr[_REG_EAX];
1645 		tf->tf_eip    = gr[_REG_EIP];
1646 		tf->tf_cs     = gr[_REG_CS];
1647 		tf->tf_esp    = gr[_REG_UESP];
1648 		tf->tf_ss     = gr[_REG_SS];
1649 	}
1650 
1651 	if ((flags & _UC_TLSBASE) != 0)
1652 		lwp_setprivate(l, (void *)(uintptr_t)mcp->_mc_tlsbase);
1653 
1654 	/* Restore floating point register context, if given. */
1655 	if ((flags & _UC_FPU) != 0) {
1656 		__CTASSERT(sizeof (struct fxsave) ==
1657 		    sizeof mcp->__fpregs.__fp_reg_set.__fp_xmm_state);
1658 		__CTASSERT(sizeof (struct save87) ==
1659 		    sizeof mcp->__fpregs.__fp_reg_set.__fpchip_state);
1660 
1661 		if (flags & _UC_FXSAVE) {
1662 			process_write_fpregs_xmm(l, (const struct fxsave *)
1663 				    &mcp->__fpregs.__fp_reg_set.__fp_xmm_state);
1664 		} else {
1665 			process_write_fpregs_s87(l, (const struct save87 *)
1666 				    &mcp->__fpregs.__fp_reg_set.__fpchip_state);
1667 		}
1668 	}
1669 
1670 	mutex_enter(p->p_lock);
1671 	if (flags & _UC_SETSTACK)
1672 		l->l_sigstk.ss_flags |= SS_ONSTACK;
1673 	if (flags & _UC_CLRSTACK)
1674 		l->l_sigstk.ss_flags &= ~SS_ONSTACK;
1675 	mutex_exit(p->p_lock);
1676 	return (0);
1677 }
1678 
1679 void
cpu_initclocks(void)1680 cpu_initclocks(void)
1681 {
1682 
1683 	(*initclock_func)();
1684 }
1685 
1686 #define	DEV_IO 14		/* iopl for compat_10 */
1687 
1688 int
mm_md_open(dev_t dev,int flag,int mode,struct lwp * l)1689 mm_md_open(dev_t dev, int flag, int mode, struct lwp *l)
1690 {
1691 
1692 	switch (minor(dev)) {
1693 	case DEV_IO:
1694 		/*
1695 		 * This is done by i386_iopl(3) now.
1696 		 *
1697 		 * #if defined(COMPAT_10) || defined(COMPAT_FREEBSD)
1698 		 */
1699 		if (flag & FWRITE) {
1700 			struct trapframe *fp;
1701 			int error;
1702 
1703 			error = kauth_authorize_machdep(l->l_cred,
1704 			    KAUTH_MACHDEP_IOPL, NULL, NULL, NULL, NULL);
1705 			if (error)
1706 				return (error);
1707 			fp = curlwp->l_md.md_regs;
1708 			fp->tf_eflags |= PSL_IOPL;
1709 		}
1710 		break;
1711 	default:
1712 		break;
1713 	}
1714 	return 0;
1715 }
1716 
1717 #ifdef PAE
1718 void
cpu_alloc_l3_page(struct cpu_info * ci)1719 cpu_alloc_l3_page(struct cpu_info *ci)
1720 {
1721 	int ret;
1722 	struct pglist pg;
1723 	struct vm_page *vmap;
1724 
1725 	KASSERT(ci != NULL);
1726 	/*
1727 	 * Allocate a page for the per-CPU L3 PD. cr3 being 32 bits, PA musts
1728 	 * resides below the 4GB boundary.
1729 	 */
1730 	ret = uvm_pglistalloc(PAGE_SIZE, 0, 0x100000000ULL, 32, 0, &pg, 1, 0);
1731 	vmap = TAILQ_FIRST(&pg);
1732 
1733 	if (ret != 0 || vmap == NULL)
1734 		panic("%s: failed to allocate L3 pglist for CPU %d (ret %d)\n",
1735 			__func__, cpu_index(ci), ret);
1736 
1737 	ci->ci_pae_l3_pdirpa = vmap->phys_addr;
1738 
1739 	ci->ci_pae_l3_pdir = (paddr_t *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
1740 		UVM_KMF_VAONLY | UVM_KMF_NOWAIT);
1741 	if (ci->ci_pae_l3_pdir == NULL)
1742 		panic("%s: failed to allocate L3 PD for CPU %d\n",
1743 			__func__, cpu_index(ci));
1744 
1745 	pmap_kenter_pa((vaddr_t)ci->ci_pae_l3_pdir, ci->ci_pae_l3_pdirpa,
1746 		VM_PROT_READ | VM_PROT_WRITE, 0);
1747 
1748 	pmap_update(pmap_kernel());
1749 }
1750 #endif /* PAE */
1751