1 /* $NetBSD: machdep.c,v 1.759 2016/07/16 17:02:34 maxv Exp $ */
2
3 /*-
4 * Copyright (c) 1996, 1997, 1998, 2000, 2004, 2006, 2008, 2009
5 * The NetBSD Foundation, Inc.
6 * All rights reserved.
7 *
8 * This code is derived from software contributed to The NetBSD Foundation
9 * by Charles M. Hannum, by Jason R. Thorpe of the Numerical Aerospace
10 * Simulation Facility NASA Ames Research Center, by Julio M. Merino Vidal,
11 * and by Andrew Doran.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
24 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
26 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 * POSSIBILITY OF SUCH DAMAGE.
33 */
34
35 /*-
36 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
37 * All rights reserved.
38 *
39 * This code is derived from software contributed to Berkeley by
40 * William Jolitz.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution.
50 * 3. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)machdep.c 7.4 (Berkeley) 6/3/91
67 */
68
69 #include <sys/cdefs.h>
70 __KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.759 2016/07/16 17:02:34 maxv Exp $");
71
72 #include "opt_beep.h"
73 #include "opt_compat_ibcs2.h"
74 #include "opt_compat_freebsd.h"
75 #include "opt_compat_netbsd.h"
76 #include "opt_compat_svr4.h"
77 #include "opt_cpureset_delay.h"
78 #include "opt_ddb.h"
79 #include "opt_ipkdb.h"
80 #include "opt_kgdb.h"
81 #include "opt_mtrr.h"
82 #include "opt_modular.h"
83 #include "opt_multiboot.h"
84 #include "opt_multiprocessor.h"
85 #include "opt_physmem.h"
86 #include "opt_realmem.h"
87 #include "opt_user_ldt.h"
88 #include "opt_vm86.h"
89 #include "opt_xen.h"
90 #include "isa.h"
91 #include "pci.h"
92
93 #include <sys/param.h>
94 #include <sys/systm.h>
95 #include <sys/signal.h>
96 #include <sys/signalvar.h>
97 #include <sys/kernel.h>
98 #include <sys/cpu.h>
99 #include <sys/exec.h>
100 #include <sys/fcntl.h>
101 #include <sys/reboot.h>
102 #include <sys/conf.h>
103 #include <sys/kauth.h>
104 #include <sys/mbuf.h>
105 #include <sys/msgbuf.h>
106 #include <sys/mount.h>
107 #include <sys/syscallargs.h>
108 #include <sys/core.h>
109 #include <sys/kcore.h>
110 #include <sys/ucontext.h>
111 #include <sys/ras.h>
112 #include <sys/ksyms.h>
113 #include <sys/device.h>
114
115 #ifdef IPKDB
116 #include <ipkdb/ipkdb.h>
117 #endif
118
119 #ifdef KGDB
120 #include <sys/kgdb.h>
121 #endif
122
123 #include <dev/cons.h>
124 #include <dev/mm.h>
125
126 #include <uvm/uvm.h>
127 #include <uvm/uvm_page.h>
128
129 #include <sys/sysctl.h>
130
131 #include <machine/cpu.h>
132 #include <machine/cpufunc.h>
133 #include <machine/cpuvar.h>
134 #include <machine/gdt.h>
135 #include <machine/intr.h>
136 #include <machine/kcore.h>
137 #include <machine/pio.h>
138 #include <machine/psl.h>
139 #include <machine/reg.h>
140 #include <machine/specialreg.h>
141 #include <machine/bootinfo.h>
142 #include <machine/mtrr.h>
143 #include <x86/x86/tsc.h>
144
145 #include <x86/fpu.h>
146 #include <x86/machdep.h>
147
148 #include <machine/multiboot.h>
149 #ifdef XEN
150 #include <xen/evtchn.h>
151 #include <xen/xen.h>
152 #include <xen/hypervisor.h>
153
154 /* #define XENDEBUG */
155 /* #define XENDEBUG_LOW */
156
157 #ifdef XENDEBUG
158 #define XENPRINTF(x) printf x
159 #define XENPRINTK(x) printk x
160 #else
161 #define XENPRINTF(x)
162 #define XENPRINTK(x)
163 #endif
164 #define PRINTK(x) printf x
165 #endif /* XEN */
166
167 #include <dev/isa/isareg.h>
168 #include <machine/isa_machdep.h>
169 #include <dev/ic/i8042reg.h>
170
171 #ifdef DDB
172 #include <machine/db_machdep.h>
173 #include <ddb/db_extern.h>
174 #endif
175
176 #ifdef VM86
177 #include <machine/vm86.h>
178 #endif
179
180 #include "acpica.h"
181 #include "bioscall.h"
182
183 #if NBIOSCALL > 0
184 #include <machine/bioscall.h>
185 #endif
186
187 #if NACPICA > 0
188 #include <dev/acpi/acpivar.h>
189 #define ACPI_MACHDEP_PRIVATE
190 #include <machine/acpi_machdep.h>
191 #endif
192
193 #include "isa.h"
194 #include "isadma.h"
195 #include "ksyms.h"
196
197 #include "cardbus.h"
198 #if NCARDBUS > 0
199 /* For rbus_min_start hint. */
200 #include <sys/bus.h>
201 #include <dev/cardbus/rbus.h>
202 #include <machine/rbus_machdep.h>
203 #endif
204
205 #include "mca.h"
206 #if NMCA > 0
207 #include <machine/mca_machdep.h> /* for mca_busprobe() */
208 #endif
209
210 #ifdef MULTIPROCESSOR /* XXX */
211 #include <machine/mpbiosvar.h> /* XXX */
212 #endif /* XXX */
213
214 /* the following is used externally (sysctl_hw) */
215 char machine[] = "i386"; /* CPU "architecture" */
216 char machine_arch[] = "i386"; /* machine == machine_arch */
217
218 #ifdef CPURESET_DELAY
219 int cpureset_delay = CPURESET_DELAY;
220 #else
221 int cpureset_delay = 2000; /* default to 2s */
222 #endif
223
224 #ifdef MTRR
225 struct mtrr_funcs *mtrr_funcs;
226 #endif
227
228 int cpu_class;
229 int use_pae;
230 int i386_fpu_present = 1;
231 int i386_fpu_fdivbug;
232
233 int i386_use_fxsave;
234 int i386_has_sse;
235 int i386_has_sse2;
236
237 vaddr_t msgbuf_vaddr;
238 struct {
239 paddr_t paddr;
240 psize_t sz;
241 } msgbuf_p_seg[VM_PHYSSEG_MAX];
242 unsigned int msgbuf_p_cnt = 0;
243
244 vaddr_t idt_vaddr;
245 paddr_t idt_paddr;
246 vaddr_t pentium_idt_vaddr;
247
248 struct vm_map *phys_map = NULL;
249
250 extern paddr_t avail_start, avail_end;
251 #ifdef XEN
252 extern paddr_t pmap_pa_start, pmap_pa_end;
253 void hypervisor_callback(void);
254 void failsafe_callback(void);
255 #endif
256
257 #ifdef XEN
258 void (*delay_func)(unsigned int) = xen_delay;
259 void (*initclock_func)(void) = xen_initclocks;
260 #else
261 void (*delay_func)(unsigned int) = i8254_delay;
262 void (*initclock_func)(void) = i8254_initclocks;
263 #endif
264
265
266 /*
267 * Size of memory segments, before any memory is stolen.
268 */
269 phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX];
270 int mem_cluster_cnt = 0;
271
272 void init386(paddr_t);
273 void initgdt(union descriptor *);
274
275 extern int time_adjusted;
276
277 int *esym;
278 int *eblob;
279 extern int boothowto;
280
281 #ifndef XEN
282
283 /* Base memory reported by BIOS. */
284 #ifndef REALBASEMEM
285 int biosbasemem = 0;
286 #else
287 int biosbasemem = REALBASEMEM;
288 #endif
289
290 /* Extended memory reported by BIOS. */
291 #ifndef REALEXTMEM
292 int biosextmem = 0;
293 #else
294 int biosextmem = REALEXTMEM;
295 #endif
296
297 /* Set if any boot-loader set biosbasemem/biosextmem. */
298 int biosmem_implicit;
299
300 /*
301 * Representation of the bootinfo structure constructed by a NetBSD native
302 * boot loader. Only be used by native_loader().
303 */
304 struct bootinfo_source {
305 uint32_t bs_naddrs;
306 void *bs_addrs[1]; /* Actually longer. */
307 };
308
309 /* Only called by locore.S; no need to be in a header file. */
310 void native_loader(int, int, struct bootinfo_source *, paddr_t, int, int);
311
312 /*
313 * Called as one of the very first things during system startup (just after
314 * the boot loader gave control to the kernel image), this routine is in
315 * charge of retrieving the parameters passed in by the boot loader and
316 * storing them in the appropriate kernel variables.
317 *
318 * WARNING: Because the kernel has not yet relocated itself to KERNBASE,
319 * special care has to be taken when accessing memory because absolute
320 * addresses (referring to kernel symbols) do not work. So:
321 *
322 * 1) Avoid jumps to absolute addresses (such as gotos and switches).
323 * 2) To access global variables use their physical address, which
324 * can be obtained using the RELOC macro.
325 */
326 void
native_loader(int bl_boothowto,int bl_bootdev,struct bootinfo_source * bl_bootinfo,paddr_t bl_esym,int bl_biosextmem,int bl_biosbasemem)327 native_loader(int bl_boothowto, int bl_bootdev,
328 struct bootinfo_source *bl_bootinfo, paddr_t bl_esym,
329 int bl_biosextmem, int bl_biosbasemem)
330 {
331 #define RELOC(type, x) ((type)((vaddr_t)(x) - KERNBASE))
332
333 *RELOC(int *, &boothowto) = bl_boothowto;
334
335 #ifdef COMPAT_OLDBOOT
336 /*
337 * Pre-1.3 boot loaders gave the boot device as a parameter
338 * (instead of a bootinfo entry).
339 */
340 *RELOC(int *, &bootdev) = bl_bootdev;
341 #endif
342
343 /*
344 * The boot loader provides a physical, non-relocated address
345 * for the symbols table's end. We need to convert it to a
346 * virtual address.
347 */
348 if (bl_esym != 0)
349 *RELOC(int **, &esym) = (int *)((vaddr_t)bl_esym + KERNBASE);
350 else
351 *RELOC(int **, &esym) = 0;
352
353 /*
354 * Copy bootinfo entries (if any) from the boot loader's
355 * representation to the kernel's bootinfo space.
356 */
357 if (bl_bootinfo != NULL) {
358 size_t i;
359 uint8_t *data;
360 struct bootinfo *bidest;
361 struct btinfo_modulelist *bi;
362
363 bidest = RELOC(struct bootinfo *, &bootinfo);
364
365 data = &bidest->bi_data[0];
366
367 for (i = 0; i < bl_bootinfo->bs_naddrs; i++) {
368 struct btinfo_common *bc;
369
370 bc = bl_bootinfo->bs_addrs[i];
371
372 if ((data + bc->len) >
373 (&bidest->bi_data[0] + BOOTINFO_MAXSIZE))
374 break;
375
376 memcpy(data, bc, bc->len);
377 /*
378 * If any modules were loaded, record where they
379 * end. We'll need to skip over them.
380 */
381 bi = (struct btinfo_modulelist *)data;
382 if (bi->common.type == BTINFO_MODULELIST) {
383 *RELOC(int **, &eblob) =
384 (int *)(bi->endpa + KERNBASE);
385 }
386 data += bc->len;
387 }
388 bidest->bi_nentries = i;
389 }
390
391 /*
392 * Configure biosbasemem and biosextmem only if they were not
393 * explicitly given during the kernel's build.
394 */
395 if (*RELOC(int *, &biosbasemem) == 0) {
396 *RELOC(int *, &biosbasemem) = bl_biosbasemem;
397 *RELOC(int *, &biosmem_implicit) = 1;
398 }
399 if (*RELOC(int *, &biosextmem) == 0) {
400 *RELOC(int *, &biosextmem) = bl_biosextmem;
401 *RELOC(int *, &biosmem_implicit) = 1;
402 }
403 #undef RELOC
404 }
405
406 #endif /* XEN */
407
408 /*
409 * Machine-dependent startup code
410 */
411 void
cpu_startup(void)412 cpu_startup(void)
413 {
414 int x, y;
415 vaddr_t minaddr, maxaddr;
416 psize_t sz;
417
418 /*
419 * For console drivers that require uvm and pmap to be initialized,
420 * we'll give them one more chance here...
421 */
422 consinit();
423
424 /*
425 * Initialize error message buffer (et end of core).
426 */
427 if (msgbuf_p_cnt == 0)
428 panic("msgbuf paddr map has not been set up");
429 for (x = 0, sz = 0; x < msgbuf_p_cnt; sz += msgbuf_p_seg[x++].sz)
430 continue;
431
432 msgbuf_vaddr = uvm_km_alloc(kernel_map, sz, 0, UVM_KMF_VAONLY);
433 if (msgbuf_vaddr == 0)
434 panic("failed to valloc msgbuf_vaddr");
435
436 for (y = 0, sz = 0; y < msgbuf_p_cnt; y++) {
437 for (x = 0; x < btoc(msgbuf_p_seg[y].sz); x++, sz += PAGE_SIZE)
438 pmap_kenter_pa((vaddr_t)msgbuf_vaddr + sz,
439 msgbuf_p_seg[y].paddr + x * PAGE_SIZE,
440 VM_PROT_READ|VM_PROT_WRITE, 0);
441 }
442
443 pmap_update(pmap_kernel());
444
445 initmsgbuf((void *)msgbuf_vaddr, sz);
446
447 #ifdef MULTIBOOT
448 multiboot_print_info();
449 #endif
450
451 #ifdef TRAPLOG
452 /*
453 * Enable recording of branch from/to in MSR's
454 */
455 wrmsr(MSR_DEBUGCTLMSR, 0x1);
456 #endif
457
458 #if NCARDBUS > 0
459 /* Tell RBUS how much RAM we have, so it can use heuristics. */
460 rbus_min_start_hint(ctob((psize_t)physmem));
461 #endif
462
463 minaddr = 0;
464
465 /*
466 * Allocate a submap for physio
467 */
468 phys_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
469 VM_PHYS_SIZE, 0, false, NULL);
470
471 /* Say hello. */
472 banner();
473
474 /* Safe for i/o port / memory space allocation to use malloc now. */
475 #if NISA > 0 || NPCI > 0
476 x86_bus_space_mallocok();
477 #endif
478
479 gdt_init();
480 i386_proc0_tss_ldt_init();
481
482 #ifndef XEN
483 cpu_init_tss(&cpu_info_primary);
484 ltr(cpu_info_primary.ci_tss_sel);
485 #endif
486
487 x86_startup();
488 }
489
490 /*
491 * Set up proc0's TSS and LDT.
492 */
493 void
i386_proc0_tss_ldt_init(void)494 i386_proc0_tss_ldt_init(void)
495 {
496 struct lwp *l;
497 struct pcb *pcb __diagused;
498
499 l = &lwp0;
500 pcb = lwp_getpcb(l);
501
502 pmap_kernel()->pm_ldt_sel = GSEL(GLDT_SEL, SEL_KPL);
503 pcb->pcb_cr0 = rcr0() & ~CR0_TS;
504 pcb->pcb_esp0 = uvm_lwp_getuarea(l) + USPACE - 16;
505 pcb->pcb_iopl = SEL_KPL;
506 l->l_md.md_regs = (struct trapframe *)pcb->pcb_esp0 - 1;
507 memcpy(&pcb->pcb_fsd, &gdt[GUDATA_SEL], sizeof(pcb->pcb_fsd));
508 memcpy(&pcb->pcb_gsd, &gdt[GUDATA_SEL], sizeof(pcb->pcb_gsd));
509
510 #ifndef XEN
511 lldt(pmap_kernel()->pm_ldt_sel);
512 #else
513 HYPERVISOR_fpu_taskswitch(1);
514 XENPRINTF(("lwp tss sp %p ss %04x/%04x\n",
515 (void *)pcb->pcb_esp0,
516 GSEL(GDATA_SEL, SEL_KPL),
517 IDXSEL(GSEL(GDATA_SEL, SEL_KPL))));
518 HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), pcb->pcb_esp0);
519 #endif
520 }
521
522 #ifdef XEN
523 /* used in assembly */
524 void i386_switch_context(lwp_t *);
525 void i386_tls_switch(lwp_t *);
526
527 /*
528 * Switch context:
529 * - switch stack pointer for user->kernel transition
530 */
531 void
i386_switch_context(lwp_t * l)532 i386_switch_context(lwp_t *l)
533 {
534 struct pcb *pcb;
535 struct physdev_op physop;
536
537 pcb = lwp_getpcb(l);
538
539 HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), pcb->pcb_esp0);
540
541 physop.cmd = PHYSDEVOP_SET_IOPL;
542 physop.u.set_iopl.iopl = pcb->pcb_iopl;
543 HYPERVISOR_physdev_op(&physop);
544 }
545
546 void
i386_tls_switch(lwp_t * l)547 i386_tls_switch(lwp_t *l)
548 {
549 struct cpu_info *ci = curcpu();
550 struct pcb *pcb = lwp_getpcb(l);
551 /*
552 * Raise the IPL to IPL_HIGH.
553 * FPU IPIs can alter the LWP's saved cr0. Dropping the priority
554 * is deferred until mi_switch(), when cpu_switchto() returns.
555 */
556 (void)splhigh();
557
558 /*
559 * If our floating point registers are on a different CPU,
560 * set CR0_TS so we'll trap rather than reuse bogus state.
561 */
562
563 if (l != ci->ci_fpcurlwp) {
564 HYPERVISOR_fpu_taskswitch(1);
565 }
566
567 /* Update TLS segment pointers */
568 update_descriptor(&ci->ci_gdt[GUFS_SEL],
569 (union descriptor *) &pcb->pcb_fsd);
570 update_descriptor(&ci->ci_gdt[GUGS_SEL],
571 (union descriptor *) &pcb->pcb_gsd);
572
573 }
574 #endif /* XEN */
575
576 #ifndef XEN
577 /*
578 * Set up TSS and I/O bitmap.
579 */
580 void
cpu_init_tss(struct cpu_info * ci)581 cpu_init_tss(struct cpu_info *ci)
582 {
583 struct i386tss *tss = &ci->ci_tss;
584
585 tss->tss_iobase = IOMAP_INVALOFF << 16;
586 tss->tss_ss0 = GSEL(GDATA_SEL, SEL_KPL);
587 tss->tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
588 tss->tss_cr3 = rcr3();
589 ci->ci_tss_sel = tss_alloc(tss);
590 }
591 #endif /* XEN */
592
593 void *
getframe(struct lwp * l,int sig,int * onstack)594 getframe(struct lwp *l, int sig, int *onstack)
595 {
596 struct proc *p = l->l_proc;
597 struct trapframe *tf = l->l_md.md_regs;
598
599 /* Do we need to jump onto the signal stack? */
600 *onstack = (l->l_sigstk.ss_flags & (SS_DISABLE | SS_ONSTACK)) == 0
601 && (SIGACTION(p, sig).sa_flags & SA_ONSTACK) != 0;
602 if (*onstack)
603 return (char *)l->l_sigstk.ss_sp + l->l_sigstk.ss_size;
604 #ifdef VM86
605 if (tf->tf_eflags & PSL_VM)
606 return (void *)(tf->tf_esp + (tf->tf_ss << 4));
607 else
608 #endif
609 return (void *)tf->tf_esp;
610 }
611
612 /*
613 * Build context to run handler in. We invoke the handler
614 * directly, only returning via the trampoline. Note the
615 * trampoline version numbers are coordinated with machine-
616 * dependent code in libc.
617 */
618 void
buildcontext(struct lwp * l,int sel,void * catcher,void * fp)619 buildcontext(struct lwp *l, int sel, void *catcher, void *fp)
620 {
621 struct trapframe *tf = l->l_md.md_regs;
622
623 tf->tf_gs = GSEL(GUGS_SEL, SEL_UPL);
624 tf->tf_fs = GSEL(GUFS_SEL, SEL_UPL);
625 tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
626 tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
627 tf->tf_eip = (int)catcher;
628 tf->tf_cs = GSEL(sel, SEL_UPL);
629 tf->tf_eflags &= ~PSL_CLEARSIG;
630 tf->tf_esp = (int)fp;
631 tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
632
633 /* Ensure FP state is reset. */
634 fpu_save_area_reset(l);
635 }
636
637 void
sendsig_siginfo(const ksiginfo_t * ksi,const sigset_t * mask)638 sendsig_siginfo(const ksiginfo_t *ksi, const sigset_t *mask)
639 {
640 struct lwp *l = curlwp;
641 struct proc *p = l->l_proc;
642 struct pmap *pmap = vm_map_pmap(&p->p_vmspace->vm_map);
643 int sel = pmap->pm_hiexec > I386_MAX_EXE_ADDR ?
644 GUCODEBIG_SEL : GUCODE_SEL;
645 struct sigacts *ps = p->p_sigacts;
646 int onstack, error;
647 int sig = ksi->ksi_signo;
648 struct sigframe_siginfo *fp = getframe(l, sig, &onstack), frame;
649 sig_t catcher = SIGACTION(p, sig).sa_handler;
650 struct trapframe *tf = l->l_md.md_regs;
651
652 KASSERT(mutex_owned(p->p_lock));
653
654 fp--;
655
656 frame.sf_ra = (int)ps->sa_sigdesc[sig].sd_tramp;
657 frame.sf_signum = sig;
658 frame.sf_sip = &fp->sf_si;
659 frame.sf_ucp = &fp->sf_uc;
660 frame.sf_si._info = ksi->ksi_info;
661 frame.sf_uc.uc_flags = _UC_SIGMASK|_UC_VM;
662 frame.sf_uc.uc_sigmask = *mask;
663 frame.sf_uc.uc_link = l->l_ctxlink;
664 frame.sf_uc.uc_flags |= (l->l_sigstk.ss_flags & SS_ONSTACK)
665 ? _UC_SETSTACK : _UC_CLRSTACK;
666 memset(&frame.sf_uc.uc_stack, 0, sizeof(frame.sf_uc.uc_stack));
667
668 if (tf->tf_eflags & PSL_VM)
669 (*p->p_emul->e_syscall_intern)(p);
670 sendsig_reset(l, sig);
671
672 mutex_exit(p->p_lock);
673 cpu_getmcontext(l, &frame.sf_uc.uc_mcontext, &frame.sf_uc.uc_flags);
674 error = copyout(&frame, fp, sizeof(frame));
675 mutex_enter(p->p_lock);
676
677 if (error != 0) {
678 /*
679 * Process has trashed its stack; give it an illegal
680 * instruction to halt it in its tracks.
681 */
682 sigexit(l, SIGILL);
683 /* NOTREACHED */
684 }
685
686 buildcontext(l, sel, catcher, fp);
687
688 /* Remember that we're now on the signal stack. */
689 if (onstack)
690 l->l_sigstk.ss_flags |= SS_ONSTACK;
691 }
692
693 static void
maybe_dump(int howto)694 maybe_dump(int howto)
695 {
696 int s;
697
698 /* Disable interrupts. */
699 s = splhigh();
700
701 /* Do a dump if requested. */
702 if ((howto & (RB_DUMP | RB_HALT)) == RB_DUMP)
703 dumpsys();
704
705 splx(s);
706 }
707
708 void
cpu_reboot(int howto,char * bootstr)709 cpu_reboot(int howto, char *bootstr)
710 {
711 static bool syncdone = false;
712 int s = IPL_NONE;
713
714 if (cold) {
715 howto |= RB_HALT;
716 goto haltsys;
717 }
718
719 boothowto = howto;
720
721 /* XXX used to dump after vfs_shutdown() and before
722 * detaching devices / shutdown hooks / pmf_system_shutdown().
723 */
724 maybe_dump(howto);
725
726 /*
727 * If we've panic'd, don't make the situation potentially
728 * worse by syncing or unmounting the file systems.
729 */
730 if ((howto & RB_NOSYNC) == 0 && panicstr == NULL) {
731 if (!syncdone) {
732 syncdone = true;
733 /* XXX used to force unmount as well, here */
734 vfs_sync_all(curlwp);
735 /*
736 * If we've been adjusting the clock, the todr
737 * will be out of synch; adjust it now.
738 *
739 * XXX used to do this after unmounting all
740 * filesystems with vfs_shutdown().
741 */
742 if (time_adjusted != 0)
743 resettodr();
744 }
745
746 while (vfs_unmountall1(curlwp, false, false) ||
747 config_detach_all(boothowto) ||
748 vfs_unmount_forceone(curlwp))
749 ; /* do nothing */
750 } else
751 suspendsched();
752
753 pmf_system_shutdown(boothowto);
754
755 s = splhigh();
756
757 /* amd64 maybe_dump() */
758
759 haltsys:
760 doshutdownhooks();
761
762 if ((howto & RB_POWERDOWN) == RB_POWERDOWN) {
763 #if NACPICA > 0
764 if (s != IPL_NONE)
765 splx(s);
766
767 acpi_enter_sleep_state(ACPI_STATE_S5);
768 #else
769 __USE(s);
770 #endif
771 #ifdef XEN
772 HYPERVISOR_shutdown();
773 for (;;);
774 #endif
775 }
776
777 #ifdef MULTIPROCESSOR
778 cpu_broadcast_halt();
779 #endif /* MULTIPROCESSOR */
780
781 if (howto & RB_HALT) {
782 #if NACPICA > 0
783 acpi_disable();
784 #endif
785
786 printf("\n");
787 printf("The operating system has halted.\n");
788 printf("Please press any key to reboot.\n\n");
789
790 #ifdef BEEP_ONHALT
791 {
792 int c;
793 for (c = BEEP_ONHALT_COUNT; c > 0; c--) {
794 sysbeep(BEEP_ONHALT_PITCH,
795 BEEP_ONHALT_PERIOD * hz / 1000);
796 delay(BEEP_ONHALT_PERIOD * 1000);
797 sysbeep(0, BEEP_ONHALT_PERIOD * hz / 1000);
798 delay(BEEP_ONHALT_PERIOD * 1000);
799 }
800 }
801 #endif
802
803 cnpollc(1); /* for proper keyboard command handling */
804 if (cngetc() == 0) {
805 /* no console attached, so just hlt */
806 printf("No keyboard - cannot reboot after all.\n");
807 for(;;) {
808 x86_hlt();
809 }
810 }
811 cnpollc(0);
812 }
813
814 printf("rebooting...\n");
815 if (cpureset_delay > 0)
816 delay(cpureset_delay * 1000);
817 cpu_reset();
818 for(;;) ;
819 /*NOTREACHED*/
820 }
821
822 /*
823 * Clear registers on exec
824 */
825 void
setregs(struct lwp * l,struct exec_package * pack,vaddr_t stack)826 setregs(struct lwp *l, struct exec_package *pack, vaddr_t stack)
827 {
828 struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
829 struct pcb *pcb = lwp_getpcb(l);
830 struct trapframe *tf;
831
832 #ifdef USER_LDT
833 pmap_ldt_cleanup(l);
834 #endif
835
836 fpu_save_area_clear(l, pack->ep_osversion >= 699002600
837 ? __INITIAL_NPXCW__ : __NetBSD_COMPAT_NPXCW__);
838
839 memcpy(&pcb->pcb_fsd, &gdt[GUDATA_SEL], sizeof(pcb->pcb_fsd));
840 memcpy(&pcb->pcb_gsd, &gdt[GUDATA_SEL], sizeof(pcb->pcb_gsd));
841
842 tf = l->l_md.md_regs;
843 tf->tf_gs = GSEL(GUGS_SEL, SEL_UPL);
844 tf->tf_fs = GSEL(GUFS_SEL, SEL_UPL);
845 tf->tf_es = LSEL(LUDATA_SEL, SEL_UPL);
846 tf->tf_ds = LSEL(LUDATA_SEL, SEL_UPL);
847 tf->tf_edi = 0;
848 tf->tf_esi = 0;
849 tf->tf_ebp = 0;
850 tf->tf_ebx = l->l_proc->p_psstrp;
851 tf->tf_edx = 0;
852 tf->tf_ecx = 0;
853 tf->tf_eax = 0;
854 tf->tf_eip = pack->ep_entry;
855 tf->tf_cs = pmap->pm_hiexec > I386_MAX_EXE_ADDR ?
856 LSEL(LUCODEBIG_SEL, SEL_UPL) : LSEL(LUCODE_SEL, SEL_UPL);
857 tf->tf_eflags = PSL_USERSET;
858 tf->tf_esp = stack;
859 tf->tf_ss = LSEL(LUDATA_SEL, SEL_UPL);
860 }
861
862 /*
863 * Initialize segments and descriptor tables
864 */
865
866 union descriptor *gdt, *ldt;
867 union descriptor *pentium_idt;
868 extern vaddr_t lwp0uarea;
869
870 void
setgate(struct gate_descriptor * gd,void * func,int args,int type,int dpl,int sel)871 setgate(struct gate_descriptor *gd, void *func, int args, int type, int dpl,
872 int sel)
873 {
874
875 gd->gd_looffset = (int)func;
876 gd->gd_selector = sel;
877 gd->gd_stkcpy = args;
878 gd->gd_xx = 0;
879 gd->gd_type = type;
880 gd->gd_dpl = dpl;
881 gd->gd_p = 1;
882 gd->gd_hioffset = (int)func >> 16;
883 }
884
885 void
unsetgate(struct gate_descriptor * gd)886 unsetgate(struct gate_descriptor *gd)
887 {
888 gd->gd_p = 0;
889 gd->gd_hioffset = 0;
890 gd->gd_looffset = 0;
891 gd->gd_selector = 0;
892 gd->gd_xx = 0;
893 gd->gd_stkcpy = 0;
894 gd->gd_type = 0;
895 gd->gd_dpl = 0;
896 }
897
898
899 void
setregion(struct region_descriptor * rd,void * base,size_t limit)900 setregion(struct region_descriptor *rd, void *base, size_t limit)
901 {
902
903 rd->rd_limit = (int)limit;
904 rd->rd_base = (int)base;
905 }
906
907 void
setsegment(struct segment_descriptor * sd,const void * base,size_t limit,int type,int dpl,int def32,int gran)908 setsegment(struct segment_descriptor *sd, const void *base, size_t limit,
909 int type, int dpl, int def32, int gran)
910 {
911
912 sd->sd_lolimit = (int)limit;
913 sd->sd_lobase = (int)base;
914 sd->sd_type = type;
915 sd->sd_dpl = dpl;
916 sd->sd_p = 1;
917 sd->sd_hilimit = (int)limit >> 16;
918 sd->sd_xx = 0;
919 sd->sd_def32 = def32;
920 sd->sd_gran = gran;
921 sd->sd_hibase = (int)base >> 24;
922 }
923
924 #define IDTVEC(name) __CONCAT(X, name)
925 typedef void (vector)(void);
926 extern vector IDTVEC(syscall);
927 extern vector IDTVEC(osyscall);
928 extern vector *IDTVEC(exceptions)[];
929 extern vector IDTVEC(svr4_fasttrap);
930 void (*svr4_fasttrap_vec)(void) = (void (*)(void))nullop;
931 krwlock_t svr4_fasttrap_lock;
932 #ifdef XEN
933 #define MAX_XEN_IDT 128
934 trap_info_t xen_idt[MAX_XEN_IDT];
935 int xen_idt_idx;
936 extern union descriptor tmpgdt[];
937 #endif
938
939 void
cpu_init_idt(void)940 cpu_init_idt(void)
941 {
942 #ifndef XEN
943 struct region_descriptor region;
944 setregion(®ion, pentium_idt, NIDT * sizeof(idt[0]) - 1);
945 lidt(®ion);
946 #else /* XEN */
947 XENPRINTF(("HYPERVISOR_set_trap_table %p\n", xen_idt));
948 if (HYPERVISOR_set_trap_table(xen_idt))
949 panic("HYPERVISOR_set_trap_table %p failed\n", xen_idt);
950 #endif /* !XEN */
951 }
952
953 void
initgdt(union descriptor * tgdt)954 initgdt(union descriptor *tgdt)
955 {
956 KASSERT(tgdt != NULL);
957
958 gdt = tgdt;
959 #ifdef XEN
960 u_long frames[16];
961 #else
962 struct region_descriptor region;
963 memset(gdt, 0, NGDT*sizeof(*gdt));
964 #endif /* XEN */
965 /* make gdt gates and memory segments */
966 setsegment(&gdt[GCODE_SEL].sd, 0, 0xfffff, SDT_MEMERA, SEL_KPL, 1, 1);
967 setsegment(&gdt[GDATA_SEL].sd, 0, 0xfffff, SDT_MEMRWA, SEL_KPL, 1, 1);
968 setsegment(&gdt[GUCODE_SEL].sd, 0, x86_btop(I386_MAX_EXE_ADDR) - 1,
969 SDT_MEMERA, SEL_UPL, 1, 1);
970 setsegment(&gdt[GUCODEBIG_SEL].sd, 0, 0xfffff,
971 SDT_MEMERA, SEL_UPL, 1, 1);
972 setsegment(&gdt[GUDATA_SEL].sd, 0, 0xfffff,
973 SDT_MEMRWA, SEL_UPL, 1, 1);
974 #if NBIOSCALL > 0
975 /* bios trampoline GDT entries */
976 setsegment(&gdt[GBIOSCODE_SEL].sd, 0, 0xfffff, SDT_MEMERA, SEL_KPL, 0,
977 0);
978 setsegment(&gdt[GBIOSDATA_SEL].sd, 0, 0xfffff, SDT_MEMRWA, SEL_KPL, 0,
979 0);
980 #endif
981 setsegment(&gdt[GCPU_SEL].sd, &cpu_info_primary, 0xfffff,
982 SDT_MEMRWA, SEL_KPL, 1, 1);
983
984 #ifndef XEN
985 setregion(®ion, gdt, NGDT * sizeof(gdt[0]) - 1);
986 lgdt(®ion);
987 #else /* !XEN */
988 /*
989 * We jumpstart the bootstrap process a bit so we can update
990 * page permissions. This is done redundantly later from
991 * x86_xpmap.c:xen_pmap_bootstrap() - harmless.
992 */
993 xpmap_phys_to_machine_mapping =
994 (unsigned long *)xen_start_info.mfn_list;
995
996 frames[0] = xpmap_ptom((uint32_t)gdt - KERNBASE) >> PAGE_SHIFT;
997 { /*
998 * Enter the gdt page RO into the kernel map. We can't
999 * use pmap_kenter_pa() here, because %fs is not
1000 * usable until the gdt is loaded, and %fs is used as
1001 * the base pointer for curcpu() and curlwp(), both of
1002 * which are in the callpath of pmap_kenter_pa().
1003 * So we mash up our own - this is MD code anyway.
1004 */
1005 pt_entry_t pte;
1006 pt_entry_t pg_nx = (cpu_feature[2] & CPUID_NOX ? PG_NX : 0);
1007
1008 pte = pmap_pa2pte((vaddr_t)gdt - KERNBASE);
1009 pte |= PG_k | PG_RO | pg_nx | PG_V;
1010
1011 if (HYPERVISOR_update_va_mapping((vaddr_t)gdt, pte, UVMF_INVLPG) < 0) {
1012 panic("gdt page RO update failed.\n");
1013 }
1014
1015 }
1016
1017 XENPRINTK(("loading gdt %lx, %d entries\n", frames[0] << PAGE_SHIFT,
1018 NGDT));
1019 if (HYPERVISOR_set_gdt(frames, NGDT /* XXX is it right ? */))
1020 panic("HYPERVISOR_set_gdt failed!\n");
1021
1022 lgdt_finish();
1023 #endif /* !XEN */
1024 }
1025
1026 static void
init386_msgbuf(void)1027 init386_msgbuf(void)
1028 {
1029 /* Message buffer is located at end of core. */
1030 struct vm_physseg *vps;
1031 psize_t sz = round_page(MSGBUFSIZE);
1032 psize_t reqsz = sz;
1033 unsigned int x;
1034
1035 search_again:
1036 vps = NULL;
1037 for (x = 0; x < vm_nphysseg; ++x) {
1038 vps = VM_PHYSMEM_PTR(x);
1039 if (ctob(vps->avail_end) == avail_end) {
1040 break;
1041 }
1042 }
1043 if (x == vm_nphysseg)
1044 panic("init386: can't find end of memory");
1045
1046 /* Shrink so it'll fit in the last segment. */
1047 if (vps->avail_end - vps->avail_start < atop(sz))
1048 sz = ctob(vps->avail_end - vps->avail_start);
1049
1050 vps->avail_end -= atop(sz);
1051 vps->end -= atop(sz);
1052 msgbuf_p_seg[msgbuf_p_cnt].sz = sz;
1053 msgbuf_p_seg[msgbuf_p_cnt++].paddr = ctob(vps->avail_end);
1054
1055 /* Remove the last segment if it now has no pages. */
1056 if (vps->start == vps->end) {
1057 for (--vm_nphysseg; x < vm_nphysseg; x++)
1058 VM_PHYSMEM_PTR_SWAP(x, x + 1);
1059 }
1060
1061 /* Now find where the new avail_end is. */
1062 for (avail_end = 0, x = 0; x < vm_nphysseg; x++)
1063 if (VM_PHYSMEM_PTR(x)->avail_end > avail_end)
1064 avail_end = VM_PHYSMEM_PTR(x)->avail_end;
1065 avail_end = ctob(avail_end);
1066
1067 if (sz == reqsz)
1068 return;
1069
1070 reqsz -= sz;
1071 if (msgbuf_p_cnt == VM_PHYSSEG_MAX) {
1072 /* No more segments available, bail out. */
1073 printf("WARNING: MSGBUFSIZE (%zu) too large, using %zu.\n",
1074 (size_t)MSGBUFSIZE, (size_t)(MSGBUFSIZE - reqsz));
1075 return;
1076 }
1077
1078 sz = reqsz;
1079 goto search_again;
1080 }
1081
1082 #ifndef XEN
1083 static void
init386_pte0(void)1084 init386_pte0(void)
1085 {
1086 paddr_t paddr;
1087 vaddr_t vaddr;
1088
1089 paddr = 4 * PAGE_SIZE;
1090 vaddr = (vaddr_t)vtopte(0);
1091 pmap_kenter_pa(vaddr, paddr, VM_PROT_ALL, 0);
1092 pmap_update(pmap_kernel());
1093 /* make sure it is clean before using */
1094 memset((void *)vaddr, 0, PAGE_SIZE);
1095 }
1096 #endif /* !XEN */
1097
1098 static void
init386_ksyms(void)1099 init386_ksyms(void)
1100 {
1101 #if NKSYMS || defined(DDB) || defined(MODULAR)
1102 extern int end;
1103 struct btinfo_symtab *symtab;
1104
1105 #ifdef DDB
1106 db_machine_init();
1107 #endif
1108
1109 #if defined(MULTIBOOT)
1110 if (multiboot_ksyms_addsyms_elf())
1111 return;
1112 #endif
1113
1114 if ((symtab = lookup_bootinfo(BTINFO_SYMTAB)) == NULL) {
1115 ksyms_addsyms_elf(*(int *)&end, ((int *)&end) + 1, esym);
1116 return;
1117 }
1118
1119 symtab->ssym += KERNBASE;
1120 symtab->esym += KERNBASE;
1121 ksyms_addsyms_elf(symtab->nsym, (int *)symtab->ssym, (int *)symtab->esym);
1122 #endif
1123 }
1124
1125 void
init386(paddr_t first_avail)1126 init386(paddr_t first_avail)
1127 {
1128 extern void consinit(void);
1129 int x;
1130 #ifndef XEN
1131 union descriptor *tgdt;
1132 struct region_descriptor region;
1133 #endif
1134 #if NBIOSCALL > 0
1135 extern int biostramp_image_size;
1136 extern u_char biostramp_image[];
1137 #endif
1138
1139 #ifdef XEN
1140 XENPRINTK(("HYPERVISOR_shared_info %p (%x)\n", HYPERVISOR_shared_info,
1141 xen_start_info.shared_info));
1142 KASSERT(HYPERVISOR_shared_info != NULL);
1143 cpu_info_primary.ci_vcpu = &HYPERVISOR_shared_info->vcpu_info[0];
1144 #endif
1145
1146 uvm_lwp_setuarea(&lwp0, lwp0uarea);
1147
1148 cpu_probe(&cpu_info_primary);
1149 cpu_init_msrs(&cpu_info_primary, true);
1150
1151 #ifdef PAE
1152 use_pae = 1;
1153 #else
1154 use_pae = 0;
1155 #endif
1156
1157 #ifdef XEN
1158 struct pcb *pcb = lwp_getpcb(&lwp0);
1159 pcb->pcb_cr3 = PDPpaddr;
1160 __PRINTK(("pcb_cr3 0x%lx cr3 0x%lx\n",
1161 PDPpaddr, xpmap_ptom(PDPpaddr)));
1162 XENPRINTK(("lwp0uarea %p first_avail %p\n",
1163 lwp0uarea, (void *)(long)first_avail));
1164 XENPRINTK(("ptdpaddr %p atdevbase %p\n", (void *)PDPpaddr,
1165 (void *)atdevbase));
1166 #endif
1167
1168 #if defined(PAE) && !defined(XEN)
1169 /*
1170 * Save VA and PA of L3 PD of boot processor (for Xen, this is done
1171 * in xen_pmap_bootstrap())
1172 */
1173 cpu_info_primary.ci_pae_l3_pdirpa = rcr3();
1174 cpu_info_primary.ci_pae_l3_pdir = (pd_entry_t *)(rcr3() + KERNBASE);
1175 #endif /* PAE && !XEN */
1176
1177 /*
1178 * Initialize PAGE_SIZE-dependent variables.
1179 */
1180 uvm_setpagesize();
1181
1182 /*
1183 * Start with 2 color bins -- this is just a guess to get us
1184 * started. We'll recolor when we determine the largest cache
1185 * sizes on the system.
1186 */
1187 uvmexp.ncolors = 2;
1188
1189 #ifndef XEN
1190 /*
1191 * Low memory reservations:
1192 * Page 0: BIOS data
1193 * Page 1: BIOS callback
1194 * Page 2: MP bootstrap code (MP_TRAMPOLINE)
1195 * Page 3: ACPI wakeup code (ACPI_WAKEUP_ADDR)
1196 * Page 4: Temporary page table for 0MB-4MB
1197 * Page 5: Temporary page directory
1198 */
1199 avail_start = 6 * PAGE_SIZE;
1200 #else /* !XEN */
1201 /* Parse Xen command line (replace bootinfo) */
1202 xen_parse_cmdline(XEN_PARSE_BOOTFLAGS, NULL);
1203
1204 /* Steal one page for gdt */
1205 gdt = (void *)((u_long)first_avail + KERNBASE);
1206 first_avail += PAGE_SIZE;
1207
1208 /* Determine physical address space */
1209 first_avail = round_page(first_avail);
1210 avail_start = first_avail;
1211 avail_end = ctob((paddr_t)xen_start_info.nr_pages);
1212 pmap_pa_start = (KERNTEXTOFF - KERNBASE);
1213 pmap_pa_end = pmap_pa_start + ctob((paddr_t)xen_start_info.nr_pages);
1214 mem_clusters[0].start = avail_start;
1215 mem_clusters[0].size = avail_end - avail_start;
1216 mem_cluster_cnt++;
1217 physmem += xen_start_info.nr_pages;
1218 uvmexp.wired += atop(avail_start);
1219
1220 /*
1221 * initgdt() has to be done before consinit(), so that %fs is properly
1222 * initialised. initgdt() uses pmap_kenter_pa so it can't be called
1223 * before the above variables are set.
1224 */
1225 initgdt(gdt);
1226
1227 mutex_init(&pte_lock, MUTEX_DEFAULT, IPL_VM);
1228 #endif /* XEN */
1229
1230 #if NISA > 0 || NPCI > 0
1231 x86_bus_space_init();
1232 #endif /* NISA > 0 || NPCI > 0 */
1233
1234 consinit(); /* XXX SHOULD NOT BE DONE HERE */
1235
1236 #ifdef DEBUG_MEMLOAD
1237 printf("mem_cluster_count: %d\n", mem_cluster_cnt);
1238 #endif
1239
1240 /*
1241 * Call pmap initialization to make new kernel address space.
1242 * We must do this before loading pages into the VM system.
1243 */
1244 pmap_bootstrap((vaddr_t)atdevbase + IOM_SIZE);
1245
1246 #ifndef XEN
1247 /* Initialize the memory clusters. */
1248 init_x86_clusters();
1249
1250 /* Internalize the physical pages into the VM system. */
1251 init_x86_vm(first_avail);
1252 #else /* !XEN */
1253 XENPRINTK(("load the memory cluster 0x%" PRIx64 " (%" PRId64 ") - "
1254 "0x%" PRIx64 " (%" PRId64 ")\n",
1255 (uint64_t)avail_start, (uint64_t)atop(avail_start),
1256 (uint64_t)avail_end, (uint64_t)atop(avail_end)));
1257 uvm_page_physload(atop(avail_start), atop(avail_end),
1258 atop(avail_start), atop(avail_end),
1259 VM_FREELIST_DEFAULT);
1260
1261 /* Reclaim the boot gdt page - see locore.s */
1262 {
1263 pt_entry_t pte;
1264 pt_entry_t pg_nx = (cpu_feature[2] & CPUID_NOX ? PG_NX : 0);
1265
1266 pte = pmap_pa2pte((vaddr_t)tmpgdt - KERNBASE);
1267 pte |= PG_k | PG_RW | pg_nx | PG_V;
1268
1269 if (HYPERVISOR_update_va_mapping((vaddr_t)tmpgdt, pte, UVMF_INVLPG) < 0) {
1270 panic("tmpgdt page relaim RW update failed.\n");
1271 }
1272 }
1273
1274 #endif /* !XEN */
1275
1276 init386_msgbuf();
1277
1278 #ifndef XEN
1279 /*
1280 * XXX Remove this
1281 *
1282 * Setup a temporary Page Table Entry to allow identity mappings of
1283 * the real mode address. This is required by:
1284 * - bioscall
1285 * - MP bootstrap
1286 * - ACPI wakecode
1287 */
1288 init386_pte0();
1289
1290 #if NBIOSCALL > 0
1291 KASSERT(biostramp_image_size <= PAGE_SIZE);
1292 pmap_kenter_pa((vaddr_t)BIOSTRAMP_BASE, (paddr_t)BIOSTRAMP_BASE,
1293 VM_PROT_ALL, 0);
1294 pmap_update(pmap_kernel());
1295 memcpy((void *)BIOSTRAMP_BASE, biostramp_image, biostramp_image_size);
1296
1297 /* Needed early, for bioscall() */
1298 cpu_info_primary.ci_pmap = pmap_kernel();
1299 #endif
1300 #endif /* !XEN */
1301
1302 pmap_kenter_pa(idt_vaddr, idt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
1303 pmap_update(pmap_kernel());
1304 memset((void *)idt_vaddr, 0, PAGE_SIZE);
1305
1306
1307 #ifndef XEN
1308 idt_init();
1309
1310 idt = (struct gate_descriptor *)idt_vaddr;
1311 pmap_kenter_pa(pentium_idt_vaddr, idt_paddr, VM_PROT_READ, 0);
1312 pmap_update(pmap_kernel());
1313 pentium_idt = (union descriptor *)pentium_idt_vaddr;
1314
1315 tgdt = gdt;
1316 gdt = (union descriptor *)
1317 ((char *)idt + NIDT * sizeof(struct gate_descriptor));
1318 ldt = gdt + NGDT;
1319
1320 memcpy(gdt, tgdt, NGDT * sizeof(*gdt));
1321
1322 setsegment(&gdt[GLDT_SEL].sd, ldt, NLDT * sizeof(ldt[0]) - 1,
1323 SDT_SYSLDT, SEL_KPL, 0, 0);
1324 #else
1325 HYPERVISOR_set_callbacks(
1326 GSEL(GCODE_SEL, SEL_KPL), (unsigned long)hypervisor_callback,
1327 GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback);
1328
1329 ldt = (union descriptor *)idt_vaddr;
1330 #endif /* XEN */
1331
1332 /* make ldt gates and memory segments */
1333 setgate(&ldt[LSYS5CALLS_SEL].gd, &IDTVEC(osyscall), 1,
1334 SDT_SYS386CGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
1335
1336 ldt[LUCODE_SEL] = gdt[GUCODE_SEL];
1337 ldt[LUCODEBIG_SEL] = gdt[GUCODEBIG_SEL];
1338 ldt[LUDATA_SEL] = gdt[GUDATA_SEL];
1339 ldt[LSOL26CALLS_SEL] = ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL];
1340
1341 #ifndef XEN
1342 /* exceptions */
1343 for (x = 0; x < 32; x++) {
1344 idt_vec_reserve(x);
1345 setgate(&idt[x], IDTVEC(exceptions)[x], 0, SDT_SYS386IGT,
1346 (x == 3 || x == 4) ? SEL_UPL : SEL_KPL,
1347 GSEL(GCODE_SEL, SEL_KPL));
1348 }
1349
1350 /* new-style interrupt gate for syscalls */
1351 idt_vec_reserve(128);
1352 setgate(&idt[128], &IDTVEC(syscall), 0, SDT_SYS386IGT, SEL_UPL,
1353 GSEL(GCODE_SEL, SEL_KPL));
1354 idt_vec_reserve(0xd2);
1355 setgate(&idt[0xd2], &IDTVEC(svr4_fasttrap), 0, SDT_SYS386IGT,
1356 SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
1357
1358 setregion(®ion, gdt, NGDT * sizeof(gdt[0]) - 1);
1359 lgdt(®ion);
1360
1361 cpu_init_idt();
1362 #else /* !XEN */
1363 memset(xen_idt, 0, sizeof(trap_info_t) * MAX_XEN_IDT);
1364 xen_idt_idx = 0;
1365 for (x = 0; x < 32; x++) {
1366 KASSERT(xen_idt_idx < MAX_XEN_IDT);
1367 xen_idt[xen_idt_idx].vector = x;
1368
1369 switch (x) {
1370 case 2: /* NMI */
1371 case 18: /* MCA */
1372 TI_SET_IF(&(xen_idt[xen_idt_idx]), 2);
1373 break;
1374 case 3:
1375 case 4:
1376 xen_idt[xen_idt_idx].flags = SEL_UPL;
1377 break;
1378 default:
1379 xen_idt[xen_idt_idx].flags = SEL_XEN;
1380 break;
1381 }
1382
1383 xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL);
1384 xen_idt[xen_idt_idx].address =
1385 (uint32_t)IDTVEC(exceptions)[x];
1386 xen_idt_idx++;
1387 }
1388 KASSERT(xen_idt_idx < MAX_XEN_IDT);
1389 xen_idt[xen_idt_idx].vector = 128;
1390 xen_idt[xen_idt_idx].flags = SEL_UPL;
1391 xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL);
1392 xen_idt[xen_idt_idx].address = (uint32_t)&IDTVEC(syscall);
1393 xen_idt_idx++;
1394 KASSERT(xen_idt_idx < MAX_XEN_IDT);
1395 xen_idt[xen_idt_idx].vector = 0xd2;
1396 xen_idt[xen_idt_idx].flags = SEL_UPL;
1397 xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL);
1398 xen_idt[xen_idt_idx].address = (uint32_t)&IDTVEC(svr4_fasttrap);
1399 xen_idt_idx++;
1400 lldt(GSEL(GLDT_SEL, SEL_KPL));
1401 cpu_init_idt();
1402 #endif /* XEN */
1403
1404 init386_ksyms();
1405
1406 #if NMCA > 0
1407 /* check for MCA bus, needed to be done before ISA stuff - if
1408 * MCA is detected, ISA needs to use level triggered interrupts
1409 * by default */
1410 mca_busprobe();
1411 #endif
1412
1413 #ifdef XEN
1414 XENPRINTF(("events_default_setup\n"));
1415 events_default_setup();
1416 #else
1417 intr_default_setup();
1418 #endif
1419
1420 splraise(IPL_HIGH);
1421 x86_enable_intr();
1422
1423 #ifdef DDB
1424 if (boothowto & RB_KDB)
1425 Debugger();
1426 #endif
1427 #ifdef IPKDB
1428 ipkdb_init();
1429 if (boothowto & RB_KDB)
1430 ipkdb_connect(0);
1431 #endif
1432 #ifdef KGDB
1433 kgdb_port_init();
1434 if (boothowto & RB_KDB) {
1435 kgdb_debug_init = 1;
1436 kgdb_connect(1);
1437 }
1438 #endif
1439
1440 if (physmem < btoc(2 * 1024 * 1024)) {
1441 printf("warning: too little memory available; "
1442 "have %lu bytes, want %lu bytes\n"
1443 "running in degraded mode\n"
1444 "press a key to confirm\n\n",
1445 (unsigned long)ptoa(physmem), 2*1024*1024UL);
1446 cngetc();
1447 }
1448
1449 rw_init(&svr4_fasttrap_lock);
1450 }
1451
1452 #include <dev/ic/mc146818reg.h> /* for NVRAM POST */
1453 #include <i386/isa/nvram.h> /* for NVRAM POST */
1454
1455 void
cpu_reset(void)1456 cpu_reset(void)
1457 {
1458 #ifdef XEN
1459 HYPERVISOR_reboot();
1460 for (;;);
1461 #else /* XEN */
1462 struct region_descriptor region;
1463
1464 x86_disable_intr();
1465
1466 /*
1467 * Ensure the NVRAM reset byte contains something vaguely sane.
1468 */
1469
1470 outb(IO_RTC, NVRAM_RESET);
1471 outb(IO_RTC+1, NVRAM_RESET_RST);
1472
1473 /*
1474 * Reset AMD Geode SC1100.
1475 *
1476 * 1) Write PCI Configuration Address Register (0xcf8) to
1477 * select Function 0, Register 0x44: Bridge Configuration,
1478 * GPIO and LPC Configuration Register Space, Reset
1479 * Control Register.
1480 *
1481 * 2) Write 0xf to PCI Configuration Data Register (0xcfc)
1482 * to reset IDE controller, IDE bus, and PCI bus, and
1483 * to trigger a system-wide reset.
1484 *
1485 * See AMD Geode SC1100 Processor Data Book, Revision 2.0,
1486 * sections 6.3.1, 6.3.2, and 6.4.1.
1487 */
1488 if (cpu_info_primary.ci_signature == 0x540) {
1489 outl(0xcf8, 0x80009044);
1490 outl(0xcfc, 0xf);
1491 }
1492
1493 x86_reset();
1494
1495 /*
1496 * Try to cause a triple fault and watchdog reset by making the IDT
1497 * invalid and causing a fault.
1498 */
1499 memset((void *)idt, 0, NIDT * sizeof(idt[0]));
1500 setregion(®ion, idt, NIDT * sizeof(idt[0]) - 1);
1501 lidt(®ion);
1502 breakpoint();
1503
1504 #if 0
1505 /*
1506 * Try to cause a triple fault and watchdog reset by unmapping the
1507 * entire address space and doing a TLB flush.
1508 */
1509 memset((void *)PTD, 0, PAGE_SIZE);
1510 tlbflush();
1511 #endif
1512
1513 for (;;);
1514 #endif /* XEN */
1515 }
1516
1517 void
cpu_getmcontext(struct lwp * l,mcontext_t * mcp,unsigned int * flags)1518 cpu_getmcontext(struct lwp *l, mcontext_t *mcp, unsigned int *flags)
1519 {
1520 const struct trapframe *tf = l->l_md.md_regs;
1521 __greg_t *gr = mcp->__gregs;
1522 __greg_t ras_eip;
1523
1524 /* Save register context. */
1525 #ifdef VM86
1526 if (tf->tf_eflags & PSL_VM) {
1527 gr[_REG_GS] = tf->tf_vm86_gs;
1528 gr[_REG_FS] = tf->tf_vm86_fs;
1529 gr[_REG_ES] = tf->tf_vm86_es;
1530 gr[_REG_DS] = tf->tf_vm86_ds;
1531 gr[_REG_EFL] = get_vflags(l);
1532 } else
1533 #endif
1534 {
1535 gr[_REG_GS] = tf->tf_gs;
1536 gr[_REG_FS] = tf->tf_fs;
1537 gr[_REG_ES] = tf->tf_es;
1538 gr[_REG_DS] = tf->tf_ds;
1539 gr[_REG_EFL] = tf->tf_eflags;
1540 }
1541 gr[_REG_EDI] = tf->tf_edi;
1542 gr[_REG_ESI] = tf->tf_esi;
1543 gr[_REG_EBP] = tf->tf_ebp;
1544 gr[_REG_EBX] = tf->tf_ebx;
1545 gr[_REG_EDX] = tf->tf_edx;
1546 gr[_REG_ECX] = tf->tf_ecx;
1547 gr[_REG_EAX] = tf->tf_eax;
1548 gr[_REG_EIP] = tf->tf_eip;
1549 gr[_REG_CS] = tf->tf_cs;
1550 gr[_REG_ESP] = tf->tf_esp;
1551 gr[_REG_UESP] = tf->tf_esp;
1552 gr[_REG_SS] = tf->tf_ss;
1553 gr[_REG_TRAPNO] = tf->tf_trapno;
1554 gr[_REG_ERR] = tf->tf_err;
1555
1556 if ((ras_eip = (__greg_t)ras_lookup(l->l_proc,
1557 (void *) gr[_REG_EIP])) != -1)
1558 gr[_REG_EIP] = ras_eip;
1559
1560 *flags |= _UC_CPU;
1561
1562 mcp->_mc_tlsbase = (uintptr_t)l->l_private;
1563 *flags |= _UC_TLSBASE;
1564
1565 /*
1566 * Save floating point register context.
1567 *
1568 * If the cpu doesn't support fxsave we must still write to
1569 * the entire 512 byte area - otherwise we leak kernel memory
1570 * contents to userspace.
1571 * It wouldn't matter if we were doing the copyout here.
1572 * So we might as well convert to fxsave format.
1573 */
1574 __CTASSERT(sizeof (struct fxsave) ==
1575 sizeof mcp->__fpregs.__fp_reg_set.__fp_xmm_state);
1576 process_read_fpregs_xmm(l, (struct fxsave *)
1577 &mcp->__fpregs.__fp_reg_set.__fp_xmm_state);
1578 memset(&mcp->__fpregs.__fp_pad, 0, sizeof mcp->__fpregs.__fp_pad);
1579 *flags |= _UC_FXSAVE | _UC_FPU;
1580 }
1581
1582 int
cpu_mcontext_validate(struct lwp * l,const mcontext_t * mcp)1583 cpu_mcontext_validate(struct lwp *l, const mcontext_t *mcp)
1584 {
1585 const __greg_t *gr = mcp->__gregs;
1586 struct trapframe *tf = l->l_md.md_regs;
1587
1588 /*
1589 * Check for security violations. If we're returning
1590 * to protected mode, the CPU will validate the segment
1591 * registers automatically and generate a trap on
1592 * violations. We handle the trap, rather than doing
1593 * all of the checking here.
1594 */
1595 if (((gr[_REG_EFL] ^ tf->tf_eflags) & PSL_USERSTATIC) ||
1596 !USERMODE(gr[_REG_CS], gr[_REG_EFL]))
1597 return EINVAL;
1598
1599 return 0;
1600 }
1601
1602 int
cpu_setmcontext(struct lwp * l,const mcontext_t * mcp,unsigned int flags)1603 cpu_setmcontext(struct lwp *l, const mcontext_t *mcp, unsigned int flags)
1604 {
1605 struct trapframe *tf = l->l_md.md_regs;
1606 const __greg_t *gr = mcp->__gregs;
1607 struct proc *p = l->l_proc;
1608 int error;
1609
1610 /* Restore register context, if any. */
1611 if ((flags & _UC_CPU) != 0) {
1612 #ifdef VM86
1613 if (gr[_REG_EFL] & PSL_VM) {
1614 tf->tf_vm86_gs = gr[_REG_GS];
1615 tf->tf_vm86_fs = gr[_REG_FS];
1616 tf->tf_vm86_es = gr[_REG_ES];
1617 tf->tf_vm86_ds = gr[_REG_DS];
1618 set_vflags(l, gr[_REG_EFL]);
1619 if (flags & _UC_VM) {
1620 void syscall_vm86(struct trapframe *);
1621 l->l_proc->p_md.md_syscall = syscall_vm86;
1622 }
1623 } else
1624 #endif
1625 {
1626 error = cpu_mcontext_validate(l, mcp);
1627 if (error)
1628 return error;
1629
1630 tf->tf_gs = gr[_REG_GS];
1631 tf->tf_fs = gr[_REG_FS];
1632 tf->tf_es = gr[_REG_ES];
1633 tf->tf_ds = gr[_REG_DS];
1634 /* Only change the user-alterable part of eflags */
1635 tf->tf_eflags &= ~PSL_USER;
1636 tf->tf_eflags |= (gr[_REG_EFL] & PSL_USER);
1637 }
1638 tf->tf_edi = gr[_REG_EDI];
1639 tf->tf_esi = gr[_REG_ESI];
1640 tf->tf_ebp = gr[_REG_EBP];
1641 tf->tf_ebx = gr[_REG_EBX];
1642 tf->tf_edx = gr[_REG_EDX];
1643 tf->tf_ecx = gr[_REG_ECX];
1644 tf->tf_eax = gr[_REG_EAX];
1645 tf->tf_eip = gr[_REG_EIP];
1646 tf->tf_cs = gr[_REG_CS];
1647 tf->tf_esp = gr[_REG_UESP];
1648 tf->tf_ss = gr[_REG_SS];
1649 }
1650
1651 if ((flags & _UC_TLSBASE) != 0)
1652 lwp_setprivate(l, (void *)(uintptr_t)mcp->_mc_tlsbase);
1653
1654 /* Restore floating point register context, if given. */
1655 if ((flags & _UC_FPU) != 0) {
1656 __CTASSERT(sizeof (struct fxsave) ==
1657 sizeof mcp->__fpregs.__fp_reg_set.__fp_xmm_state);
1658 __CTASSERT(sizeof (struct save87) ==
1659 sizeof mcp->__fpregs.__fp_reg_set.__fpchip_state);
1660
1661 if (flags & _UC_FXSAVE) {
1662 process_write_fpregs_xmm(l, (const struct fxsave *)
1663 &mcp->__fpregs.__fp_reg_set.__fp_xmm_state);
1664 } else {
1665 process_write_fpregs_s87(l, (const struct save87 *)
1666 &mcp->__fpregs.__fp_reg_set.__fpchip_state);
1667 }
1668 }
1669
1670 mutex_enter(p->p_lock);
1671 if (flags & _UC_SETSTACK)
1672 l->l_sigstk.ss_flags |= SS_ONSTACK;
1673 if (flags & _UC_CLRSTACK)
1674 l->l_sigstk.ss_flags &= ~SS_ONSTACK;
1675 mutex_exit(p->p_lock);
1676 return (0);
1677 }
1678
1679 void
cpu_initclocks(void)1680 cpu_initclocks(void)
1681 {
1682
1683 (*initclock_func)();
1684 }
1685
1686 #define DEV_IO 14 /* iopl for compat_10 */
1687
1688 int
mm_md_open(dev_t dev,int flag,int mode,struct lwp * l)1689 mm_md_open(dev_t dev, int flag, int mode, struct lwp *l)
1690 {
1691
1692 switch (minor(dev)) {
1693 case DEV_IO:
1694 /*
1695 * This is done by i386_iopl(3) now.
1696 *
1697 * #if defined(COMPAT_10) || defined(COMPAT_FREEBSD)
1698 */
1699 if (flag & FWRITE) {
1700 struct trapframe *fp;
1701 int error;
1702
1703 error = kauth_authorize_machdep(l->l_cred,
1704 KAUTH_MACHDEP_IOPL, NULL, NULL, NULL, NULL);
1705 if (error)
1706 return (error);
1707 fp = curlwp->l_md.md_regs;
1708 fp->tf_eflags |= PSL_IOPL;
1709 }
1710 break;
1711 default:
1712 break;
1713 }
1714 return 0;
1715 }
1716
1717 #ifdef PAE
1718 void
cpu_alloc_l3_page(struct cpu_info * ci)1719 cpu_alloc_l3_page(struct cpu_info *ci)
1720 {
1721 int ret;
1722 struct pglist pg;
1723 struct vm_page *vmap;
1724
1725 KASSERT(ci != NULL);
1726 /*
1727 * Allocate a page for the per-CPU L3 PD. cr3 being 32 bits, PA musts
1728 * resides below the 4GB boundary.
1729 */
1730 ret = uvm_pglistalloc(PAGE_SIZE, 0, 0x100000000ULL, 32, 0, &pg, 1, 0);
1731 vmap = TAILQ_FIRST(&pg);
1732
1733 if (ret != 0 || vmap == NULL)
1734 panic("%s: failed to allocate L3 pglist for CPU %d (ret %d)\n",
1735 __func__, cpu_index(ci), ret);
1736
1737 ci->ci_pae_l3_pdirpa = vmap->phys_addr;
1738
1739 ci->ci_pae_l3_pdir = (paddr_t *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
1740 UVM_KMF_VAONLY | UVM_KMF_NOWAIT);
1741 if (ci->ci_pae_l3_pdir == NULL)
1742 panic("%s: failed to allocate L3 PD for CPU %d\n",
1743 __func__, cpu_index(ci));
1744
1745 pmap_kenter_pa((vaddr_t)ci->ci_pae_l3_pdir, ci->ci_pae_l3_pdirpa,
1746 VM_PROT_READ | VM_PROT_WRITE, 0);
1747
1748 pmap_update(pmap_kernel());
1749 }
1750 #endif /* PAE */
1751