1 /*-
2 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
3 * Copyright (c) 1992 Terrence R. Lambert.
4 * Copyright (c) 2003 Peter Wemm.
5 * Copyright (c) 2008-2017 The DragonFly Project.
6 * All rights reserved.
7 *
8 * This code is derived from software contributed to Berkeley by
9 * William Jolitz.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by the University of
22 * California, Berkeley and its contributors.
23 * 4. Neither the name of the University nor the names of its contributors
24 * may be used to endorse or promote products derived from this software
25 * without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * SUCH DAMAGE.
38 *
39 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
40 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $
41 */
42
43 #include "use_isa.h"
44 #include "opt_cpu.h"
45 #include "opt_ddb.h"
46 #include "opt_inet.h"
47 #include "opt_maxmem.h"
48 #include "opt_msgbuf.h"
49 #include "opt_swap.h"
50
51 #include <sys/param.h>
52 #include <sys/systm.h>
53 #include <sys/sysmsg.h>
54 #include <sys/signalvar.h>
55 #include <sys/kernel.h>
56 #include <sys/linker.h>
57 #include <sys/malloc.h>
58 #include <sys/proc.h>
59 #include <sys/caps.h>
60 #include <sys/buf.h>
61 #include <sys/reboot.h>
62 #include <sys/mbuf.h>
63 #include <sys/msgbuf.h>
64 #include <sys/sysent.h>
65 #include <sys/sysctl.h>
66 #include <sys/vmmeter.h>
67 #include <sys/bus.h>
68 #include <sys/usched.h>
69 #include <sys/reg.h>
70 #include <sys/sbuf.h>
71 #include <sys/ctype.h>
72 #include <sys/serialize.h>
73 #include <sys/systimer.h>
74
75 #include <vm/vm.h>
76 #include <vm/vm_param.h>
77 #include <sys/lock.h>
78 #include <vm/vm_kern.h>
79 #include <vm/vm_object.h>
80 #include <vm/vm_page.h>
81 #include <vm/vm_map.h>
82 #include <vm/vm_pager.h>
83 #include <vm/vm_extern.h>
84
85 #include <sys/thread2.h>
86 #include <sys/mplock2.h>
87
88 #include <sys/exec.h>
89 #include <sys/cons.h>
90
91 #include <sys/efi.h>
92
93 #include <ddb/ddb.h>
94
95 #include <machine/cpu.h>
96 #include <machine/clock.h>
97 #include <machine/specialreg.h>
98 #if 0 /* JG */
99 #include <machine/bootinfo.h>
100 #endif
101 #include <machine/md_var.h>
102 #include <machine/metadata.h>
103 #include <machine/pc/bios.h>
104 #include <machine/pcb_ext.h>
105 #include <machine/globaldata.h> /* CPU_prvspace */
106 #include <machine/smp.h>
107 #include <machine/cputypes.h>
108 #include <machine/intr_machdep.h>
109 #include <machine/framebuffer.h>
110
111 #ifdef OLD_BUS_ARCH
112 #include <bus/isa/isa_device.h>
113 #endif
114 #include <machine_base/isa/isa_intr.h>
115 #include <bus/isa/rtc.h>
116 #include <sys/random.h>
117 #include <sys/ptrace.h>
118 #include <machine/sigframe.h>
119
120 #include <sys/machintr.h>
121 #include <machine_base/icu/icu_abi.h>
122 #include <machine_base/icu/elcr_var.h>
123 #include <machine_base/apic/lapic.h>
124 #include <machine_base/apic/ioapic.h>
125 #include <machine_base/apic/ioapic_abi.h>
126 #include <machine/mptable.h>
127
128 #define PHYSMAP_ENTRIES 10
129 #define MAXBUFSTRUCTSIZE ((size_t)512 * 1024 * 1024)
130
131 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
132
133 extern void printcpuinfo(void); /* XXX header file */
134 extern void identify_cpu(void);
135 extern void panicifcpuunsupported(void);
136
137 static void cpu_startup(void *);
138 static void pic_finish(void *);
139 static void cpu_finish(void *);
140
141 static void set_fpregs_xmm(struct save87 *, struct savexmm *);
142 static void fill_fpregs_xmm(struct savexmm *, struct save87 *);
143 static void init_locks(void);
144
145 extern void pcpu_timer_always(struct intrframe *);
146
147 SYSINIT(cpu, SI_BOOT2_START_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
148 SYSINIT(pic_finish, SI_BOOT2_FINISH_PIC, SI_ORDER_FIRST, pic_finish, NULL);
149 SYSINIT(cpu_finish, SI_BOOT2_FINISH_CPU, SI_ORDER_FIRST, cpu_finish, NULL);
150
151 #ifdef DDB
152 extern vm_offset_t ksym_start, ksym_end;
153 #endif
154
155 struct privatespace CPU_prvspace_bsp __aligned(4096);
156 struct privatespace *CPU_prvspace[MAXCPU] = { &CPU_prvspace_bsp };
157
158 vm_paddr_t efi_systbl_phys;
159 int _udatasel, _ucodesel, _ucode32sel;
160 u_long atdevbase;
161 int64_t tsc_offsets[MAXCPU];
162 cpumask_t smp_idleinvl_mask;
163 cpumask_t smp_idleinvl_reqs;
164
165 /* MWAIT hint (EAX) or CPU_MWAIT_HINT_ */
166 __read_mostly static int cpu_mwait_halt_global;
167 __read_mostly static int clock_debug1;
168 __read_mostly static int flame_poll_debug;
169
170 SYSCTL_INT(_debug, OID_AUTO, flame_poll_debug,
171 CTLFLAG_RW, &flame_poll_debug, 0, "");
172 TUNABLE_INT("debug.flame_poll_debug", &flame_poll_debug);
173
174 #if defined(SWTCH_OPTIM_STATS)
175 extern int swtch_optim_stats;
176 SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats,
177 CTLFLAG_RD, &swtch_optim_stats, 0, "");
178 SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count,
179 CTLFLAG_RD, &tlb_flush_count, 0, "");
180 #endif
181 SYSCTL_INT(_debug, OID_AUTO, clock_debug1,
182 CTLFLAG_RW, &clock_debug1, 0, "");
183 SYSCTL_INT(_hw, OID_AUTO, cpu_mwait_halt,
184 CTLFLAG_RD, &cpu_mwait_halt_global, 0, "");
185 SYSCTL_INT(_hw, OID_AUTO, cpu_mwait_spin,
186 CTLFLAG_RD, &cpu_mwait_spin, 0, "monitor/mwait target state");
187
188 #define CPU_MWAIT_HAS_CX \
189 ((cpu_feature2 & CPUID2_MON) && \
190 (cpu_mwait_feature & CPUID_MWAIT_EXT))
191
192 #define CPU_MWAIT_CX_NAMELEN 16
193
194 #define CPU_MWAIT_C1 1
195 #define CPU_MWAIT_C2 2
196 #define CPU_MWAIT_C3 3
197 #define CPU_MWAIT_CX_MAX 8
198
199 #define CPU_MWAIT_HINT_AUTO -1 /* C1 and C2 */
200 #define CPU_MWAIT_HINT_AUTODEEP -2 /* C3+ */
201
202 SYSCTL_NODE(_machdep, OID_AUTO, mwait, CTLFLAG_RW, 0, "MWAIT features");
203 SYSCTL_NODE(_machdep_mwait, OID_AUTO, CX, CTLFLAG_RW, 0, "MWAIT Cx settings");
204
205 struct cpu_mwait_cx {
206 int subcnt;
207 char name[4];
208 struct sysctl_ctx_list sysctl_ctx;
209 struct sysctl_oid *sysctl_tree;
210 };
211 static struct cpu_mwait_cx cpu_mwait_cx_info[CPU_MWAIT_CX_MAX];
212 static char cpu_mwait_cx_supported[256];
213
214 static int cpu_mwait_c1_hints_cnt;
215 static int cpu_mwait_hints_cnt;
216 static int *cpu_mwait_hints;
217
218 static int cpu_mwait_deep_hints_cnt;
219 static int *cpu_mwait_deep_hints;
220
221 #define CPU_IDLE_REPEAT_DEFAULT 750
222
223 static u_int cpu_idle_repeat = CPU_IDLE_REPEAT_DEFAULT;
224 static u_long cpu_idle_repeat_max = CPU_IDLE_REPEAT_DEFAULT;
225 static u_int cpu_mwait_repeat_shift = 1;
226
227 #define CPU_MWAIT_C3_PREAMBLE_BM_ARB 0x1
228 #define CPU_MWAIT_C3_PREAMBLE_BM_STS 0x2
229
230 static int cpu_mwait_c3_preamble =
231 CPU_MWAIT_C3_PREAMBLE_BM_ARB |
232 CPU_MWAIT_C3_PREAMBLE_BM_STS;
233
234 SYSCTL_STRING(_machdep_mwait_CX, OID_AUTO, supported, CTLFLAG_RD,
235 cpu_mwait_cx_supported, 0, "MWAIT supported C states");
236 SYSCTL_INT(_machdep_mwait_CX, OID_AUTO, c3_preamble, CTLFLAG_RD,
237 &cpu_mwait_c3_preamble, 0, "C3+ preamble mask");
238
239 static int cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS,
240 int *, boolean_t);
241 static int cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS);
242 static int cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS);
243 static int cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS);
244
245 SYSCTL_PROC(_machdep_mwait_CX, OID_AUTO, idle, CTLTYPE_STRING|CTLFLAG_RW,
246 NULL, 0, cpu_mwait_cx_idle_sysctl, "A", "");
247 SYSCTL_PROC(_machdep_mwait_CX, OID_AUTO, spin, CTLTYPE_STRING|CTLFLAG_RW,
248 NULL, 0, cpu_mwait_cx_spin_sysctl, "A", "");
249 SYSCTL_UINT(_machdep_mwait_CX, OID_AUTO, repeat_shift, CTLFLAG_RW,
250 &cpu_mwait_repeat_shift, 0, "");
251
252 long physmem = 0;
253
254 u_long ebda_addr = 0;
255
256 int imcr_present = 0;
257
258 int naps = 0; /* # of Applications processors */
259
260 u_int base_memory;
261
262 static int
sysctl_hw_physmem(SYSCTL_HANDLER_ARGS)263 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS)
264 {
265 u_long pmem = ctob(physmem);
266 int error;
267
268 error = sysctl_handle_long(oidp, &pmem, 0, req);
269
270 return (error);
271 }
272
273 SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_ULONG|CTLFLAG_RD,
274 0, 0, sysctl_hw_physmem, "LU",
275 "Total system memory in bytes (number of pages * page size)");
276
277 static int
sysctl_hw_usermem(SYSCTL_HANDLER_ARGS)278 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS)
279 {
280 u_long usermem = ctob(physmem - vmstats.v_wire_count);
281 int error;
282
283 error = sysctl_handle_long(oidp, &usermem, 0, req);
284
285 return (error);
286 }
287
288 SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_ULONG|CTLFLAG_RD,
289 0, 0, sysctl_hw_usermem, "LU", "");
290
291 static int
sysctl_hw_availpages(SYSCTL_HANDLER_ARGS)292 sysctl_hw_availpages(SYSCTL_HANDLER_ARGS)
293 {
294 int error;
295 u_long availpages;
296
297 availpages = x86_64_btop(avail_end - avail_start);
298 error = sysctl_handle_long(oidp, &availpages, 0, req);
299
300 return (error);
301 }
302
303 SYSCTL_PROC(_hw, OID_AUTO, availpages, CTLTYPE_ULONG|CTLFLAG_RD,
304 0, 0, sysctl_hw_availpages, "LU", "");
305
306 vm_paddr_t Maxmem;
307 vm_paddr_t Realmem;
308
309 /*
310 * The number of PHYSMAP entries must be one less than the number of
311 * PHYSSEG entries because the PHYSMAP entry that spans the largest
312 * physical address that is accessible by ISA DMA is split into two
313 * PHYSSEG entries.
314 */
315 vm_phystable_t phys_avail[VM_PHYSSEG_MAX + 1];
316 vm_phystable_t dump_avail[VM_PHYSSEG_MAX + 1];
317
318 /* must be 1 less so 0 0 can signal end of chunks */
319 #define PHYS_AVAIL_ARRAY_END (NELEM(phys_avail) - 1)
320 #define DUMP_AVAIL_ARRAY_END (NELEM(dump_avail) - 1)
321
322 static vm_offset_t buffer_sva, buffer_eva;
323 vm_offset_t clean_sva, clean_eva;
324 static vm_offset_t pager_sva, pager_eva;
325 static struct trapframe proc0_tf;
326
327 static void cpu_implement_smap(void);
328
329 static void
cpu_startup(void * dummy)330 cpu_startup(void *dummy)
331 {
332 caddr_t v;
333 vm_size_t size = 0;
334 vm_offset_t firstaddr;
335
336 /*
337 * Good {morning,afternoon,evening,night}.
338 */
339 kprintf("%s", version);
340 startrtclock();
341 printcpuinfo();
342 panicifcpuunsupported();
343 if (cpu_stdext_feature & CPUID_STDEXT_SMAP)
344 cpu_implement_smap();
345
346 kprintf("real memory = %ju (%ju MB)\n",
347 (intmax_t)Realmem,
348 (intmax_t)Realmem / 1024 / 1024);
349 /*
350 * Display any holes after the first chunk of extended memory.
351 */
352 if (bootverbose) {
353 int indx;
354
355 kprintf("Physical memory chunk(s):\n");
356 for (indx = 0; phys_avail[indx].phys_end != 0; ++indx) {
357 vm_paddr_t size1;
358
359 size1 = phys_avail[indx].phys_end -
360 phys_avail[indx].phys_beg;
361
362 kprintf("0x%08jx - 0x%08jx, %ju bytes (%ju pages)\n",
363 (intmax_t)phys_avail[indx].phys_beg,
364 (intmax_t)phys_avail[indx].phys_end - 1,
365 (intmax_t)size1,
366 (intmax_t)(size1 / PAGE_SIZE));
367 }
368 }
369
370 /*
371 * Allocate space for system data structures.
372 * The first available kernel virtual address is in "v".
373 * As pages of kernel virtual memory are allocated, "v" is incremented.
374 * As pages of memory are allocated and cleared,
375 * "firstaddr" is incremented.
376 * An index into the kernel page table corresponding to the
377 * virtual memory address maintained in "v" is kept in "mapaddr".
378 */
379
380 /*
381 * Make two passes. The first pass calculates how much memory is
382 * needed and allocates it. The second pass assigns virtual
383 * addresses to the various data structures.
384 */
385 firstaddr = 0;
386 again:
387 v = (caddr_t)firstaddr;
388
389 #define valloc(name, type, num) \
390 (name) = (type *)v; v = (caddr_t)((name)+(num))
391 #define valloclim(name, type, num, lim) \
392 (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num)))
393
394 /*
395 * Calculate nbuf such that maxbufspace uses approximately 1/20
396 * of physical memory by default, with a minimum of 50 buffers.
397 *
398 * The calculation is made after discounting 128MB.
399 *
400 * NOTE: maxbufspace is (nbuf * NBUFCALCSIZE) (NBUFCALCSIZE ~= 16KB).
401 * nbuf = (kbytes / factor) would cover all of memory.
402 */
403 if (nbuf == 0) {
404 long factor = NBUFCALCSIZE / 1024; /* KB/nbuf */
405 long kbytes = physmem * (PAGE_SIZE / 1024); /* physmem */
406
407 nbuf = 50;
408 if (kbytes > 128 * 1024)
409 nbuf += (kbytes - 128 * 1024) / (factor * 20);
410 if (maxbcache && nbuf > maxbcache / NBUFCALCSIZE)
411 nbuf = maxbcache / NBUFCALCSIZE;
412 if ((size_t)nbuf * sizeof(struct buf) > MAXBUFSTRUCTSIZE) {
413 kprintf("Warning: nbuf capped at %ld due to the "
414 "reasonability limit\n", nbuf);
415 nbuf = MAXBUFSTRUCTSIZE / sizeof(struct buf);
416 }
417 }
418
419 /*
420 * Do not allow the buffer_map to be more then 1/2 the size of the
421 * kernel_map.
422 */
423 if (nbuf > (virtual_end - virtual_start +
424 virtual2_end - virtual2_start) / (MAXBSIZE * 2)) {
425 nbuf = (virtual_end - virtual_start +
426 virtual2_end - virtual2_start) / (MAXBSIZE * 2);
427 kprintf("Warning: nbufs capped at %ld due to kvm\n", nbuf);
428 }
429
430 /*
431 * Do not allow the buffer_map to use more than 50% of available
432 * physical-equivalent memory. Since the VM pages which back
433 * individual buffers are typically wired, having too many bufs
434 * can prevent the system from paging properly.
435 */
436 if (nbuf > physmem * PAGE_SIZE / (NBUFCALCSIZE * 2)) {
437 nbuf = physmem * PAGE_SIZE / (NBUFCALCSIZE * 2);
438 kprintf("Warning: nbufs capped at %ld due to physmem\n", nbuf);
439 }
440
441 /*
442 * Do not allow the sizeof(struct buf) * nbuf to exceed 1/4 of
443 * the valloc space which is just the virtual_end - virtual_start
444 * section. This is typically ~2GB regardless of the amount of
445 * memory, so we use 500MB as a metric.
446 *
447 * This is because we use valloc() to allocate the buf header array.
448 *
449 * NOTE: buffer space in bytes is limited by vfs.*bufspace sysctls.
450 */
451 if (nbuf > (virtual_end - virtual_start) / (sizeof(struct buf) * 4)) {
452 nbuf = (virtual_end - virtual_start) /
453 (sizeof(struct buf) * 4);
454 kprintf("Warning: nbufs capped at %ld due to "
455 "valloc considerations\n",
456 nbuf);
457 }
458
459 nswbuf_mem = lmax(lmin(nbuf / 32, 512), 8);
460 #ifdef NSWBUF_MIN
461 if (nswbuf_mem < NSWBUF_MIN)
462 nswbuf_mem = NSWBUF_MIN;
463 #endif
464 nswbuf_kva = lmax(lmin(nbuf / 4, 512), 16);
465 #ifdef NSWBUF_MIN
466 if (nswbuf_kva < NSWBUF_MIN)
467 nswbuf_kva = NSWBUF_MIN;
468 #endif
469
470 valloc(swbuf_mem, struct buf, nswbuf_mem);
471 valloc(swbuf_kva, struct buf, nswbuf_kva);
472 valloc(buf, struct buf, nbuf);
473
474 /*
475 * End of first pass, size has been calculated so allocate memory
476 */
477 if (firstaddr == 0) {
478 size = (vm_size_t)(v - firstaddr);
479 firstaddr = kmem_alloc(kernel_map, round_page(size),
480 VM_SUBSYS_BUF);
481 if (firstaddr == 0)
482 panic("startup: no room for tables");
483 goto again;
484 }
485
486 /*
487 * End of second pass, addresses have been assigned
488 *
489 * nbuf is an int, make sure we don't overflow the field.
490 *
491 * On 64-bit systems we always reserve maximal allocations for
492 * buffer cache buffers and there are no fragmentation issues,
493 * so the KVA segment does not have to be excessively oversized.
494 */
495 if ((vm_size_t)(v - firstaddr) != size)
496 panic("startup: table size inconsistency");
497
498 kmem_suballoc(kernel_map, clean_map, &clean_sva, &clean_eva,
499 ((vm_offset_t)(nbuf + 16) * MAXBSIZE) +
500 ((nswbuf_mem + nswbuf_kva) * MAXPHYS) + pager_map_size);
501 kmem_suballoc(clean_map, buffer_map, &buffer_sva, &buffer_eva,
502 ((vm_offset_t)(nbuf + 16) * MAXBSIZE));
503 buffer_map->system_map = 1;
504 kmem_suballoc(clean_map, pager_map, &pager_sva, &pager_eva,
505 ((vm_offset_t)(nswbuf_mem + nswbuf_kva) * MAXPHYS) +
506 pager_map_size);
507 pager_map->system_map = 1;
508 kprintf("avail memory = %ju (%ju MB)\n",
509 (uintmax_t)ptoa(vmstats.v_free_count + vmstats.v_dma_pages),
510 (uintmax_t)ptoa(vmstats.v_free_count + vmstats.v_dma_pages) /
511 1024 / 1024);
512 }
513
514 struct cpu_idle_stat {
515 int hint;
516 int reserved;
517 u_long halt;
518 u_long spin;
519 u_long repeat;
520 u_long repeat_last;
521 u_long repeat_delta;
522 u_long mwait_cx[CPU_MWAIT_CX_MAX];
523 } __cachealign;
524
525 #define CPU_IDLE_STAT_HALT -1
526 #define CPU_IDLE_STAT_SPIN -2
527
528 static struct cpu_idle_stat cpu_idle_stats[MAXCPU];
529
530 static int
sysctl_cpu_idle_cnt(SYSCTL_HANDLER_ARGS)531 sysctl_cpu_idle_cnt(SYSCTL_HANDLER_ARGS)
532 {
533 int idx = arg2, cpu, error;
534 u_long val = 0;
535
536 if (idx == CPU_IDLE_STAT_HALT) {
537 for (cpu = 0; cpu < ncpus; ++cpu)
538 val += cpu_idle_stats[cpu].halt;
539 } else if (idx == CPU_IDLE_STAT_SPIN) {
540 for (cpu = 0; cpu < ncpus; ++cpu)
541 val += cpu_idle_stats[cpu].spin;
542 } else {
543 KASSERT(idx >= 0 && idx < CPU_MWAIT_CX_MAX,
544 ("invalid index %d", idx));
545 for (cpu = 0; cpu < ncpus; ++cpu)
546 val += cpu_idle_stats[cpu].mwait_cx[idx];
547 }
548
549 error = sysctl_handle_quad(oidp, &val, 0, req);
550 if (error || req->newptr == NULL)
551 return error;
552
553 if (idx == CPU_IDLE_STAT_HALT) {
554 for (cpu = 0; cpu < ncpus; ++cpu)
555 cpu_idle_stats[cpu].halt = 0;
556 cpu_idle_stats[0].halt = val;
557 } else if (idx == CPU_IDLE_STAT_SPIN) {
558 for (cpu = 0; cpu < ncpus; ++cpu)
559 cpu_idle_stats[cpu].spin = 0;
560 cpu_idle_stats[0].spin = val;
561 } else {
562 KASSERT(idx >= 0 && idx < CPU_MWAIT_CX_MAX,
563 ("invalid index %d", idx));
564 for (cpu = 0; cpu < ncpus; ++cpu)
565 cpu_idle_stats[cpu].mwait_cx[idx] = 0;
566 cpu_idle_stats[0].mwait_cx[idx] = val;
567 }
568 return 0;
569 }
570
571 static void
cpu_mwait_attach(void)572 cpu_mwait_attach(void)
573 {
574 struct sbuf sb;
575 int hint_idx, i;
576
577 if (!CPU_MWAIT_HAS_CX)
578 return;
579
580 if (cpu_vendor_id == CPU_VENDOR_INTEL &&
581 (CPUID_TO_FAMILY(cpu_id) > 0xf ||
582 (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
583 CPUID_TO_MODEL(cpu_id) >= 0xf))) {
584 int bm_sts = 1;
585
586 /*
587 * Pentium dual-core, Core 2 and beyond do not need any
588 * additional activities to enter deep C-state, i.e. C3(+).
589 */
590 cpu_mwait_cx_no_bmarb();
591
592 TUNABLE_INT_FETCH("machdep.cpu.mwait.bm_sts", &bm_sts);
593 if (!bm_sts)
594 cpu_mwait_cx_no_bmsts();
595 }
596
597 sbuf_new(&sb, cpu_mwait_cx_supported,
598 sizeof(cpu_mwait_cx_supported), SBUF_FIXEDLEN);
599
600 for (i = 0; i < CPU_MWAIT_CX_MAX; ++i) {
601 struct cpu_mwait_cx *cx = &cpu_mwait_cx_info[i];
602 int sub;
603
604 ksnprintf(cx->name, sizeof(cx->name), "C%d", i);
605
606 sysctl_ctx_init(&cx->sysctl_ctx);
607 cx->sysctl_tree = SYSCTL_ADD_NODE(&cx->sysctl_ctx,
608 SYSCTL_STATIC_CHILDREN(_machdep_mwait), OID_AUTO,
609 cx->name, CTLFLAG_RW, NULL, "Cx control/info");
610 if (cx->sysctl_tree == NULL)
611 continue;
612
613 cx->subcnt = CPUID_MWAIT_CX_SUBCNT(cpu_mwait_extemu, i);
614 SYSCTL_ADD_INT(&cx->sysctl_ctx,
615 SYSCTL_CHILDREN(cx->sysctl_tree), OID_AUTO,
616 "subcnt", CTLFLAG_RD, &cx->subcnt, 0,
617 "sub-state count");
618 SYSCTL_ADD_PROC(&cx->sysctl_ctx,
619 SYSCTL_CHILDREN(cx->sysctl_tree), OID_AUTO,
620 "entered", (CTLTYPE_QUAD | CTLFLAG_RW), 0,
621 i, sysctl_cpu_idle_cnt, "Q", "# of times entered");
622
623 for (sub = 0; sub < cx->subcnt; ++sub)
624 sbuf_printf(&sb, "C%d/%d ", i, sub);
625 }
626 sbuf_trim(&sb);
627 sbuf_finish(&sb);
628
629 /*
630 * Non-deep C-states
631 */
632 cpu_mwait_c1_hints_cnt = cpu_mwait_cx_info[CPU_MWAIT_C1].subcnt;
633 for (i = CPU_MWAIT_C1; i < CPU_MWAIT_C3; ++i)
634 cpu_mwait_hints_cnt += cpu_mwait_cx_info[i].subcnt;
635 cpu_mwait_hints = kmalloc(sizeof(int) * cpu_mwait_hints_cnt,
636 M_DEVBUF, M_WAITOK);
637
638 hint_idx = 0;
639 for (i = CPU_MWAIT_C1; i < CPU_MWAIT_C3; ++i) {
640 int j, subcnt;
641
642 subcnt = cpu_mwait_cx_info[i].subcnt;
643 for (j = 0; j < subcnt; ++j) {
644 KASSERT(hint_idx < cpu_mwait_hints_cnt,
645 ("invalid mwait hint index %d", hint_idx));
646 cpu_mwait_hints[hint_idx] = MWAIT_EAX_HINT(i, j);
647 ++hint_idx;
648 }
649 }
650 KASSERT(hint_idx == cpu_mwait_hints_cnt,
651 ("mwait hint count %d != index %d",
652 cpu_mwait_hints_cnt, hint_idx));
653
654 if (bootverbose) {
655 kprintf("MWAIT hints (%d C1 hints):\n", cpu_mwait_c1_hints_cnt);
656 for (i = 0; i < cpu_mwait_hints_cnt; ++i) {
657 int hint = cpu_mwait_hints[i];
658
659 kprintf(" C%d/%d hint 0x%04x\n",
660 MWAIT_EAX_TO_CX(hint), MWAIT_EAX_TO_CX_SUB(hint),
661 hint);
662 }
663 }
664
665 /*
666 * Deep C-states
667 */
668 for (i = CPU_MWAIT_C1; i < CPU_MWAIT_CX_MAX; ++i)
669 cpu_mwait_deep_hints_cnt += cpu_mwait_cx_info[i].subcnt;
670 cpu_mwait_deep_hints = kmalloc(sizeof(int) * cpu_mwait_deep_hints_cnt,
671 M_DEVBUF, M_WAITOK);
672
673 hint_idx = 0;
674 for (i = CPU_MWAIT_C1; i < CPU_MWAIT_CX_MAX; ++i) {
675 int j, subcnt;
676
677 subcnt = cpu_mwait_cx_info[i].subcnt;
678 for (j = 0; j < subcnt; ++j) {
679 KASSERT(hint_idx < cpu_mwait_deep_hints_cnt,
680 ("invalid mwait deep hint index %d", hint_idx));
681 cpu_mwait_deep_hints[hint_idx] = MWAIT_EAX_HINT(i, j);
682 ++hint_idx;
683 }
684 }
685 KASSERT(hint_idx == cpu_mwait_deep_hints_cnt,
686 ("mwait deep hint count %d != index %d",
687 cpu_mwait_deep_hints_cnt, hint_idx));
688
689 if (bootverbose) {
690 kprintf("MWAIT deep hints:\n");
691 for (i = 0; i < cpu_mwait_deep_hints_cnt; ++i) {
692 int hint = cpu_mwait_deep_hints[i];
693
694 kprintf(" C%d/%d hint 0x%04x\n",
695 MWAIT_EAX_TO_CX(hint), MWAIT_EAX_TO_CX_SUB(hint),
696 hint);
697 }
698 }
699 cpu_idle_repeat_max = 256 * cpu_mwait_deep_hints_cnt;
700
701 for (i = 0; i < ncpus; ++i) {
702 char name[16];
703
704 ksnprintf(name, sizeof(name), "idle%d", i);
705 SYSCTL_ADD_PROC(NULL,
706 SYSCTL_STATIC_CHILDREN(_machdep_mwait_CX), OID_AUTO,
707 name, (CTLTYPE_STRING | CTLFLAG_RW), &cpu_idle_stats[i],
708 0, cpu_mwait_cx_pcpu_idle_sysctl, "A", "");
709 }
710 }
711
712 static void
cpu_finish(void * dummy __unused)713 cpu_finish(void *dummy __unused)
714 {
715 cpu_setregs();
716 cpu_mwait_attach();
717 }
718
719 static void
pic_finish(void * dummy __unused)720 pic_finish(void *dummy __unused)
721 {
722 /* Log ELCR information */
723 elcr_dump();
724
725 /* Log MPTABLE information */
726 mptable_pci_int_dump();
727
728 /* Finalize PCI */
729 MachIntrABI.finalize();
730 }
731
732 /*
733 * Send an interrupt to process.
734 *
735 * Stack is set up to allow sigcode stored
736 * at top to call routine, followed by kcall
737 * to sigreturn routine below. After sigreturn
738 * resets the signal mask, the stack, and the
739 * frame pointer, it returns to the user
740 * specified pc, psl.
741 */
742 void
sendsig(sig_t catcher,int sig,sigset_t * mask,u_long code)743 sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
744 {
745 struct lwp *lp = curthread->td_lwp;
746 struct proc *p = lp->lwp_proc;
747 struct trapframe *regs;
748 struct sigacts *psp = p->p_sigacts;
749 struct sigframe sf, *sfp;
750 int oonstack;
751 char *sp;
752
753 regs = lp->lwp_md.md_regs;
754 oonstack = (lp->lwp_sigstk.ss_flags & SS_ONSTACK) ? 1 : 0;
755
756 /* Save user context */
757 bzero(&sf, sizeof(struct sigframe));
758 sf.sf_uc.uc_sigmask = *mask;
759 sf.sf_uc.uc_stack = lp->lwp_sigstk;
760 sf.sf_uc.uc_mcontext.mc_onstack = oonstack;
761 KKASSERT(__offsetof(struct trapframe, tf_rdi) == 0);
762 /* gcc errors out on optimized bcopy */
763 _bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(struct trapframe));
764
765 /* Make the size of the saved context visible to userland */
766 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext);
767
768 /* Allocate and validate space for the signal handler context. */
769 if ((lp->lwp_flags & LWP_ALTSTACK) != 0 && !oonstack &&
770 SIGISMEMBER(psp->ps_sigonstack, sig)) {
771 sp = (char *)lp->lwp_sigstk.ss_sp + lp->lwp_sigstk.ss_size -
772 sizeof(struct sigframe);
773 lp->lwp_sigstk.ss_flags |= SS_ONSTACK;
774 } else {
775 /* We take red zone into account */
776 sp = (char *)regs->tf_rsp - sizeof(struct sigframe) - 128;
777 }
778
779 /*
780 * XXX AVX needs 64-byte alignment but sigframe has other fields and
781 * the embedded ucontext is not at the front, so aligning this won't
782 * help us. Fortunately we bcopy in/out of the sigframe, so the
783 * kernel is ok.
784 *
785 * The problem though is if userland winds up trying to use the
786 * context directly.
787 */
788 sfp = (struct sigframe *)((intptr_t)sp & ~(intptr_t)0xF);
789
790 /* Translate the signal is appropriate */
791 if (p->p_sysent->sv_sigtbl) {
792 if (sig <= p->p_sysent->sv_sigsize)
793 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
794 }
795
796 /*
797 * Build the argument list for the signal handler.
798 *
799 * Arguments are in registers (%rdi, %rsi, %rdx, %rcx)
800 */
801 regs->tf_rdi = sig; /* argument 1 */
802 regs->tf_rdx = (register_t)&sfp->sf_uc; /* argument 3 */
803
804 if (SIGISMEMBER(psp->ps_siginfo, sig)) {
805 /*
806 * Signal handler installed with SA_SIGINFO.
807 *
808 * action(signo, siginfo, ucontext)
809 */
810 regs->tf_rsi = (register_t)&sfp->sf_si; /* argument 2 */
811 regs->tf_rcx = (register_t)regs->tf_addr; /* argument 4 */
812 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
813
814 /* fill siginfo structure */
815 sf.sf_si.si_signo = sig;
816 sf.sf_si.si_pid = psp->ps_frominfo[sig].pid;
817 sf.sf_si.si_uid = psp->ps_frominfo[sig].uid;
818 sf.sf_si.si_code = code;
819 sf.sf_si.si_addr = (void *)regs->tf_addr;
820 } else {
821 /*
822 * Old FreeBSD-style arguments.
823 *
824 * handler (signo, code, [uc], addr)
825 */
826 regs->tf_rsi = (register_t)code; /* argument 2 */
827 regs->tf_rcx = (register_t)regs->tf_addr; /* argument 4 */
828 sf.sf_ahu.sf_handler = catcher;
829 }
830
831 /*
832 * If we're a vm86 process, we want to save the segment registers.
833 * We also change eflags to be our emulated eflags, not the actual
834 * eflags.
835 */
836 #if 0 /* JG */
837 if (regs->tf_eflags & PSL_VM) {
838 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
839 struct vm86_kernel *vm86 = &lp->lwp_thread->td_pcb->pcb_ext->ext_vm86;
840
841 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
842 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
843 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
844 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
845
846 if (vm86->vm86_has_vme == 0)
847 sf.sf_uc.uc_mcontext.mc_eflags =
848 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
849 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
850
851 /*
852 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
853 * syscalls made by the signal handler. This just avoids
854 * wasting time for our lazy fixup of such faults. PSL_NT
855 * does nothing in vm86 mode, but vm86 programs can set it
856 * almost legitimately in probes for old cpu types.
857 */
858 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
859 }
860 #endif
861
862 /*
863 * Save the FPU state and reinit the FP unit
864 */
865 npxpush(&sf.sf_uc.uc_mcontext);
866
867 /*
868 * Copy the sigframe out to the user's stack.
869 */
870 if (copyout(&sf, sfp, sizeof(struct sigframe)) != 0) {
871 /*
872 * Something is wrong with the stack pointer.
873 * ...Kill the process.
874 */
875 sigexit(lp, SIGILL);
876 }
877
878 regs->tf_rsp = (register_t)sfp;
879 regs->tf_rip = trunc_page64(PS_STRINGS - *(p->p_sysent->sv_szsigcode));
880 regs->tf_rip -= SZSIGCODE_EXTRA_BYTES;
881
882 /*
883 * x86 abi specifies that the direction flag must be cleared
884 * on function entry
885 */
886 regs->tf_rflags &= ~(PSL_T | PSL_D);
887
888 /*
889 * 64 bit mode has a code and stack selector but
890 * no data or extra selector. %fs and %gs are not
891 * stored in-context.
892 */
893 regs->tf_cs = _ucodesel;
894 regs->tf_ss = _udatasel;
895 clear_quickret();
896 }
897
898 /*
899 * Sanitize the trapframe for a virtual kernel passing control to a custom
900 * VM context. Remove any items that would otherwise create a privilage
901 * issue.
902 *
903 * XXX at the moment we allow userland to set the resume flag. Is this a
904 * bad idea?
905 */
906 int
cpu_sanitize_frame(struct trapframe * frame)907 cpu_sanitize_frame(struct trapframe *frame)
908 {
909 frame->tf_cs = _ucodesel;
910 frame->tf_ss = _udatasel;
911 /* XXX VM (8086) mode not supported? */
912 frame->tf_rflags &= (PSL_RF | PSL_USERCHANGE | PSL_VM_UNSUPP);
913 frame->tf_rflags |= PSL_RESERVED_DEFAULT | PSL_I;
914
915 return(0);
916 }
917
918 /*
919 * Sanitize the tls so loading the descriptor does not blow up
920 * on us. For x86_64 we don't have to do anything.
921 */
922 int
cpu_sanitize_tls(struct savetls * tls)923 cpu_sanitize_tls(struct savetls *tls)
924 {
925 return(0);
926 }
927
928 /*
929 * sigreturn(ucontext_t *sigcntxp)
930 *
931 * System call to cleanup state after a signal
932 * has been taken. Reset signal mask and
933 * stack state from context left by sendsig (above).
934 * Return to previous pc and psl as specified by
935 * context left by sendsig. Check carefully to
936 * make sure that the user has not modified the
937 * state to gain improper privileges.
938 *
939 * MPSAFE
940 */
941 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
942 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
943
944 int
sys_sigreturn(struct sysmsg * sysmsg,const struct sigreturn_args * uap)945 sys_sigreturn(struct sysmsg *sysmsg, const struct sigreturn_args *uap)
946 {
947 struct lwp *lp = curthread->td_lwp;
948 struct trapframe *regs;
949 ucontext_t uc;
950 ucontext_t *ucp;
951 register_t rflags;
952 int cs;
953 int error;
954
955 /*
956 * We have to copy the information into kernel space so userland
957 * can't modify it while we are sniffing it.
958 */
959 regs = lp->lwp_md.md_regs;
960 error = copyin(uap->sigcntxp, &uc, sizeof(uc));
961 if (error)
962 return (error);
963 ucp = &uc;
964 rflags = ucp->uc_mcontext.mc_rflags;
965
966 /* VM (8086) mode not supported */
967 rflags &= ~PSL_VM_UNSUPP;
968
969 #if 0 /* JG */
970 if (eflags & PSL_VM) {
971 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
972 struct vm86_kernel *vm86;
973
974 /*
975 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
976 * set up the vm86 area, and we can't enter vm86 mode.
977 */
978 if (lp->lwp_thread->td_pcb->pcb_ext == 0)
979 return (EINVAL);
980 vm86 = &lp->lwp_thread->td_pcb->pcb_ext->ext_vm86;
981 if (vm86->vm86_inited == 0)
982 return (EINVAL);
983
984 /* go back to user mode if both flags are set */
985 if ((eflags & PSL_VIP) && (eflags & PSL_VIF))
986 trapsignal(lp, SIGBUS, 0);
987
988 if (vm86->vm86_has_vme) {
989 eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
990 (eflags & VME_USERCHANGE) | PSL_VM;
991 } else {
992 vm86->vm86_eflags = eflags; /* save VIF, VIP */
993 eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
994 (eflags & VM_USERCHANGE) | PSL_VM;
995 }
996 bcopy(&ucp->uc_mcontext.mc_gs, tf, sizeof(struct trapframe));
997 tf->tf_eflags = eflags;
998 tf->tf_vm86_ds = tf->tf_ds;
999 tf->tf_vm86_es = tf->tf_es;
1000 tf->tf_vm86_fs = tf->tf_fs;
1001 tf->tf_vm86_gs = tf->tf_gs;
1002 tf->tf_ds = _udatasel;
1003 tf->tf_es = _udatasel;
1004 tf->tf_fs = _udatasel;
1005 tf->tf_gs = _udatasel;
1006 } else
1007 #endif
1008 {
1009 /*
1010 * Don't allow users to change privileged or reserved flags.
1011 */
1012 /*
1013 * XXX do allow users to change the privileged flag PSL_RF.
1014 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
1015 * should sometimes set it there too. tf_eflags is kept in
1016 * the signal context during signal handling and there is no
1017 * other place to remember it, so the PSL_RF bit may be
1018 * corrupted by the signal handler without us knowing.
1019 * Corruption of the PSL_RF bit at worst causes one more or
1020 * one less debugger trap, so allowing it is fairly harmless.
1021 */
1022 if (!EFL_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) {
1023 kprintf("sigreturn: rflags = 0x%lx\n", (long)rflags);
1024 return(EINVAL);
1025 }
1026
1027 /*
1028 * Don't allow users to load a valid privileged %cs. Let the
1029 * hardware check for invalid selectors, excess privilege in
1030 * other selectors, invalid %eip's and invalid %esp's.
1031 */
1032 cs = ucp->uc_mcontext.mc_cs;
1033 if (!CS_SECURE(cs)) {
1034 kprintf("sigreturn: cs = 0x%x\n", cs);
1035 trapsignal(lp, SIGBUS, T_PROTFLT);
1036 return(EINVAL);
1037 }
1038 /* gcc errors out on optimized bcopy */
1039 _bcopy(&ucp->uc_mcontext.mc_rdi, regs,
1040 sizeof(struct trapframe));
1041 }
1042
1043 /*
1044 * Restore the FPU state from the frame
1045 */
1046 crit_enter();
1047 npxpop(&ucp->uc_mcontext);
1048
1049 if (ucp->uc_mcontext.mc_onstack & 1)
1050 lp->lwp_sigstk.ss_flags |= SS_ONSTACK;
1051 else
1052 lp->lwp_sigstk.ss_flags &= ~SS_ONSTACK;
1053
1054 lp->lwp_sigmask = ucp->uc_sigmask;
1055 SIG_CANTMASK(lp->lwp_sigmask);
1056 clear_quickret();
1057 crit_exit();
1058 return(EJUSTRETURN);
1059 }
1060
1061 /*
1062 * Machine dependent boot() routine
1063 *
1064 * I haven't seen anything to put here yet
1065 * Possibly some stuff might be grafted back here from boot()
1066 */
1067 void
cpu_boot(int howto)1068 cpu_boot(int howto)
1069 {
1070 }
1071
1072 /*
1073 * Shutdown the CPU as much as possible
1074 */
1075 void
cpu_halt(void)1076 cpu_halt(void)
1077 {
1078 for (;;)
1079 __asm__ __volatile("hlt");
1080 }
1081
1082 /*
1083 * cpu_idle() represents the idle LWKT. You cannot return from this function
1084 * (unless you want to blow things up!). Instead we look for runnable threads
1085 * and loop or halt as appropriate. Giant is not held on entry to the thread.
1086 *
1087 * The main loop is entered with a critical section held, we must release
1088 * the critical section before doing anything else. lwkt_switch() will
1089 * check for pending interrupts due to entering and exiting its own
1090 * critical section.
1091 *
1092 * NOTE: On an SMP system we rely on a scheduler IPI to wake a HLTed cpu up.
1093 * However, there are cases where the idlethread will be entered with
1094 * the possibility that no IPI will occur and in such cases
1095 * lwkt_switch() sets TDF_IDLE_NOHLT.
1096 *
1097 * NOTE: cpu_idle_repeat determines how many entries into the idle thread
1098 * must occur before it starts using ACPI halt.
1099 *
1100 * NOTE: Value overridden in hammer_time().
1101 */
1102 static int cpu_idle_hlt = 2;
1103 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW,
1104 &cpu_idle_hlt, 0, "Idle loop HLT enable");
1105 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_repeat, CTLFLAG_RW,
1106 &cpu_idle_repeat, 0, "Idle entries before acpi hlt");
1107
1108 SYSCTL_PROC(_machdep, OID_AUTO, cpu_idle_hltcnt, (CTLTYPE_QUAD | CTLFLAG_RW),
1109 0, CPU_IDLE_STAT_HALT, sysctl_cpu_idle_cnt, "Q", "Idle loop entry halts");
1110 SYSCTL_PROC(_machdep, OID_AUTO, cpu_idle_spincnt, (CTLTYPE_QUAD | CTLFLAG_RW),
1111 0, CPU_IDLE_STAT_SPIN, sysctl_cpu_idle_cnt, "Q", "Idle loop entry spins");
1112
1113 static void
cpu_idle_default_hook(void)1114 cpu_idle_default_hook(void)
1115 {
1116 /*
1117 * We must guarentee that hlt is exactly the instruction
1118 * following the sti.
1119 */
1120 __asm __volatile("sti; hlt");
1121 }
1122
1123 /* Other subsystems (e.g., ACPI) can hook this later. */
1124 void (*cpu_idle_hook)(void) = cpu_idle_default_hook;
1125
1126 static __inline int
cpu_mwait_cx_hint(struct cpu_idle_stat * stat)1127 cpu_mwait_cx_hint(struct cpu_idle_stat *stat)
1128 {
1129 int hint, cx_idx;
1130 u_int idx;
1131
1132 hint = stat->hint;
1133 if (hint >= 0)
1134 goto done;
1135
1136 idx = (stat->repeat + stat->repeat_last + stat->repeat_delta) >>
1137 cpu_mwait_repeat_shift;
1138 if (idx >= cpu_mwait_c1_hints_cnt) {
1139 /* Step up faster, once we walked through all C1 states */
1140 stat->repeat_delta += 1 << (cpu_mwait_repeat_shift + 1);
1141 }
1142 if (hint == CPU_MWAIT_HINT_AUTODEEP) {
1143 if (idx >= cpu_mwait_deep_hints_cnt)
1144 idx = cpu_mwait_deep_hints_cnt - 1;
1145 hint = cpu_mwait_deep_hints[idx];
1146 } else {
1147 if (idx >= cpu_mwait_hints_cnt)
1148 idx = cpu_mwait_hints_cnt - 1;
1149 hint = cpu_mwait_hints[idx];
1150 }
1151 done:
1152 cx_idx = MWAIT_EAX_TO_CX(hint);
1153 if (cx_idx >= 0 && cx_idx < CPU_MWAIT_CX_MAX)
1154 stat->mwait_cx[cx_idx]++;
1155 return hint;
1156 }
1157
1158 void
cpu_idle(void)1159 cpu_idle(void)
1160 {
1161 globaldata_t gd = mycpu;
1162 struct cpu_idle_stat *stat = &cpu_idle_stats[gd->gd_cpuid];
1163 struct thread *td __debugvar = gd->gd_curthread;
1164 int reqflags;
1165
1166 stat->repeat = stat->repeat_last = cpu_idle_repeat_max;
1167
1168 crit_exit();
1169 KKASSERT(td->td_critcount == 0);
1170
1171 for (;;) {
1172 /*
1173 * See if there are any LWKTs ready to go.
1174 */
1175 lwkt_switch();
1176
1177 /*
1178 * When halting inside a cli we must check for reqflags
1179 * races, particularly [re]schedule requests. Running
1180 * splz() does the job.
1181 *
1182 * cpu_idle_hlt:
1183 * 0 Never halt, just spin
1184 *
1185 * 1 Always use MONITOR/MWAIT if avail, HLT
1186 * otherwise.
1187 *
1188 * Better default for modern (Haswell+) Intel
1189 * cpus.
1190 *
1191 * 2 Use HLT/MONITOR/MWAIT up to a point and then
1192 * use the ACPI halt (default). This is a hybrid
1193 * approach. See machdep.cpu_idle_repeat.
1194 *
1195 * Better default for modern AMD cpus and older
1196 * Intel cpus.
1197 *
1198 * 3 Always use the ACPI halt. This typically
1199 * eats the least amount of power but the cpu
1200 * will be slow waking up. Slows down e.g.
1201 * compiles and other pipe/event oriented stuff.
1202 *
1203 * Usually the best default for AMD cpus.
1204 *
1205 * 4 Always use HLT.
1206 *
1207 * 5 Always spin.
1208 *
1209 * NOTE: Interrupts are enabled and we are not in a critical
1210 * section.
1211 *
1212 * NOTE: Preemptions do not reset gd_idle_repeat. Also we
1213 * don't bother capping gd_idle_repeat, it is ok if
1214 * it overflows (we do make it unsigned, however).
1215 *
1216 * Implement optimized invltlb operations when halted
1217 * in idle. By setting the bit in smp_idleinvl_mask
1218 * we inform other cpus that they can set _reqs to
1219 * request an invltlb. Current the code to do that
1220 * sets the bits in _reqs anyway, but then check _mask
1221 * to determine if they can assume the invltlb will execute.
1222 *
1223 * A critical section is required to ensure that interrupts
1224 * do not fully run until after we've had a chance to execute
1225 * the request.
1226 */
1227 if (gd->gd_idle_repeat == 0) {
1228 stat->repeat = (stat->repeat + stat->repeat_last) >> 1;
1229 if (stat->repeat > cpu_idle_repeat_max)
1230 stat->repeat = cpu_idle_repeat_max;
1231 stat->repeat_last = 0;
1232 stat->repeat_delta = 0;
1233 }
1234 ++stat->repeat_last;
1235
1236 /*
1237 * General idle thread halt code
1238 *
1239 * IBRS NOTES - IBRS is a SPECTRE mitigation. When going
1240 * idle, disable IBRS to reduce hyperthread
1241 * overhead.
1242 */
1243 ++gd->gd_idle_repeat;
1244
1245 switch(cpu_idle_hlt) {
1246 default:
1247 case 0:
1248 /*
1249 * Always spin
1250 */
1251 ;
1252 do_spin:
1253 splz();
1254 __asm __volatile("sti");
1255 stat->spin++;
1256 crit_enter_gd(gd);
1257 crit_exit_gd(gd);
1258 break;
1259 case 2:
1260 /*
1261 * Use MONITOR/MWAIT (or HLT) for a few cycles,
1262 * then start using the ACPI halt code if we
1263 * continue to be idle.
1264 */
1265 if (gd->gd_idle_repeat >= cpu_idle_repeat)
1266 goto do_acpi;
1267 /* FALL THROUGH */
1268 case 1:
1269 /*
1270 * Always use MONITOR/MWAIT (will use HLT if
1271 * MONITOR/MWAIT not available).
1272 */
1273 if (cpu_mi_feature & CPU_MI_MONITOR) {
1274 splz(); /* XXX */
1275 reqflags = gd->gd_reqflags;
1276 if (reqflags & RQF_IDLECHECK_WK_MASK)
1277 goto do_spin;
1278 crit_enter_gd(gd);
1279 ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask, gd->gd_cpuid);
1280 /*
1281 * IBRS/STIBP
1282 */
1283 if (pscpu->trampoline.tr_pcb_spec_ctrl[1] &
1284 SPEC_CTRL_DUMMY_ENABLE) {
1285 wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[1] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
1286 }
1287 cpu_mmw_pause_int(&gd->gd_reqflags, reqflags,
1288 cpu_mwait_cx_hint(stat), 0);
1289 if (pscpu->trampoline.tr_pcb_spec_ctrl[0] &
1290 SPEC_CTRL_DUMMY_ENABLE) {
1291 wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[0] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
1292 }
1293 stat->halt++;
1294 ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask, gd->gd_cpuid);
1295 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs,
1296 gd->gd_cpuid)) {
1297 cpu_invltlb();
1298 cpu_mfence();
1299 }
1300 crit_exit_gd(gd);
1301 break;
1302 }
1303 /* FALLTHROUGH */
1304 case 4:
1305 /*
1306 * Use HLT
1307 */
1308 __asm __volatile("cli");
1309 splz();
1310 crit_enter_gd(gd);
1311 if ((gd->gd_reqflags & RQF_IDLECHECK_WK_MASK) == 0) {
1312 ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask,
1313 gd->gd_cpuid);
1314 if (pscpu->trampoline.tr_pcb_spec_ctrl[1] &
1315 SPEC_CTRL_DUMMY_ENABLE) {
1316 wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[1] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
1317 }
1318 cpu_idle_default_hook();
1319 if (pscpu->trampoline.tr_pcb_spec_ctrl[0] &
1320 SPEC_CTRL_DUMMY_ENABLE) {
1321 wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[0] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
1322 }
1323 ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask,
1324 gd->gd_cpuid);
1325 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs,
1326 gd->gd_cpuid)) {
1327 cpu_invltlb();
1328 cpu_mfence();
1329 }
1330 }
1331 __asm __volatile("sti");
1332 stat->halt++;
1333 crit_exit_gd(gd);
1334 break;
1335 case 3:
1336 /*
1337 * Use ACPI halt
1338 */
1339 ;
1340 do_acpi:
1341 __asm __volatile("cli");
1342 splz();
1343 crit_enter_gd(gd);
1344 if ((gd->gd_reqflags & RQF_IDLECHECK_WK_MASK) == 0) {
1345 ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask,
1346 gd->gd_cpuid);
1347 if (pscpu->trampoline.tr_pcb_spec_ctrl[1] &
1348 SPEC_CTRL_DUMMY_ENABLE) {
1349 wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[1] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
1350 }
1351 cpu_idle_hook();
1352 if (pscpu->trampoline.tr_pcb_spec_ctrl[0] &
1353 SPEC_CTRL_DUMMY_ENABLE) {
1354 wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[0] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
1355 }
1356 ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask,
1357 gd->gd_cpuid);
1358 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs,
1359 gd->gd_cpuid)) {
1360 cpu_invltlb();
1361 cpu_mfence();
1362 }
1363 }
1364 __asm __volatile("sti");
1365 stat->halt++;
1366 crit_exit_gd(gd);
1367 break;
1368 }
1369 }
1370 }
1371
1372 /*
1373 * Called from deep ACPI via cpu_idle_hook() (see above) to actually halt
1374 * the cpu in C1. ACPI might use other halt methods for deeper states
1375 * and not reach here.
1376 *
1377 * For now we always use HLT as we are not sure what ACPI may have actually
1378 * done. MONITOR/MWAIT might not be appropriate.
1379 *
1380 * NOTE: MONITOR/MWAIT does not appear to throttle AMD cpus, while HLT
1381 * does. On Intel, MONITOR/MWAIT does appear to throttle the cpu.
1382 */
1383 void
cpu_idle_halt(void)1384 cpu_idle_halt(void)
1385 {
1386 globaldata_t gd;
1387
1388 gd = mycpu;
1389 #if 0
1390 /* DISABLED FOR NOW */
1391 struct cpu_idle_stat *stat;
1392 int reqflags;
1393
1394
1395 if ((cpu_idle_hlt == 1 || cpu_idle_hlt == 2) &&
1396 (cpu_mi_feature & CPU_MI_MONITOR) &&
1397 cpu_vendor_id != CPU_VENDOR_AMD) {
1398 /*
1399 * Use MONITOR/MWAIT
1400 *
1401 * (NOTE: On ryzen, MWAIT does not throttle clocks, so we
1402 * have to use HLT)
1403 */
1404 stat = &cpu_idle_stats[gd->gd_cpuid];
1405 reqflags = gd->gd_reqflags;
1406 if ((reqflags & RQF_IDLECHECK_WK_MASK) == 0) {
1407 __asm __volatile("sti");
1408 cpu_mmw_pause_int(&gd->gd_reqflags, reqflags,
1409 cpu_mwait_cx_hint(stat), 0);
1410 } else {
1411 __asm __volatile("sti; pause");
1412 }
1413 } else
1414 #endif
1415 {
1416 /*
1417 * Use HLT
1418 */
1419 if ((gd->gd_reqflags & RQF_IDLECHECK_WK_MASK) == 0)
1420 __asm __volatile("sti; hlt");
1421 else
1422 __asm __volatile("sti; pause");
1423 }
1424 }
1425
1426
1427 /*
1428 * Called in a loop indirectly via Xcpustop
1429 */
1430 void
cpu_smp_stopped(void)1431 cpu_smp_stopped(void)
1432 {
1433 globaldata_t gd = mycpu;
1434 volatile __uint64_t *ptr;
1435 __uint64_t ovalue;
1436
1437 ptr = CPUMASK_ADDR(started_cpus, gd->gd_cpuid);
1438 ovalue = *ptr;
1439 if ((ovalue & CPUMASK_SIMPLE(gd->gd_cpuid & 63)) == 0) {
1440 if (cpu_mi_feature & CPU_MI_MONITOR) {
1441 if (cpu_mwait_hints) {
1442 cpu_mmw_pause_long(__DEVOLATILE(void *, ptr),
1443 ovalue,
1444 cpu_mwait_hints[
1445 cpu_mwait_hints_cnt - 1], 0);
1446 } else {
1447 cpu_mmw_pause_long(__DEVOLATILE(void *, ptr),
1448 ovalue, 0, 0);
1449 }
1450 } else {
1451 cpu_halt(); /* depend on lapic timer */
1452 }
1453 }
1454 }
1455
1456 /*
1457 * This routine is called if a spinlock has been held through the
1458 * exponential backoff period and is seriously contested. On a real cpu
1459 * we let it spin.
1460 */
1461 void
cpu_spinlock_contested(void)1462 cpu_spinlock_contested(void)
1463 {
1464 cpu_pause();
1465 }
1466
1467 /*
1468 * Clear registers on exec
1469 */
1470 void
exec_setregs(u_long entry,u_long stack,u_long ps_strings)1471 exec_setregs(u_long entry, u_long stack, u_long ps_strings)
1472 {
1473 struct thread *td = curthread;
1474 struct lwp *lp = td->td_lwp;
1475 struct pcb *pcb = td->td_pcb;
1476 struct trapframe *regs = lp->lwp_md.md_regs;
1477
1478 user_ldt_free(pcb);
1479
1480 clear_quickret();
1481 bzero((char *)regs, sizeof(struct trapframe));
1482 regs->tf_rip = entry;
1483 regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; /* align the stack */
1484 regs->tf_rdi = stack; /* argv */
1485 regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
1486 regs->tf_ss = _udatasel;
1487 regs->tf_cs = _ucodesel;
1488 regs->tf_rbx = ps_strings;
1489
1490 /*
1491 * Reset the hardware debug registers if they were in use.
1492 * They won't have any meaning for the newly exec'd process.
1493 */
1494 if (pcb->pcb_flags & PCB_DBREGS) {
1495 pcb->pcb_dr0 = 0;
1496 pcb->pcb_dr1 = 0;
1497 pcb->pcb_dr2 = 0;
1498 pcb->pcb_dr3 = 0;
1499 pcb->pcb_dr6 = 0;
1500 pcb->pcb_dr7 = 0; /* JG set bit 10? */
1501 if (pcb == td->td_pcb) {
1502 /*
1503 * Clear the debug registers on the running
1504 * CPU, otherwise they will end up affecting
1505 * the next process we switch to.
1506 */
1507 reset_dbregs();
1508 }
1509 pcb->pcb_flags &= ~PCB_DBREGS;
1510 }
1511
1512 /*
1513 * Initialize the math emulator (if any) for the current process.
1514 * Actually, just clear the bit that says that the emulator has
1515 * been initialized. Initialization is delayed until the process
1516 * traps to the emulator (if it is done at all) mainly because
1517 * emulators don't provide an entry point for initialization.
1518 */
1519 pcb->pcb_flags &= ~FP_SOFTFP;
1520
1521 /*
1522 * NOTE: do not set CR0_TS here. npxinit() must do it after clearing
1523 * gd_npxthread. Otherwise a preemptive interrupt thread
1524 * may panic in npxdna().
1525 */
1526 crit_enter();
1527 load_cr0(rcr0() | CR0_MP);
1528
1529 /*
1530 * NOTE: The MSR values must be correct so we can return to
1531 * userland. gd_user_fs/gs must be correct so the switch
1532 * code knows what the current MSR values are.
1533 */
1534 pcb->pcb_fsbase = 0; /* Values loaded from PCB on switch */
1535 pcb->pcb_gsbase = 0;
1536 mdcpu->gd_user_fs = 0; /* Cache of current MSR values */
1537 mdcpu->gd_user_gs = 0;
1538 wrmsr(MSR_FSBASE, 0); /* Set MSR values for return to userland */
1539 wrmsr(MSR_KGSBASE, 0);
1540
1541 /* Initialize the npx (if any) for the current process. */
1542 npxinit();
1543 crit_exit();
1544
1545 pcb->pcb_ds = _udatasel;
1546 pcb->pcb_es = _udatasel;
1547 pcb->pcb_fs = _udatasel;
1548 pcb->pcb_gs = _udatasel;
1549 }
1550
1551 void
cpu_setregs(void)1552 cpu_setregs(void)
1553 {
1554 register_t cr0;
1555
1556 cr0 = rcr0();
1557 cr0 |= CR0_NE; /* Done by npxinit() */
1558 cr0 |= CR0_MP | CR0_TS; /* Done at every execve() too. */
1559 cr0 |= CR0_WP | CR0_AM;
1560 load_cr0(cr0);
1561 load_gs(_udatasel);
1562 }
1563
1564 static int
sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS)1565 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS)
1566 {
1567 int error;
1568 error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2,
1569 req);
1570 if (!error && req->newptr)
1571 resettodr();
1572 return (error);
1573 }
1574
1575 SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW,
1576 &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", "");
1577
1578 SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set,
1579 CTLFLAG_RW, &disable_rtc_set, 0, "");
1580
1581 #if 0 /* JG */
1582 SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo,
1583 CTLFLAG_RD, &bootinfo, bootinfo, "");
1584 #endif
1585
1586 SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock,
1587 CTLFLAG_RW, &wall_cmos_clock, 0, "");
1588
1589 static int
efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)1590 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1591 {
1592 struct efi_map_header *efihdr;
1593 caddr_t kmdp;
1594 uint32_t efisize;
1595
1596 kmdp = preload_search_by_type("elf kernel");
1597 if (kmdp == NULL)
1598 kmdp = preload_search_by_type("elf64 kernel");
1599 efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1600 MODINFO_METADATA | MODINFOMD_EFI_MAP);
1601 if (efihdr == NULL)
1602 return (0);
1603 efisize = *((uint32_t *)efihdr - 1);
1604 return (SYSCTL_OUT(req, efihdr, efisize));
1605 }
1606 SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1607 efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map");
1608
1609 /*
1610 * Initialize x86 and configure to run kernel
1611 */
1612
1613 /*
1614 * Initialize segments & interrupt table
1615 */
1616
1617 int _default_ldt;
1618 struct user_segment_descriptor gdt_cpu0[MAXGDT_COUNT];
1619 struct gate_descriptor idt_arr[MAXCPU][NIDT];
1620 #if 0 /* JG */
1621 union descriptor ldt[NLDT]; /* local descriptor table */
1622 #endif
1623
1624 /* table descriptors - used to load tables by cpu */
1625 struct region_descriptor r_gdt;
1626 struct region_descriptor r_idt_arr[MAXCPU];
1627
1628 /* JG proc0paddr is a virtual address */
1629 void *proc0paddr;
1630 /* JG alignment? */
1631 char proc0paddr_buff[LWKT_THREAD_STACK];
1632
1633
1634 /* software prototypes -- in more palatable form */
1635 struct soft_segment_descriptor gdt_segs[] = {
1636 /* GNULL_SEL 0 Null Descriptor */
1637 { 0x0, /* segment base address */
1638 0x0, /* length */
1639 0, /* segment type */
1640 0, /* segment descriptor priority level */
1641 0, /* segment descriptor present */
1642 0, /* long */
1643 0, /* default 32 vs 16 bit size */
1644 0 /* limit granularity (byte/page units)*/ },
1645 /* GCODE_SEL 1 Code Descriptor for kernel */
1646 { 0x0, /* segment base address */
1647 0xfffff, /* length - all address space */
1648 SDT_MEMERA, /* segment type */
1649 SEL_KPL, /* segment descriptor priority level */
1650 1, /* segment descriptor present */
1651 1, /* long */
1652 0, /* default 32 vs 16 bit size */
1653 1 /* limit granularity (byte/page units)*/ },
1654 /* GDATA_SEL 2 Data Descriptor for kernel */
1655 { 0x0, /* segment base address */
1656 0xfffff, /* length - all address space */
1657 SDT_MEMRWA, /* segment type */
1658 SEL_KPL, /* segment descriptor priority level */
1659 1, /* segment descriptor present */
1660 1, /* long */
1661 0, /* default 32 vs 16 bit size */
1662 1 /* limit granularity (byte/page units)*/ },
1663 /* GUCODE32_SEL 3 32 bit Code Descriptor for user */
1664 { 0x0, /* segment base address */
1665 0xfffff, /* length - all address space */
1666 SDT_MEMERA, /* segment type */
1667 SEL_UPL, /* segment descriptor priority level */
1668 1, /* segment descriptor present */
1669 0, /* long */
1670 1, /* default 32 vs 16 bit size */
1671 1 /* limit granularity (byte/page units)*/ },
1672 /* GUDATA_SEL 4 32/64 bit Data Descriptor for user */
1673 { 0x0, /* segment base address */
1674 0xfffff, /* length - all address space */
1675 SDT_MEMRWA, /* segment type */
1676 SEL_UPL, /* segment descriptor priority level */
1677 1, /* segment descriptor present */
1678 0, /* long */
1679 1, /* default 32 vs 16 bit size */
1680 1 /* limit granularity (byte/page units)*/ },
1681 /* GUCODE_SEL 5 64 bit Code Descriptor for user */
1682 { 0x0, /* segment base address */
1683 0xfffff, /* length - all address space */
1684 SDT_MEMERA, /* segment type */
1685 SEL_UPL, /* segment descriptor priority level */
1686 1, /* segment descriptor present */
1687 1, /* long */
1688 0, /* default 32 vs 16 bit size */
1689 1 /* limit granularity (byte/page units)*/ },
1690 /* GPROC0_SEL 6 Proc 0 Tss Descriptor */
1691 {
1692 0x0, /* segment base address */
1693 sizeof(struct x86_64tss)-1,/* length - all address space */
1694 SDT_SYSTSS, /* segment type */
1695 SEL_KPL, /* segment descriptor priority level */
1696 1, /* segment descriptor present */
1697 0, /* long */
1698 0, /* unused - default 32 vs 16 bit size */
1699 0 /* limit granularity (byte/page units)*/ },
1700 /* Actually, the TSS is a system descriptor which is double size */
1701 { 0x0, /* segment base address */
1702 0x0, /* length */
1703 0, /* segment type */
1704 0, /* segment descriptor priority level */
1705 0, /* segment descriptor present */
1706 0, /* long */
1707 0, /* default 32 vs 16 bit size */
1708 0 /* limit granularity (byte/page units)*/ },
1709 /* GUGS32_SEL 8 32 bit GS Descriptor for user */
1710 { 0x0, /* segment base address */
1711 0xfffff, /* length - all address space */
1712 SDT_MEMRWA, /* segment type */
1713 SEL_UPL, /* segment descriptor priority level */
1714 1, /* segment descriptor present */
1715 0, /* long */
1716 1, /* default 32 vs 16 bit size */
1717 1 /* limit granularity (byte/page units)*/ },
1718 };
1719
1720 void
setidt_global(int idx,inthand_t * func,int typ,int dpl,int ist)1721 setidt_global(int idx, inthand_t *func, int typ, int dpl, int ist)
1722 {
1723 int cpu;
1724
1725 for (cpu = 0; cpu < MAXCPU; ++cpu) {
1726 struct gate_descriptor *ip = &idt_arr[cpu][idx];
1727
1728 ip->gd_looffset = (uintptr_t)func;
1729 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
1730 ip->gd_ist = ist;
1731 ip->gd_xx = 0;
1732 ip->gd_type = typ;
1733 ip->gd_dpl = dpl;
1734 ip->gd_p = 1;
1735 ip->gd_hioffset = ((uintptr_t)func)>>16 ;
1736 }
1737 }
1738
1739 void
setidt(int idx,inthand_t * func,int typ,int dpl,int ist,int cpu)1740 setidt(int idx, inthand_t *func, int typ, int dpl, int ist, int cpu)
1741 {
1742 struct gate_descriptor *ip;
1743
1744 KASSERT(cpu >= 0 && cpu < ncpus, ("invalid cpu %d", cpu));
1745
1746 ip = &idt_arr[cpu][idx];
1747 ip->gd_looffset = (uintptr_t)func;
1748 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
1749 ip->gd_ist = ist;
1750 ip->gd_xx = 0;
1751 ip->gd_type = typ;
1752 ip->gd_dpl = dpl;
1753 ip->gd_p = 1;
1754 ip->gd_hioffset = ((uintptr_t)func)>>16 ;
1755 }
1756
1757 #define IDTVEC(name) __CONCAT(X,name)
1758
1759 extern inthand_t
1760 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
1761 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
1762 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
1763 IDTVEC(page), IDTVEC(mchk), IDTVEC(fpu), IDTVEC(align),
1764 IDTVEC(xmm), IDTVEC(dblfault),
1765 IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
1766
1767 extern inthand_t
1768 IDTVEC(rsvd00), IDTVEC(rsvd01), IDTVEC(rsvd02), IDTVEC(rsvd03),
1769 IDTVEC(rsvd04), IDTVEC(rsvd05), IDTVEC(rsvd06), IDTVEC(rsvd07),
1770 IDTVEC(rsvd08), IDTVEC(rsvd09), IDTVEC(rsvd0a), IDTVEC(rsvd0b),
1771 IDTVEC(rsvd0c), IDTVEC(rsvd0d), IDTVEC(rsvd0e), IDTVEC(rsvd0f),
1772 IDTVEC(rsvd10), IDTVEC(rsvd11), IDTVEC(rsvd12), IDTVEC(rsvd13),
1773 IDTVEC(rsvd14), IDTVEC(rsvd15), IDTVEC(rsvd16), IDTVEC(rsvd17),
1774 IDTVEC(rsvd18), IDTVEC(rsvd19), IDTVEC(rsvd1a), IDTVEC(rsvd1b),
1775 IDTVEC(rsvd1c), IDTVEC(rsvd1d), IDTVEC(rsvd1e), IDTVEC(rsvd1f),
1776 IDTVEC(rsvd20), IDTVEC(rsvd21), IDTVEC(rsvd22), IDTVEC(rsvd23),
1777 IDTVEC(rsvd24), IDTVEC(rsvd25), IDTVEC(rsvd26), IDTVEC(rsvd27),
1778 IDTVEC(rsvd28), IDTVEC(rsvd29), IDTVEC(rsvd2a), IDTVEC(rsvd2b),
1779 IDTVEC(rsvd2c), IDTVEC(rsvd2d), IDTVEC(rsvd2e), IDTVEC(rsvd2f),
1780 IDTVEC(rsvd30), IDTVEC(rsvd31), IDTVEC(rsvd32), IDTVEC(rsvd33),
1781 IDTVEC(rsvd34), IDTVEC(rsvd35), IDTVEC(rsvd36), IDTVEC(rsvd37),
1782 IDTVEC(rsvd38), IDTVEC(rsvd39), IDTVEC(rsvd3a), IDTVEC(rsvd3b),
1783 IDTVEC(rsvd3c), IDTVEC(rsvd3d), IDTVEC(rsvd3e), IDTVEC(rsvd3f),
1784 IDTVEC(rsvd40), IDTVEC(rsvd41), IDTVEC(rsvd42), IDTVEC(rsvd43),
1785 IDTVEC(rsvd44), IDTVEC(rsvd45), IDTVEC(rsvd46), IDTVEC(rsvd47),
1786 IDTVEC(rsvd48), IDTVEC(rsvd49), IDTVEC(rsvd4a), IDTVEC(rsvd4b),
1787 IDTVEC(rsvd4c), IDTVEC(rsvd4d), IDTVEC(rsvd4e), IDTVEC(rsvd4f),
1788 IDTVEC(rsvd50), IDTVEC(rsvd51), IDTVEC(rsvd52), IDTVEC(rsvd53),
1789 IDTVEC(rsvd54), IDTVEC(rsvd55), IDTVEC(rsvd56), IDTVEC(rsvd57),
1790 IDTVEC(rsvd58), IDTVEC(rsvd59), IDTVEC(rsvd5a), IDTVEC(rsvd5b),
1791 IDTVEC(rsvd5c), IDTVEC(rsvd5d), IDTVEC(rsvd5e), IDTVEC(rsvd5f),
1792 IDTVEC(rsvd60), IDTVEC(rsvd61), IDTVEC(rsvd62), IDTVEC(rsvd63),
1793 IDTVEC(rsvd64), IDTVEC(rsvd65), IDTVEC(rsvd66), IDTVEC(rsvd67),
1794 IDTVEC(rsvd68), IDTVEC(rsvd69), IDTVEC(rsvd6a), IDTVEC(rsvd6b),
1795 IDTVEC(rsvd6c), IDTVEC(rsvd6d), IDTVEC(rsvd6e), IDTVEC(rsvd6f),
1796 IDTVEC(rsvd70), IDTVEC(rsvd71), IDTVEC(rsvd72), IDTVEC(rsvd73),
1797 IDTVEC(rsvd74), IDTVEC(rsvd75), IDTVEC(rsvd76), IDTVEC(rsvd77),
1798 IDTVEC(rsvd78), IDTVEC(rsvd79), IDTVEC(rsvd7a), IDTVEC(rsvd7b),
1799 IDTVEC(rsvd7c), IDTVEC(rsvd7d), IDTVEC(rsvd7e), IDTVEC(rsvd7f),
1800 IDTVEC(rsvd80), IDTVEC(rsvd81), IDTVEC(rsvd82), IDTVEC(rsvd83),
1801 IDTVEC(rsvd84), IDTVEC(rsvd85), IDTVEC(rsvd86), IDTVEC(rsvd87),
1802 IDTVEC(rsvd88), IDTVEC(rsvd89), IDTVEC(rsvd8a), IDTVEC(rsvd8b),
1803 IDTVEC(rsvd8c), IDTVEC(rsvd8d), IDTVEC(rsvd8e), IDTVEC(rsvd8f),
1804 IDTVEC(rsvd90), IDTVEC(rsvd91), IDTVEC(rsvd92), IDTVEC(rsvd93),
1805 IDTVEC(rsvd94), IDTVEC(rsvd95), IDTVEC(rsvd96), IDTVEC(rsvd97),
1806 IDTVEC(rsvd98), IDTVEC(rsvd99), IDTVEC(rsvd9a), IDTVEC(rsvd9b),
1807 IDTVEC(rsvd9c), IDTVEC(rsvd9d), IDTVEC(rsvd9e), IDTVEC(rsvd9f),
1808 IDTVEC(rsvda0), IDTVEC(rsvda1), IDTVEC(rsvda2), IDTVEC(rsvda3),
1809 IDTVEC(rsvda4), IDTVEC(rsvda5), IDTVEC(rsvda6), IDTVEC(rsvda7),
1810 IDTVEC(rsvda8), IDTVEC(rsvda9), IDTVEC(rsvdaa), IDTVEC(rsvdab),
1811 IDTVEC(rsvdac), IDTVEC(rsvdad), IDTVEC(rsvdae), IDTVEC(rsvdaf),
1812 IDTVEC(rsvdb0), IDTVEC(rsvdb1), IDTVEC(rsvdb2), IDTVEC(rsvdb3),
1813 IDTVEC(rsvdb4), IDTVEC(rsvdb5), IDTVEC(rsvdb6), IDTVEC(rsvdb7),
1814 IDTVEC(rsvdb8), IDTVEC(rsvdb9), IDTVEC(rsvdba), IDTVEC(rsvdbb),
1815 IDTVEC(rsvdbc), IDTVEC(rsvdbd), IDTVEC(rsvdbe), IDTVEC(rsvdbf),
1816 IDTVEC(rsvdc0), IDTVEC(rsvdc1), IDTVEC(rsvdc2), IDTVEC(rsvdc3),
1817 IDTVEC(rsvdc4), IDTVEC(rsvdc5), IDTVEC(rsvdc6), IDTVEC(rsvdc7),
1818 IDTVEC(rsvdc8), IDTVEC(rsvdc9), IDTVEC(rsvdca), IDTVEC(rsvdcb),
1819 IDTVEC(rsvdcc), IDTVEC(rsvdcd), IDTVEC(rsvdce), IDTVEC(rsvdcf),
1820 IDTVEC(rsvdd0), IDTVEC(rsvdd1), IDTVEC(rsvdd2), IDTVEC(rsvdd3),
1821 IDTVEC(rsvdd4), IDTVEC(rsvdd5), IDTVEC(rsvdd6), IDTVEC(rsvdd7),
1822 IDTVEC(rsvdd8), IDTVEC(rsvdd9), IDTVEC(rsvdda), IDTVEC(rsvddb),
1823 IDTVEC(rsvddc), IDTVEC(rsvddd), IDTVEC(rsvdde), IDTVEC(rsvddf),
1824 IDTVEC(rsvde0), IDTVEC(rsvde1), IDTVEC(rsvde2), IDTVEC(rsvde3),
1825 IDTVEC(rsvde4), IDTVEC(rsvde5), IDTVEC(rsvde6), IDTVEC(rsvde7),
1826 IDTVEC(rsvde8), IDTVEC(rsvde9), IDTVEC(rsvdea), IDTVEC(rsvdeb),
1827 IDTVEC(rsvdec), IDTVEC(rsvded), IDTVEC(rsvdee), IDTVEC(rsvdef),
1828 IDTVEC(rsvdf0), IDTVEC(rsvdf1), IDTVEC(rsvdf2), IDTVEC(rsvdf3),
1829 IDTVEC(rsvdf4), IDTVEC(rsvdf5), IDTVEC(rsvdf6), IDTVEC(rsvdf7),
1830 IDTVEC(rsvdf8), IDTVEC(rsvdf9), IDTVEC(rsvdfa), IDTVEC(rsvdfb),
1831 IDTVEC(rsvdfc), IDTVEC(rsvdfd), IDTVEC(rsvdfe), IDTVEC(rsvdff);
1832
1833 inthand_t *rsvdary[NIDT] = {
1834 &IDTVEC(rsvd00), &IDTVEC(rsvd01), &IDTVEC(rsvd02), &IDTVEC(rsvd03),
1835 &IDTVEC(rsvd04), &IDTVEC(rsvd05), &IDTVEC(rsvd06), &IDTVEC(rsvd07),
1836 &IDTVEC(rsvd08), &IDTVEC(rsvd09), &IDTVEC(rsvd0a), &IDTVEC(rsvd0b),
1837 &IDTVEC(rsvd0c), &IDTVEC(rsvd0d), &IDTVEC(rsvd0e), &IDTVEC(rsvd0f),
1838 &IDTVEC(rsvd10), &IDTVEC(rsvd11), &IDTVEC(rsvd12), &IDTVEC(rsvd13),
1839 &IDTVEC(rsvd14), &IDTVEC(rsvd15), &IDTVEC(rsvd16), &IDTVEC(rsvd17),
1840 &IDTVEC(rsvd18), &IDTVEC(rsvd19), &IDTVEC(rsvd1a), &IDTVEC(rsvd1b),
1841 &IDTVEC(rsvd1c), &IDTVEC(rsvd1d), &IDTVEC(rsvd1e), &IDTVEC(rsvd1f),
1842 &IDTVEC(rsvd20), &IDTVEC(rsvd21), &IDTVEC(rsvd22), &IDTVEC(rsvd23),
1843 &IDTVEC(rsvd24), &IDTVEC(rsvd25), &IDTVEC(rsvd26), &IDTVEC(rsvd27),
1844 &IDTVEC(rsvd28), &IDTVEC(rsvd29), &IDTVEC(rsvd2a), &IDTVEC(rsvd2b),
1845 &IDTVEC(rsvd2c), &IDTVEC(rsvd2d), &IDTVEC(rsvd2e), &IDTVEC(rsvd2f),
1846 &IDTVEC(rsvd30), &IDTVEC(rsvd31), &IDTVEC(rsvd32), &IDTVEC(rsvd33),
1847 &IDTVEC(rsvd34), &IDTVEC(rsvd35), &IDTVEC(rsvd36), &IDTVEC(rsvd37),
1848 &IDTVEC(rsvd38), &IDTVEC(rsvd39), &IDTVEC(rsvd3a), &IDTVEC(rsvd3b),
1849 &IDTVEC(rsvd3c), &IDTVEC(rsvd3d), &IDTVEC(rsvd3e), &IDTVEC(rsvd3f),
1850 &IDTVEC(rsvd40), &IDTVEC(rsvd41), &IDTVEC(rsvd42), &IDTVEC(rsvd43),
1851 &IDTVEC(rsvd44), &IDTVEC(rsvd45), &IDTVEC(rsvd46), &IDTVEC(rsvd47),
1852 &IDTVEC(rsvd48), &IDTVEC(rsvd49), &IDTVEC(rsvd4a), &IDTVEC(rsvd4b),
1853 &IDTVEC(rsvd4c), &IDTVEC(rsvd4d), &IDTVEC(rsvd4e), &IDTVEC(rsvd4f),
1854 &IDTVEC(rsvd50), &IDTVEC(rsvd51), &IDTVEC(rsvd52), &IDTVEC(rsvd53),
1855 &IDTVEC(rsvd54), &IDTVEC(rsvd55), &IDTVEC(rsvd56), &IDTVEC(rsvd57),
1856 &IDTVEC(rsvd58), &IDTVEC(rsvd59), &IDTVEC(rsvd5a), &IDTVEC(rsvd5b),
1857 &IDTVEC(rsvd5c), &IDTVEC(rsvd5d), &IDTVEC(rsvd5e), &IDTVEC(rsvd5f),
1858 &IDTVEC(rsvd60), &IDTVEC(rsvd61), &IDTVEC(rsvd62), &IDTVEC(rsvd63),
1859 &IDTVEC(rsvd64), &IDTVEC(rsvd65), &IDTVEC(rsvd66), &IDTVEC(rsvd67),
1860 &IDTVEC(rsvd68), &IDTVEC(rsvd69), &IDTVEC(rsvd6a), &IDTVEC(rsvd6b),
1861 &IDTVEC(rsvd6c), &IDTVEC(rsvd6d), &IDTVEC(rsvd6e), &IDTVEC(rsvd6f),
1862 &IDTVEC(rsvd70), &IDTVEC(rsvd71), &IDTVEC(rsvd72), &IDTVEC(rsvd73),
1863 &IDTVEC(rsvd74), &IDTVEC(rsvd75), &IDTVEC(rsvd76), &IDTVEC(rsvd77),
1864 &IDTVEC(rsvd78), &IDTVEC(rsvd79), &IDTVEC(rsvd7a), &IDTVEC(rsvd7b),
1865 &IDTVEC(rsvd7c), &IDTVEC(rsvd7d), &IDTVEC(rsvd7e), &IDTVEC(rsvd7f),
1866 &IDTVEC(rsvd80), &IDTVEC(rsvd81), &IDTVEC(rsvd82), &IDTVEC(rsvd83),
1867 &IDTVEC(rsvd84), &IDTVEC(rsvd85), &IDTVEC(rsvd86), &IDTVEC(rsvd87),
1868 &IDTVEC(rsvd88), &IDTVEC(rsvd89), &IDTVEC(rsvd8a), &IDTVEC(rsvd8b),
1869 &IDTVEC(rsvd8c), &IDTVEC(rsvd8d), &IDTVEC(rsvd8e), &IDTVEC(rsvd8f),
1870 &IDTVEC(rsvd90), &IDTVEC(rsvd91), &IDTVEC(rsvd92), &IDTVEC(rsvd93),
1871 &IDTVEC(rsvd94), &IDTVEC(rsvd95), &IDTVEC(rsvd96), &IDTVEC(rsvd97),
1872 &IDTVEC(rsvd98), &IDTVEC(rsvd99), &IDTVEC(rsvd9a), &IDTVEC(rsvd9b),
1873 &IDTVEC(rsvd9c), &IDTVEC(rsvd9d), &IDTVEC(rsvd9e), &IDTVEC(rsvd9f),
1874 &IDTVEC(rsvda0), &IDTVEC(rsvda1), &IDTVEC(rsvda2), &IDTVEC(rsvda3),
1875 &IDTVEC(rsvda4), &IDTVEC(rsvda5), &IDTVEC(rsvda6), &IDTVEC(rsvda7),
1876 &IDTVEC(rsvda8), &IDTVEC(rsvda9), &IDTVEC(rsvdaa), &IDTVEC(rsvdab),
1877 &IDTVEC(rsvdac), &IDTVEC(rsvdad), &IDTVEC(rsvdae), &IDTVEC(rsvdaf),
1878 &IDTVEC(rsvdb0), &IDTVEC(rsvdb1), &IDTVEC(rsvdb2), &IDTVEC(rsvdb3),
1879 &IDTVEC(rsvdb4), &IDTVEC(rsvdb5), &IDTVEC(rsvdb6), &IDTVEC(rsvdb7),
1880 &IDTVEC(rsvdb8), &IDTVEC(rsvdb9), &IDTVEC(rsvdba), &IDTVEC(rsvdbb),
1881 &IDTVEC(rsvdbc), &IDTVEC(rsvdbd), &IDTVEC(rsvdbe), &IDTVEC(rsvdbf),
1882 &IDTVEC(rsvdc0), &IDTVEC(rsvdc1), &IDTVEC(rsvdc2), &IDTVEC(rsvdc3),
1883 &IDTVEC(rsvdc4), &IDTVEC(rsvdc5), &IDTVEC(rsvdc6), &IDTVEC(rsvdc7),
1884 &IDTVEC(rsvdc8), &IDTVEC(rsvdc9), &IDTVEC(rsvdca), &IDTVEC(rsvdcb),
1885 &IDTVEC(rsvdcc), &IDTVEC(rsvdcd), &IDTVEC(rsvdce), &IDTVEC(rsvdcf),
1886 &IDTVEC(rsvdd0), &IDTVEC(rsvdd1), &IDTVEC(rsvdd2), &IDTVEC(rsvdd3),
1887 &IDTVEC(rsvdd4), &IDTVEC(rsvdd5), &IDTVEC(rsvdd6), &IDTVEC(rsvdd7),
1888 &IDTVEC(rsvdd8), &IDTVEC(rsvdd9), &IDTVEC(rsvdda), &IDTVEC(rsvddb),
1889 &IDTVEC(rsvddc), &IDTVEC(rsvddd), &IDTVEC(rsvdde), &IDTVEC(rsvddf),
1890 &IDTVEC(rsvde0), &IDTVEC(rsvde1), &IDTVEC(rsvde2), &IDTVEC(rsvde3),
1891 &IDTVEC(rsvde4), &IDTVEC(rsvde5), &IDTVEC(rsvde6), &IDTVEC(rsvde7),
1892 &IDTVEC(rsvde8), &IDTVEC(rsvde9), &IDTVEC(rsvdea), &IDTVEC(rsvdeb),
1893 &IDTVEC(rsvdec), &IDTVEC(rsvded), &IDTVEC(rsvdee), &IDTVEC(rsvdef),
1894 &IDTVEC(rsvdf0), &IDTVEC(rsvdf1), &IDTVEC(rsvdf2), &IDTVEC(rsvdf3),
1895 &IDTVEC(rsvdf4), &IDTVEC(rsvdf5), &IDTVEC(rsvdf6), &IDTVEC(rsvdf7),
1896 &IDTVEC(rsvdf8), &IDTVEC(rsvdf9), &IDTVEC(rsvdfa), &IDTVEC(rsvdfb),
1897 &IDTVEC(rsvdfc), &IDTVEC(rsvdfd), &IDTVEC(rsvdfe), &IDTVEC(rsvdff)
1898 };
1899
1900 void
sdtossd(struct user_segment_descriptor * sd,struct soft_segment_descriptor * ssd)1901 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd)
1902 {
1903 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase;
1904 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
1905 ssd->ssd_type = sd->sd_type;
1906 ssd->ssd_dpl = sd->sd_dpl;
1907 ssd->ssd_p = sd->sd_p;
1908 ssd->ssd_def32 = sd->sd_def32;
1909 ssd->ssd_gran = sd->sd_gran;
1910 }
1911
1912 void
ssdtosd(struct soft_segment_descriptor * ssd,struct user_segment_descriptor * sd)1913 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd)
1914 {
1915
1916 sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
1917 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
1918 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
1919 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
1920 sd->sd_type = ssd->ssd_type;
1921 sd->sd_dpl = ssd->ssd_dpl;
1922 sd->sd_p = ssd->ssd_p;
1923 sd->sd_long = ssd->ssd_long;
1924 sd->sd_def32 = ssd->ssd_def32;
1925 sd->sd_gran = ssd->ssd_gran;
1926 }
1927
1928 void
ssdtosyssd(struct soft_segment_descriptor * ssd,struct system_segment_descriptor * sd)1929 ssdtosyssd(struct soft_segment_descriptor *ssd,
1930 struct system_segment_descriptor *sd)
1931 {
1932
1933 sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
1934 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
1935 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
1936 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
1937 sd->sd_type = ssd->ssd_type;
1938 sd->sd_dpl = ssd->ssd_dpl;
1939 sd->sd_p = ssd->ssd_p;
1940 sd->sd_gran = ssd->ssd_gran;
1941 }
1942
1943 /*
1944 * Populate the (physmap) array with base/bound pairs describing the
1945 * available physical memory in the system, then test this memory and
1946 * build the phys_avail array describing the actually-available memory.
1947 *
1948 * If we cannot accurately determine the physical memory map, then use
1949 * value from the 0xE801 call, and failing that, the RTC.
1950 *
1951 * Total memory size may be set by the kernel environment variable
1952 * hw.physmem or the compile-time define MAXMEM.
1953 *
1954 * Memory is aligned to PHYSMAP_ALIGN which must be a multiple
1955 * of PAGE_SIZE. This also greatly reduces the memory test time
1956 * which would otherwise be excessive on machines with > 8G of ram.
1957 *
1958 * XXX first should be vm_paddr_t.
1959 */
1960
1961 #define PHYSMAP_ALIGN (vm_paddr_t)(128 * 1024)
1962 #define PHYSMAP_ALIGN_MASK (vm_paddr_t)(PHYSMAP_ALIGN - 1)
1963 #define PHYSMAP_SIZE VM_PHYSSEG_MAX
1964
1965 vm_paddr_t physmap[PHYSMAP_SIZE];
1966 struct bios_smap *smapbase, *smap, *smapend;
1967 struct efi_map_header *efihdrbase;
1968 u_int32_t smapsize;
1969
1970 #define PHYSMAP_HANDWAVE (vm_paddr_t)(2 * 1024 * 1024)
1971 #define PHYSMAP_HANDWAVE_MASK (PHYSMAP_HANDWAVE - 1)
1972
1973 static void
add_smap_entries(int * physmap_idx)1974 add_smap_entries(int *physmap_idx)
1975 {
1976 int i;
1977
1978 smapsize = *((u_int32_t *)smapbase - 1);
1979 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
1980
1981 for (smap = smapbase; smap < smapend; smap++) {
1982 if (boothowto & RB_VERBOSE)
1983 kprintf("SMAP type=%02x base=%016lx len=%016lx\n",
1984 smap->type, smap->base, smap->length);
1985
1986 if (smap->type != SMAP_TYPE_MEMORY)
1987 continue;
1988
1989 if (smap->length == 0)
1990 continue;
1991
1992 for (i = 0; i <= *physmap_idx; i += 2) {
1993 if (smap->base < physmap[i + 1]) {
1994 if (boothowto & RB_VERBOSE) {
1995 kprintf("Overlapping or non-monotonic "
1996 "memory region, ignoring "
1997 "second region\n");
1998 }
1999 break;
2000 }
2001 }
2002 if (i <= *physmap_idx)
2003 continue;
2004
2005 Realmem += smap->length;
2006
2007 /*
2008 * NOTE: This little bit of code initially expands
2009 * physmap[1] as well as later entries.
2010 */
2011 if (smap->base == physmap[*physmap_idx + 1]) {
2012 physmap[*physmap_idx + 1] += smap->length;
2013 continue;
2014 }
2015
2016 *physmap_idx += 2;
2017 if (*physmap_idx == PHYSMAP_SIZE) {
2018 kprintf("Too many segments in the physical "
2019 "address map, giving up\n");
2020 break;
2021 }
2022 physmap[*physmap_idx] = smap->base;
2023 physmap[*physmap_idx + 1] = smap->base + smap->length;
2024 }
2025 }
2026
2027 static void
add_efi_map_entries(int * physmap_idx)2028 add_efi_map_entries(int *physmap_idx)
2029 {
2030 struct efi_md *map, *p;
2031 const char *type;
2032 size_t efisz;
2033 int i, ndesc;
2034
2035 static const char *types[] = {
2036 "Reserved",
2037 "LoaderCode",
2038 "LoaderData",
2039 "BootServicesCode",
2040 "BootServicesData",
2041 "RuntimeServicesCode",
2042 "RuntimeServicesData",
2043 "ConventionalMemory",
2044 "UnusableMemory",
2045 "ACPIReclaimMemory",
2046 "ACPIMemoryNVS",
2047 "MemoryMappedIO",
2048 "MemoryMappedIOPortSpace",
2049 "PalCode"
2050 };
2051
2052 /*
2053 * Memory map data provided by UEFI via the GetMemoryMap
2054 * Boot Services API.
2055 */
2056 efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
2057 map = (struct efi_md *)((uint8_t *)efihdrbase + efisz);
2058
2059 if (efihdrbase->descriptor_size == 0)
2060 return;
2061 ndesc = efihdrbase->memory_size / efihdrbase->descriptor_size;
2062
2063 if (boothowto & RB_VERBOSE)
2064 kprintf("%23s %12s %12s %8s %4s\n",
2065 "Type", "Physical", "Virtual", "#Pages", "Attr");
2066
2067 for (i = 0, p = map; i < ndesc; i++,
2068 p = efi_next_descriptor(p, efihdrbase->descriptor_size)) {
2069 if (boothowto & RB_VERBOSE) {
2070 if (p->md_type <= EFI_MD_TYPE_PALCODE)
2071 type = types[p->md_type];
2072 else
2073 type = "<INVALID>";
2074 kprintf("%23s %012lx %12p %08lx ", type, p->md_phys,
2075 p->md_virt, p->md_pages);
2076 if (p->md_attr & EFI_MD_ATTR_UC)
2077 kprintf("UC ");
2078 if (p->md_attr & EFI_MD_ATTR_WC)
2079 kprintf("WC ");
2080 if (p->md_attr & EFI_MD_ATTR_WT)
2081 kprintf("WT ");
2082 if (p->md_attr & EFI_MD_ATTR_WB)
2083 kprintf("WB ");
2084 if (p->md_attr & EFI_MD_ATTR_UCE)
2085 kprintf("UCE ");
2086 if (p->md_attr & EFI_MD_ATTR_WP)
2087 kprintf("WP ");
2088 if (p->md_attr & EFI_MD_ATTR_RP)
2089 kprintf("RP ");
2090 if (p->md_attr & EFI_MD_ATTR_XP)
2091 kprintf("XP ");
2092 if (p->md_attr & EFI_MD_ATTR_RT)
2093 kprintf("RUNTIME");
2094 kprintf("\n");
2095 }
2096
2097 switch (p->md_type) {
2098 case EFI_MD_TYPE_CODE:
2099 case EFI_MD_TYPE_DATA:
2100 case EFI_MD_TYPE_BS_CODE:
2101 case EFI_MD_TYPE_BS_DATA:
2102 case EFI_MD_TYPE_FREE:
2103 /*
2104 * We're allowed to use any entry with these types.
2105 */
2106 break;
2107 default:
2108 continue;
2109 }
2110
2111 Realmem += p->md_pages * PAGE_SIZE;
2112
2113 /*
2114 * NOTE: This little bit of code initially expands
2115 * physmap[1] as well as later entries.
2116 */
2117 if (p->md_phys == physmap[*physmap_idx + 1]) {
2118 physmap[*physmap_idx + 1] += p->md_pages * PAGE_SIZE;
2119 continue;
2120 }
2121
2122 *physmap_idx += 2;
2123 if (*physmap_idx == PHYSMAP_SIZE) {
2124 kprintf("Too many segments in the physical "
2125 "address map, giving up\n");
2126 break;
2127 }
2128 physmap[*physmap_idx] = p->md_phys;
2129 physmap[*physmap_idx + 1] = p->md_phys + p->md_pages * PAGE_SIZE;
2130 }
2131 }
2132
2133 struct fb_info efi_fb_info;
2134 static int have_efi_framebuffer = 0;
2135
2136 static void
efi_fb_init_vaddr(int direct_map)2137 efi_fb_init_vaddr(int direct_map)
2138 {
2139 uint64_t sz;
2140 vm_offset_t addr, v;
2141
2142 v = efi_fb_info.vaddr;
2143 sz = efi_fb_info.stride * efi_fb_info.height;
2144
2145 if (direct_map) {
2146 addr = PHYS_TO_DMAP(efi_fb_info.paddr);
2147 if (addr >= DMAP_MIN_ADDRESS && addr + sz <= DMapMaxAddress)
2148 efi_fb_info.vaddr = addr;
2149 } else {
2150 efi_fb_info.vaddr =
2151 (vm_offset_t)pmap_mapdev_attr(efi_fb_info.paddr,
2152 sz,
2153 PAT_WRITE_COMBINING);
2154 }
2155 }
2156
2157 static u_int
efifb_color_depth(struct efi_fb * efifb)2158 efifb_color_depth(struct efi_fb *efifb)
2159 {
2160 uint32_t mask;
2161 u_int depth;
2162
2163 mask = efifb->fb_mask_red | efifb->fb_mask_green |
2164 efifb->fb_mask_blue | efifb->fb_mask_reserved;
2165 if (mask == 0)
2166 return (0);
2167 for (depth = 1; mask != 1; depth++)
2168 mask >>= 1;
2169 return (depth);
2170 }
2171
2172 int
probe_efi_fb(int early)2173 probe_efi_fb(int early)
2174 {
2175 struct efi_fb *efifb;
2176 caddr_t kmdp;
2177 u_int depth;
2178
2179 if (have_efi_framebuffer) {
2180 if (!early &&
2181 (efi_fb_info.vaddr == 0 ||
2182 efi_fb_info.vaddr == PHYS_TO_DMAP(efi_fb_info.paddr)))
2183 efi_fb_init_vaddr(0);
2184 return 0;
2185 }
2186
2187 kmdp = preload_search_by_type("elf kernel");
2188 if (kmdp == NULL)
2189 kmdp = preload_search_by_type("elf64 kernel");
2190 efifb = (struct efi_fb *)preload_search_info(kmdp,
2191 MODINFO_METADATA | MODINFOMD_EFI_FB);
2192 if (efifb == NULL)
2193 return 1;
2194
2195 depth = efifb_color_depth(efifb);
2196 /*
2197 * Our bootloader should already notice, when we won't be able to
2198 * use the UEFI framebuffer.
2199 */
2200 if (depth != 24 && depth != 32)
2201 return 1;
2202
2203 have_efi_framebuffer = 1;
2204
2205 efi_fb_info.is_vga_boot_display = 1;
2206 efi_fb_info.width = efifb->fb_width;
2207 efi_fb_info.height = efifb->fb_height;
2208 efi_fb_info.depth = depth;
2209 efi_fb_info.stride = efifb->fb_stride * (depth / 8);
2210 efi_fb_info.paddr = efifb->fb_addr;
2211 if (early) {
2212 efi_fb_info.vaddr = 0;
2213 } else {
2214 efi_fb_init_vaddr(0);
2215 }
2216 efi_fb_info.fbops.fb_set_par = NULL;
2217 efi_fb_info.fbops.fb_blank = NULL;
2218 efi_fb_info.fbops.fb_debug_enter = NULL;
2219 efi_fb_info.device = NULL;
2220
2221 return 0;
2222 }
2223
2224 static void
efifb_startup(void * arg)2225 efifb_startup(void *arg)
2226 {
2227 probe_efi_fb(0);
2228 }
2229
2230 SYSINIT(efi_fb_info, SI_BOOT1_POST, SI_ORDER_FIRST, efifb_startup, NULL);
2231
2232 static void
getmemsize(caddr_t kmdp,u_int64_t first)2233 getmemsize(caddr_t kmdp, u_int64_t first)
2234 {
2235 int off, physmap_idx, pa_indx, da_indx;
2236 int i, j;
2237 vm_paddr_t pa;
2238 vm_paddr_t msgbuf_size;
2239 u_long physmem_tunable;
2240 pt_entry_t *pte;
2241 quad_t dcons_addr, dcons_size;
2242
2243 bzero(physmap, sizeof(physmap));
2244 physmap_idx = 0;
2245
2246 /*
2247 * get memory map from INT 15:E820, kindly supplied by the loader.
2248 *
2249 * subr_module.c says:
2250 * "Consumer may safely assume that size value precedes data."
2251 * ie: an int32_t immediately precedes smap.
2252 */
2253 efihdrbase = (struct efi_map_header *)preload_search_info(kmdp,
2254 MODINFO_METADATA | MODINFOMD_EFI_MAP);
2255 smapbase = (struct bios_smap *)preload_search_info(kmdp,
2256 MODINFO_METADATA | MODINFOMD_SMAP);
2257 if (smapbase == NULL && efihdrbase == NULL)
2258 panic("No BIOS smap or EFI map info from loader!");
2259
2260 if (efihdrbase == NULL)
2261 add_smap_entries(&physmap_idx);
2262 else
2263 add_efi_map_entries(&physmap_idx);
2264
2265 base_memory = physmap[1] / 1024;
2266 /* make hole for AP bootstrap code */
2267 physmap[1] = mp_bootaddress(base_memory);
2268
2269 /* Save EBDA address, if any */
2270 ebda_addr = (u_long)(*(u_short *)(KERNBASE + 0x40e));
2271 ebda_addr <<= 4;
2272
2273 /*
2274 * Maxmem isn't the "maximum memory", it's one larger than the
2275 * highest page of the physical address space. It should be
2276 * called something like "Maxphyspage". We may adjust this
2277 * based on ``hw.physmem'' and the results of the memory test.
2278 */
2279 Maxmem = atop(physmap[physmap_idx + 1]);
2280
2281 #ifdef MAXMEM
2282 Maxmem = MAXMEM / 4;
2283 #endif
2284
2285 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
2286 Maxmem = atop(physmem_tunable);
2287
2288 /*
2289 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
2290 * in the system.
2291 */
2292 if (Maxmem > atop(physmap[physmap_idx + 1]))
2293 Maxmem = atop(physmap[physmap_idx + 1]);
2294
2295 /*
2296 * Blowing out the DMAP will blow up the system.
2297 */
2298 if (Maxmem > atop(DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS)) {
2299 kprintf("Limiting Maxmem due to DMAP size\n");
2300 Maxmem = atop(DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS);
2301 }
2302
2303 if (atop(physmap[physmap_idx + 1]) != Maxmem &&
2304 (boothowto & RB_VERBOSE)) {
2305 kprintf("Physical memory use set to %ldK\n", Maxmem * 4);
2306 }
2307
2308 /*
2309 * Call pmap initialization to make new kernel address space
2310 *
2311 * Mask off page 0.
2312 */
2313 pmap_bootstrap(&first);
2314 physmap[0] = PAGE_SIZE;
2315
2316 /*
2317 * Align the physmap to PHYSMAP_ALIGN and cut out anything
2318 * exceeding Maxmem.
2319 */
2320 for (i = j = 0; i <= physmap_idx; i += 2) {
2321 if (physmap[i+1] > ptoa(Maxmem))
2322 physmap[i+1] = ptoa(Maxmem);
2323 physmap[i] = (physmap[i] + PHYSMAP_ALIGN_MASK) &
2324 ~PHYSMAP_ALIGN_MASK;
2325 physmap[i+1] = physmap[i+1] & ~PHYSMAP_ALIGN_MASK;
2326
2327 physmap[j] = physmap[i];
2328 physmap[j+1] = physmap[i+1];
2329
2330 if (physmap[i] < physmap[i+1])
2331 j += 2;
2332 }
2333 physmap_idx = j - 2;
2334
2335 /*
2336 * Align anything else used in the validation loop.
2337 *
2338 * Also make sure that our 2MB kernel text+data+bss mappings
2339 * do not overlap potentially allocatable space.
2340 */
2341 first = (first + PHYSMAP_ALIGN_MASK) & ~PHYSMAP_ALIGN_MASK;
2342
2343 /*
2344 * Size up each available chunk of physical memory.
2345 */
2346 pa_indx = 0;
2347 da_indx = 0;
2348 phys_avail[pa_indx].phys_beg = physmap[0];
2349 phys_avail[pa_indx].phys_end = physmap[0];
2350 dump_avail[da_indx].phys_beg = 0;
2351 dump_avail[da_indx].phys_end = physmap[0];
2352 pte = CMAP1;
2353
2354 /*
2355 * Get dcons buffer address
2356 */
2357 if (kgetenv_quad("dcons.addr", &dcons_addr) == 0 ||
2358 kgetenv_quad("dcons.size", &dcons_size) == 0)
2359 dcons_addr = 0;
2360
2361 /*
2362 * Validate the physical memory. The physical memory segments
2363 * have already been aligned to PHYSMAP_ALIGN which is a multiple
2364 * of PAGE_SIZE.
2365 *
2366 * We no longer perform an exhaustive memory test. Instead we
2367 * simply test the first and last word in each physmap[]
2368 * segment.
2369 */
2370 for (i = 0; i <= physmap_idx; i += 2) {
2371 vm_paddr_t end;
2372 vm_paddr_t incr;
2373
2374 end = physmap[i + 1];
2375
2376 for (pa = physmap[i]; pa < end; pa += incr) {
2377 int page_bad, full;
2378 volatile uint64_t *ptr = (uint64_t *)CADDR1;
2379 uint64_t tmp;
2380
2381 full = FALSE;
2382
2383 /*
2384 * Calculate incr. Just test the first and
2385 * last page in each physmap[] segment.
2386 */
2387 if (pa == end - PAGE_SIZE)
2388 incr = PAGE_SIZE;
2389 else
2390 incr = end - pa - PAGE_SIZE;
2391
2392 /*
2393 * Make sure we don't skip blacked out areas.
2394 */
2395 if (pa < 0x200000 && 0x200000 < end) {
2396 incr = 0x200000 - pa;
2397 }
2398 if (dcons_addr > 0 &&
2399 pa < dcons_addr &&
2400 dcons_addr < end) {
2401 incr = dcons_addr - pa;
2402 }
2403
2404 /*
2405 * Block out kernel memory as not available.
2406 */
2407 if (pa >= 0x200000 && pa < first) {
2408 incr = first - pa;
2409 if (pa + incr > end)
2410 incr = end - pa;
2411 goto do_dump_avail;
2412 }
2413
2414 /*
2415 * Block out the dcons buffer if it exists.
2416 */
2417 if (dcons_addr > 0 &&
2418 pa >= trunc_page(dcons_addr) &&
2419 pa < dcons_addr + dcons_size) {
2420 incr = dcons_addr + dcons_size - pa;
2421 incr = (incr + PAGE_MASK) &
2422 ~(vm_paddr_t)PAGE_MASK;
2423 if (pa + incr > end)
2424 incr = end - pa;
2425 goto do_dump_avail;
2426 }
2427
2428 page_bad = FALSE;
2429
2430 /*
2431 * Map the page non-cacheable for the memory
2432 * test.
2433 */
2434 *pte = pa |
2435 kernel_pmap->pmap_bits[PG_V_IDX] |
2436 kernel_pmap->pmap_bits[PG_RW_IDX] |
2437 kernel_pmap->pmap_bits[PG_N_IDX];
2438 cpu_invlpg(__DEVOLATILE(void *, ptr));
2439 cpu_mfence();
2440
2441 /*
2442 * Save original value for restoration later.
2443 */
2444 tmp = *ptr;
2445
2446 /*
2447 * Test for alternating 1's and 0's
2448 */
2449 *ptr = 0xaaaaaaaaaaaaaaaaLLU;
2450 cpu_mfence();
2451 if (*ptr != 0xaaaaaaaaaaaaaaaaLLU)
2452 page_bad = TRUE;
2453 /*
2454 * Test for alternating 0's and 1's
2455 */
2456 *ptr = 0x5555555555555555LLU;
2457 cpu_mfence();
2458 if (*ptr != 0x5555555555555555LLU)
2459 page_bad = TRUE;
2460 /*
2461 * Test for all 1's
2462 */
2463 *ptr = 0xffffffffffffffffLLU;
2464 cpu_mfence();
2465 if (*ptr != 0xffffffffffffffffLLU)
2466 page_bad = TRUE;
2467 /*
2468 * Test for all 0's
2469 */
2470 *ptr = 0x0;
2471 cpu_mfence();
2472 if (*ptr != 0x0)
2473 page_bad = TRUE;
2474
2475 /*
2476 * Restore original value.
2477 */
2478 *ptr = tmp;
2479
2480 /*
2481 * Adjust array of valid/good pages.
2482 */
2483 if (page_bad == TRUE) {
2484 incr = PAGE_SIZE;
2485 continue;
2486 }
2487
2488 /*
2489 * Collapse page address into phys_avail[]. Do a
2490 * continuation of the current phys_avail[] index
2491 * when possible.
2492 */
2493 if (phys_avail[pa_indx].phys_end == pa) {
2494 /*
2495 * Continuation
2496 */
2497 phys_avail[pa_indx].phys_end += incr;
2498 } else if (phys_avail[pa_indx].phys_beg ==
2499 phys_avail[pa_indx].phys_end) {
2500 /*
2501 * Current phys_avail is completely empty,
2502 * reuse the index.
2503 */
2504 phys_avail[pa_indx].phys_beg = pa;
2505 phys_avail[pa_indx].phys_end = pa + incr;
2506 } else {
2507 /*
2508 * Allocate next phys_avail index.
2509 */
2510 ++pa_indx;
2511 if (pa_indx == PHYS_AVAIL_ARRAY_END) {
2512 kprintf(
2513 "Too many holes in the physical address space, giving up\n");
2514 --pa_indx;
2515 full = TRUE;
2516 goto do_dump_avail;
2517 }
2518 phys_avail[pa_indx].phys_beg = pa;
2519 phys_avail[pa_indx].phys_end = pa + incr;
2520 }
2521 physmem += incr / PAGE_SIZE;
2522
2523 /*
2524 * pa available for dumping
2525 */
2526 do_dump_avail:
2527 if (dump_avail[da_indx].phys_end == pa) {
2528 dump_avail[da_indx].phys_end += incr;
2529 } else {
2530 ++da_indx;
2531 if (da_indx == DUMP_AVAIL_ARRAY_END) {
2532 --da_indx;
2533 goto do_next;
2534 }
2535 dump_avail[da_indx].phys_beg = pa;
2536 dump_avail[da_indx].phys_end = pa + incr;
2537 }
2538 do_next:
2539 if (full)
2540 break;
2541 }
2542 }
2543 *pte = 0;
2544 cpu_invltlb();
2545 cpu_mfence();
2546
2547 /*
2548 * The last chunk must contain at least one page plus the message
2549 * buffer to avoid complicating other code (message buffer address
2550 * calculation, etc.).
2551 */
2552 msgbuf_size = (MSGBUF_SIZE + PHYSMAP_ALIGN_MASK) & ~PHYSMAP_ALIGN_MASK;
2553
2554 while (phys_avail[pa_indx].phys_beg + PHYSMAP_ALIGN + msgbuf_size >=
2555 phys_avail[pa_indx].phys_end) {
2556 physmem -= atop(phys_avail[pa_indx].phys_end -
2557 phys_avail[pa_indx].phys_beg);
2558 phys_avail[pa_indx].phys_beg = 0;
2559 phys_avail[pa_indx].phys_end = 0;
2560 --pa_indx;
2561 }
2562
2563 Maxmem = atop(phys_avail[pa_indx].phys_end);
2564
2565 /* Trim off space for the message buffer. */
2566 phys_avail[pa_indx].phys_end -= msgbuf_size;
2567
2568 avail_end = phys_avail[pa_indx].phys_end;
2569
2570 /* Map the message buffer. */
2571 for (off = 0; off < msgbuf_size; off += PAGE_SIZE) {
2572 pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off);
2573 }
2574
2575 /*
2576 * Try to get EFI framebuffer working as early as possible.
2577 *
2578 * WARN: Some BIOSes do not list the EFI framebuffer memory, causing
2579 * the pmap probe code to create a DMAP that does not cover its
2580 * physical address space, efi_fb_init_vaddr(1) might not return
2581 * an initialized framebuffer base pointer. In this situation the
2582 * later efi_fb_init_vaddr(0) call will deal with it.
2583 */
2584 if (have_efi_framebuffer)
2585 efi_fb_init_vaddr(1);
2586 }
2587
2588 struct machintr_abi MachIntrABI;
2589
2590 /*
2591 * IDT VECTORS:
2592 * 0 Divide by zero
2593 * 1 Debug
2594 * 2 NMI
2595 * 3 BreakPoint
2596 * 4 OverFlow
2597 * 5 Bound-Range
2598 * 6 Invalid OpCode
2599 * 7 Device Not Available (x87)
2600 * 8 Double-Fault
2601 * 9 Coprocessor Segment overrun (unsupported, reserved)
2602 * 10 Invalid-TSS
2603 * 11 Segment not present
2604 * 12 Stack
2605 * 13 General Protection
2606 * 14 Page Fault
2607 * 15 Reserved
2608 * 16 x87 FP Exception pending
2609 * 17 Alignment Check
2610 * 18 Machine Check
2611 * 19 SIMD floating point
2612 * 20-31 reserved
2613 * 32-255 INTn/external sources
2614 */
2615 u_int64_t
hammer_time(u_int64_t modulep,u_int64_t physfree)2616 hammer_time(u_int64_t modulep, u_int64_t physfree)
2617 {
2618 caddr_t kmdp;
2619 int gsel_tss, x, cpu;
2620 #if 0 /* JG */
2621 int metadata_missing, off;
2622 #endif
2623 struct mdglobaldata *gd;
2624 struct privatespace *ps;
2625 u_int64_t msr;
2626
2627 /*
2628 * Prevent lowering of the ipl if we call tsleep() early.
2629 */
2630 gd = &CPU_prvspace[0]->mdglobaldata;
2631 ps = (struct privatespace *)gd;
2632 bzero(gd, sizeof(*gd));
2633 bzero(&ps->common_tss, sizeof(ps->common_tss));
2634
2635 /*
2636 * Note: on both UP and SMP curthread must be set non-NULL
2637 * early in the boot sequence because the system assumes
2638 * that 'curthread' is never NULL.
2639 */
2640
2641 gd->mi.gd_curthread = &thread0;
2642 thread0.td_gd = &gd->mi;
2643
2644 atdevbase = ISA_HOLE_START + PTOV_OFFSET;
2645
2646 #if 0 /* JG */
2647 metadata_missing = 0;
2648 if (bootinfo.bi_modulep) {
2649 preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE;
2650 preload_bootstrap_relocate(KERNBASE);
2651 } else {
2652 metadata_missing = 1;
2653 }
2654 if (bootinfo.bi_envp)
2655 kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE;
2656 #endif
2657
2658 preload_metadata = (caddr_t)(uintptr_t)(modulep + PTOV_OFFSET);
2659 preload_bootstrap_relocate(PTOV_OFFSET);
2660 kmdp = preload_search_by_type("elf kernel");
2661 if (kmdp == NULL)
2662 kmdp = preload_search_by_type("elf64 kernel");
2663 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
2664 kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *) + PTOV_OFFSET;
2665 #ifdef DDB
2666 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
2667 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
2668 #endif
2669 efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
2670
2671 if (boothowto & RB_VERBOSE)
2672 bootverbose++;
2673
2674 /*
2675 * Default MachIntrABI to ICU
2676 */
2677 MachIntrABI = MachIntrABI_ICU;
2678
2679 /*
2680 * start with one cpu. Note: with one cpu, ncpus_fit_mask remain 0.
2681 */
2682 ncpus = 1;
2683 ncpus_fit = 1;
2684 /* Init basic tunables, hz etc */
2685 init_param1();
2686
2687 /*
2688 * make gdt memory segments
2689 */
2690 gdt_segs[GPROC0_SEL].ssd_base =
2691 (uintptr_t) &CPU_prvspace[0]->common_tss;
2692
2693 gd->mi.gd_prvspace = CPU_prvspace[0];
2694
2695 for (x = 0; x < NGDT; x++) {
2696 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1))
2697 ssdtosd(&gdt_segs[x], &gdt_cpu0[x]);
2698 }
2699 ssdtosyssd(&gdt_segs[GPROC0_SEL],
2700 (struct system_segment_descriptor *)&gdt_cpu0[GPROC0_SEL]);
2701
2702 /*
2703 * WARNING! Due to an Intel quirk, VMX exits set the gdt[] table
2704 * limit to 0xFFFF. To avoid having to do a heavy-weight
2705 * reload, we just make ours maximally sized.
2706 */
2707 r_gdt.rd_limit = MAXGDT_LIMIT - 1;
2708 r_gdt.rd_base = (long)gdt_cpu0;
2709 lgdt(&r_gdt);
2710
2711 wrmsr(MSR_FSBASE, 0); /* User value */
2712 wrmsr(MSR_GSBASE, (u_int64_t)&gd->mi);
2713 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */
2714
2715 mi_gdinit(&gd->mi, 0);
2716 cpu_gdinit(gd, 0);
2717 proc0paddr = proc0paddr_buff;
2718 mi_proc0init(&gd->mi, proc0paddr);
2719 safepri = TDPRI_MAX;
2720
2721 /* spinlocks and the BGL */
2722 init_locks();
2723
2724 /* exceptions */
2725 for (x = 0; x < NIDT; x++)
2726 setidt_global(x, rsvdary[x], SDT_SYSIGT, SEL_KPL, 0);
2727 setidt_global(IDT_DE, &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0);
2728 setidt_global(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 2);
2729 setidt_global(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 1);
2730 setidt_global(IDT_BP, &IDTVEC(bpt), SDT_SYSIGT, SEL_UPL, 0);
2731 setidt_global(IDT_OF, &IDTVEC(ofl), SDT_SYSIGT, SEL_KPL, 0);
2732 setidt_global(IDT_BR, &IDTVEC(bnd), SDT_SYSIGT, SEL_KPL, 0);
2733 setidt_global(IDT_UD, &IDTVEC(ill), SDT_SYSIGT, SEL_KPL, 0);
2734 setidt_global(IDT_NM, &IDTVEC(dna), SDT_SYSIGT, SEL_KPL, 0);
2735 setidt_global(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
2736 setidt_global(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYSIGT, SEL_KPL, 0);
2737 setidt_global(IDT_TS, &IDTVEC(tss), SDT_SYSIGT, SEL_KPL, 0);
2738 setidt_global(IDT_NP, &IDTVEC(missing), SDT_SYSIGT, SEL_KPL, 0);
2739 setidt_global(IDT_SS, &IDTVEC(stk), SDT_SYSIGT, SEL_KPL, 0);
2740 setidt_global(IDT_GP, &IDTVEC(prot), SDT_SYSIGT, SEL_KPL, 0);
2741 setidt_global(IDT_PF, &IDTVEC(page), SDT_SYSIGT, SEL_KPL, 0);
2742 setidt_global(IDT_MF, &IDTVEC(fpu), SDT_SYSIGT, SEL_KPL, 0);
2743 setidt_global(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0);
2744 setidt_global(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 0);
2745 setidt_global(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0);
2746
2747 for (cpu = 0; cpu < MAXCPU; ++cpu) {
2748 r_idt_arr[cpu].rd_limit = sizeof(idt_arr[cpu]) - 1;
2749 r_idt_arr[cpu].rd_base = (long) &idt_arr[cpu][0];
2750 }
2751
2752 lidt(&r_idt_arr[0]);
2753
2754 /*
2755 * Initialize the console before we print anything out.
2756 */
2757 cninit();
2758
2759 #if 0 /* JG */
2760 if (metadata_missing)
2761 kprintf("WARNING: loader(8) metadata is missing!\n");
2762 #endif
2763
2764 #if NISA >0
2765 elcr_probe();
2766 isa_defaultirq();
2767 #endif
2768 rand_initialize();
2769
2770 /*
2771 * Initialize IRQ mapping
2772 *
2773 * NOTE:
2774 * SHOULD be after elcr_probe()
2775 */
2776 MachIntrABI_ICU.initmap();
2777 MachIntrABI_IOAPIC.initmap();
2778
2779 #ifdef DDB
2780 kdb_init();
2781 if (boothowto & RB_KDB)
2782 Debugger("Boot flags requested debugger");
2783 #endif
2784
2785 identify_cpu(); /* Final stage of CPU initialization */
2786 initializecpu(0); /* Initialize CPU registers */
2787
2788 /*
2789 * On modern Intel cpus, haswell or later, cpu_idle_hlt=1 is better
2790 * because the cpu does significant power management in MWAIT
2791 * (also suggested is to set sysctl machdep.mwait.CX.idle=AUTODEEP).
2792 *
2793 * On many AMD cpus cpu_idle_hlt=3 is better, because the cpu does
2794 * significant power management only when using ACPI halt mode.
2795 * (However, on Ryzen, mode 4 (HLT) also does power management).
2796 *
2797 * On older AMD or Intel cpus, cpu_idle_hlt=2 is better because ACPI
2798 * is needed to reduce power consumption, but wakeup times are often
2799 * too long.
2800 */
2801 if (cpu_vendor_id == CPU_VENDOR_INTEL &&
2802 CPUID_TO_MODEL(cpu_id) >= 0x3C) { /* Haswell or later */
2803 cpu_idle_hlt = 1;
2804 }
2805 if (cpu_vendor_id == CPU_VENDOR_AMD) {
2806 if (CPUID_TO_FAMILY(cpu_id) >= 0x17) {
2807 /* Ryzen or later */
2808 cpu_idle_hlt = 3;
2809 } else if (CPUID_TO_FAMILY(cpu_id) >= 0x14) {
2810 /* Bobcat or later */
2811 cpu_idle_hlt = 3;
2812 }
2813 }
2814
2815 TUNABLE_INT_FETCH("hw.apic_io_enable", &ioapic_enable); /* for compat */
2816 TUNABLE_INT_FETCH("hw.ioapic_enable", &ioapic_enable);
2817 TUNABLE_INT_FETCH("hw.lapic_enable", &lapic_enable);
2818 TUNABLE_INT_FETCH("machdep.cpu_idle_hlt", &cpu_idle_hlt);
2819
2820 /*
2821 * By default always enable the ioapic. Certain virtual machines
2822 * may not work with the I/O apic enabled and can be specified in
2823 * the case statement below. On the other hand, if the ioapic is
2824 * disabled for virtual machines which DO work with the I/O apic,
2825 * the virtual machine can implode if we disable the I/O apic.
2826 *
2827 * For now enable the ioapic for all guests.
2828 *
2829 * NOTE: This must be done after identify_cpu(), which sets
2830 * 'cpu_feature2'.
2831 */
2832 if (ioapic_enable < 0) {
2833 ioapic_enable = 1;
2834 switch(vmm_guest) {
2835 case VMM_GUEST_NONE: /* should be enabled on real HW */
2836 case VMM_GUEST_KVM: /* must be enabled or VM implodes */
2837 ioapic_enable = 1;
2838 break;
2839 default: /* enable by default for other VMs */
2840 ioapic_enable = 1;
2841 break;
2842 }
2843 }
2844
2845 /*
2846 * TSS entry point for interrupts, traps, and exceptions
2847 * (sans NMI). This will always go to near the top of the pcpu
2848 * trampoline area. Hardware-pushed data will be copied into
2849 * the trap-frame on entry, and (if necessary) returned to the
2850 * trampoline on exit.
2851 *
2852 * We store some pcb data for the trampoline code above the
2853 * stack the cpu hw pushes into, and arrange things so the
2854 * address of tr_pcb_rsp is the same as the desired top of
2855 * stack.
2856 */
2857 ps->common_tss.tss_rsp0 = (register_t)&ps->trampoline.tr_pcb_rsp;
2858 ps->trampoline.tr_pcb_rsp = ps->common_tss.tss_rsp0;
2859 ps->trampoline.tr_pcb_gs_kernel = (register_t)gd;
2860 ps->trampoline.tr_pcb_cr3 = KPML4phys; /* adj to user cr3 live */
2861 ps->dbltramp.tr_pcb_gs_kernel = (register_t)gd;
2862 ps->dbltramp.tr_pcb_cr3 = KPML4phys;
2863 ps->dbgtramp.tr_pcb_gs_kernel = (register_t)gd;
2864 ps->dbgtramp.tr_pcb_cr3 = KPML4phys;
2865
2866 /* double fault stack */
2867 ps->common_tss.tss_ist1 = (register_t)&ps->dbltramp.tr_pcb_rsp;
2868 /* #DB debugger needs its own stack */
2869 ps->common_tss.tss_ist2 = (register_t)&ps->dbgtramp.tr_pcb_rsp;
2870
2871 /* Set the IO permission bitmap (empty due to tss seg limit) */
2872 ps->common_tss.tss_iobase = sizeof(struct x86_64tss);
2873
2874 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
2875 gd->gd_gdt = &gdt_cpu0[0];
2876 gd->gd_tss_gdt = &gd->gd_gdt[GPROC0_SEL];
2877 gd->gd_common_tssd = *gd->gd_tss_gdt;
2878 ltr(gsel_tss);
2879
2880 /* Set up the fast syscall stuff */
2881 msr = rdmsr(MSR_EFER) | EFER_SCE;
2882 wrmsr(MSR_EFER, msr);
2883 wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall));
2884 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
2885 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
2886 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
2887 wrmsr(MSR_STAR, msr);
2888 wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D|PSL_IOPL|PSL_AC);
2889
2890 getmemsize(kmdp, physfree);
2891 init_param2(physmem);
2892
2893 /* now running on new page tables, configured,and u/iom is accessible */
2894
2895 /* Map the message buffer. */
2896 #if 0 /* JG */
2897 for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE)
2898 pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off);
2899 #endif
2900
2901 msgbufinit(msgbufp, MSGBUF_SIZE);
2902
2903
2904 /* transfer to user mode */
2905
2906 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
2907 _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
2908 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
2909
2910 load_ds(_udatasel);
2911 load_es(_udatasel);
2912 load_fs(_udatasel);
2913
2914 /* setup proc 0's pcb */
2915 thread0.td_pcb->pcb_flags = 0;
2916 thread0.td_pcb->pcb_cr3 = KPML4phys;
2917 thread0.td_pcb->pcb_cr3_iso = 0;
2918 thread0.td_pcb->pcb_ext = NULL;
2919 lwp0.lwp_md.md_regs = &proc0_tf; /* XXX needed? */
2920
2921 /* Location of kernel stack for locore */
2922 return ((u_int64_t)thread0.td_pcb);
2923 }
2924
2925 /*
2926 * Initialize machine-dependant portions of the global data structure.
2927 * Note that the global data area and cpu0's idlestack in the private
2928 * data space were allocated in locore.
2929 *
2930 * Note: the idlethread's cpl is 0
2931 *
2932 * WARNING! Called from early boot, 'mycpu' may not work yet.
2933 */
2934 void
cpu_gdinit(struct mdglobaldata * gd,int cpu)2935 cpu_gdinit(struct mdglobaldata *gd, int cpu)
2936 {
2937 if (cpu)
2938 gd->mi.gd_curthread = &gd->mi.gd_idlethread;
2939
2940 lwkt_init_thread(&gd->mi.gd_idlethread,
2941 gd->mi.gd_prvspace->idlestack,
2942 sizeof(gd->mi.gd_prvspace->idlestack),
2943 0, &gd->mi);
2944 lwkt_set_comm(&gd->mi.gd_idlethread, "idle_%d", cpu);
2945 gd->mi.gd_idlethread.td_switch = cpu_lwkt_switch;
2946 gd->mi.gd_idlethread.td_sp -= sizeof(void *);
2947 *(void **)gd->mi.gd_idlethread.td_sp = cpu_idle_restore;
2948 }
2949
2950 /*
2951 * We only have to check for DMAP bounds, the globaldata space is
2952 * actually part of the kernel_map so we don't have to waste time
2953 * checking CPU_prvspace[*].
2954 */
2955 int
is_globaldata_space(vm_offset_t saddr,vm_offset_t eaddr)2956 is_globaldata_space(vm_offset_t saddr, vm_offset_t eaddr)
2957 {
2958 #if 0
2959 if (saddr >= (vm_offset_t)&CPU_prvspace[0] &&
2960 eaddr <= (vm_offset_t)&CPU_prvspace[MAXCPU]) {
2961 return (TRUE);
2962 }
2963 #endif
2964 if (saddr >= DMAP_MIN_ADDRESS && eaddr <= DMAP_MAX_ADDRESS)
2965 return (TRUE);
2966 return (FALSE);
2967 }
2968
2969 struct globaldata *
globaldata_find(int cpu)2970 globaldata_find(int cpu)
2971 {
2972 KKASSERT(cpu >= 0 && cpu < ncpus);
2973 return(&CPU_prvspace[cpu]->mdglobaldata.mi);
2974 }
2975
2976 /*
2977 * This path should be safe from the SYSRET issue because only stopped threads
2978 * can have their %rip adjusted this way (and all heavy weight thread switches
2979 * clear QUICKREF and thus do not use SYSRET). However, the code path is
2980 * convoluted so add a safety by forcing %rip to be cannonical.
2981 */
2982 int
ptrace_set_pc(struct lwp * lp,unsigned long addr)2983 ptrace_set_pc(struct lwp *lp, unsigned long addr)
2984 {
2985 if (addr & 0x0000800000000000LLU)
2986 lp->lwp_md.md_regs->tf_rip = addr | 0xFFFF000000000000LLU;
2987 else
2988 lp->lwp_md.md_regs->tf_rip = addr & 0x0000FFFFFFFFFFFFLLU;
2989 return (0);
2990 }
2991
2992 int
ptrace_single_step(struct lwp * lp)2993 ptrace_single_step(struct lwp *lp)
2994 {
2995 lp->lwp_md.md_regs->tf_rflags |= PSL_T;
2996 return (0);
2997 }
2998
2999 int
fill_regs(struct lwp * lp,struct reg * regs)3000 fill_regs(struct lwp *lp, struct reg *regs)
3001 {
3002 struct trapframe *tp;
3003
3004 if ((tp = lp->lwp_md.md_regs) == NULL)
3005 return EINVAL;
3006 bcopy(&tp->tf_rdi, ®s->r_rdi, sizeof(*regs));
3007 return (0);
3008 }
3009
3010 int
set_regs(struct lwp * lp,struct reg * regs)3011 set_regs(struct lwp *lp, struct reg *regs)
3012 {
3013 struct trapframe *tp;
3014
3015 tp = lp->lwp_md.md_regs;
3016 if (!EFL_SECURE(regs->r_rflags, tp->tf_rflags) ||
3017 !CS_SECURE(regs->r_cs))
3018 return (EINVAL);
3019 bcopy(®s->r_rdi, &tp->tf_rdi, sizeof(*regs));
3020 clear_quickret();
3021 return (0);
3022 }
3023
3024 static void
fill_fpregs_xmm(struct savexmm * sv_xmm,struct save87 * sv_87)3025 fill_fpregs_xmm(struct savexmm *sv_xmm, struct save87 *sv_87)
3026 {
3027 struct env87 *penv_87 = &sv_87->sv_env;
3028 struct envxmm *penv_xmm = &sv_xmm->sv_env;
3029 int i;
3030
3031 /* FPU control/status */
3032 penv_87->en_cw = penv_xmm->en_cw;
3033 penv_87->en_sw = penv_xmm->en_sw;
3034 penv_87->en_tw = penv_xmm->en_tw;
3035 penv_87->en_fip = penv_xmm->en_fip;
3036 penv_87->en_fcs = penv_xmm->en_fcs;
3037 penv_87->en_opcode = penv_xmm->en_opcode;
3038 penv_87->en_foo = penv_xmm->en_foo;
3039 penv_87->en_fos = penv_xmm->en_fos;
3040
3041 /* FPU registers */
3042 for (i = 0; i < 8; ++i)
3043 sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc;
3044 }
3045
3046 static void
set_fpregs_xmm(struct save87 * sv_87,struct savexmm * sv_xmm)3047 set_fpregs_xmm(struct save87 *sv_87, struct savexmm *sv_xmm)
3048 {
3049 struct env87 *penv_87 = &sv_87->sv_env;
3050 struct envxmm *penv_xmm = &sv_xmm->sv_env;
3051 int i;
3052
3053 /* FPU control/status */
3054 penv_xmm->en_cw = penv_87->en_cw;
3055 penv_xmm->en_sw = penv_87->en_sw;
3056 penv_xmm->en_tw = penv_87->en_tw;
3057 penv_xmm->en_fip = penv_87->en_fip;
3058 penv_xmm->en_fcs = penv_87->en_fcs;
3059 penv_xmm->en_opcode = penv_87->en_opcode;
3060 penv_xmm->en_foo = penv_87->en_foo;
3061 penv_xmm->en_fos = penv_87->en_fos;
3062
3063 /* FPU registers */
3064 for (i = 0; i < 8; ++i)
3065 sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i];
3066 }
3067
3068 int
fill_fpregs(struct lwp * lp,struct fpreg * fpregs)3069 fill_fpregs(struct lwp *lp, struct fpreg *fpregs)
3070 {
3071 if (lp->lwp_thread == NULL || lp->lwp_thread->td_pcb == NULL)
3072 return EINVAL;
3073 if (cpu_fxsr) {
3074 fill_fpregs_xmm(&lp->lwp_thread->td_pcb->pcb_save.sv_xmm,
3075 (struct save87 *)fpregs);
3076 return (0);
3077 }
3078 bcopy(&lp->lwp_thread->td_pcb->pcb_save.sv_87, fpregs, sizeof *fpregs);
3079 return (0);
3080 }
3081
3082 int
set_fpregs(struct lwp * lp,struct fpreg * fpregs)3083 set_fpregs(struct lwp *lp, struct fpreg *fpregs)
3084 {
3085 if (cpu_fxsr) {
3086 set_fpregs_xmm((struct save87 *)fpregs,
3087 &lp->lwp_thread->td_pcb->pcb_save.sv_xmm);
3088 return (0);
3089 }
3090 bcopy(fpregs, &lp->lwp_thread->td_pcb->pcb_save.sv_87, sizeof *fpregs);
3091 return (0);
3092 }
3093
3094 int
fill_dbregs(struct lwp * lp,struct dbreg * dbregs)3095 fill_dbregs(struct lwp *lp, struct dbreg *dbregs)
3096 {
3097 struct pcb *pcb;
3098
3099 if (lp == NULL) {
3100 dbregs->dr[0] = rdr0();
3101 dbregs->dr[1] = rdr1();
3102 dbregs->dr[2] = rdr2();
3103 dbregs->dr[3] = rdr3();
3104 dbregs->dr[4] = rdr4();
3105 dbregs->dr[5] = rdr5();
3106 dbregs->dr[6] = rdr6();
3107 dbregs->dr[7] = rdr7();
3108 return (0);
3109 }
3110 if (lp->lwp_thread == NULL || (pcb = lp->lwp_thread->td_pcb) == NULL)
3111 return EINVAL;
3112 dbregs->dr[0] = pcb->pcb_dr0;
3113 dbregs->dr[1] = pcb->pcb_dr1;
3114 dbregs->dr[2] = pcb->pcb_dr2;
3115 dbregs->dr[3] = pcb->pcb_dr3;
3116 dbregs->dr[4] = 0;
3117 dbregs->dr[5] = 0;
3118 dbregs->dr[6] = pcb->pcb_dr6;
3119 dbregs->dr[7] = pcb->pcb_dr7;
3120 return (0);
3121 }
3122
3123 int
set_dbregs(struct lwp * lp,struct dbreg * dbregs)3124 set_dbregs(struct lwp *lp, struct dbreg *dbregs)
3125 {
3126 if (lp == NULL) {
3127 load_dr0(dbregs->dr[0]);
3128 load_dr1(dbregs->dr[1]);
3129 load_dr2(dbregs->dr[2]);
3130 load_dr3(dbregs->dr[3]);
3131 load_dr4(dbregs->dr[4]);
3132 load_dr5(dbregs->dr[5]);
3133 load_dr6(dbregs->dr[6]);
3134 load_dr7(dbregs->dr[7]);
3135 } else {
3136 struct pcb *pcb;
3137 struct ucred *ucred;
3138 int i;
3139 uint64_t mask1, mask2;
3140
3141 /*
3142 * Don't let an illegal value for dr7 get set. Specifically,
3143 * check for undefined settings. Setting these bit patterns
3144 * result in undefined behaviour and can lead to an unexpected
3145 * TRCTRAP.
3146 */
3147 /* JG this loop looks unreadable */
3148 /* Check 4 2-bit fields for invalid patterns.
3149 * These fields are R/Wi, for i = 0..3
3150 */
3151 /* Is 10 in LENi allowed when running in compatibility mode? */
3152 /* Pattern 10 in R/Wi might be used to indicate
3153 * breakpoint on I/O. Further analysis should be
3154 * carried to decide if it is safe and useful to
3155 * provide access to that capability
3156 */
3157 for (i = 0, mask1 = 0x3<<16, mask2 = 0x2<<16; i < 4;
3158 i++, mask1 <<= 4, mask2 <<= 4)
3159 if ((dbregs->dr[7] & mask1) == mask2)
3160 return (EINVAL);
3161
3162 pcb = lp->lwp_thread->td_pcb;
3163 ucred = lp->lwp_proc->p_ucred;
3164
3165 /*
3166 * Don't let a process set a breakpoint that is not within the
3167 * process's address space. If a process could do this, it
3168 * could halt the system by setting a breakpoint in the kernel
3169 * (if ddb was enabled). Thus, we need to check to make sure
3170 * that no breakpoints are being enabled for addresses outside
3171 * process's address space, unless, perhaps, we were called by
3172 * uid 0.
3173 *
3174 * XXX - what about when the watched area of the user's
3175 * address space is written into from within the kernel
3176 * ... wouldn't that still cause a breakpoint to be generated
3177 * from within kernel mode?
3178 */
3179
3180 if (caps_priv_check(ucred, SYSCAP_RESTRICTEDROOT) != 0) {
3181 if (dbregs->dr[7] & 0x3) {
3182 /* dr0 is enabled */
3183 if (dbregs->dr[0] >= VM_MAX_USER_ADDRESS)
3184 return (EINVAL);
3185 }
3186
3187 if (dbregs->dr[7] & (0x3<<2)) {
3188 /* dr1 is enabled */
3189 if (dbregs->dr[1] >= VM_MAX_USER_ADDRESS)
3190 return (EINVAL);
3191 }
3192
3193 if (dbregs->dr[7] & (0x3<<4)) {
3194 /* dr2 is enabled */
3195 if (dbregs->dr[2] >= VM_MAX_USER_ADDRESS)
3196 return (EINVAL);
3197 }
3198
3199 if (dbregs->dr[7] & (0x3<<6)) {
3200 /* dr3 is enabled */
3201 if (dbregs->dr[3] >= VM_MAX_USER_ADDRESS)
3202 return (EINVAL);
3203 }
3204 }
3205
3206 pcb->pcb_dr0 = dbregs->dr[0];
3207 pcb->pcb_dr1 = dbregs->dr[1];
3208 pcb->pcb_dr2 = dbregs->dr[2];
3209 pcb->pcb_dr3 = dbregs->dr[3];
3210 pcb->pcb_dr6 = dbregs->dr[6];
3211 pcb->pcb_dr7 = dbregs->dr[7];
3212
3213 pcb->pcb_flags |= PCB_DBREGS;
3214 }
3215
3216 return (0);
3217 }
3218
3219 /*
3220 * Return > 0 if a hardware breakpoint has been hit, and the
3221 * breakpoint was in user space. Return 0, otherwise.
3222 */
3223 int
user_dbreg_trap(void)3224 user_dbreg_trap(void)
3225 {
3226 u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */
3227 u_int64_t bp; /* breakpoint bits extracted from dr6 */
3228 int nbp; /* number of breakpoints that triggered */
3229 caddr_t addr[4]; /* breakpoint addresses */
3230 int i;
3231
3232 dr7 = rdr7();
3233 if ((dr7 & 0xff) == 0) {
3234 /*
3235 * all GE and LE bits in the dr7 register are zero,
3236 * thus the trap couldn't have been caused by the
3237 * hardware debug registers
3238 */
3239 return 0;
3240 }
3241
3242 nbp = 0;
3243 dr6 = rdr6();
3244 bp = dr6 & 0xf;
3245
3246 if (bp == 0) {
3247 /*
3248 * None of the breakpoint bits are set meaning this
3249 * trap was not caused by any of the debug registers
3250 */
3251 return 0;
3252 }
3253
3254 /*
3255 * at least one of the breakpoints were hit, check to see
3256 * which ones and if any of them are user space addresses
3257 */
3258
3259 if (bp & 0x01) {
3260 addr[nbp++] = (caddr_t)rdr0();
3261 }
3262 if (bp & 0x02) {
3263 addr[nbp++] = (caddr_t)rdr1();
3264 }
3265 if (bp & 0x04) {
3266 addr[nbp++] = (caddr_t)rdr2();
3267 }
3268 if (bp & 0x08) {
3269 addr[nbp++] = (caddr_t)rdr3();
3270 }
3271
3272 for (i = 0; i < nbp; i++) {
3273 if (addr[i] < (caddr_t)VM_MAX_USER_ADDRESS) {
3274 /*
3275 * addr[i] is in user space
3276 */
3277 return nbp;
3278 }
3279 }
3280
3281 /*
3282 * None of the breakpoints are in user space.
3283 */
3284 return 0;
3285 }
3286
3287
3288 #ifndef DDB
3289 void
Debugger(const char * msg)3290 Debugger(const char *msg)
3291 {
3292 kprintf("Debugger(\"%s\") called.\n", msg);
3293 }
3294 #endif /* no DDB */
3295
3296 #ifdef DDB
3297
3298 /*
3299 * Provide inb() and outb() as functions. They are normally only
3300 * available as macros calling inlined functions, thus cannot be
3301 * called inside DDB.
3302 *
3303 * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
3304 */
3305
3306 #undef inb
3307 #undef outb
3308
3309 /* silence compiler warnings */
3310 u_char inb(u_int);
3311 void outb(u_int, u_char);
3312
3313 u_char
inb(u_int port)3314 inb(u_int port)
3315 {
3316 u_char data;
3317 /*
3318 * We use %%dx and not %1 here because i/o is done at %dx and not at
3319 * %edx, while gcc generates inferior code (movw instead of movl)
3320 * if we tell it to load (u_short) port.
3321 */
3322 __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port));
3323 return (data);
3324 }
3325
3326 void
outb(u_int port,u_char data)3327 outb(u_int port, u_char data)
3328 {
3329 u_char al;
3330 /*
3331 * Use an unnecessary assignment to help gcc's register allocator.
3332 * This make a large difference for gcc-1.40 and a tiny difference
3333 * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for
3334 * best results. gcc-2.6.0 can't handle this.
3335 */
3336 al = data;
3337 __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port));
3338 }
3339
3340 #endif /* DDB */
3341
3342
3343
3344 /*
3345 * initialize all the SMP locks
3346 */
3347
3348 /* critical region when masking or unmasking interupts */
3349 struct spinlock_deprecated imen_spinlock;
3350
3351 /* locks com (tty) data/hardware accesses: a FASTINTR() */
3352 struct spinlock_deprecated com_spinlock;
3353
3354 /* lock regions around the clock hardware */
3355 struct spinlock_deprecated clock_spinlock;
3356
3357 static void
init_locks(void)3358 init_locks(void)
3359 {
3360 /*
3361 * Get the initial mplock with a count of 1 for the BSP.
3362 * This uses a LOGICAL cpu ID, ie BSP == 0.
3363 */
3364 cpu_get_initial_mplock();
3365 /* DEPRECATED */
3366 spin_init_deprecated(&imen_spinlock);
3367 spin_init_deprecated(&com_spinlock);
3368 spin_init_deprecated(&clock_spinlock);
3369
3370 /* our token pool needs to work early */
3371 lwkt_token_pool_init();
3372 }
3373
3374 boolean_t
cpu_mwait_hint_valid(uint32_t hint)3375 cpu_mwait_hint_valid(uint32_t hint)
3376 {
3377 int cx_idx, sub;
3378
3379 cx_idx = MWAIT_EAX_TO_CX(hint);
3380 if (cx_idx >= CPU_MWAIT_CX_MAX)
3381 return FALSE;
3382
3383 sub = MWAIT_EAX_TO_CX_SUB(hint);
3384 if (sub >= cpu_mwait_cx_info[cx_idx].subcnt)
3385 return FALSE;
3386
3387 return TRUE;
3388 }
3389
3390 void
cpu_mwait_cx_no_bmsts(void)3391 cpu_mwait_cx_no_bmsts(void)
3392 {
3393 atomic_clear_int(&cpu_mwait_c3_preamble, CPU_MWAIT_C3_PREAMBLE_BM_STS);
3394 }
3395
3396 void
cpu_mwait_cx_no_bmarb(void)3397 cpu_mwait_cx_no_bmarb(void)
3398 {
3399 atomic_clear_int(&cpu_mwait_c3_preamble, CPU_MWAIT_C3_PREAMBLE_BM_ARB);
3400 }
3401
3402 static int
cpu_mwait_cx_hint2name(int hint,char * name,int namelen,boolean_t allow_auto)3403 cpu_mwait_cx_hint2name(int hint, char *name, int namelen, boolean_t allow_auto)
3404 {
3405 int old_cx_idx, sub = 0;
3406
3407 if (hint >= 0) {
3408 old_cx_idx = MWAIT_EAX_TO_CX(hint);
3409 sub = MWAIT_EAX_TO_CX_SUB(hint);
3410 } else if (hint == CPU_MWAIT_HINT_AUTO) {
3411 old_cx_idx = allow_auto ? CPU_MWAIT_C2 : CPU_MWAIT_CX_MAX;
3412 } else if (hint == CPU_MWAIT_HINT_AUTODEEP) {
3413 old_cx_idx = allow_auto ? CPU_MWAIT_C3 : CPU_MWAIT_CX_MAX;
3414 } else {
3415 old_cx_idx = CPU_MWAIT_CX_MAX;
3416 }
3417
3418 if (!CPU_MWAIT_HAS_CX)
3419 strlcpy(name, "NONE", namelen);
3420 else if (allow_auto && hint == CPU_MWAIT_HINT_AUTO)
3421 strlcpy(name, "AUTO", namelen);
3422 else if (allow_auto && hint == CPU_MWAIT_HINT_AUTODEEP)
3423 strlcpy(name, "AUTODEEP", namelen);
3424 else if (old_cx_idx >= CPU_MWAIT_CX_MAX ||
3425 sub >= cpu_mwait_cx_info[old_cx_idx].subcnt)
3426 strlcpy(name, "INVALID", namelen);
3427 else
3428 ksnprintf(name, namelen, "C%d/%d", old_cx_idx, sub);
3429
3430 return old_cx_idx;
3431 }
3432
3433 static int
cpu_mwait_cx_name2hint(char * name,int * hint0,boolean_t allow_auto)3434 cpu_mwait_cx_name2hint(char *name, int *hint0, boolean_t allow_auto)
3435 {
3436 int cx_idx, sub, hint;
3437 char *ptr, *start;
3438
3439 if (allow_auto && strcmp(name, "AUTO") == 0) {
3440 hint = CPU_MWAIT_HINT_AUTO;
3441 cx_idx = CPU_MWAIT_C2;
3442 goto done;
3443 }
3444 if (allow_auto && strcmp(name, "AUTODEEP") == 0) {
3445 hint = CPU_MWAIT_HINT_AUTODEEP;
3446 cx_idx = CPU_MWAIT_C3;
3447 goto done;
3448 }
3449
3450 if (strlen(name) < 4 || toupper(name[0]) != 'C')
3451 return -1;
3452 start = &name[1];
3453 ptr = NULL;
3454
3455 cx_idx = strtol(start, &ptr, 10);
3456 if (ptr == start || *ptr != '/')
3457 return -1;
3458 if (cx_idx < 0 || cx_idx >= CPU_MWAIT_CX_MAX)
3459 return -1;
3460
3461 start = ptr + 1;
3462 ptr = NULL;
3463
3464 sub = strtol(start, &ptr, 10);
3465 if (*ptr != '\0')
3466 return -1;
3467 if (sub < 0 || sub >= cpu_mwait_cx_info[cx_idx].subcnt)
3468 return -1;
3469
3470 hint = MWAIT_EAX_HINT(cx_idx, sub);
3471 done:
3472 *hint0 = hint;
3473 return cx_idx;
3474 }
3475
3476 static int
cpu_mwait_cx_transit(int old_cx_idx,int cx_idx)3477 cpu_mwait_cx_transit(int old_cx_idx, int cx_idx)
3478 {
3479 if (cx_idx >= CPU_MWAIT_C3 && cpu_mwait_c3_preamble)
3480 return EOPNOTSUPP;
3481 if (old_cx_idx < CPU_MWAIT_C3 && cx_idx >= CPU_MWAIT_C3) {
3482 int error;
3483
3484 error = cputimer_intr_powersave_addreq();
3485 if (error)
3486 return error;
3487 } else if (old_cx_idx >= CPU_MWAIT_C3 && cx_idx < CPU_MWAIT_C3) {
3488 cputimer_intr_powersave_remreq();
3489 }
3490 return 0;
3491 }
3492
3493 static int
cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS,int * hint0,boolean_t allow_auto)3494 cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS, int *hint0,
3495 boolean_t allow_auto)
3496 {
3497 int error, cx_idx, old_cx_idx, hint;
3498 char name[CPU_MWAIT_CX_NAMELEN];
3499
3500 hint = *hint0;
3501 old_cx_idx = cpu_mwait_cx_hint2name(hint, name, sizeof(name),
3502 allow_auto);
3503
3504 error = sysctl_handle_string(oidp, name, sizeof(name), req);
3505 if (error != 0 || req->newptr == NULL)
3506 return error;
3507
3508 if (!CPU_MWAIT_HAS_CX)
3509 return EOPNOTSUPP;
3510
3511 cx_idx = cpu_mwait_cx_name2hint(name, &hint, allow_auto);
3512 if (cx_idx < 0)
3513 return EINVAL;
3514
3515 error = cpu_mwait_cx_transit(old_cx_idx, cx_idx);
3516 if (error)
3517 return error;
3518
3519 *hint0 = hint;
3520 return 0;
3521 }
3522
3523 static int
cpu_mwait_cx_setname(struct cpu_idle_stat * stat,const char * cx_name)3524 cpu_mwait_cx_setname(struct cpu_idle_stat *stat, const char *cx_name)
3525 {
3526 int error, cx_idx, old_cx_idx, hint;
3527 char name[CPU_MWAIT_CX_NAMELEN];
3528
3529 KASSERT(CPU_MWAIT_HAS_CX, ("cpu does not support mwait CX extension"));
3530
3531 hint = stat->hint;
3532 old_cx_idx = cpu_mwait_cx_hint2name(hint, name, sizeof(name), TRUE);
3533
3534 strlcpy(name, cx_name, sizeof(name));
3535 cx_idx = cpu_mwait_cx_name2hint(name, &hint, TRUE);
3536 if (cx_idx < 0)
3537 return EINVAL;
3538
3539 error = cpu_mwait_cx_transit(old_cx_idx, cx_idx);
3540 if (error)
3541 return error;
3542
3543 stat->hint = hint;
3544 return 0;
3545 }
3546
3547 static int
cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS)3548 cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS)
3549 {
3550 int hint = cpu_mwait_halt_global;
3551 int error, cx_idx, cpu;
3552 char name[CPU_MWAIT_CX_NAMELEN], cx_name[CPU_MWAIT_CX_NAMELEN];
3553
3554 cpu_mwait_cx_hint2name(hint, name, sizeof(name), TRUE);
3555
3556 error = sysctl_handle_string(oidp, name, sizeof(name), req);
3557 if (error != 0 || req->newptr == NULL)
3558 return error;
3559
3560 if (!CPU_MWAIT_HAS_CX)
3561 return EOPNOTSUPP;
3562
3563 /* Save name for later per-cpu CX configuration */
3564 strlcpy(cx_name, name, sizeof(cx_name));
3565
3566 cx_idx = cpu_mwait_cx_name2hint(name, &hint, TRUE);
3567 if (cx_idx < 0)
3568 return EINVAL;
3569
3570 /* Change per-cpu CX configuration */
3571 for (cpu = 0; cpu < ncpus; ++cpu) {
3572 error = cpu_mwait_cx_setname(&cpu_idle_stats[cpu], cx_name);
3573 if (error)
3574 return error;
3575 }
3576
3577 cpu_mwait_halt_global = hint;
3578 return 0;
3579 }
3580
3581 static int
cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS)3582 cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS)
3583 {
3584 struct cpu_idle_stat *stat = arg1;
3585 int error;
3586
3587 error = cpu_mwait_cx_select_sysctl(oidp, arg1, arg2, req,
3588 &stat->hint, TRUE);
3589 return error;
3590 }
3591
3592 static int
cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS)3593 cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS)
3594 {
3595 int error;
3596
3597 error = cpu_mwait_cx_select_sysctl(oidp, arg1, arg2, req,
3598 &cpu_mwait_spin, FALSE);
3599 return error;
3600 }
3601
3602 /*
3603 * This manual debugging code is called unconditionally from Xtimer
3604 * (the per-cpu timer interrupt) whether the current thread is in a
3605 * critical section or not) and can be useful in tracking down lockups.
3606 *
3607 * NOTE: MANUAL DEBUG CODE
3608 */
3609 #if 0
3610 static int saveticks[SMP_MAXCPU];
3611 static int savecounts[SMP_MAXCPU];
3612 #endif
3613 static tsc_uclock_t last_tsc[SMP_MAXCPU];
3614
3615 void
pcpu_timer_always(struct intrframe * frame)3616 pcpu_timer_always(struct intrframe *frame)
3617 {
3618 globaldata_t gd;
3619 thread_t td;
3620 char *top;
3621 char *bot;
3622 char *rbp;
3623 char *rip;
3624 int n;
3625 tsc_uclock_t tsc;
3626
3627 if (flame_poll_debug == 0)
3628 return;
3629 gd = mycpu;
3630 tsc = rdtsc() - last_tsc[gd->gd_cpuid];
3631 if (tsc_frequency == 0 || tsc < tsc_frequency)
3632 return;
3633 last_tsc[gd->gd_cpuid] = rdtsc();
3634
3635 td = gd->gd_curthread;
3636 if (td == NULL)
3637 return;
3638 bot = (char *)td->td_kstack + PAGE_SIZE; /* skip guard */
3639 top = (char *)td->td_kstack + td->td_kstack_size;
3640 if (bot >= top)
3641 return;
3642
3643 rip = (char *)(intptr_t)frame->if_rip;
3644 kprintf("POLL%02d %016lx", gd->gd_cpuid, (intptr_t)rip);
3645 rbp = (char *)(intptr_t)frame->if_rbp;
3646
3647 for (n = 1; n < 8; ++n) {
3648 if (rbp < bot || rbp > top - 8 || ((intptr_t)rbp & 7))
3649 break;
3650 kprintf("<-%016lx", (intptr_t)*(char **)(rbp + 8));
3651 if (*(char **)rbp <= rbp)
3652 break;
3653 rbp = *(char **)rbp;
3654 }
3655 kprintf("\n");
3656 cpu_sfence();
3657 }
3658
3659 SET_DECLARE(smap_open, char);
3660 SET_DECLARE(smap_close, char);
3661
3662 static void
cpu_implement_smap(void)3663 cpu_implement_smap(void)
3664 {
3665 char **scan;
3666
3667 for (scan = SET_BEGIN(smap_open); /* nop -> stac */
3668 scan < SET_LIMIT(smap_open); ++scan) {
3669 (*scan)[0] = 0x0F;
3670 (*scan)[1] = 0x01;
3671 (*scan)[2] = 0xCB;
3672 }
3673 for (scan = SET_BEGIN(smap_close); /* nop -> clac */
3674 scan < SET_LIMIT(smap_close); ++scan) {
3675 (*scan)[0] = 0x0F;
3676 (*scan)[1] = 0x01;
3677 (*scan)[2] = 0xCA;
3678 }
3679 }
3680
3681 /*
3682 * From a hard interrupt
3683 */
3684 int
cpu_interrupt_running(struct thread * td)3685 cpu_interrupt_running(struct thread *td)
3686 {
3687 struct mdglobaldata *gd = mdcpu;
3688
3689 if (clock_debug1 > 0) {
3690 --clock_debug1;
3691 kprintf("%d %016lx %016lx %016lx\n",
3692 ((td->td_flags & TDF_INTTHREAD) != 0),
3693 gd->gd_ipending[0],
3694 gd->gd_ipending[1],
3695 gd->gd_ipending[2]);
3696 if (td->td_flags & TDF_CLKTHREAD) {
3697 kprintf("CLKTD %s PREEMPT %s\n",
3698 td->td_comm,
3699 (td->td_preempted ?
3700 td->td_preempted->td_comm : ""));
3701 } else {
3702 kprintf("NORTD %s\n", td->td_comm);
3703 }
3704 }
3705 if ((td->td_flags & TDF_INTTHREAD) ||
3706 gd->gd_ipending[0] ||
3707 gd->gd_ipending[1] ||
3708 gd->gd_ipending[2]) {
3709 return 1;
3710 } else {
3711 return 0;
3712 }
3713 }
3714