xref: /netbsd/sys/arch/x86/x86/x86_machdep.c (revision 6550d01e)
1 /*	$NetBSD: x86_machdep.c,v 1.44 2010/10/21 11:17:54 yamt Exp $	*/
2 
3 /*-
4  * Copyright (c) 2002, 2006, 2007 YAMAMOTO Takashi,
5  * Copyright (c) 2005, 2008, 2009 The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Julio M. Merino Vidal.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 #include <sys/cdefs.h>
34 __KERNEL_RCSID(0, "$NetBSD: x86_machdep.c,v 1.44 2010/10/21 11:17:54 yamt Exp $");
35 
36 #include "opt_modular.h"
37 #include "opt_physmem.h"
38 
39 #include <sys/types.h>
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/kcore.h>
43 #include <sys/errno.h>
44 #include <sys/kauth.h>
45 #include <sys/mutex.h>
46 #include <sys/cpu.h>
47 #include <sys/intr.h>
48 #include <sys/atomic.h>
49 #include <sys/module.h>
50 #include <sys/sysctl.h>
51 #include <sys/extent.h>
52 
53 #include <x86/cpuvar.h>
54 #include <x86/cputypes.h>
55 #include <x86/machdep.h>
56 #include <x86/nmi.h>
57 #include <x86/pio.h>
58 
59 #include <dev/isa/isareg.h>
60 #include <dev/ic/i8042reg.h>
61 
62 #include <machine/bootinfo.h>
63 #include <machine/vmparam.h>
64 
65 #include <uvm/uvm_extern.h>
66 
67 void (*x86_cpu_idle)(void);
68 static bool x86_cpu_idle_ipi;
69 static char x86_cpu_idle_text[16];
70 
71 int check_pa_acc(paddr_t, vm_prot_t);
72 
73 /* --------------------------------------------------------------------- */
74 
75 /*
76  * Main bootinfo structure.  This is filled in by the bootstrap process
77  * done in locore.S based on the information passed by the boot loader.
78  */
79 struct bootinfo bootinfo;
80 
81 /* --------------------------------------------------------------------- */
82 
83 static kauth_listener_t x86_listener;
84 
85 /*
86  * Given the type of a bootinfo entry, looks for a matching item inside
87  * the bootinfo structure.  If found, returns a pointer to it (which must
88  * then be casted to the appropriate bootinfo_* type); otherwise, returns
89  * NULL.
90  */
91 void *
92 lookup_bootinfo(int type)
93 {
94 	bool found;
95 	int i;
96 	struct btinfo_common *bic;
97 
98 	bic = (struct btinfo_common *)(bootinfo.bi_data);
99 	found = FALSE;
100 	for (i = 0; i < bootinfo.bi_nentries && !found; i++) {
101 		if (bic->type == type)
102 			found = TRUE;
103 		else
104 			bic = (struct btinfo_common *)
105 			    ((uint8_t *)bic + bic->len);
106 	}
107 
108 	return found ? bic : NULL;
109 }
110 
111 /*
112  * check_pa_acc: check if given pa is accessible.
113  */
114 int
115 check_pa_acc(paddr_t pa, vm_prot_t prot)
116 {
117 	extern phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX];
118 	extern int mem_cluster_cnt;
119 	int i;
120 
121 	for (i = 0; i < mem_cluster_cnt; i++) {
122 		const phys_ram_seg_t *seg = &mem_clusters[i];
123 		paddr_t lstart = seg->start;
124 
125 		if (lstart <= pa && pa - lstart <= seg->size) {
126 			return 0;
127 		}
128 	}
129 
130 	return kauth_authorize_machdep(kauth_cred_get(),
131 	    KAUTH_MACHDEP_UNMANAGEDMEM, NULL, NULL, NULL, NULL);
132 }
133 
134 #ifdef MODULAR
135 /*
136  * Push any modules loaded by the boot loader.
137  */
138 void
139 module_init_md(void)
140 {
141 	struct btinfo_modulelist *biml;
142 	struct bi_modulelist_entry *bi, *bimax;
143 
144 	biml = lookup_bootinfo(BTINFO_MODULELIST);
145 	if (biml == NULL) {
146 		aprint_debug("No module info at boot\n");
147 		return;
148 	}
149 
150 	bi = (struct bi_modulelist_entry *)((uint8_t *)biml + sizeof(*biml));
151 	bimax = bi + biml->num;
152 	for (; bi < bimax; bi++) {
153 		if (bi->type != BI_MODULE_ELF) {
154 			aprint_debug("Skipping non-ELF module\n");
155 			continue;
156 		}
157 		aprint_debug("Prep module path=%s len=%d pa=%x\n", bi->path,
158 		    bi->len, bi->base);
159 		KASSERT(trunc_page(bi->base) == bi->base);
160 		(void)module_prime((void *)((uintptr_t)bi->base + KERNBASE),
161 		    bi->len);
162 	}
163 }
164 #endif	/* MODULAR */
165 
166 void
167 cpu_need_resched(struct cpu_info *ci, int flags)
168 {
169 	struct cpu_info *cur;
170 	lwp_t *l;
171 
172 	KASSERT(kpreempt_disabled());
173 	cur = curcpu();
174 	l = ci->ci_data.cpu_onproc;
175 	ci->ci_want_resched |= flags;
176 
177 	if (__predict_false((l->l_pflag & LP_INTR) != 0)) {
178 		/*
179 		 * No point doing anything, it will switch soon.
180 		 * Also here to prevent an assertion failure in
181 		 * kpreempt() due to preemption being set on a
182 		 * soft interrupt LWP.
183 		 */
184 		return;
185 	}
186 
187 	if (l == ci->ci_data.cpu_idlelwp) {
188 		if (ci == cur)
189 			return;
190 #ifndef XEN /* XXX review when Xen gets MP support */
191 		if (x86_cpu_idle_ipi != false)
192 			x86_send_ipi(ci, 0);
193 #endif
194 		return;
195 	}
196 
197 	if ((flags & RESCHED_KPREEMPT) != 0) {
198 #ifdef __HAVE_PREEMPTION
199 		atomic_or_uint(&l->l_dopreempt, DOPREEMPT_ACTIVE);
200 		if (ci == cur) {
201 			softint_trigger(1 << SIR_PREEMPT);
202 		} else {
203 			x86_send_ipi(ci, X86_IPI_KPREEMPT);
204 		}
205 #endif
206 	} else {
207 		aston(l, X86_AST_PREEMPT);
208 		if (ci == cur) {
209 			return;
210 		}
211 		if ((flags & RESCHED_IMMED) != 0) {
212 			x86_send_ipi(ci, 0);
213 		}
214 	}
215 }
216 
217 void
218 cpu_signotify(struct lwp *l)
219 {
220 
221 	KASSERT(kpreempt_disabled());
222 	aston(l, X86_AST_GENERIC);
223 	if (l->l_cpu != curcpu())
224 		x86_send_ipi(l->l_cpu, 0);
225 }
226 
227 void
228 cpu_need_proftick(struct lwp *l)
229 {
230 
231 	KASSERT(kpreempt_disabled());
232 	KASSERT(l->l_cpu == curcpu());
233 
234 	l->l_pflag |= LP_OWEUPC;
235 	aston(l, X86_AST_GENERIC);
236 }
237 
238 bool
239 cpu_intr_p(void)
240 {
241 	int idepth;
242 
243 	kpreempt_disable();
244 	idepth = curcpu()->ci_idepth;
245 	kpreempt_enable();
246 	return (idepth >= 0);
247 }
248 
249 #ifdef __HAVE_PREEMPTION
250 /*
251  * Called to check MD conditions that would prevent preemption, and to
252  * arrange for those conditions to be rechecked later.
253  */
254 bool
255 cpu_kpreempt_enter(uintptr_t where, int s)
256 {
257 	struct cpu_info *ci;
258 	struct pcb *pcb;
259 	lwp_t *l;
260 
261 	KASSERT(kpreempt_disabled());
262 
263 	l = curlwp;
264 	ci = curcpu();
265 
266 	/*
267 	 * If SPL raised, can't go.  Note this implies that spin
268 	 * mutexes at IPL_NONE are _not_ valid to use.
269 	 */
270 	if (s > IPL_PREEMPT) {
271 		softint_trigger(1 << SIR_PREEMPT);
272 		aston(l, X86_AST_PREEMPT);	/* paranoid */
273 		return false;
274 	}
275 
276 	/* Must save cr2 or it could be clobbered. */
277 	pcb = lwp_getpcb(l);
278 	pcb->pcb_cr2 = rcr2();
279 
280 	return true;
281 }
282 
283 /*
284  * Called after returning from a kernel preemption, and called with
285  * preemption disabled.
286  */
287 void
288 cpu_kpreempt_exit(uintptr_t where)
289 {
290 	extern char x86_copyfunc_start, x86_copyfunc_end;
291 	struct pcb *pcb;
292 
293 	KASSERT(kpreempt_disabled());
294 
295 	/*
296 	 * If we interrupted any of the copy functions we must reload
297 	 * the pmap when resuming, as they cannot tolerate it being
298 	 * swapped out.
299 	 */
300 	if (where >= (uintptr_t)&x86_copyfunc_start &&
301 	    where < (uintptr_t)&x86_copyfunc_end) {
302 		pmap_load();
303 	}
304 
305 	/* Restore cr2 only after the pmap, as pmap_load can block. */
306 	pcb = lwp_getpcb(curlwp);
307 	lcr2(pcb->pcb_cr2);
308 }
309 
310 /*
311  * Return true if preemption is disabled for MD reasons.  Must be called
312  * with preemption disabled, and thus is only for diagnostic checks.
313  */
314 bool
315 cpu_kpreempt_disabled(void)
316 {
317 
318 	return curcpu()->ci_ilevel > IPL_NONE;
319 }
320 #endif	/* __HAVE_PREEMPTION */
321 
322 SYSCTL_SETUP(sysctl_machdep_cpu_idle, "sysctl machdep cpu_idle")
323 {
324 	const struct sysctlnode	*mnode, *node;
325 
326 	sysctl_createv(NULL, 0, NULL, &mnode,
327 	    CTLFLAG_PERMANENT, CTLTYPE_NODE, "machdep", NULL,
328 	    NULL, 0, NULL, 0, CTL_MACHDEP, CTL_EOL);
329 
330 	sysctl_createv(NULL, 0, &mnode, &node,
331 		       CTLFLAG_PERMANENT, CTLTYPE_STRING, "idle-mechanism",
332 		       SYSCTL_DESCR("Mechanism used for the idle loop."),
333 		       NULL, 0, x86_cpu_idle_text, 0,
334 		       CTL_CREATE, CTL_EOL);
335 }
336 
337 void
338 x86_cpu_idle_init(void)
339 {
340 
341 #ifndef XEN
342 	if ((cpu_feature[1] & CPUID2_MONITOR) == 0 ||
343 	    cpu_vendor == CPUVENDOR_AMD)
344 		x86_cpu_idle_set(x86_cpu_idle_halt, "halt", true);
345 	else
346 		x86_cpu_idle_set(x86_cpu_idle_mwait, "mwait", false);
347 #else
348 	x86_cpu_idle_set(x86_cpu_idle_xen, "xen", false);
349 #endif
350 }
351 
352 void
353 x86_cpu_idle_get(void (**func)(void), char *text, size_t len)
354 {
355 
356 	*func = x86_cpu_idle;
357 
358 	(void)strlcpy(text, x86_cpu_idle_text, len);
359 }
360 
361 void
362 x86_cpu_idle_set(void (*func)(void), const char *text, bool ipi)
363 {
364 
365 	x86_cpu_idle = func;
366 	x86_cpu_idle_ipi = ipi;
367 
368 	(void)strlcpy(x86_cpu_idle_text, text, sizeof(x86_cpu_idle_text));
369 }
370 
371 #ifndef XEN
372 
373 #define KBTOB(x)	((size_t)(x) * 1024UL)
374 #define MBTOB(x)	((size_t)(x) * 1024UL * 1024UL)
375 
376 extern paddr_t avail_start, avail_end;
377 
378 static int
379 add_mem_cluster(phys_ram_seg_t *seg_clusters, int seg_cluster_cnt,
380 	struct extent *iomem_ex,
381 	uint64_t seg_start, uint64_t seg_end, uint32_t type)
382 {
383 	uint64_t new_physmem = 0;
384 	phys_ram_seg_t *cluster;
385 	int i;
386 
387 #ifdef i386
388 #ifdef PAE
389 #define TOPLIMIT	0x1000000000ULL	/* 64GB */
390 #else
391 #define TOPLIMIT	0x100000000ULL	/* 4GB */
392 #endif
393 #else
394 #define TOPLIMIT	0x100000000000ULL /* 16TB */
395 #endif
396 
397 	if (seg_end > TOPLIMIT) {
398 		aprint_verbose("WARNING: skipping large memory map entry: "
399 		    "0x%"PRIx64"/0x%"PRIx64"/0x%x\n",
400 		    seg_start,
401 		    (seg_end - seg_start),
402 		    type);
403 		return seg_cluster_cnt;
404 	}
405 
406 	/*
407 	 * XXX Chop the last page off the size so that
408 	 * XXX it can fit in avail_end.
409 	 */
410 	if (seg_end == TOPLIMIT)
411 		seg_end -= PAGE_SIZE;
412 
413 	if (seg_end <= seg_start)
414 		return seg_cluster_cnt;
415 
416 	for (i = 0; i < seg_cluster_cnt; i++) {
417 		cluster = &seg_clusters[i];
418 		if ((cluster->start == round_page(seg_start))
419 		    && (cluster->size == trunc_page(seg_end) - cluster->start))
420 		{
421 #ifdef DEBUG_MEMLOAD
422 			printf("WARNING: skipping duplicate segment entry\n");
423 #endif
424 			return seg_cluster_cnt;
425 		}
426 	}
427 
428 	/*
429 	 * Allocate the physical addresses used by RAM
430 	 * from the iomem extent map.  This is done before
431 	 * the addresses are page rounded just to make
432 	 * sure we get them all.
433 	 */
434 	if (seg_start < 0x100000000ULL) {
435 		uint64_t io_end;
436 
437 		if (seg_end > 0x100000000ULL)
438 			io_end = 0x100000000ULL;
439 		else
440 			io_end = seg_end;
441 
442 		if (iomem_ex != NULL && extent_alloc_region(iomem_ex, seg_start,
443 		    io_end - seg_start, EX_NOWAIT)) {
444 			/* XXX What should we do? */
445 			printf("WARNING: CAN't ALLOCATE MEMORY SEGMENT "
446 			    "(0x%"PRIx64"/0x%"PRIx64"/0x%x) FROM "
447 			    "IOMEM EXTENT MAP!\n",
448 			    seg_start, seg_end - seg_start, type);
449 			return seg_cluster_cnt;
450 		}
451 	}
452 
453 	/*
454 	 * If it's not free memory, skip it.
455 	 */
456 	if (type != BIM_Memory)
457 		return seg_cluster_cnt;
458 
459 	/* XXX XXX XXX */
460 	if (seg_cluster_cnt >= VM_PHYSSEG_MAX)
461 		panic("%s: too many memory segments (increase VM_PHYSSEG_MAX)",
462 			__func__);
463 
464 #ifdef PHYSMEM_MAX_ADDR
465 	if (seg_start >= MBTOB(PHYSMEM_MAX_ADDR))
466 		return seg_cluster_cnt;
467 	if (seg_end > MBTOB(PHYSMEM_MAX_ADDR))
468 		seg_end = MBTOB(PHYSMEM_MAX_ADDR);
469 #endif
470 
471 	seg_start = round_page(seg_start);
472 	seg_end = trunc_page(seg_end);
473 
474 	if (seg_start == seg_end)
475 		return seg_cluster_cnt;
476 
477 	cluster = &seg_clusters[seg_cluster_cnt];
478 	cluster->start = seg_start;
479 	if (iomem_ex != NULL)
480 		new_physmem = physmem + atop(seg_end - seg_start);
481 
482 #ifdef PHYSMEM_MAX_SIZE
483 	if (iomem_ex != NULL) {
484 		if (physmem >= atop(MBTOB(PHYSMEM_MAX_SIZE)))
485 			return seg_cluster_cnt;
486 		if (new_physmem > atop(MBTOB(PHYSMEM_MAX_SIZE))) {
487 			seg_end = seg_start + MBTOB(PHYSMEM_MAX_SIZE) - ptoa(physmem);
488 			new_physmem = atop(MBTOB(PHYSMEM_MAX_SIZE));
489 		}
490 	}
491 #endif
492 
493 	cluster->size = seg_end - seg_start;
494 
495 	if (iomem_ex != NULL) {
496 		if (avail_end < seg_end)
497 			avail_end = seg_end;
498 		physmem = new_physmem;
499 	}
500 	seg_cluster_cnt++;
501 
502 	return seg_cluster_cnt;
503 }
504 
505 int
506 initx86_parse_memmap(struct btinfo_memmap *bim, struct extent *iomem_ex)
507 {
508 	uint64_t seg_start, seg_end;
509 	uint64_t addr, size;
510 	uint32_t type;
511 	int x;
512 
513 	KASSERT(bim != NULL);
514 	KASSERT(bim->num > 0);
515 
516 #ifdef DEBUG_MEMLOAD
517 	printf("BIOS MEMORY MAP (%d ENTRIES):\n", bim->num);
518 #endif
519 	for (x = 0; x < bim->num; x++) {
520 		addr = bim->entry[x].addr;
521 		size = bim->entry[x].size;
522 		type = bim->entry[x].type;
523 #ifdef DEBUG_MEMLOAD
524 		printf("    addr 0x%"PRIx64"  size 0x%"PRIx64"  type 0x%x\n",
525 			addr, size, type);
526 #endif
527 
528 		/*
529 		 * If the segment is not memory, skip it.
530 		 */
531 		switch (type) {
532 		case BIM_Memory:
533 		case BIM_ACPI:
534 		case BIM_NVS:
535 			break;
536 		default:
537 			continue;
538 		}
539 
540 		/*
541 		 * If the segment is smaller than a page, skip it.
542 		 */
543 		if (size < NBPG)
544 			continue;
545 
546 		seg_start = addr;
547 		seg_end = addr + size;
548 
549 		/*
550 		 *   Avoid Compatibility Holes.
551 		 * XXX  Holes within memory space that allow access
552 		 * XXX to be directed to the PC-compatible frame buffer
553 		 * XXX (0xa0000-0xbffff), to adapter ROM space
554 		 * XXX (0xc0000-0xdffff), and to system BIOS space
555 		 * XXX (0xe0000-0xfffff).
556 		 * XXX  Some laptop(for example,Toshiba Satellite2550X)
557 		 * XXX report this area and occurred problems,
558 		 * XXX so we avoid this area.
559 		 */
560 		if (seg_start < 0x100000 && seg_end > 0xa0000) {
561 			printf("WARNING: memory map entry overlaps "
562 			    "with ``Compatibility Holes'': "
563 			    "0x%"PRIx64"/0x%"PRIx64"/0x%x\n", seg_start,
564 			    seg_end - seg_start, type);
565 			mem_cluster_cnt = add_mem_cluster(
566 				mem_clusters, mem_cluster_cnt, iomem_ex,
567 				seg_start, 0xa0000, type);
568 			mem_cluster_cnt = add_mem_cluster(
569 				mem_clusters, mem_cluster_cnt, iomem_ex,
570 				0x100000, seg_end, type);
571 		} else
572 			mem_cluster_cnt = add_mem_cluster(
573 				mem_clusters, mem_cluster_cnt, iomem_ex,
574 				seg_start, seg_end, type);
575 	}
576 
577 	return 0;
578 }
579 
580 int
581 initx86_fake_memmap(struct extent *iomem_ex)
582 {
583 	phys_ram_seg_t *cluster;
584 	KASSERT(mem_cluster_cnt == 0);
585 
586 	/*
587 	 * Allocate the physical addresses used by RAM from the iomem
588 	 * extent map.  This is done before the addresses are
589 	 * page rounded just to make sure we get them all.
590 	 */
591 	if (extent_alloc_region(iomem_ex, 0, KBTOB(biosbasemem),
592 	    EX_NOWAIT))
593 	{
594 		/* XXX What should we do? */
595 		printf("WARNING: CAN'T ALLOCATE BASE MEMORY FROM "
596 		    "IOMEM EXTENT MAP!\n");
597 	}
598 
599 	cluster = &mem_clusters[0];
600 	cluster->start = 0;
601 	cluster->size = trunc_page(KBTOB(biosbasemem));
602 	physmem += atop(cluster->size);
603 
604 	if (extent_alloc_region(iomem_ex, IOM_END, KBTOB(biosextmem),
605 	    EX_NOWAIT))
606 	{
607 		/* XXX What should we do? */
608 		printf("WARNING: CAN'T ALLOCATE EXTENDED MEMORY FROM "
609 		    "IOMEM EXTENT MAP!\n");
610 	}
611 
612 #if NISADMA > 0
613 	/*
614 	 * Some motherboards/BIOSes remap the 384K of RAM that would
615 	 * normally be covered by the ISA hole to the end of memory
616 	 * so that it can be used.  However, on a 16M system, this
617 	 * would cause bounce buffers to be allocated and used.
618 	 * This is not desirable behaviour, as more than 384K of
619 	 * bounce buffers might be allocated.  As a work-around,
620 	 * we round memory down to the nearest 1M boundary if
621 	 * we're using any isadma devices and the remapped memory
622 	 * is what puts us over 16M.
623 	 */
624 	if (biosextmem > (15*1024) && biosextmem < (16*1024)) {
625 		char pbuf[9];
626 
627 		format_bytes(pbuf, sizeof(pbuf),
628 		    biosextmem - (15*1024));
629 		printf("Warning: ignoring %s of remapped memory\n",
630 		    pbuf);
631 		biosextmem = (15*1024);
632 	}
633 #endif
634 	cluster = &mem_clusters[1];
635 	cluster->start = IOM_END;
636 	cluster->size = trunc_page(KBTOB(biosextmem));
637 	physmem += atop(cluster->size);
638 
639 	mem_cluster_cnt = 2;
640 
641 	avail_end = IOM_END + trunc_page(KBTOB(biosextmem));
642 
643 	return 0;
644 }
645 
646 #ifdef amd64
647 extern vaddr_t kern_end;
648 extern vaddr_t module_start, module_end;
649 #endif
650 
651 int
652 initx86_load_memmap(paddr_t first_avail)
653 {
654 	uint64_t seg_start, seg_end;
655 	uint64_t seg_start1, seg_end1;
656 	int first16q, x;
657 #ifdef VM_FREELIST_FIRST4G
658 	int first4gq;
659 #endif
660 
661 	/*
662 	 * If we have 16M of RAM or less, just put it all on
663 	 * the default free list.  Otherwise, put the first
664 	 * 16M of RAM on a lower priority free list (so that
665 	 * all of the ISA DMA'able memory won't be eaten up
666 	 * first-off).
667 	 */
668 #define ADDR_16M (16 * 1024 * 1024)
669 
670 	if (avail_end <= ADDR_16M)
671 		first16q = VM_FREELIST_DEFAULT;
672 	else
673 		first16q = VM_FREELIST_FIRST16;
674 
675 #ifdef VM_FREELIST_FIRST4G
676 	/*
677 	 * If we have 4G of RAM or less, just put it all on
678 	 * the default free list.  Otherwise, put the first
679 	 * 4G of RAM on a lower priority free list (so that
680 	 * all of the 32bit PCI DMA'able memory won't be eaten up
681 	 * first-off).
682 	 */
683 #define ADDR_4G (4ULL * 1024 * 1024 * 1024)
684 	if (avail_end <= ADDR_4G)
685 		first4gq = VM_FREELIST_DEFAULT;
686 	else
687 		first4gq = VM_FREELIST_FIRST4G;
688 #endif /* defined(VM_FREELIST_FIRST4G) */
689 
690 	/* Make sure the end of the space used by the kernel is rounded. */
691 	first_avail = round_page(first_avail);
692 
693 #ifdef amd64
694 	kern_end = KERNBASE + first_avail;
695 	module_start = kern_end;
696 	module_end = KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2;
697 #endif
698 
699 	/*
700 	 * Now, load the memory clusters (which have already been
701 	 * rounded and truncated) into the VM system.
702 	 *
703 	 * NOTE: WE ASSUME THAT MEMORY STARTS AT 0 AND THAT THE KERNEL
704 	 * IS LOADED AT IOM_END (1M).
705 	 */
706 	for (x = 0; x < mem_cluster_cnt; x++) {
707 		const phys_ram_seg_t *cluster = &mem_clusters[x];
708 
709 		seg_start = cluster->start;
710 		seg_end = cluster->start + cluster->size;
711 		seg_start1 = 0;
712 		seg_end1 = 0;
713 
714 		/*
715 		 * Skip memory before our available starting point.
716 		 */
717 		if (seg_end <= avail_start)
718 			continue;
719 
720 		if (avail_start >= seg_start && avail_start < seg_end) {
721 			if (seg_start != 0)
722 				panic("init_x86_64: memory doesn't start at 0");
723 			seg_start = avail_start;
724 			if (seg_start == seg_end)
725 				continue;
726 		}
727 
728 		/*
729 		 * If this segment contains the kernel, split it
730 		 * in two, around the kernel.
731 		 */
732 		if (seg_start <= IOM_END && first_avail <= seg_end) {
733 			seg_start1 = first_avail;
734 			seg_end1 = seg_end;
735 			seg_end = IOM_END;
736 			KASSERT(seg_end < seg_end1);
737 		}
738 
739 		/* First hunk */
740 		if (seg_start != seg_end) {
741 			if (seg_start < ADDR_16M &&
742 			    first16q != VM_FREELIST_DEFAULT) {
743 				uint64_t tmp;
744 
745 				if (seg_end > ADDR_16M)
746 					tmp = ADDR_16M;
747 				else
748 					tmp = seg_end;
749 
750 				if (tmp != seg_start) {
751 #ifdef DEBUG_MEMLOAD
752 					printf("loading first16q 0x%"PRIx64
753 					    "-0x%"PRIx64
754 					    " (0x%"PRIx64"-0x%"PRIx64")\n",
755 					    seg_start, tmp,
756 					    (uint64_t)atop(seg_start),
757 					    (uint64_t)atop(tmp));
758 #endif
759 					uvm_page_physload(atop(seg_start),
760 					    atop(tmp), atop(seg_start),
761 					    atop(tmp), first16q);
762 				}
763 				seg_start = tmp;
764 			}
765 
766 #ifdef VM_FREELIST_FIRST4G
767 			if (seg_start < ADDR_4G &&
768 			    first4gq != VM_FREELIST_DEFAULT) {
769 				uint64_t tmp;
770 
771 				if (seg_end > ADDR_4G)
772 					tmp = ADDR_4G;
773 				else
774 					tmp = seg_end;
775 
776 				if (tmp != seg_start) {
777 #ifdef DEBUG_MEMLOAD
778 					printf("loading first4gq 0x%"PRIx64
779 					    "-0x%"PRIx64
780 					    " (0x%"PRIx64"-0x%"PRIx64")\n",
781 					    seg_start, tmp,
782 					    (uint64_t)atop(seg_start),
783 					    (uint64_t)atop(tmp));
784 #endif
785 					uvm_page_physload(atop(seg_start),
786 					    atop(tmp), atop(seg_start),
787 					    atop(tmp), first4gq);
788 				}
789 				seg_start = tmp;
790 			}
791 #endif /* defined(VM_FREELIST_FIRST4G) */
792 
793 			if (seg_start != seg_end) {
794 #ifdef DEBUG_MEMLOAD
795 				printf("loading default 0x%"PRIx64"-0x%"PRIx64
796 				    " (0x%"PRIx64"-0x%"PRIx64")\n",
797 				    seg_start, seg_end,
798 				    (uint64_t)atop(seg_start),
799 				    (uint64_t)atop(seg_end));
800 #endif
801 				uvm_page_physload(atop(seg_start),
802 				    atop(seg_end), atop(seg_start),
803 				    atop(seg_end), VM_FREELIST_DEFAULT);
804 			}
805 		}
806 
807 		/* Second hunk */
808 		if (seg_start1 != seg_end1) {
809 			if (seg_start1 < ADDR_16M &&
810 			    first16q != VM_FREELIST_DEFAULT) {
811 				uint64_t tmp;
812 
813 				if (seg_end1 > ADDR_16M)
814 					tmp = ADDR_16M;
815 				else
816 					tmp = seg_end1;
817 
818 				if (tmp != seg_start1) {
819 #ifdef DEBUG_MEMLOAD
820 					printf("loading first16q 0x%"PRIx64
821 					    "-0x%"PRIx64
822 					    " (0x%"PRIx64"-0x%"PRIx64")\n",
823 					    seg_start1, tmp,
824 					    (uint64_t)atop(seg_start1),
825 					    (uint64_t)atop(tmp));
826 #endif
827 					uvm_page_physload(atop(seg_start1),
828 					    atop(tmp), atop(seg_start1),
829 					    atop(tmp), first16q);
830 				}
831 				seg_start1 = tmp;
832 			}
833 
834 #ifdef VM_FREELIST_FIRST4G
835 			if (seg_start1 < ADDR_4G &&
836 			    first4gq != VM_FREELIST_DEFAULT) {
837 				uint64_t tmp;
838 
839 				if (seg_end1 > ADDR_4G)
840 					tmp = ADDR_4G;
841 				else
842 					tmp = seg_end1;
843 
844 				if (tmp != seg_start1) {
845 #ifdef DEBUG_MEMLOAD
846 					printf("loading first4gq 0x%"PRIx64
847 					    "-0x%"PRIx64
848 					    " (0x%"PRIx64"-0x%"PRIx64")\n",
849 					    seg_start1, tmp,
850 					    (uint64_t)atop(seg_start1),
851 					    (uint64_t)atop(tmp));
852 #endif
853 					uvm_page_physload(atop(seg_start1),
854 					    atop(tmp), atop(seg_start1),
855 					    atop(tmp), first4gq);
856 				}
857 				seg_start1 = tmp;
858 			}
859 #endif /* defined(VM_FREELIST_FIRST4G) */
860 
861 			if (seg_start1 != seg_end1) {
862 #ifdef DEBUG_MEMLOAD
863 				printf("loading default 0x%"PRIx64"-0x%"PRIx64
864 				    " (0x%"PRIx64"-0x%"PRIx64")\n",
865 				    seg_start1, seg_end1,
866 				    (uint64_t)atop(seg_start1),
867 				    (uint64_t)atop(seg_end1));
868 #endif
869 				uvm_page_physload(atop(seg_start1),
870 				    atop(seg_end1), atop(seg_start1),
871 				    atop(seg_end1), VM_FREELIST_DEFAULT);
872 			}
873 		}
874 	}
875 
876 	return 0;
877 }
878 #endif
879 
880 void
881 x86_reset(void)
882 {
883 	uint8_t b;
884 	/*
885 	 * The keyboard controller has 4 random output pins, one of which is
886 	 * connected to the RESET pin on the CPU in many PCs.  We tell the
887 	 * keyboard controller to pulse this line a couple of times.
888 	 */
889 	outb(IO_KBD + KBCMDP, KBC_PULSE0);
890 	delay(100000);
891 	outb(IO_KBD + KBCMDP, KBC_PULSE0);
892 	delay(100000);
893 
894 	/*
895 	 * Attempt to force a reset via the Reset Control register at
896 	 * I/O port 0xcf9.  Bit 2 forces a system reset when it
897 	 * transitions from 0 to 1.  Bit 1 selects the type of reset
898 	 * to attempt: 0 selects a "soft" reset, and 1 selects a
899 	 * "hard" reset.  We try a "hard" reset.  The first write sets
900 	 * bit 1 to select a "hard" reset and clears bit 2.  The
901 	 * second write forces a 0 -> 1 transition in bit 2 to trigger
902 	 * a reset.
903 	 */
904 	outb(0xcf9, 0x2);
905 	outb(0xcf9, 0x6);
906 	DELAY(500000);  /* wait 0.5 sec to see if that did it */
907 
908 	/*
909 	 * Attempt to force a reset via the Fast A20 and Init register
910 	 * at I/O port 0x92.  Bit 1 serves as an alternate A20 gate.
911 	 * Bit 0 asserts INIT# when set to 1.  We are careful to only
912 	 * preserve bit 1 while setting bit 0.  We also must clear bit
913 	 * 0 before setting it if it isn't already clear.
914 	 */
915 	b = inb(0x92);
916 	if (b != 0xff) {
917 		if ((b & 0x1) != 0)
918 			outb(0x92, b & 0xfe);
919 		outb(0x92, b | 0x1);
920 		DELAY(500000);  /* wait 0.5 sec to see if that did it */
921 	}
922 }
923 
924 static int
925 x86_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
926     void *arg0, void *arg1, void *arg2, void *arg3)
927 {
928 	int result;
929 
930 	result = KAUTH_RESULT_DEFER;
931 
932 	switch (action) {
933 	case KAUTH_MACHDEP_IOPERM_GET:
934 	case KAUTH_MACHDEP_LDT_GET:
935 	case KAUTH_MACHDEP_LDT_SET:
936 	case KAUTH_MACHDEP_MTRR_GET:
937 		result = KAUTH_RESULT_ALLOW;
938 
939 		break;
940 
941 	default:
942 		break;
943 	}
944 
945 	return result;
946 }
947 
948 void
949 machdep_init(void)
950 {
951 
952 	x86_listener = kauth_listen_scope(KAUTH_SCOPE_MACHDEP,
953 	    x86_listener_cb, NULL);
954 }
955 
956 /*
957  * x86_startup: x86 common startup routine
958  *
959  * called by cpu_startup.
960  */
961 
962 void
963 x86_startup(void)
964 {
965 
966 #if !defined(XEN)
967 	nmi_init();
968 #endif /* !defined(XEN) */
969 }
970