1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/machparam.h>
31 #include <sys/x86_archext.h>
32 #include <sys/systm.h>
33 #include <sys/mach_mmu.h>
34 #include <sys/multiboot.h>
35 
36 #if defined(__xpv)
37 
38 #include <sys/hypervisor.h>
39 uintptr_t xen_virt_start;
40 pfn_t *mfn_to_pfn_mapping;
41 
42 #else /* !__xpv */
43 
44 extern multiboot_header_t mb_header;
45 extern int have_cpuid(void);
46 
47 #endif /* !__xpv */
48 
49 #include <sys/inttypes.h>
50 #include <sys/bootinfo.h>
51 #include <sys/mach_mmu.h>
52 #include <sys/boot_console.h>
53 
54 #include "dboot_asm.h"
55 #include "dboot_printf.h"
56 #include "dboot_xboot.h"
57 #include "dboot_elfload.h"
58 
59 /*
60  * This file contains code that runs to transition us from either a multiboot
61  * compliant loader (32 bit non-paging) or a XPV domain loader to
62  * regular kernel execution. Its task is to setup the kernel memory image
63  * and page tables.
64  *
65  * The code executes as:
66  *	- 32 bits under GRUB (for 32 or 64 bit Solaris)
67  * 	- a 32 bit program for the 32-bit PV hypervisor
68  *	- a 64 bit program for the 64-bit PV hypervisor (at least for now)
69  *
70  * Under the PV hypervisor, we must create mappings for any memory beyond the
71  * initial start of day allocation (such as the kernel itself).
72  *
73  * When on the metal, the mapping between maddr_t and paddr_t is 1:1.
74  * Since we are running in real mode, so all such memory is accessible.
75  */
76 
77 /*
78  * Standard bits used in PTE (page level) and PTP (internal levels)
79  */
80 x86pte_t ptp_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_USER;
81 x86pte_t pte_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_MOD | PT_NOCONSIST;
82 
83 /*
84  * This is the target addresses (physical) where the kernel text and data
85  * nucleus pages will be unpacked. On the hypervisor this is actually a
86  * virtual address.
87  */
88 paddr_t ktext_phys;
89 uint32_t ksize = 2 * FOUR_MEG;	/* kernel nucleus is 8Meg */
90 
91 static uint64_t target_kernel_text;	/* value to use for KERNEL_TEXT */
92 
93 /*
94  * The stack is setup in assembler before entering startup_kernel()
95  */
96 char stack_space[STACK_SIZE];
97 
98 /*
99  * Used to track physical memory allocation
100  */
101 static paddr_t next_avail_addr = 0;
102 
103 #if defined(__xpv)
104 /*
105  * Additional information needed for hypervisor memory allocation.
106  * Only memory up to scratch_end is mapped by page tables.
107  * mfn_base is the start of the hypervisor virtual image. It's ONE_GIG, so
108  * to derive a pfn from a pointer, you subtract mfn_base.
109  */
110 
111 static paddr_t scratch_end = 0;	/* we can't write all of mem here */
112 static paddr_t mfn_base;		/* addr corresponding to mfn_list[0] */
113 start_info_t *xen_info;
114 
115 #else	/* __xpv */
116 
117 /*
118  * If on the metal, then we have a multiboot loader.
119  */
120 multiboot_info_t *mb_info;
121 
122 #endif	/* __xpv */
123 
124 /*
125  * This contains information passed to the kernel
126  */
127 struct xboot_info boot_info[2];	/* extra space to fix alignement for amd64 */
128 struct xboot_info *bi;
129 
130 /*
131  * Page table and memory stuff.
132  */
133 static paddr_t max_mem;			/* maximum memory address */
134 
135 /*
136  * Information about processor MMU
137  */
138 int amd64_support = 0;
139 int largepage_support = 0;
140 int pae_support = 0;
141 int pge_support = 0;
142 int NX_support = 0;
143 
144 /*
145  * Low 32 bits of kernel entry address passed back to assembler.
146  * When running a 64 bit kernel, the high 32 bits are 0xffffffff.
147  */
148 uint32_t entry_addr_low;
149 
150 /*
151  * Memlists for the kernel. We shouldn't need a lot of these.
152  */
153 #define	MAX_MEMLIST (50)
154 struct boot_memlist memlists[MAX_MEMLIST];
155 uint_t memlists_used = 0;
156 struct boot_memlist pcimemlists[MAX_MEMLIST];
157 uint_t pcimemlists_used = 0;
158 
159 #define	MAX_MODULES (10)
160 struct boot_modules modules[MAX_MODULES];
161 uint_t modules_used = 0;
162 
163 /*
164  * Debugging macros
165  */
166 uint_t prom_debug = 0;
167 uint_t map_debug = 0;
168 
169 /*
170  * Either hypervisor-specific or grub-specific code builds the initial
171  * memlists. This code does the sort/merge/link for final use.
172  */
173 static void
174 sort_physinstall(void)
175 {
176 	int i;
177 #if !defined(__xpv)
178 	int j;
179 	struct boot_memlist tmp;
180 
181 	/*
182 	 * Now sort the memlists, in case they weren't in order.
183 	 * Yeah, this is a bubble sort; small, simple and easy to get right.
184 	 */
185 	DBG_MSG("Sorting phys-installed list\n");
186 	for (j = memlists_used - 1; j > 0; --j) {
187 		for (i = 0; i < j; ++i) {
188 			if (memlists[i].addr < memlists[i + 1].addr)
189 				continue;
190 			tmp = memlists[i];
191 			memlists[i] = memlists[i + 1];
192 			memlists[i + 1] = tmp;
193 		}
194 	}
195 
196 	/*
197 	 * Merge any memlists that don't have holes between them.
198 	 */
199 	for (i = 0; i <= memlists_used - 1; ++i) {
200 		if (memlists[i].addr + memlists[i].size != memlists[i + 1].addr)
201 			continue;
202 
203 		if (prom_debug)
204 			dboot_printf(
205 			    "merging mem segs %" PRIx64 "...%" PRIx64
206 			    " w/ %" PRIx64 "...%" PRIx64 "\n",
207 			    memlists[i].addr,
208 			    memlists[i].addr + memlists[i].size,
209 			    memlists[i + 1].addr,
210 			    memlists[i + 1].addr + memlists[i + 1].size);
211 
212 		memlists[i].size += memlists[i + 1].size;
213 		for (j = i + 1; j < memlists_used - 1; ++j)
214 			memlists[j] = memlists[j + 1];
215 		--memlists_used;
216 		DBG(memlists_used);
217 		--i;	/* after merging we need to reexamine, so do this */
218 	}
219 #endif	/* __xpv */
220 
221 	if (prom_debug) {
222 		dboot_printf("\nFinal memlists:\n");
223 		for (i = 0; i < memlists_used; ++i) {
224 			dboot_printf("\t%d: addr=%" PRIx64 " size=%"
225 			    PRIx64 "\n", i, memlists[i].addr, memlists[i].size);
226 		}
227 	}
228 
229 	/*
230 	 * link together the memlists with native size pointers
231 	 */
232 	memlists[0].next = 0;
233 	memlists[0].prev = 0;
234 	for (i = 1; i < memlists_used; ++i) {
235 		memlists[i].prev = (native_ptr_t)(uintptr_t)(memlists + i - 1);
236 		memlists[i].next = 0;
237 		memlists[i - 1].next = (native_ptr_t)(uintptr_t)(memlists + i);
238 	}
239 	bi->bi_phys_install = (native_ptr_t)memlists;
240 	DBG(bi->bi_phys_install);
241 }
242 
243 #if defined(__xpv)
244 
245 /*
246  * halt on the hypervisor after a delay to drain console output
247  */
248 void
249 dboot_halt(void)
250 {
251 	uint_t i = 10000;
252 
253 	while (--i)
254 		HYPERVISOR_yield();
255 	HYPERVISOR_shutdown(SHUTDOWN_poweroff);
256 }
257 
258 /*
259  * From a machine address, find the corresponding pseudo-physical address.
260  * Pseudo-physical address are contiguous and run from mfn_base in each VM.
261  * Machine addresses are the real underlying hardware addresses.
262  * These are needed for page table entries. Note that this routine is
263  * poorly protected. A bad value of "ma" will cause a page fault.
264  */
265 paddr_t
266 ma_to_pa(maddr_t ma)
267 {
268 	ulong_t pgoff = ma & MMU_PAGEOFFSET;
269 	ulong_t pfn = mfn_to_pfn_mapping[mmu_btop(ma)];
270 	paddr_t pa;
271 
272 	if (pfn >= xen_info->nr_pages)
273 		return (-(paddr_t)1);
274 	pa = mfn_base + mmu_ptob((paddr_t)pfn) + pgoff;
275 #ifdef DEBUG
276 	if (ma != pa_to_ma(pa))
277 		dboot_printf("ma_to_pa(%" PRIx64 ") got %" PRIx64 ", "
278 		    "pa_to_ma() says %" PRIx64 "\n", ma, pa, pa_to_ma(pa));
279 #endif
280 	return (pa);
281 }
282 
283 /*
284  * From a pseudo-physical address, find the corresponding machine address.
285  */
286 maddr_t
287 pa_to_ma(paddr_t pa)
288 {
289 	pfn_t pfn;
290 	ulong_t mfn;
291 
292 	pfn = mmu_btop(pa - mfn_base);
293 	if (pa < mfn_base || pfn >= xen_info->nr_pages)
294 		dboot_panic("pa_to_ma(): illegal address 0x%lx", (ulong_t)pa);
295 	mfn = ((ulong_t *)xen_info->mfn_list)[pfn];
296 #ifdef DEBUG
297 	if (mfn_to_pfn_mapping[mfn] != pfn)
298 		dboot_printf("pa_to_ma(pfn=%lx) got %lx ma_to_pa() says %lx\n",
299 		    pfn, mfn, mfn_to_pfn_mapping[mfn]);
300 #endif
301 	return (mfn_to_ma(mfn) | (pa & MMU_PAGEOFFSET));
302 }
303 
304 #endif	/* __xpv */
305 
306 x86pte_t
307 get_pteval(paddr_t table, uint_t index)
308 {
309 	if (pae_support)
310 		return (((x86pte_t *)(uintptr_t)table)[index]);
311 	return (((x86pte32_t *)(uintptr_t)table)[index]);
312 }
313 
314 /*ARGSUSED*/
315 void
316 set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval)
317 {
318 #ifdef __xpv
319 	mmu_update_t t;
320 	maddr_t mtable = pa_to_ma(table);
321 	int retcnt;
322 
323 	t.ptr = (mtable + index * pte_size) | MMU_NORMAL_PT_UPDATE;
324 	t.val = pteval;
325 	if (HYPERVISOR_mmu_update(&t, 1, &retcnt, DOMID_SELF) || retcnt != 1)
326 		dboot_panic("HYPERVISOR_mmu_update() failed");
327 #else /* __xpv */
328 	uintptr_t tab_addr = (uintptr_t)table;
329 
330 	if (pae_support)
331 		((x86pte_t *)tab_addr)[index] = pteval;
332 	else
333 		((x86pte32_t *)tab_addr)[index] = (x86pte32_t)pteval;
334 	if (level == top_level && level == 2)
335 		reload_cr3();
336 #endif /* __xpv */
337 }
338 
339 paddr_t
340 make_ptable(x86pte_t *pteval, uint_t level)
341 {
342 	paddr_t new_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
343 
344 	if (level == top_level && level == 2)
345 		*pteval = pa_to_ma((uintptr_t)new_table) | PT_VALID;
346 	else
347 		*pteval = pa_to_ma((uintptr_t)new_table) | ptp_bits;
348 
349 #ifdef __xpv
350 	/* Remove write permission to the new page table. */
351 	if (HYPERVISOR_update_va_mapping(new_table,
352 	    *pteval & ~(x86pte_t)PT_WRITABLE, UVMF_INVLPG | UVMF_LOCAL))
353 		dboot_panic("HYP_update_va_mapping error");
354 #endif
355 
356 	if (map_debug)
357 		dboot_printf("new page table lvl=%d paddr=0x%lx ptp=0x%"
358 		    PRIx64 "\n", level, (ulong_t)new_table, *pteval);
359 	return (new_table);
360 }
361 
362 x86pte_t *
363 map_pte(paddr_t table, uint_t index)
364 {
365 	return ((x86pte_t *)(uintptr_t)(table + index * pte_size));
366 }
367 
368 #if !defined(__xpv)
369 #define	maddr_t paddr_t
370 #endif /* !__xpv */
371 
372 /*
373  * Add a mapping for the machine page at the given virtual address.
374  */
375 static void
376 map_ma_at_va(maddr_t ma, native_ptr_t va, uint_t level)
377 {
378 	x86pte_t *ptep;
379 	x86pte_t pteval;
380 
381 	pteval = ma | pte_bits;
382 	if (level > 0)
383 		pteval |= PT_PAGESIZE;
384 	if (va >= target_kernel_text && pge_support)
385 		pteval |= PT_GLOBAL;
386 
387 	if (map_debug && ma != va)
388 		dboot_printf("mapping ma=0x%" PRIx64 " va=0x%" PRIx64
389 		    " pte=0x%" PRIx64 " l=%d\n",
390 		    (uint64_t)ma, (uint64_t)va, pteval, level);
391 
392 #if defined(__xpv)
393 	/*
394 	 * see if we can avoid find_pte() on the hypervisor
395 	 */
396 	if (HYPERVISOR_update_va_mapping(va, pteval,
397 	    UVMF_INVLPG | UVMF_LOCAL) == 0)
398 		return;
399 #endif
400 
401 	/*
402 	 * Find the pte that will map this address. This creates any
403 	 * missing intermediate level page tables
404 	 */
405 	ptep = find_pte(va, NULL, level, 0);
406 
407 	/*
408 	 * When paravirtualized, we must use hypervisor calls to modify the
409 	 * PTE, since paging is active. On real hardware we just write to
410 	 * the pagetables which aren't in use yet.
411 	 */
412 #if defined(__xpv)
413 	ptep = ptep;	/* shut lint up */
414 	if (HYPERVISOR_update_va_mapping(va, pteval, UVMF_INVLPG | UVMF_LOCAL))
415 		dboot_panic("mmu_update failed-map_pa_at_va va=0x%" PRIx64
416 		    " l=%d ma=0x%" PRIx64 ", pte=0x%" PRIx64 "",
417 		    (uint64_t)va, level, (uint64_t)ma, pteval);
418 #else
419 	if (va < 1024 * 1024)
420 		pteval |= PT_NOCACHE;		/* for video RAM */
421 	if (pae_support)
422 		*ptep = pteval;
423 	else
424 		*((x86pte32_t *)ptep) = (x86pte32_t)pteval;
425 #endif
426 }
427 
428 /*
429  * Add a mapping for the physical page at the given virtual address.
430  */
431 static void
432 map_pa_at_va(paddr_t pa, native_ptr_t va, uint_t level)
433 {
434 	map_ma_at_va(pa_to_ma(pa), va, level);
435 }
436 
437 /*
438  * This is called to remove start..end from the
439  * possible range of PCI addresses.
440  */
441 const uint64_t pci_lo_limit = 0x00100000ul;
442 const uint64_t pci_hi_limit = 0xfff00000ul;
443 static void
444 exclude_from_pci(uint64_t start, uint64_t end)
445 {
446 	int i;
447 	int j;
448 	struct boot_memlist *ml;
449 
450 	for (i = 0; i < pcimemlists_used; ++i) {
451 		ml = &pcimemlists[i];
452 
453 		/* delete the entire range? */
454 		if (start <= ml->addr && ml->addr + ml->size <= end) {
455 			--pcimemlists_used;
456 			for (j = i; j < pcimemlists_used; ++j)
457 				pcimemlists[j] = pcimemlists[j + 1];
458 			--i;	/* to revisit the new one at this index */
459 		}
460 
461 		/* split a range? */
462 		else if (ml->addr < start && end < ml->addr + ml->size) {
463 
464 			++pcimemlists_used;
465 			if (pcimemlists_used > MAX_MEMLIST)
466 				dboot_panic("too many pcimemlists");
467 
468 			for (j = pcimemlists_used - 1; j > i; --j)
469 				pcimemlists[j] = pcimemlists[j - 1];
470 			ml->size = start - ml->addr;
471 
472 			++ml;
473 			ml->size = (ml->addr + ml->size) - end;
474 			ml->addr = end;
475 			++i;	/* skip on to next one */
476 		}
477 
478 		/* cut memory off the start? */
479 		else if (ml->addr < end && end < ml->addr + ml->size) {
480 			ml->size -= end - ml->addr;
481 			ml->addr = end;
482 		}
483 
484 		/* cut memory off the end? */
485 		else if (ml->addr <= start && start < ml->addr + ml->size) {
486 			ml->size = start - ml->addr;
487 		}
488 	}
489 }
490 
491 /*
492  * Xen strips the size field out of the mb_memory_map_t, see struct e820entry
493  * definition in Xen source.
494  */
495 #ifdef __xpv
496 typedef struct {
497 	uint32_t	base_addr_low;
498 	uint32_t	base_addr_high;
499 	uint32_t	length_low;
500 	uint32_t	length_high;
501 	uint32_t	type;
502 } mmap_t;
503 #else
504 typedef mb_memory_map_t mmap_t;
505 #endif
506 
507 static void
508 build_pcimemlists(mmap_t *mem, int num)
509 {
510 	mmap_t *mmap;
511 	uint64_t page_offset = MMU_PAGEOFFSET;	/* needs to be 64 bits */
512 	uint64_t start;
513 	uint64_t end;
514 	int i;
515 
516 	/*
517 	 * initialize
518 	 */
519 	pcimemlists[0].addr = pci_lo_limit;
520 	pcimemlists[0].size = pci_hi_limit - pci_lo_limit;
521 	pcimemlists_used = 1;
522 
523 	/*
524 	 * Fill in PCI memlists.
525 	 */
526 	for (mmap = mem, i = 0; i < num; ++i, ++mmap) {
527 		start = ((uint64_t)mmap->base_addr_high << 32) +
528 		    mmap->base_addr_low;
529 		end = start + ((uint64_t)mmap->length_high << 32) +
530 		    mmap->length_low;
531 
532 		if (prom_debug)
533 			dboot_printf("\ttype: %d %" PRIx64 "..%"
534 			    PRIx64 "\n", mmap->type, start, end);
535 
536 		/*
537 		 * page align start and end
538 		 */
539 		start = (start + page_offset) & ~page_offset;
540 		end &= ~page_offset;
541 		if (end <= start)
542 			continue;
543 
544 		exclude_from_pci(start, end);
545 	}
546 
547 	/*
548 	 * Finish off the pcimemlist
549 	 */
550 	if (prom_debug) {
551 		for (i = 0; i < pcimemlists_used; ++i) {
552 			dboot_printf("pcimemlist entry 0x%" PRIx64 "..0x%"
553 			    PRIx64 "\n", pcimemlists[i].addr,
554 			    pcimemlists[i].addr + pcimemlists[i].size);
555 		}
556 	}
557 	pcimemlists[0].next = 0;
558 	pcimemlists[0].prev = 0;
559 	for (i = 1; i < pcimemlists_used; ++i) {
560 		pcimemlists[i].prev =
561 		    (native_ptr_t)(uintptr_t)(pcimemlists + i - 1);
562 		pcimemlists[i].next = 0;
563 		pcimemlists[i - 1].next =
564 		    (native_ptr_t)(uintptr_t)(pcimemlists + i);
565 	}
566 	bi->bi_pcimem = (native_ptr_t)pcimemlists;
567 	DBG(bi->bi_pcimem);
568 }
569 
570 #if defined(__xpv)
571 /*
572  * Initialize memory allocator stuff from hypervisor-supplied start info.
573  *
574  * There is 512KB of scratch area after the boot stack page.
575  * We'll use that for everything except the kernel nucleus pages which are too
576  * big to fit there and are allocated last anyway.
577  */
578 #define	MAXMAPS	100
579 static mmap_t map_buffer[MAXMAPS];
580 static void
581 init_mem_alloc(void)
582 {
583 	int	local;	/* variables needed to find start region */
584 	paddr_t	scratch_start;
585 	xen_memory_map_t map;
586 
587 	DBG_MSG("Entered init_mem_alloc()\n");
588 
589 	/*
590 	 * Free memory follows the stack. There's at least 512KB of scratch
591 	 * space, rounded up to at least 2Mb alignment.  That should be enough
592 	 * for the page tables we'll need to build.  The nucleus memory is
593 	 * allocated last and will be outside the addressible range.  We'll
594 	 * switch to new page tables before we unpack the kernel
595 	 */
596 	scratch_start = RNDUP((paddr_t)(uintptr_t)&local, MMU_PAGESIZE);
597 	DBG(scratch_start);
598 	scratch_end = RNDUP((paddr_t)scratch_start + 512 * 1024, TWO_MEG);
599 	DBG(scratch_end);
600 
601 	/*
602 	 * For paranoia, leave some space between hypervisor data and ours.
603 	 * Use 500 instead of 512.
604 	 */
605 	next_avail_addr = scratch_end - 500 * 1024;
606 	DBG(next_avail_addr);
607 
608 	/*
609 	 * The domain builder gives us at most 1 module
610 	 */
611 	DBG(xen_info->mod_len);
612 	if (xen_info->mod_len > 0) {
613 		DBG(xen_info->mod_start);
614 		modules[0].bm_addr = xen_info->mod_start;
615 		modules[0].bm_size = xen_info->mod_len;
616 		bi->bi_module_cnt = 1;
617 		bi->bi_modules = (native_ptr_t)modules;
618 	} else {
619 		bi->bi_module_cnt = 0;
620 		bi->bi_modules = NULL;
621 	}
622 	DBG(bi->bi_module_cnt);
623 	DBG(bi->bi_modules);
624 
625 	DBG(xen_info->mfn_list);
626 	DBG(xen_info->nr_pages);
627 	max_mem = (paddr_t)xen_info->nr_pages << MMU_PAGESHIFT;
628 	DBG(max_mem);
629 
630 	/*
631 	 * Using pseudo-physical addresses, so only 1 memlist element
632 	 */
633 	memlists[0].addr = 0;
634 	DBG(memlists[0].addr);
635 	memlists[0].size = max_mem;
636 	DBG(memlists[0].size);
637 	memlists_used = 1;
638 	DBG(memlists_used);
639 
640 	/*
641 	 * finish building physinstall list
642 	 */
643 	sort_physinstall();
644 
645 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
646 		/*
647 		 * build PCI Memory list
648 		 */
649 		map.nr_entries = MAXMAPS;
650 		/*LINTED: constant in conditional context*/
651 		set_xen_guest_handle(map.buffer, map_buffer);
652 		if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &map) != 0)
653 			dboot_panic("getting XENMEM_machine_memory_map failed");
654 		build_pcimemlists(map_buffer, map.nr_entries);
655 	}
656 }
657 
658 #else	/* !__xpv */
659 
660 /*
661  * During memory allocation, find the highest address not used yet.
662  */
663 static void
664 check_higher(paddr_t a)
665 {
666 	if (a < next_avail_addr)
667 		return;
668 	next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE);
669 	DBG(next_avail_addr);
670 }
671 
672 /*
673  * Walk through the module information finding the last used address.
674  * The first available address will become the top level page table.
675  *
676  * We then build the phys_install memlist from the multiboot information.
677  */
678 static void
679 init_mem_alloc(void)
680 {
681 	mb_memory_map_t *mmap;
682 	mb_module_t *mod;
683 	uint64_t start;
684 	uint64_t end;
685 	uint64_t page_offset = MMU_PAGEOFFSET;	/* needs to be 64 bits */
686 	extern char _end[];
687 	int i;
688 
689 	DBG_MSG("Entered init_mem_alloc()\n");
690 	DBG((uintptr_t)mb_info);
691 
692 	/*
693 	 * search the modules to find the last used address
694 	 * we'll build the module list while we're walking through here
695 	 */
696 	DBG_MSG("\nFinding Modules\n");
697 	check_higher((paddr_t)&_end);
698 	for (mod = (mb_module_t *)(mb_info->mods_addr), i = 0;
699 	    i < mb_info->mods_count;
700 	    ++mod, ++i) {
701 		if (prom_debug) {
702 			dboot_printf("\tmodule #%d: %s at: 0x%lx, len 0x%lx\n",
703 			    i, (char *)(mod->mod_name),
704 			    (ulong_t)mod->mod_start, (ulong_t)mod->mod_end);
705 		}
706 		modules[i].bm_addr = mod->mod_start;
707 		modules[i].bm_size = mod->mod_end;
708 
709 		check_higher(mod->mod_end);
710 	}
711 	bi->bi_modules = (native_ptr_t)modules;
712 	DBG(bi->bi_modules);
713 	bi->bi_module_cnt = mb_info->mods_count;
714 	DBG(bi->bi_module_cnt);
715 
716 	/*
717 	 * Walk through the memory map from multiboot and build our memlist
718 	 * structures. Note these will have native format pointers.
719 	 */
720 	DBG_MSG("\nFinding Memory Map\n");
721 	DBG(mb_info->flags);
722 	max_mem = 0;
723 	if (mb_info->flags & 0x40) {
724 		int cnt = 0;
725 
726 		DBG(mb_info->mmap_addr);
727 		DBG(mb_info->mmap_length);
728 		check_higher(mb_info->mmap_addr + mb_info->mmap_length);
729 
730 		for (mmap = (mb_memory_map_t *)mb_info->mmap_addr;
731 		    (uint32_t)mmap < mb_info->mmap_addr + mb_info->mmap_length;
732 		    mmap = (mb_memory_map_t *)((uint32_t)mmap + mmap->size
733 		    + sizeof (mmap->size))) {
734 			++cnt;
735 			start = ((uint64_t)mmap->base_addr_high << 32) +
736 			    mmap->base_addr_low;
737 			end = start + ((uint64_t)mmap->length_high << 32) +
738 			    mmap->length_low;
739 
740 			if (prom_debug)
741 				dboot_printf("\ttype: %d %" PRIx64 "..%"
742 				    PRIx64 "\n", mmap->type, start, end);
743 
744 			/*
745 			 * page align start and end
746 			 */
747 			start = (start + page_offset) & ~page_offset;
748 			end &= ~page_offset;
749 			if (end <= start)
750 				continue;
751 
752 			/*
753 			 * only type 1 is usable RAM
754 			 */
755 			if (mmap->type != 1)
756 				continue;
757 
758 			if (end > max_mem)
759 				max_mem = end;
760 
761 			memlists[memlists_used].addr = start;
762 			memlists[memlists_used].size = end - start;
763 			++memlists_used;
764 			if (memlists_used > MAX_MEMLIST)
765 				dboot_panic("too many memlists");
766 		}
767 		build_pcimemlists((mb_memory_map_t *)mb_info->mmap_addr, cnt);
768 	} else if (mb_info->flags & 0x01) {
769 		DBG(mb_info->mem_lower);
770 		memlists[memlists_used].addr = 0;
771 		memlists[memlists_used].size = mb_info->mem_lower * 1024;
772 		++memlists_used;
773 		DBG(mb_info->mem_upper);
774 		memlists[memlists_used].addr = 1024 * 1024;
775 		memlists[memlists_used].size = mb_info->mem_upper * 1024;
776 		++memlists_used;
777 
778 		/*
779 		 * Old platform - assume I/O space at the end of memory.
780 		 */
781 		pcimemlists[0].addr =
782 		    (mb_info->mem_upper * 1024) + (1024 * 1024);
783 		pcimemlists[0].size = pci_hi_limit - pcimemlists[0].addr;
784 		pcimemlists[0].next = 0;
785 		pcimemlists[0].prev = 0;
786 		bi->bi_pcimem = (native_ptr_t)pcimemlists;
787 		DBG(bi->bi_pcimem);
788 	} else {
789 		dboot_panic("No memory info from boot loader!!!");
790 	}
791 
792 	check_higher(bi->bi_cmdline);
793 
794 	/*
795 	 * finish processing the physinstall list
796 	 */
797 	sort_physinstall();
798 }
799 #endif /* !__xpv */
800 
801 /*
802  * Simple memory allocator, allocates aligned physical memory.
803  * Note that startup_kernel() only allocates memory, never frees.
804  * Memory usage just grows in an upward direction.
805  */
806 static void *
807 do_mem_alloc(uint32_t size, uint32_t align)
808 {
809 	uint_t i;
810 	uint64_t best;
811 	uint64_t start;
812 	uint64_t end;
813 
814 	/*
815 	 * make sure size is a multiple of pagesize
816 	 */
817 	size = RNDUP(size, MMU_PAGESIZE);
818 	next_avail_addr = RNDUP(next_avail_addr, align);
819 
820 	/*
821 	 * XXPV fixme joe
822 	 *
823 	 * a really large bootarchive that causes you to run out of memory
824 	 * may cause this to blow up
825 	 */
826 	/* LINTED E_UNEXPECTED_UINT_PROMOTION */
827 	best = (uint64_t)-size;
828 	for (i = 0; i < memlists_used; ++i) {
829 		start = memlists[i].addr;
830 #if defined(__xpv)
831 		start += mfn_base;
832 #endif
833 		end = start + memlists[i].size;
834 
835 		/*
836 		 * did we find the desired address?
837 		 */
838 		if (start <= next_avail_addr && next_avail_addr + size <= end) {
839 			best = next_avail_addr;
840 			goto done;
841 		}
842 
843 		/*
844 		 * if not is this address the best so far?
845 		 */
846 		if (start > next_avail_addr && start < best &&
847 		    RNDUP(start, align) + size <= end)
848 			best = RNDUP(start, align);
849 	}
850 
851 	/*
852 	 * We didn't find exactly the address we wanted, due to going off the
853 	 * end of a memory region. Return the best found memory address.
854 	 */
855 done:
856 	next_avail_addr = best + size;
857 #if defined(__xpv)
858 	if (next_avail_addr > scratch_end)
859 		dboot_panic("Out of mem next_avail: 0x%lx, scratch_end: "
860 		    "0x%lx", (ulong_t)next_avail_addr,
861 		    (ulong_t)scratch_end);
862 #endif
863 	(void) memset((void *)(uintptr_t)best, 0, size);
864 	return ((void *)(uintptr_t)best);
865 }
866 
867 void *
868 mem_alloc(uint32_t size)
869 {
870 	return (do_mem_alloc(size, MMU_PAGESIZE));
871 }
872 
873 
874 /*
875  * Build page tables to map all of memory used so far as well as the kernel.
876  */
877 static void
878 build_page_tables(void)
879 {
880 	uint32_t psize;
881 	uint32_t level;
882 	uint32_t off;
883 	uint64_t start;
884 #if !defined(__xpv)
885 	uint32_t i;
886 	uint64_t end;
887 	uint64_t next_mapping;
888 #endif	/* __xpv */
889 
890 	/*
891 	 * If we're on metal, we need to create the top level pagetable.
892 	 */
893 #if defined(__xpv)
894 	top_page_table = (paddr_t)(uintptr_t)xen_info->pt_base;
895 #else /* __xpv */
896 	top_page_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
897 #endif /* __xpv */
898 	DBG((uintptr_t)top_page_table);
899 
900 	/*
901 	 * Determine if we'll use large mappings for kernel, then map it.
902 	 */
903 	if (largepage_support) {
904 		psize = lpagesize;
905 		level = 1;
906 	} else {
907 		psize = MMU_PAGESIZE;
908 		level = 0;
909 	}
910 
911 	DBG_MSG("Mapping kernel\n");
912 	DBG(ktext_phys);
913 	DBG(target_kernel_text);
914 	DBG(ksize);
915 	DBG(psize);
916 	for (off = 0; off < ksize; off += psize)
917 		map_pa_at_va(ktext_phys + off, target_kernel_text + off, level);
918 
919 	/*
920 	 * The kernel will need a 1 page window to work with page tables
921 	 */
922 	bi->bi_pt_window = (uintptr_t)mem_alloc(MMU_PAGESIZE);
923 	DBG(bi->bi_pt_window);
924 	bi->bi_pte_to_pt_window =
925 	    (uintptr_t)find_pte(bi->bi_pt_window, NULL, 0, 0);
926 	DBG(bi->bi_pte_to_pt_window);
927 
928 #if defined(__xpv)
929 	if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
930 		/* If this is a domU we're done. */
931 		DBG_MSG("\nPage tables constructed\n");
932 		return;
933 	}
934 #endif /* __xpv */
935 
936 	/*
937 	 * We need 1:1 mappings for the lower 1M of memory to access
938 	 * BIOS tables used by a couple of drivers during boot.
939 	 *
940 	 * The following code works because our simple memory allocator
941 	 * only grows usage in an upwards direction.
942 	 *
943 	 * Note that by this point in boot some mappings for low memory
944 	 * may already exist because we've already accessed device in low
945 	 * memory.  (Specifically the video frame buffer and keyboard
946 	 * status ports.)  If we're booting on raw hardware then GRUB
947 	 * created these mappings for us.  If we're booting under a
948 	 * hypervisor then we went ahead and remapped these devices into
949 	 * memory allocated within dboot itself.
950 	 */
951 	if (map_debug)
952 		dboot_printf("1:1 map pa=0..1Meg\n");
953 	for (start = 0; start < 1024 * 1024; start += MMU_PAGESIZE) {
954 #if defined(__xpv)
955 		map_ma_at_va(start, start, 0);
956 #else /* __xpv */
957 		map_pa_at_va(start, start, 0);
958 #endif /* __xpv */
959 	}
960 
961 #if !defined(__xpv)
962 	/*
963 	 * Skip memory between 1M and _start, this acts as a reserve
964 	 * of memory usable for DMA.
965 	 */
966 	next_mapping = (uintptr_t)_start & MMU_PAGEMASK;
967 	for (i = 0; i < memlists_used; ++i) {
968 		start = memlists[i].addr;
969 		if (start < next_mapping)
970 			start = next_mapping;
971 
972 		end = start + memlists[i].size;
973 
974 		if (map_debug)
975 			dboot_printf("1:1 map pa=%" PRIx64 "..%" PRIx64 "\n",
976 			    start, end);
977 		while (start < end && start < next_avail_addr) {
978 			map_pa_at_va(start, start, 0);
979 			start += MMU_PAGESIZE;
980 		}
981 	}
982 #endif /* !__xpv */
983 
984 	DBG_MSG("\nPage tables constructed\n");
985 }
986 
987 #define	NO_MULTIBOOT	\
988 "multiboot is no longer used to boot the Solaris Operating System.\n\
989 The grub entry should be changed to:\n\
990 kernel$ /platform/i86pc/kernel/$ISADIR/unix\n\
991 module$ /platform/i86pc/$ISADIR/boot_archive\n\
992 See http://www.sun.com/msg/SUNOS-8000-AK for details.\n"
993 
994 /*
995  * startup_kernel has a pretty simple job. It builds pagetables which reflect
996  * 1:1 mappings for all memory in use. It then also adds mappings for
997  * the kernel nucleus at virtual address of target_kernel_text using large page
998  * mappings. The page table pages are also accessible at 1:1 mapped
999  * virtual addresses.
1000  */
1001 /*ARGSUSED*/
1002 void
1003 startup_kernel(void)
1004 {
1005 	char *cmdline;
1006 	uintptr_t addr;
1007 #if defined(__xpv)
1008 	physdev_set_iopl_t set_iopl;
1009 #endif /* __xpv */
1010 
1011 	/*
1012 	 * At this point we are executing in a 32 bit real mode.
1013 	 */
1014 #if defined(__xpv)
1015 	cmdline = (char *)xen_info->cmd_line;
1016 #else /* __xpv */
1017 	cmdline = (char *)mb_info->cmdline;
1018 #endif /* __xpv */
1019 
1020 	prom_debug = (strstr(cmdline, "prom_debug") != NULL);
1021 	map_debug = (strstr(cmdline, "map_debug") != NULL);
1022 
1023 #if defined(__xpv)
1024 	/*
1025 	 * For dom0, before we initialize the console subsystem we'll
1026 	 * need to enable io operations, so set I/O priveldge level to 1.
1027 	 */
1028 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
1029 		set_iopl.iopl = 1;
1030 		(void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
1031 	}
1032 #endif /* __xpv */
1033 
1034 	bcons_init(cmdline);
1035 	DBG_MSG("\n\nSolaris prekernel set: ");
1036 	DBG_MSG(cmdline);
1037 	DBG_MSG("\n");
1038 
1039 	if (strstr(cmdline, "multiboot") != NULL) {
1040 		dboot_panic(NO_MULTIBOOT);
1041 	}
1042 
1043 	/*
1044 	 * boot info must be 16 byte aligned for 64 bit kernel ABI
1045 	 */
1046 	addr = (uintptr_t)boot_info;
1047 	addr = (addr + 0xf) & ~0xf;
1048 	bi = (struct xboot_info *)addr;
1049 	DBG((uintptr_t)bi);
1050 	bi->bi_cmdline = (native_ptr_t)(uintptr_t)cmdline;
1051 
1052 	/*
1053 	 * Need correct target_kernel_text value
1054 	 */
1055 #if defined(_BOOT_TARGET_amd64)
1056 	target_kernel_text = KERNEL_TEXT_amd64;
1057 #elif defined(__xpv)
1058 	target_kernel_text = KERNEL_TEXT_i386_xpv;
1059 #else
1060 	target_kernel_text = KERNEL_TEXT_i386;
1061 #endif
1062 	DBG(target_kernel_text);
1063 
1064 #if defined(__xpv)
1065 
1066 	/*
1067 	 * XXPV	Derive this stuff from CPUID / what the hypervisor has enabled
1068 	 */
1069 
1070 #if defined(_BOOT_TARGET_amd64)
1071 	/*
1072 	 * 64-bit hypervisor.
1073 	 */
1074 	amd64_support = 1;
1075 	pae_support = 1;
1076 
1077 #else	/* _BOOT_TARGET_amd64 */
1078 
1079 	/*
1080 	 * See if we are running on a PAE Hypervisor
1081 	 */
1082 	{
1083 		xen_capabilities_info_t caps;
1084 
1085 		if (HYPERVISOR_xen_version(XENVER_capabilities, &caps) != 0)
1086 			dboot_panic("HYPERVISOR_xen_version(caps) failed");
1087 		caps[sizeof (caps) - 1] = 0;
1088 		if (prom_debug)
1089 			dboot_printf("xen capabilities %s\n", caps);
1090 		if (strstr(caps, "x86_32p") != NULL)
1091 			pae_support = 1;
1092 	}
1093 
1094 #endif	/* _BOOT_TARGET_amd64 */
1095 	{
1096 		xen_platform_parameters_t p;
1097 
1098 		if (HYPERVISOR_xen_version(XENVER_platform_parameters, &p) != 0)
1099 			dboot_panic("HYPERVISOR_xen_version(parms) failed");
1100 		DBG(p.virt_start);
1101 		mfn_to_pfn_mapping = (pfn_t *)(xen_virt_start = p.virt_start);
1102 	}
1103 
1104 	/*
1105 	 * The hypervisor loads stuff starting at 1Gig
1106 	 */
1107 	mfn_base = ONE_GIG;
1108 	DBG(mfn_base);
1109 
1110 	/*
1111 	 * enable writable page table mode for the hypervisor
1112 	 */
1113 	if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
1114 	    VMASST_TYPE_writable_pagetables) < 0)
1115 		dboot_panic("HYPERVISOR_vm_assist(writable_pagetables) failed");
1116 
1117 	/*
1118 	 * check for NX support
1119 	 */
1120 	if (pae_support) {
1121 		uint32_t eax = 0x80000000;
1122 		uint32_t edx = get_cpuid_edx(&eax);
1123 
1124 		if (eax >= 0x80000001) {
1125 			eax = 0x80000001;
1126 			edx = get_cpuid_edx(&eax);
1127 			if (edx & CPUID_AMD_EDX_NX)
1128 				NX_support = 1;
1129 		}
1130 	}
1131 
1132 #if !defined(_BOOT_TARGET_amd64)
1133 
1134 	/*
1135 	 * The 32-bit hypervisor uses segmentation to protect itself from
1136 	 * guests. This means when a guest attempts to install a flat 4GB
1137 	 * code or data descriptor the 32-bit hypervisor will protect itself
1138 	 * by silently shrinking the segment such that if the guest attempts
1139 	 * any access where the hypervisor lives a #gp fault is generated.
1140 	 * The problem is that some applications expect a full 4GB flat
1141 	 * segment for their current thread pointer and will use negative
1142 	 * offset segment wrap around to access data. TLS support in linux
1143 	 * brand is one example of this.
1144 	 *
1145 	 * The 32-bit hypervisor can catch the #gp fault in these cases
1146 	 * and emulate the access without passing the #gp fault to the guest
1147 	 * but only if VMASST_TYPE_4gb_segments is explicitly turned on.
1148 	 * Seems like this should have been the default.
1149 	 * Either way, we want the hypervisor -- and not Solaris -- to deal
1150 	 * to deal with emulating these accesses.
1151 	 */
1152 	if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
1153 	    VMASST_TYPE_4gb_segments) < 0)
1154 		dboot_panic("HYPERVISOR_vm_assist(4gb_segments) failed");
1155 #endif	/* !_BOOT_TARGET_amd64 */
1156 
1157 #else	/* __xpv */
1158 
1159 	/*
1160 	 * use cpuid to enable MMU features
1161 	 */
1162 	if (have_cpuid()) {
1163 		uint32_t eax, edx;
1164 
1165 		eax = 1;
1166 		edx = get_cpuid_edx(&eax);
1167 		if (edx & CPUID_INTC_EDX_PSE)
1168 			largepage_support = 1;
1169 		if (edx & CPUID_INTC_EDX_PGE)
1170 			pge_support = 1;
1171 		if (edx & CPUID_INTC_EDX_PAE)
1172 			pae_support = 1;
1173 
1174 		eax = 0x80000000;
1175 		edx = get_cpuid_edx(&eax);
1176 		if (eax >= 0x80000001) {
1177 			eax = 0x80000001;
1178 			edx = get_cpuid_edx(&eax);
1179 			if (edx & CPUID_AMD_EDX_LM)
1180 				amd64_support = 1;
1181 			if (edx & CPUID_AMD_EDX_NX)
1182 				NX_support = 1;
1183 		}
1184 	} else {
1185 		dboot_printf("cpuid not supported\n");
1186 	}
1187 #endif /* __xpv */
1188 
1189 
1190 #if defined(_BOOT_TARGET_amd64)
1191 	if (amd64_support == 0)
1192 		dboot_panic("long mode not supported, rebooting");
1193 	else if (pae_support == 0)
1194 		dboot_panic("long mode, but no PAE; rebooting");
1195 #else
1196 	/*
1197 	 * Allow the command line to over-ride use of PAE for 32 bit.
1198 	 */
1199 	if (strstr(cmdline, "disablePAE=true") != NULL) {
1200 		pae_support = 0;
1201 		NX_support = 0;
1202 		amd64_support = 0;
1203 	}
1204 #endif
1205 
1206 	/*
1207 	 * initialize the simple memory allocator
1208 	 */
1209 	init_mem_alloc();
1210 
1211 #if !defined(__xpv) && !defined(_BOOT_TARGET_amd64)
1212 	/*
1213 	 * disable PAE on 32 bit h/w w/o NX and < 4Gig of memory
1214 	 */
1215 	if (max_mem < FOUR_GIG && NX_support == 0)
1216 		pae_support = 0;
1217 #endif
1218 
1219 	/*
1220 	 * configure mmu information
1221 	 */
1222 	if (pae_support) {
1223 		shift_amt = shift_amt_pae;
1224 		ptes_per_table = 512;
1225 		pte_size = 8;
1226 		lpagesize = TWO_MEG;
1227 #if defined(_BOOT_TARGET_amd64)
1228 		top_level = 3;
1229 #else
1230 		top_level = 2;
1231 #endif
1232 	} else {
1233 		pae_support = 0;
1234 		NX_support = 0;
1235 		shift_amt = shift_amt_nopae;
1236 		ptes_per_table = 1024;
1237 		pte_size = 4;
1238 		lpagesize = FOUR_MEG;
1239 		top_level = 1;
1240 	}
1241 
1242 	DBG(pge_support);
1243 	DBG(NX_support);
1244 	DBG(largepage_support);
1245 	DBG(amd64_support);
1246 	DBG(top_level);
1247 	DBG(pte_size);
1248 	DBG(ptes_per_table);
1249 	DBG(lpagesize);
1250 
1251 #if defined(__xpv)
1252 	ktext_phys = ONE_GIG;		/* from UNIX Mapfile */
1253 #else
1254 	ktext_phys = FOUR_MEG;		/* from UNIX Mapfile */
1255 #endif
1256 
1257 #if !defined(__xpv) && defined(_BOOT_TARGET_amd64)
1258 	/*
1259 	 * For grub, copy kernel bits from the ELF64 file to final place.
1260 	 */
1261 	DBG_MSG("\nAllocating nucleus pages.\n");
1262 	ktext_phys = (uintptr_t)do_mem_alloc(ksize, FOUR_MEG);
1263 	if (ktext_phys == 0)
1264 		dboot_panic("failed to allocate aligned kernel memory");
1265 	if (dboot_elfload64(mb_header.load_addr) != 0)
1266 		dboot_panic("failed to parse kernel ELF image, rebooting");
1267 #endif
1268 
1269 	DBG(ktext_phys);
1270 
1271 	/*
1272 	 * Allocate page tables.
1273 	 */
1274 	build_page_tables();
1275 
1276 	/*
1277 	 * return to assembly code to switch to running kernel
1278 	 */
1279 	entry_addr_low = (uint32_t)target_kernel_text;
1280 	DBG(entry_addr_low);
1281 	bi->bi_use_largepage = largepage_support;
1282 	bi->bi_use_pae = pae_support;
1283 	bi->bi_use_pge = pge_support;
1284 	bi->bi_use_nx = NX_support;
1285 
1286 #if defined(__xpv)
1287 
1288 	bi->bi_next_paddr = next_avail_addr - mfn_base;
1289 	DBG(bi->bi_next_paddr);
1290 	bi->bi_next_vaddr = (native_ptr_t)next_avail_addr;
1291 	DBG(bi->bi_next_vaddr);
1292 
1293 	/*
1294 	 * unmap unused pages in start area to make them available for DMA
1295 	 */
1296 	while (next_avail_addr < scratch_end) {
1297 		(void) HYPERVISOR_update_va_mapping(next_avail_addr,
1298 		    0, UVMF_INVLPG | UVMF_LOCAL);
1299 		next_avail_addr += MMU_PAGESIZE;
1300 	}
1301 
1302 	bi->bi_xen_start_info = (uintptr_t)xen_info;
1303 	DBG((uintptr_t)HYPERVISOR_shared_info);
1304 	bi->bi_shared_info = (native_ptr_t)HYPERVISOR_shared_info;
1305 	bi->bi_top_page_table = (uintptr_t)top_page_table - mfn_base;
1306 
1307 #else /* __xpv */
1308 
1309 	bi->bi_next_paddr = next_avail_addr;
1310 	DBG(bi->bi_next_paddr);
1311 	bi->bi_next_vaddr = (uintptr_t)next_avail_addr;
1312 	DBG(bi->bi_next_vaddr);
1313 	bi->bi_mb_info = (uintptr_t)mb_info;
1314 	bi->bi_top_page_table = (uintptr_t)top_page_table;
1315 
1316 #endif /* __xpv */
1317 
1318 	bi->bi_kseg_size = FOUR_MEG;
1319 	DBG(bi->bi_kseg_size);
1320 
1321 	DBG_MSG("\n\n*** DBOOT DONE -- back to asm to jump to kernel\n\n");
1322 }
1323